diff options
| author | Pavan Sondur <pavan@gluster.com> | 2010-08-06 05:31:45 +0000 | 
|---|---|---|
| committer | Anand V. Avati <avati@dev.gluster.com> | 2010-08-06 04:09:07 -0700 | 
| commit | acdeed002d30209e0a058c2df0346d4f16c08994 (patch) | |
| tree | 9c6acda8d92494952f4a80134303b9d2d1c3e1ac | |
| parent | 453cb4bf0b70c876eb468def34054095cfd66359 (diff) | |
add pump xlator and changes for replace-brick
Signed-off-by: Pavan Vilas Sondur <pavan@gluster.com>
Signed-off-by: Anand V. Avati <avati@dev.gluster.com>
BUG: 1235 (Bug for all pump/migrate commits)
URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=1235
25 files changed, 5461 insertions, 2617 deletions
diff --git a/cli/src/cli-cmd-parser.c b/cli/src/cli-cmd-parser.c index 6c96d0836c8..ddb376bcbdc 100644 --- a/cli/src/cli-cmd-parser.c +++ b/cli/src/cli-cmd-parser.c @@ -442,8 +442,8 @@ cli_cmd_volume_replace_brick_parse (const char **words, int wordcount,          if (!strcasecmp ("start", op)) {                  replace_op = GF_REPLACE_OP_START; -        } else if (!strcasecmp ("stop", op)) { -                replace_op = GF_REPLACE_OP_STOP; +        } else if (!strcasecmp ("commit", op)) { +                replace_op = GF_REPLACE_OP_COMMIT;          } else if (!strcasecmp ("pause", op)) {                  replace_op = GF_REPLACE_OP_PAUSE;          } else if (!strcasecmp ("abort", op)) { diff --git a/cli/src/cli.h b/cli/src/cli.h index c31b4631ab8..7b2de667358 100644 --- a/cli/src/cli.h +++ b/cli/src/cli.h @@ -37,6 +37,14 @@ enum argp_option_keys {  	ARGP_PORT_KEY = 'p',  }; +typedef enum replace_brick_cmd { +        REPLACE_BRICK_START, +        REPLACE_BRICK_PAUSE, +        REPLACE_BRICK_ABORT, +        REPLACE_BRICK_STATUS, +        REPLACE_BRICK_COMMIT, +} replace_brick_cmd_t; +  struct cli_state;  struct cli_cmd_word;  struct cli_cmd_tree; @@ -116,6 +124,13 @@ struct cli_local {                  struct {                          char    *volname;                  } defrag_vol; + +                struct { +                        char    *volume; +                        replace_brick_cmd_t op; +                        char *src_brick; +                        char *dst_brick; +                } replace_brick;          } u;  }; diff --git a/cli/src/cli3_1-cops.c b/cli/src/cli3_1-cops.c index cfede807651..929f490c22e 100644 --- a/cli/src/cli3_1-cops.c +++ b/cli/src/cli3_1-cops.c @@ -31,6 +31,7 @@  #include "cli1-xdr.h"  #include "cli1.h"  #include "protocol-common.h" +#include "cli-mem-types.h"  extern rpc_clnt_prog_t *cli_rpc_prog;  extern int      cli_op_ret; @@ -650,27 +651,133 @@ out:  } +static int +replace_brick_mount (char *volname) +{ +        char  cmd_str[8192] = {0,}; + +        gf_log ("", GF_LOG_DEBUG, +                "creating directory"); + +        snprintf (cmd_str, 4096, "mkdir -p /tmp/mnt"); +        system (cmd_str); + +        gf_log ("", GF_LOG_DEBUG, +                "creating maintenance mount"); + +        snprintf (cmd_str, 4096, "glusterfs -f /tmp/replace_brick.vol /tmp/mnt -l /tmp/pav_log -LTRACE"); + +        system (cmd_str); + +        return 0; +} +  int  gf_cli3_1_replace_brick_cbk (struct rpc_req *req, struct iovec *iov,                               int count, void *myframe)  {          gf1_cli_replace_brick_rsp       rsp   = {0,};          int                             ret   = 0; +        cli_local_t                     *local = NULL; +        call_frame_t                    *frame = NULL; +        char *src_brick = NULL; +        char *dst_brick = NULL; +        char *replace_brick_op = NULL; +        char  status_msg[8192] = {0,}; +        char  cmd_str[8192] = {0,};          if (-1 == req->rpc_status) {                  goto out;          } +        frame = (call_frame_t *) myframe; +        local = frame->local; +          ret = gf_xdr_to_cli_replace_brick_rsp (*iov, &rsp);          if (ret < 0) {                  gf_log ("", GF_LOG_ERROR, "error");                  goto out;          } +        switch (local->u.replace_brick.op) { +        case REPLACE_BRICK_START: + +                replace_brick_op = "Replace brick start operation"; + +                replace_brick_mount (local->u.replace_brick.volume); + +                gf_log ("", GF_LOG_DEBUG, +                        "sending setxattr"); + +                snprintf (cmd_str, 4096, "sleep 2; stat /tmp/mnt/ >/dev/null;setfattr -n trusted.glusterfs.pump.start /tmp/mnt/"); + +                system (cmd_str); + +                gf_log ("", GF_LOG_DEBUG, +                        "umounting"); + +                snprintf (cmd_str, 4096, "umount -f /tmp/mnt 2>/dev/null"); + +                ret = system (cmd_str); + +                system ("rmdir /tmp/mnt"); +                break; + +        case REPLACE_BRICK_STATUS: + +                replace_brick_op = "Replace brick status operation"; + +                snprintf (cmd_str, 4096, "mkdir -p /tmp/mnt"); +                system (cmd_str); + +                snprintf (cmd_str, 4096, "glusterfs -f /tmp/replace_brick.vol /tmp/mnt -l /tmp/pav_log -LTRACE"); +                system (cmd_str); + +                gf_log ("", GF_LOG_DEBUG, +                        "sending getxattr"); + +                ret = getxattr ("/tmp/mnt/", "trusted.glusterfs.pump.status", status_msg, 8192); +                fprintf (stdout, "%s\n", status_msg); + +                gf_log ("", GF_LOG_DEBUG, +                        "umounting"); + +                snprintf (cmd_str, 4096, "umount -f /tmp/mnt 2>/dev/null"); + +                ret = system (cmd_str); + +                system ("rmdir /tmp/mnt"); +                break; + +        case REPLACE_BRICK_COMMIT: + +                replace_brick_op = "Replace brick commit operation"; + +                src_brick = local->u.replace_brick.src_brick; +                dst_brick = local->u.replace_brick.dst_brick; + +                snprintf (cmd_str, 4096, "gluster volume add-brick %s %s >/dev/null", +                          local->u.replace_brick.volume, dst_brick); + +                ret = system (cmd_str); + +                snprintf (cmd_str, 4096, "gluster volume remove-brick %s %s >/dev/null", +                          local->u.replace_brick.volume, src_brick); + +                ret = system (cmd_str); + +                break; + +        default: +                break; +        } +          gf_log ("cli", GF_LOG_NORMAL, "Received resp to replace brick"); -        cli_out ("Replace Brick %s", (rsp.op_ret) ? "unsuccessful": -                                        "successful"); +        cli_out ("%s %s", +                 replace_brick_op ? replace_brick_op : "Unknown operation", +                 (rsp.op_ret) ? "unsuccessful": +                 "successful");          ret = rsp.op_ret; @@ -1183,25 +1290,76 @@ gf_cli3_1_replace_brick (call_frame_t *frame, xlator_t *this,          gf1_cli_replace_brick_req  req = {0,};          int                        ret = 0;          dict_t                     *dict = NULL; +        cli_local_t                *local = NULL; +        char                       *src_brick = NULL; +        char                       *dst_brick = NULL;          if (!frame || !this ||  !data) {                  ret = -1;                  goto out;          } -        dict = data; +        local = GF_CALLOC (1, sizeof (*local), cli_mt_cli_local_t); + +	dict = data; + +       ret = dict_get_int32 (dict, "operation", (int32_t *)&req.op); + +		if (ret) +			                       goto out; + +        switch (req.op) { +        case GF_REPLACE_OP_START: +                local->u.replace_brick.op = REPLACE_BRICK_START; +                break; +        case GF_REPLACE_OP_PAUSE: +                local->u.replace_brick.op = REPLACE_BRICK_PAUSE; +                break; +        case GF_REPLACE_OP_ABORT: +                local->u.replace_brick.op = REPLACE_BRICK_ABORT; +                break; +        case GF_REPLACE_OP_STATUS: +                local->u.replace_brick.op = REPLACE_BRICK_STATUS; +                break; +        case GF_REPLACE_OP_COMMIT: +                local->u.replace_brick.op = REPLACE_BRICK_COMMIT; +                break; + +        default: +                break; +        } +          ret = dict_get_str (dict, "volname", &req.volname);          if (ret)                  goto out; -        ret = dict_get_int32 (dict, "operation", (int32_t *)&req.op); +        ret = dict_get_str (dict, "src-brick", &src_brick); -        if (ret) +        if (ret) { +                goto out; +        } + +        ret = dict_get_str (dict, "dst-brick", &dst_brick); + +        if (ret) {                  goto out; +        } + +        local->u.replace_brick.volume = strdup (req.volname); +        local->u.replace_brick.src_brick = strdup (src_brick); +        local->u.replace_brick.dst_brick = strdup (dst_brick); +        frame->local = local; -        if (GF_REPLACE_OP_START == req.op) { + +        ret = dict_allocate_and_serialize (dict, +                                           &req.bricks.bricks_val, +                                           (size_t *)&req.bricks.bricks_len); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "failed to get serialized length of dict"); +                goto out;          }          ret = cli_cmd_submit (&req, frame, cli_rpc_prog, @@ -1216,10 +1374,6 @@ out:                  GF_FREE (req.bricks.bricks_val);          } -        if (req.bricks.bricks_val) { -                GF_FREE (req.bricks.bricks_val); -        } -          return ret;  } diff --git a/libglusterfs/src/syncop.c b/libglusterfs/src/syncop.c index beb5d9db4a1..cfece659171 100644 --- a/libglusterfs/src/syncop.c +++ b/libglusterfs/src/syncop.c @@ -24,6 +24,20 @@  #include "syncop.h" +call_frame_t * +syncop_create_frame () +{ +        struct synctask *task = NULL; +        struct call_frame_t *frame = NULL; + +        task = synctask_get (); + +        if (task) { +                frame = task->opaque; +        } + +        return (call_frame_t *)frame; +}  void  synctask_yield (struct synctask *task) @@ -95,6 +109,8 @@ synctask_wrap (struct synctask *task)          */          task->complete = 1;          synctask_wake (task); + +        synctask_yield (task);  } @@ -105,7 +121,7 @@ synctask_destroy (struct synctask *task)                  return;          if (task->stack) -                FREE (task); +                FREE (task->stack);          FREE (task);  } @@ -305,7 +321,181 @@ syncop_lookup (xlator_t *subvol, loc_t *loc, dict_t *xattr_req,          return args.op_ret;  } +static gf_dirent_t * +entry_copy (gf_dirent_t *source) +{ +        gf_dirent_t *sink = NULL; + +        sink = gf_dirent_for_name (source->d_name); + +        sink->d_off = source->d_off; +        sink->d_ino = source->d_ino; +        sink->d_type = source->d_type; + +        return sink; +} + +int32_t +syncop_readdirp_cbk (call_frame_t *frame, +		      void *cookie, +		      xlator_t *this, +		      int32_t op_ret, +		      int32_t op_errno, +		      gf_dirent_t *entries) +{ +        struct syncargs *args = NULL; +        gf_dirent_t *entry = NULL; +        gf_dirent_t  *tmp = NULL; + +        int count = 0; + +        args = cookie; + +        INIT_LIST_HEAD (&args->entries.list); + +        args->op_ret   = op_ret; +        args->op_errno = op_errno; + +        if (op_ret >= 0) { +                list_for_each_entry (entry, &entries->list, list) { +                        tmp = entry_copy (entry); +                        gf_log (this->name, GF_LOG_TRACE, +                                "adding entry=%s, count=%d", +                                tmp->d_name, count); +                        list_add_tail (&tmp->list, &(args->entries.list)); +                        count++; +                } +        } + +        __wake (args); + +        return 0; + +} + +int +syncop_readdirp (xlator_t *subvol, +		 fd_t *fd, +		 size_t size, +		 off_t off, +                 gf_dirent_t *entries) +{ +        struct syncargs args = {0, }; + +        SYNCOP (subvol, (&args), syncop_readdirp_cbk, subvol->fops->readdirp, +                fd, size, off); + +        if (entries) +                list_splice_init (&args.entries.list, &entries->list); + +        errno = args.op_errno; +        return args.op_ret; + +} + +int32_t +syncop_opendir_cbk (call_frame_t *frame, +                    void *cookie, +                    xlator_t *this, +                    int32_t op_ret, +                    int32_t op_errno, +                    fd_t *fd) +{ +        struct syncargs *args = NULL; + +        args = cookie; + +        args->op_ret   = op_ret; +        args->op_errno = op_errno; + +        __wake (args); + +        return 0; +} + +int +syncop_opendir (xlator_t *subvol, +                loc_t *loc, +                fd_t *fd) +{ +        struct syncargs args = {0, }; +        SYNCOP (subvol, (&args), syncop_opendir_cbk, subvol->fops->opendir, +                loc, fd); + +        errno = args.op_errno; +        return args.op_ret; + +} + + +int +syncop_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                     int op_ret, int op_errno) +{ +        struct syncargs *args = NULL; + +        args = cookie; + +        args->op_ret   = op_ret; +        args->op_errno = op_errno; + +        __wake (args); + +        return 0; +} + + +int +syncop_setxattr (xlator_t *subvol, loc_t *loc, dict_t *dict, int32_t flags) +{ +        struct syncargs args = {0, }; + +        SYNCOP (subvol, (&args), syncop_setxattr_cbk, subvol->fops->setxattr, +                loc, dict, flags); + +        errno = args.op_errno; +        return args.op_ret; +} + +int +syncop_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		    int32_t op_ret, int32_t op_errno, +		    struct statvfs *buf) + +{ +        struct syncargs *args = NULL; + +        args = cookie; + +        args->op_ret   = op_ret; +        args->op_errno = op_errno; + +        if (op_ret == 0) { +                args->statvfs_buf  = *buf; +        } + +        __wake (args); + +        return 0; +} + + +int +syncop_statfs (xlator_t *subvol, loc_t *loc, struct statvfs *buf) + +{ +        struct syncargs args = {0, }; + +        SYNCOP (subvol, (&args), syncop_statfs_cbk, subvol->fops->statfs, +                loc); + +        if (buf) +                *buf = args.statvfs_buf; + +        errno = args.op_errno; +        return args.op_ret; +}  int  syncop_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, diff --git a/libglusterfs/src/syncop.h b/libglusterfs/src/syncop.h index ce364b07301..13b07ed31fd 100644 --- a/libglusterfs/src/syncop.h +++ b/libglusterfs/src/syncop.h @@ -76,6 +76,8 @@ struct syncargs {          struct iatt         iatt1;          struct iatt         iatt2;          dict_t             *xattr; +        gf_dirent_t        entries; +        struct statvfs     statvfs_buf;          /* do not touch */          pthread_mutex_t     mutex; @@ -134,10 +136,10 @@ struct syncargs {  #define SYNCOP(subvol, stb, cbk, op, params ...) do {                   \          call_frame_t    *frame = NULL;                                  \                                                                          \ -        frame = create_frame (THIS, THIS->ctx->pool);                   \ +        frame = syncop_create_frame ();                                 \                                                                          \          __yawn (stb);                                                   \ -        STACK_WIND_COOKIE (frame, (void *)stb, cbk, subvol, op, params);\ +        STACK_WIND_COOKIE (frame, cbk, (void *)stb, subvol, op, params);\          __yield (stb);                                                  \  } while (0) @@ -157,8 +159,23 @@ int syncop_lookup (xlator_t *subvol, loc_t *loc, dict_t *xattr_req,                     /* out */                     struct iatt *iatt, dict_t **xattr_rsp, struct iatt *parent); +int syncop_readdirp (xlator_t *subvol, fd_t *fd, size_t size, off_t off, +                     /* out */ +                     gf_dirent_t *entries); + +int +syncop_opendir (xlator_t *subvol, +                loc_t *loc, +                fd_t *fd); +  int syncop_setattr (xlator_t *subvol, loc_t *loc, struct iatt *iatt, int valid,                      /* out */                      struct iatt *preop, struct iatt *postop); +int +syncop_statfs (xlator_t *subvol, loc_t *loc, struct statvfs *buf); + +int +syncop_setxattr (xlator_t *subvol, loc_t *loc, dict_t *dict, int32_t flags); +  #endif /* _SYNCOP_H */ diff --git a/libglusterfs/src/xlator.h b/libglusterfs/src/xlator.h index e373cc42400..ef79c354fcc 100644 --- a/libglusterfs/src/xlator.h +++ b/libglusterfs/src/xlator.h @@ -76,6 +76,7 @@ typedef int32_t (*event_notify_fn_t) (xlator_t *this, int32_t event, void *data,  #include "iatt.h" +  struct _loc {  	const char *path;  	const char *name; diff --git a/rpc/xdr/src/cli1-xdr.h b/rpc/xdr/src/cli1-xdr.h index 51fce3e9296..2f71df4a891 100644 --- a/rpc/xdr/src/cli1-xdr.h +++ b/rpc/xdr/src/cli1-xdr.h @@ -24,7 +24,7 @@ typedef enum gf1_cluster_type gf1_cluster_type;  enum gf1_cli_replace_op {  	GF_REPLACE_OP_NONE = 0,  	GF_REPLACE_OP_START = 0 + 1, -	GF_REPLACE_OP_STOP = 0 + 2, +	GF_REPLACE_OP_COMMIT = 0 + 2,  	GF_REPLACE_OP_PAUSE = 0 + 3,  	GF_REPLACE_OP_ABORT = 0 + 4,  	GF_REPLACE_OP_STATUS = 0 + 5, diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am index ece459ca772..699b3da77be 100644 --- a/xlators/cluster/afr/src/Makefile.am +++ b/xlators/cluster/afr/src/Makefile.am @@ -1,12 +1,17 @@ -xlator_LTLIBRARIES = afr.la +xlator_LTLIBRARIES = afr.la pump.la  xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster -afr_la_LDFLAGS = -module -avoidversion +afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-open.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c afr-self-heal-algorithm.c -afr_la_SOURCES = afr.c afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-open.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c afr-self-heal-algorithm.c +afr_la_LDFLAGS = -module -avoidversion +afr_la_SOURCES = $(afr_common_source) afr.c  afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h afr-self-heal-algorithm.h afr-mem-types.h +pump_la_LDFLAGS = -module -avoidversion +pump_la_SOURCES =  $(afr_common_source) pump.c +pump_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h afr-self-heal-algorithm.h pump.h afr-mem-types.h afr-common.c  AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \  	    -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/contrib/md5 -shared -nostartfiles $(GF_CFLAGS) @@ -15,6 +20,7 @@ CLEANFILES =  uninstall-local:  	rm -f $(DESTDIR)$(xlatordir)/replicate.so +	rm -f $(DESTDIR)$(xlatordir)/pump.so  install-data-hook: -	ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so
\ No newline at end of file +	ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c new file mode 100644 index 00000000000..aeadceddb6d --- /dev/null +++ b/xlators/cluster/afr/src/afr-common.c @@ -0,0 +1,2642 @@ +/* +   Copyright (c) 2007-2009 Gluster, Inc. <http://www.gluster.com> +   This file is part of GlusterFS. + +   GlusterFS is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published +   by the Free Software Foundation; either version 3 of the License, +   or (at your option) any later version. + +   GlusterFS is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see +   <http://www.gnu.org/licenses/>. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" +#include "statedump.h" + +#include "fd.h" + +#include "afr-inode-read.h" +#include "afr-inode-write.h" +#include "afr-dir-read.h" +#include "afr-dir-write.h" +#include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-self-heal-common.h" +#include "pump.h" + +#define AFR_ICTX_OPENDIR_DONE_MASK     0x0000000200000000ULL +#define AFR_ICTX_SPLIT_BRAIN_MASK      0x0000000100000000ULL +#define AFR_ICTX_READ_CHILD_MASK       0x00000000FFFFFFFFULL + +void +afr_set_lk_owner (call_frame_t *frame, xlator_t *this) +{ +        if (!frame->root->lk_owner) { +                gf_log (this->name, GF_LOG_TRACE, +                        "Setting lk-owner=%llu", +                        (unsigned long long) frame->root); +                frame->root->lk_owner = (uint64_t) frame->root; +        } +} + +uint64_t +afr_is_split_brain (xlator_t *this, inode_t *inode) +{ +        int ret = 0; + +        uint64_t ctx         = 0; +        uint64_t split_brain = 0; + +        VALIDATE_OR_GOTO (inode, out); + +        LOCK (&inode->lock); +        { +                ret = __inode_ctx_get (inode, this, &ctx); + +                if (ret < 0) +                        goto unlock; + +                split_brain = ctx & AFR_ICTX_SPLIT_BRAIN_MASK; +        } +unlock: +        UNLOCK (&inode->lock); + +out: +        return split_brain; +} + + +void +afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set) +{ +        uint64_t ctx = 0; +        int      ret = 0; + +        VALIDATE_OR_GOTO (inode, out); + +        LOCK (&inode->lock); +        { +                ret = __inode_ctx_get (inode, this, &ctx); + +                if (ret < 0) { +                        ctx = 0; +                } + +                if (set) { +			ctx = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx) +                                | (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_SPLIT_BRAIN_MASK); +		} else { +			ctx = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx); +		} +                __inode_ctx_put (inode, this, ctx); +        } +        UNLOCK (&inode->lock); +out: +        return; +} + + +uint64_t +afr_is_opendir_done (xlator_t *this, inode_t *inode) +{ +        int ret = 0; + +        uint64_t ctx          = 0; +        uint64_t opendir_done = 0; + +        VALIDATE_OR_GOTO (inode, out); + +        LOCK (&inode->lock); +        { +                ret = __inode_ctx_get (inode, this, &ctx); + +                if (ret < 0) +                        goto unlock; + +                opendir_done = ctx & AFR_ICTX_OPENDIR_DONE_MASK; +        } +unlock: +        UNLOCK (&inode->lock); + +out: +        return opendir_done; +} + + +void +afr_set_opendir_done (xlator_t *this, inode_t *inode) +{ +        uint64_t ctx = 0; +        int      ret = 0; + +        VALIDATE_OR_GOTO (inode, out); + +        LOCK (&inode->lock); +        { +                ret = __inode_ctx_get (inode, this, &ctx); + +                if (ret < 0) { +                        ctx = 0; +                } + +                ctx = (~AFR_ICTX_OPENDIR_DONE_MASK & ctx) +                        | (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_OPENDIR_DONE_MASK); + +                __inode_ctx_put (inode, this, ctx); +        } +        UNLOCK (&inode->lock); +out: +        return; +} + + +uint64_t +afr_read_child (xlator_t *this, inode_t *inode) +{ +        int ret = 0; + +        uint64_t ctx         = 0; +        uint64_t read_child  = 0; + +        VALIDATE_OR_GOTO (inode, out); + +        LOCK (&inode->lock); +        { +                ret = __inode_ctx_get (inode, this, &ctx); + +                if (ret < 0) +                        goto unlock; + +                read_child = ctx & AFR_ICTX_READ_CHILD_MASK; +        } +unlock: +        UNLOCK (&inode->lock); + +out: +        return read_child; +} + + +void +afr_set_read_child (xlator_t *this, inode_t *inode, int32_t read_child) +{ +        uint64_t ctx = 0; +        int      ret = 0; + +        VALIDATE_OR_GOTO (inode, out); + +        LOCK (&inode->lock); +        { +                ret = __inode_ctx_get (inode, this, &ctx); + +                if (ret < 0) { +                        ctx = 0; +                } + +                ctx = (~AFR_ICTX_READ_CHILD_MASK & ctx) +                        | (AFR_ICTX_READ_CHILD_MASK & read_child); + +                __inode_ctx_put (inode, this, ctx); +        } +        UNLOCK (&inode->lock); + +out: +        return; +} + + +/** + * afr_local_cleanup - cleanup everything in frame->local + */ + +void +afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) +{ +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              i = 0; + + +	sh = &local->self_heal; +	priv = this->private; + +	if (sh->buf) +		GF_FREE (sh->buf); + +	if (sh->xattr) { +		for (i = 0; i < priv->child_count; i++) { +			if (sh->xattr[i]) { +				dict_unref (sh->xattr[i]); +				sh->xattr[i] = NULL; +			} +		} +		GF_FREE (sh->xattr); +	} + +	if (sh->child_errno) +		GF_FREE (sh->child_errno); + +	if (sh->pending_matrix) { +		for (i = 0; i < priv->child_count; i++) { +			GF_FREE (sh->pending_matrix[i]); +		} +		GF_FREE (sh->pending_matrix); +	} + +	if (sh->delta_matrix) { +		for (i = 0; i < priv->child_count; i++) { +			GF_FREE (sh->delta_matrix[i]); +		} +		GF_FREE (sh->delta_matrix); +	} + +	if (sh->sources) +		GF_FREE (sh->sources); + +	if (sh->success) +		GF_FREE (sh->success); + +	if (sh->locked_nodes) +		GF_FREE (sh->locked_nodes); + +	if (sh->healing_fd && !sh->healing_fd_opened) { +		fd_unref (sh->healing_fd); +		sh->healing_fd = NULL; +	} + +        if (sh->linkname) +                GF_FREE ((char *)sh->linkname); + +	loc_wipe (&sh->parent_loc); +} + + +void +afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) +{ +        int             i = 0; +        afr_private_t * priv = NULL; + +        priv = this->private; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->pending && local->pending[i]) +                        GF_FREE (local->pending[i]); +        } + +        GF_FREE (local->pending); + +	GF_FREE (local->transaction.locked_nodes); +	GF_FREE (local->transaction.child_errno); +	GF_FREE (local->child_errno); + +	GF_FREE (local->transaction.basename); +	GF_FREE (local->transaction.new_basename); + +	loc_wipe (&local->transaction.parent_loc); +	loc_wipe (&local->transaction.new_parent_loc); +} + + +void +afr_local_cleanup (afr_local_t *local, xlator_t *this) +{ +        int i; +        afr_private_t * priv = NULL; + +	if (!local) +		return; + +	afr_local_sh_cleanup (local, this); + +        afr_local_transaction_cleanup (local, this); + +        priv = this->private; + +	loc_wipe (&local->loc); +	loc_wipe (&local->newloc); + +	if (local->fd) +		fd_unref (local->fd); + +	if (local->xattr_req) +		dict_unref (local->xattr_req); + +	GF_FREE (local->child_up); + +	{ /* lookup */ +                if (local->cont.lookup.xattrs) { +                        for (i = 0; i < priv->child_count; i++) { +                                if (local->cont.lookup.xattrs[i]) { +                                        dict_unref (local->cont.lookup.xattrs[i]); +                                        local->cont.lookup.xattrs[i] = NULL; +                                } +                        } +                        GF_FREE (local->cont.lookup.xattrs); +                        local->cont.lookup.xattrs = NULL; +                } + +		if (local->cont.lookup.xattr) { +                        dict_unref (local->cont.lookup.xattr); +                } + +                if (local->cont.lookup.inode) { +                        inode_unref (local->cont.lookup.inode); +                } +	} + +	{ /* getxattr */ +		if (local->cont.getxattr.name) +			GF_FREE (local->cont.getxattr.name); +	} + +	{ /* lk */ +		if (local->cont.lk.locked_nodes) +			GF_FREE (local->cont.lk.locked_nodes); +	} + +	{ /* create */ +		if (local->cont.create.fd) +			fd_unref (local->cont.create.fd); +	} + +	{ /* writev */ +		GF_FREE (local->cont.writev.vector); +	} + +	{ /* setxattr */ +		if (local->cont.setxattr.dict) +			dict_unref (local->cont.setxattr.dict); +	} + +	{ /* removexattr */ +		GF_FREE (local->cont.removexattr.name); +	} + +	{ /* symlink */ +		GF_FREE (local->cont.symlink.linkpath); +	} + +        { /* opendir */ +                if (local->cont.opendir.checksum) +                        GF_FREE (local->cont.opendir.checksum); +        } +} + + +int +afr_frame_return (call_frame_t *frame) +{ +	afr_local_t *local = NULL; +	int          call_count = 0; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		call_count = --local->call_count; +	} +	UNLOCK (&frame->lock); + +	return call_count; +} + + +/** + * up_children_count - return the number of children that are up + */ + +int +afr_up_children_count (int child_count, unsigned char *child_up) +{ +	int i   = 0; +	int ret = 0; + +	for (i = 0; i < child_count; i++) +		if (child_up[i]) +			ret++; +	return ret; +} + + +int +afr_locked_nodes_count (unsigned char *locked_nodes, int child_count) +{ +	int ret = 0; +	int i; + +	for (i = 0; i < child_count; i++) +		if (locked_nodes[i]) +			ret++; + +	return ret; +} + + +ino64_t +afr_itransform (ino64_t ino, int child_count, int child_index) +{ +	ino64_t scaled_ino = -1; + +	if (ino == ((uint64_t) -1)) { +		scaled_ino = ((uint64_t) -1); +		goto out; +	} + +	scaled_ino = (ino * child_count) + child_index; + +out: +	return scaled_ino; +} + + +int +afr_deitransform_orig (ino64_t ino, int child_count) +{ +	int index = -1; + +	index = ino % child_count; + +	return index; +} + + +int +afr_deitransform (ino64_t ino, int child_count) +{ +	return 0; +} + + +int +afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; + +	local = frame->local; + +	if (local->govinda_gOvinda) { +                afr_set_split_brain (this, local->cont.lookup.inode, _gf_true); +	} + +	AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, +			  local->cont.lookup.inode, +			  &local->cont.lookup.buf, +			  local->cont.lookup.xattr, +                          &local->cont.lookup.postparent); + +	return 0; +} + + +static void +afr_lookup_collect_xattr (afr_local_t *local, xlator_t *this, +                          int child_index, dict_t *xattr) +{ +	uint32_t        open_fd_count = 0; +        uint32_t        inodelk_count = 0; +        uint32_t        entrylk_count = 0; + +        int ret = 0; + +        if (afr_sh_has_metadata_pending (xattr, child_index, this)) { +                local->self_heal.need_metadata_self_heal = _gf_true; +                gf_log(this->name, GF_LOG_DEBUG, +                       "metadata self-heal is pending for %s.", +                       local->loc.path); +        } + +        if (afr_sh_has_entry_pending (xattr, child_index, this)) { +                local->self_heal.need_entry_self_heal = _gf_true; +                gf_log(this->name, GF_LOG_DEBUG, +                       "entry self-heal is pending for %s.", local->loc.path); +        } + +        if (afr_sh_has_data_pending (xattr, child_index, this)) { +                local->self_heal.need_data_self_heal = _gf_true; +                gf_log(this->name, GF_LOG_DEBUG, +                       "data self-heal is pending for %s.", local->loc.path); +        } + +        ret = dict_get_uint32 (xattr, GLUSTERFS_OPEN_FD_COUNT, +                               &open_fd_count); +        if (ret == 0) +                local->open_fd_count += open_fd_count; + +        ret = dict_get_uint32 (xattr, GLUSTERFS_INODELK_COUNT, +                               &inodelk_count); +        if (ret == 0) +                local->inodelk_count += inodelk_count; + +        ret = dict_get_uint32 (xattr, GLUSTERFS_ENTRYLK_COUNT, +                               &entrylk_count); +        if (ret == 0) +                local->entrylk_count += entrylk_count; +} + + +static void +afr_lookup_self_heal_check (xlator_t *this, afr_local_t *local, +                            struct iatt *buf, struct iatt *lookup_buf) +{ +        if (FILETYPE_DIFFERS (buf, lookup_buf)) { +                /* mismatching filetypes with same name +                */ + +                gf_log (this->name, GF_LOG_NORMAL, +                        "filetype differs for %s ", local->loc.path); + +                local->govinda_gOvinda = 1; +        } + +        if (PERMISSION_DIFFERS (buf, lookup_buf)) { +                /* mismatching permissions */ +                gf_log (this->name, GF_LOG_NORMAL, +                        "permissions differ for %s ", local->loc.path); +                local->self_heal.need_metadata_self_heal = _gf_true; +        } + +        if (OWNERSHIP_DIFFERS (buf, lookup_buf)) { +                /* mismatching permissions */ +                local->self_heal.need_metadata_self_heal = _gf_true; +                gf_log (this->name, GF_LOG_NORMAL, +                        "ownership differs for %s ", local->loc.path); +        } + +        if (SIZE_DIFFERS (buf, lookup_buf) +            && IA_ISREG (buf->ia_type)) { +                gf_log (this->name, GF_LOG_NORMAL, +                        "size differs for %s ", local->loc.path); +                local->self_heal.need_data_self_heal = _gf_true; +        } + +} + + +static void +afr_lookup_done (call_frame_t *frame, xlator_t *this, struct iatt *lookup_buf) +{ +        int  unwind = 1; +        int  source = -1; +        char sh_type_str[256] = {0,}; + +        afr_local_t *local = NULL; + +        local = frame->local; + +        local->cont.lookup.postparent.ia_ino  = local->cont.lookup.parent_ino; + +        if (local->cont.lookup.ino) { +                local->cont.lookup.buf.ia_ino = local->cont.lookup.ino; +                local->cont.lookup.buf.ia_gen = local->cont.lookup.gen; +        } + +        if (local->op_ret == 0) { +                /* KLUDGE: assuming DHT will not itransform in +                   revalidate */ +                if (local->cont.lookup.inode->ino) { +                        local->cont.lookup.buf.ia_ino = +                                local->cont.lookup.inode->ino; +                        local->cont.lookup.buf.ia_gen = +                                local->cont.lookup.inode->generation; +                } +        } + +        if (local->success_count && local->enoent_count) { +                local->self_heal.need_metadata_self_heal = _gf_true; +                local->self_heal.need_data_self_heal     = _gf_true; +                local->self_heal.need_entry_self_heal    = _gf_true; +                gf_log(this->name, GF_LOG_NORMAL, +                       "entries are missing in lookup of %s.", +                       local->loc.path); +        } + +        if (local->success_count) { +                /* check for split-brain case in previous lookup */ +                if (afr_is_split_brain (this, +                                        local->cont.lookup.inode)) { +                        local->self_heal.need_data_self_heal = _gf_true; +                        gf_log(this->name, GF_LOG_NORMAL, +                               "split brain detected during lookup of " +                               "%s.", local->loc.path); +                } +        } + +        if ((local->self_heal.need_metadata_self_heal +             || local->self_heal.need_data_self_heal +             || local->self_heal.need_entry_self_heal) +            && ((!local->cont.lookup.is_revalidate) +                || (local->op_ret != -1))) { + +                if (local->open_fd_count +                    || local->inodelk_count +                    || local->entrylk_count) { + +                        /* Someone else is doing self-heal on this file. +                           So just make a best effort to set the read-subvolume +                           and return */ + +                        if (IA_ISREG (local->cont.lookup.inode->ia_type)) { +                                source = afr_self_heal_get_source (this, local, local->cont.lookup.xattrs); + +                                if (source >= 0) { +                                        afr_set_read_child (this, +                                                            local->cont.lookup.inode, +                                                            source); +                                } +                        } +                } else { +                        if (!local->cont.lookup.inode->ia_type) { +                                /* fix for RT #602 */ +                                local->cont.lookup.inode->ia_type = +                                        lookup_buf->ia_type; +                        } + +                        local->self_heal.background = _gf_true; +                        local->self_heal.type       = local->cont.lookup.buf.ia_type; +                        local->self_heal.unwind     = afr_self_heal_lookup_unwind; + +                        unwind = 0; + +                        afr_self_heal_type_str_get(&local->self_heal, +                                                   sh_type_str, +                                                   sizeof(sh_type_str)); + +                        gf_log (this->name, GF_LOG_NORMAL, "background %s " +                                "self-heal triggered. path: %s", +                                sh_type_str, local->loc.path); + +                        afr_self_heal (frame, this); +                } +        } + +        if (unwind) { +                AFR_STACK_UNWIND (lookup, frame, local->op_ret, +                                  local->op_errno, +                                  local->cont.lookup.inode, +                                  &local->cont.lookup.buf, +                                  local->cont.lookup.xattr, +                                  &local->cont.lookup.postparent); +        } +} + + +/* + * During a lookup, some errors are more "important" than + * others in that they must be given higher priority while + * returning to the user. + * + * The hierarchy is ESTALE > ENOENT > others + * + */ + +static gf_boolean_t +__error_more_important (int32_t old_errno, int32_t new_errno) +{ +        gf_boolean_t ret = _gf_true; + +        /* Nothing should ever overwrite ESTALE */ +        if (old_errno == ESTALE) +                ret = _gf_false; + +        /* Nothing should overwrite ENOENT, except ESTALE */ +        else if ((old_errno == ENOENT) && (new_errno != ESTALE)) +                ret = _gf_false; + +        return ret; +} + + +int +afr_fresh_lookup_cbk (call_frame_t *frame, void *cookie, +                      xlator_t *this,  int32_t op_ret,	int32_t op_errno, +                      inode_t *inode,	struct iatt *buf, dict_t *xattr, +                      struct iatt *postparent) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +	struct iatt *   lookup_buf = NULL; + +	int             call_count      = -1; +	int             child_index     = -1; +        int             first_up_child  = -1; + +	child_index = (long) cookie; +	priv = this->private; + +	LOCK (&frame->lock); +	{ +		local = frame->local; + +                lookup_buf = &local->cont.lookup.buf; + +		if (op_ret == -1) { +			if (op_errno == ENOENT) +				local->enoent_count++; + +                        if (__error_more_important (local->op_errno, op_errno)) +                                local->op_errno = op_errno; + +                        if (local->op_errno == ESTALE) { +                                local->op_ret = -1; +                        } + +                        goto unlock; +		} + +                afr_lookup_collect_xattr (local, this, child_index, xattr); + +                first_up_child = afr_first_up_child (priv); + +                if (child_index == first_up_child) { +                        local->cont.lookup.ino = +                                afr_itransform (buf->ia_ino, +                                                priv->child_count, +                                                first_up_child); +                        local->cont.lookup.gen = buf->ia_gen; +                } + +		if (local->success_count == 0) { +                        if (local->op_errno != ESTALE) +                                local->op_ret = op_ret; + +			local->cont.lookup.inode               = inode_ref (inode); +			local->cont.lookup.xattr               = dict_ref (xattr); +			local->cont.lookup.xattrs[child_index] = dict_ref (xattr); +                        local->cont.lookup.postparent          = *postparent; + +                        if (priv->first_lookup && inode->ino == 1) { +                                gf_log (this->name, GF_LOG_TRACE, +                                        "added root inode"); +                                priv->root_inode = inode_ref (inode); +                                priv->first_lookup = 0; +                                priv->child_up[1] = 0; + +                                LOCK (&priv->lock); +                                { +                                        priv->down_count++; +                                } +                                UNLOCK (&priv->lock); + +                        } + +                        *lookup_buf = *buf; + +                        lookup_buf->ia_ino = afr_itransform (buf->ia_ino, +                                                             priv->child_count, +                                                             child_index); +                        if (priv->read_child >= 0) { +                                afr_set_read_child (this, +                                                    local->cont.lookup.inode, +                                                    priv->read_child); +                        } else { +                                afr_set_read_child (this, +                                                    local->cont.lookup.inode, +                                                    child_index); +                        } + +		} else { +                        afr_lookup_self_heal_check (this, local, buf, lookup_buf); + +                        if (child_index == local->read_child_index) { +                                /* +                                   lookup has succeeded on the read child. +                                   So use its inode number +                                */ +                                if (local->cont.lookup.xattr) +                                        dict_unref (local->cont.lookup.xattr); + +                                local->cont.lookup.xattr = dict_ref (xattr); +                                local->cont.lookup.xattrs[child_index] = dict_ref (xattr); +                                local->cont.lookup.postparent          = *postparent; + +                                *lookup_buf = *buf; + +                                if (priv->read_child >= 0) { +                                        afr_set_read_child (this, +                                                            local->cont.lookup.inode, +                                                            priv->read_child); +                                } else { +                                        afr_set_read_child (this, +                                                            local->cont.lookup.inode, +                                                            local->read_child_index); +                                } +                        } + +		} + +		local->success_count++; +	} +unlock: +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +                afr_lookup_done (frame, this, lookup_buf); +	} + +	return 0; +} + + +int +afr_revalidate_lookup_cbk (call_frame_t *frame, void *cookie, +                           xlator_t *this, int32_t op_ret, int32_t op_errno, +                           inode_t *inode, struct iatt *buf, dict_t *xattr, +                           struct iatt *postparent) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +	struct iatt *   lookup_buf = NULL; + +	int             call_count      = -1; +	int             child_index     = -1; +        int             first_up_child  = -1; + +	child_index = (long) cookie; +	priv = this->private; + +	LOCK (&frame->lock); +	{ +		local = frame->local; + +		lookup_buf = &local->cont.lookup.buf; + +		if (op_ret == -1) { +			if (op_errno == ENOENT) +				local->enoent_count++; + +                        if (__error_more_important (local->op_errno, op_errno)) +                            local->op_errno = op_errno; + +                            if (local->op_errno == ESTALE) { +                                    local->op_ret = -1; +                            } + +                            goto unlock; +                } + +                afr_lookup_collect_xattr (local, this, child_index, xattr); + +                first_up_child = afr_first_up_child (priv); + +                if (child_index == first_up_child) { +                        local->cont.lookup.ino = +                                afr_itransform (buf->ia_ino, +                                                priv->child_count, +                                                first_up_child); +                        local->cont.lookup.gen = buf->ia_gen; +                } + +		/* in case of revalidate, we need to send stat of the +		 * child whose stat was sent during the first lookup. +		 * (so that time stamp does not vary with revalidate. +		 * in case it is down, stat of the fist success will +		 * be replied */ + +		/* inode number should be preserved across revalidates */ + +		if (local->success_count == 0) { +                        if (local->op_errno != ESTALE) +                                local->op_ret = op_ret; + +			local->cont.lookup.inode               = inode_ref (inode); +			local->cont.lookup.xattr               = dict_ref (xattr); +			local->cont.lookup.xattrs[child_index] = dict_ref (xattr); +                        local->cont.lookup.postparent          = *postparent; + +			*lookup_buf = *buf; + +                        lookup_buf->ia_ino = afr_itransform (buf->ia_ino, +                                                             priv->child_count, +                                                             child_index); + +                        if (priv->read_child >= 0) { +                                afr_set_read_child (this, +                                                    local->cont.lookup.inode, +                                                    priv->read_child); +                        } else { +                                afr_set_read_child (this, +                                                    local->cont.lookup.inode, +                                                    child_index); +                        } + +		} else { +                        afr_lookup_self_heal_check (this, local, buf, lookup_buf); + +                        if (child_index == local->read_child_index) { + +                                /* +                                   lookup has succeeded on the read child. +                                   So use its inode number +                                */ + +                                if (local->cont.lookup.xattr) +                                        dict_unref (local->cont.lookup.xattr); + +                                local->cont.lookup.xattr               = dict_ref (xattr); +                                local->cont.lookup.xattrs[child_index] = dict_ref (xattr); +                                local->cont.lookup.postparent          = *postparent; + +                                *lookup_buf = *buf; + +                                if (priv->read_child >= 0) { +                                        afr_set_read_child (this, +                                                            local->cont.lookup.inode, +                                                            priv->read_child); +                                } else { +                                        afr_set_read_child (this, +                                                            local->cont.lookup.inode, +                                                            local->read_child_index); +                                } +                        } + +		} + +		local->success_count++; +	} +unlock: +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +                afr_lookup_done (frame, this, lookup_buf); +	} + +	return 0; +} + + +int +afr_lookup (call_frame_t *frame, xlator_t *this, +	    loc_t *loc, dict_t *xattr_req) +{ +	afr_private_t *priv = NULL; +	afr_local_t   *local = NULL; +	int            ret = -1; +	int            i = 0; + +        fop_lookup_cbk_t callback; + +        int call_count = 0; + +        uint64_t       ctx; + +	int32_t        op_errno = 0; + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	local->op_ret = -1; + +	frame->local = local; + +        if (!strcmp (loc->path, "/" GF_REPLICATE_TRASH_DIR)) { +                op_errno = ENOENT; +                goto out; +        } + +	loc_copy (&local->loc, loc); + +        ret = inode_ctx_get (loc->inode, this, &ctx); +        if (ret == 0) { +                /* lookup is a revalidate */ + +                callback = afr_revalidate_lookup_cbk; + +                local->cont.lookup.is_revalidate = _gf_true; +                local->read_child_index          = afr_read_child (this, +                                                                   loc->inode); +        } else { +                callback = afr_fresh_lookup_cbk; + +                LOCK (&priv->read_child_lock); +                { +                        local->read_child_index = (++priv->read_child_rr) +                                % (priv->child_count); +                } +                UNLOCK (&priv->read_child_lock); +        } + +        if (loc->parent) +                local->cont.lookup.parent_ino = loc->parent->ino; + +	local->child_up = memdup (priv->child_up, priv->child_count); + +        local->cont.lookup.xattrs = GF_CALLOC (priv->child_count, +                                    sizeof (*local->cont.lookup.xattr), +                                    gf_afr_mt_dict_t); + +	local->call_count = afr_up_children_count (priv->child_count, +                                                   local->child_up); +        call_count = local->call_count; + +        if (local->call_count == 0) { +                ret      = -1; +                op_errno = ENOTCONN; +                goto out; +        } + +	/* By default assume ENOTCONN. On success it will be set to 0. */ +	local->op_errno = ENOTCONN; + +	if (xattr_req == NULL) +		local->xattr_req = dict_new (); +	else +		local->xattr_req = dict_ref (xattr_req); + +        for (i = 0; i < priv->child_count; i++) { +		ret = dict_set_uint64 (local->xattr_req, priv->pending_key[i], +				       3 * sizeof(int32_t)); + +                /* 3 = data+metadata+entry */ +        } + +	ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_OPEN_FD_COUNT, 0); +        ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); +        ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); + +	for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i]) { +                        STACK_WIND_COOKIE (frame, callback, (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->lookup, +                                           loc, local->xattr_req); +                        if (!--call_count) +                                break; +                } +	} + +	ret = 0; +out: +	if (ret == -1) +		AFR_STACK_UNWIND (lookup, frame, -1, op_errno, +                                  NULL, NULL, NULL, NULL); + +	return 0; +} + + +/* {{{ open */ + +int +afr_fd_ctx_set (xlator_t *this, fd_t *fd) +{ +        afr_private_t * priv = NULL; + +        int op_ret = 0; +        int ret    = 0; + +        uint64_t       ctx; +        afr_fd_ctx_t * fd_ctx = NULL; + +        VALIDATE_OR_GOTO (this->private, out); +        VALIDATE_OR_GOTO (fd, out); + +        priv = this->private; + +        LOCK (&fd->lock); +        { +                ret = __fd_ctx_get (fd, this, &ctx); + +                if (ret == 0) +                        goto unlock; + +                fd_ctx = GF_CALLOC (1, sizeof (afr_fd_ctx_t), +                                    gf_afr_mt_afr_fd_ctx_t); +                if (!fd_ctx) { +                        gf_log (this->name, GF_LOG_ERROR, +                                "Out of memory"); + +                        op_ret = -ENOMEM; +                        goto unlock; +                } + +                fd_ctx->pre_op_done = GF_CALLOC (sizeof (*fd_ctx->pre_op_done), +                                                 priv->child_count, +                                                 gf_afr_mt_char); +                if (!fd_ctx->pre_op_done) { +                        gf_log (this->name, GF_LOG_ERROR, +                                "Out of memory"); +                        op_ret = -ENOMEM; +                        goto unlock; +                } + +                fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), +                                               priv->child_count, +                                               gf_afr_mt_char); +                if (!fd_ctx->opened_on) { +                        gf_log (this->name, GF_LOG_ERROR, +                                "Out of memory"); +                        op_ret = -ENOMEM; +                        goto unlock; +                } + +                fd_ctx->child_failed = GF_CALLOC ( +                                         sizeof (*fd_ctx->child_failed), +                                         priv->child_count, +                                         gf_afr_mt_char); + +                if (!fd_ctx->child_failed) { +                        gf_log (this->name, GF_LOG_ERROR, +                                "Out of memory"); + +                        op_ret = -ENOMEM; +                        goto unlock; +                } + +                fd_ctx->up_count   = priv->up_count; +                fd_ctx->down_count = priv->down_count; + +                ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); +                if (ret < 0) { +                        op_ret = ret; +                } + +                INIT_LIST_HEAD (&fd_ctx->entries); +        } +unlock: +        UNLOCK (&fd->lock); +out: +        return ret; +} + +/* {{{ flush */ + +int +afr_flush_unwind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +	call_frame_t   *main_frame = NULL; + +	local = frame->local; +	priv  = this->private; + +	LOCK (&frame->lock); +	{ +		if (local->transaction.main_frame) +			main_frame = local->transaction.main_frame; +		local->transaction.main_frame = NULL; +	} +	UNLOCK (&frame->lock); + +	if (main_frame) { +		AFR_STACK_UNWIND (flush, main_frame, +                                  local->op_ret, local->op_errno); +	} + +	return 0; +} + + +int +afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		      int32_t op_ret, int32_t op_errno) +{ +        afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	int call_count  = -1; +	int child_index = (long) cookie; +	int need_unwind = 0; + +	local = frame->local; +	priv  = this->private; + +	LOCK (&frame->lock); +	{ +		if (afr_fop_failed (op_ret, op_errno)) +			afr_transaction_fop_failed (frame, this, child_index); + +		if (op_ret != -1) { +			if (local->success_count == 0) { +				local->op_ret = op_ret; +			} +			local->success_count++; + +			if (local->success_count == priv->wait_count) { +				need_unwind = 1; +			} +		} + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	if (need_unwind) +		afr_flush_unwind (frame, this); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +	} + +	return 0; +} + + +int +afr_flush_wind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; + +	int i = 0; +	int call_count = -1; + +	local = frame->local; +	priv = this->private; + +	call_count = afr_up_children_count (priv->child_count, local->child_up); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_flush_wind_cbk, +					   (void *) (long) i, +					   priv->children[i], +					   priv->children[i]->fops->flush, +					   local->fd); + +			if (!--call_count) +				break; +		} +	} + +	return 0; +} + + +int +afr_flush_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; + +	local = frame->local; + +	local->transaction.unwind (frame, this); + +        AFR_STACK_DESTROY (frame); + +	return 0; +} + + +int +afr_plain_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                     int32_t op_ret, int32_t op_errno) + +{ +	afr_local_t *local = NULL; + +	int call_count = -1; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (op_ret == 0) +			local->op_ret = 0; + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		AFR_STACK_UNWIND (flush, frame, local->op_ret, local->op_errno); + +	return 0; +} + + +static int +__no_pre_op_done (xlator_t *this, fd_t *fd) +{ +        int i      = 0; +        int op_ret = 1; + +        int _ret = 0; +        uint64_t       ctx; +        afr_fd_ctx_t * fd_ctx = NULL; + +        afr_private_t *priv = NULL; + +        priv = this->private; + +        LOCK (&fd->lock); +        { +                _ret = __fd_ctx_get (fd, this, &ctx); + +                if (_ret < 0) { +                        goto out; +                } + +                fd_ctx = (afr_fd_ctx_t *)(long) ctx; + +                for (i = 0; i < priv->child_count; i++) { +                        if (fd_ctx->pre_op_done[i]) { +                                op_ret = 0; +                                break; +                        } +                } +        } +out: +        UNLOCK (&fd->lock); + +        return op_ret; +} + + +int +afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; + +        call_frame_t  * transaction_frame = NULL; + +	int ret        = -1; + +	int op_ret   = -1; +	int op_errno = 0; + +        int i          = 0; +        int call_count = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +        call_count = afr_up_children_count (priv->child_count, local->child_up); + +        if (__no_pre_op_done (this, fd)) { +                frame->local = local; + +                for (i = 0; i < priv->child_count; i++) { +                        if (local->child_up[i]) { +                                STACK_WIND_COOKIE (frame, afr_plain_flush_cbk, +                                                   (void *) (long) i, +                                                   priv->children[i], +                                                   priv->children[i]->fops->flush, +                                                   fd); +                                if (!--call_count) +                                        break; +                        } +                } +        } else { +                transaction_frame = copy_frame (frame); +                if (!transaction_frame) { +                        op_errno = ENOMEM; +                        gf_log (this->name, GF_LOG_ERROR, +                                "Out of memory."); +                        goto out; +                } + +                transaction_frame->local = local; + +                local->op = GF_FOP_FLUSH; + +                local->transaction.fop    = afr_flush_wind; +                local->transaction.done   = afr_flush_done; +                local->transaction.unwind = afr_flush_unwind; + +                local->fd                 = fd_ref (fd); + +                local->transaction.main_frame = frame; +                local->transaction.start  = 0; +                local->transaction.len    = 0; + +                afr_transaction (transaction_frame, this, AFR_FLUSH_TRANSACTION); +        } + +	op_ret = 0; +out: +	if (op_ret == -1) { +                if (transaction_frame) +                        AFR_STACK_DESTROY (transaction_frame); + +		AFR_STACK_UNWIND (flush, frame, op_ret, op_errno); +	} + +	return 0; +} + +/* }}} */ + + +int +afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) +{ +        uint64_t        ctx = 0; +        afr_fd_ctx_t    *fd_ctx = NULL; +        int             ret = 0; + +        ret = fd_ctx_get (fd, this, &ctx); + +        if (ret < 0) +                goto out; + +        fd_ctx = (afr_fd_ctx_t *)(long) ctx; + +        if (fd_ctx) { +                if (fd_ctx->child_failed) +                        GF_FREE (fd_ctx->child_failed); + +                if (fd_ctx->pre_op_done) +                        GF_FREE (fd_ctx->pre_op_done); + +                if (fd_ctx->opened_on) +                        GF_FREE (fd_ctx->opened_on); + +                GF_FREE (fd_ctx); +        } + +out: +        return 0; +} + + +int +afr_release (xlator_t *this, fd_t *fd) +{ +        afr_cleanup_fd_ctx (this, fd); + +        return 0; +} + + +/* {{{ fsync */ + +int +afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +               int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +               struct iatt *postbuf) +{ +	afr_local_t *local = NULL; + +	int call_count = -1; + +        int child_index = (long) cookie; +        int read_child  = 0; + +	local = frame->local; + +        read_child = afr_read_child (this, local->fd->inode); + +	LOCK (&frame->lock); +	{ +                if (child_index == read_child) { +                        local->read_child_returned = _gf_true; +                } + +		if (op_ret == 0) { +			local->op_ret = 0; + +			if (local->success_count == 0) { +				local->cont.fsync.prebuf  = *prebuf; +				local->cont.fsync.postbuf = *postbuf; +			} + +                        if (child_index == read_child) { +                                local->cont.fsync.prebuf  = *prebuf; +                                local->cont.fsync.postbuf = *postbuf; +                        } + +			local->success_count++; +                } + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +                local->cont.fsync.prebuf.ia_ino  = local->cont.fsync.ino; +                local->cont.fsync.postbuf.ia_ino = local->cont.fsync.ino; + +		AFR_STACK_UNWIND (fsync, frame, local->op_ret, local->op_errno, +                                  &local->cont.fsync.prebuf, +                                  &local->cont.fsync.postbuf); +        } + +	return 0; +} + + +int +afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, +	   int32_t datasync) +{ +	afr_private_t *priv = NULL; +	afr_local_t *local = NULL; + +	int ret = -1; + +	int i = 0; +	int32_t call_count = 0; +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	call_count = local->call_count; +	frame->local = local; + +        local->fd             = fd_ref (fd); +        local->cont.fsync.ino = fd->inode->ino; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_fsync_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->fsync, +                                           fd, datasync); +			if (!--call_count) +				break; +		} +	} + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, NULL, NULL); +	} +	return 0; +} + +/* }}} */ + +/* {{{ fsync */ + +int32_t +afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, +		  xlator_t *this, int32_t op_ret, int32_t op_errno) +{ +	afr_local_t *local = NULL; + +	int call_count = -1; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (op_ret == 0) +			local->op_ret = 0; + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		AFR_STACK_UNWIND (fsyncdir, frame, local->op_ret, +                                  local->op_errno); + +	return 0; +} + + +int32_t +afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, +	      int32_t datasync) +{ +	afr_private_t *priv = NULL; +	afr_local_t *local = NULL; + +	int ret = -1; + +	int i = 0; +	int32_t call_count = 0; +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	call_count = local->call_count; +	frame->local = local; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND (frame, afr_fsyncdir_cbk, +				    priv->children[i], +				    priv->children[i]->fops->fsyncdir, +				    fd, datasync); +			if (!--call_count) +				break; +		} +	} + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (fsyncdir, frame, op_ret, op_errno); +	} +	return 0; +} + +/* }}} */ + +/* {{{ xattrop */ + +int32_t +afr_xattrop_cbk (call_frame_t *frame, void *cookie, +		 xlator_t *this, int32_t op_ret, int32_t op_errno, +		 dict_t *xattr) +{ +	afr_local_t *local = NULL; + +	int call_count = -1; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (op_ret == 0) +			local->op_ret = 0; + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		AFR_STACK_UNWIND (xattrop, frame, local->op_ret, local->op_errno, +                                  xattr); + +	return 0; +} + + +int32_t +afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, +	     gf_xattrop_flags_t optype, dict_t *xattr) +{ +	afr_private_t *priv = NULL; +	afr_local_t *local  = NULL; + +	int ret = -1; + +	int i = 0; +	int32_t call_count = 0; +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	call_count = local->call_count; +	frame->local = local; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND (frame, afr_xattrop_cbk, +				    priv->children[i], +				    priv->children[i]->fops->xattrop, +				    loc, optype, xattr); +			if (!--call_count) +				break; +		} +	} + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (xattrop, frame, op_ret, op_errno, NULL); +	} +	return 0; +} + +/* }}} */ + +/* {{{ fxattrop */ + +int32_t +afr_fxattrop_cbk (call_frame_t *frame, void *cookie, +		  xlator_t *this, int32_t op_ret, int32_t op_errno, +		  dict_t *xattr) +{ +	afr_local_t *local = NULL; + +	int call_count = -1; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (op_ret == 0) +			local->op_ret = 0; + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		AFR_STACK_UNWIND (fxattrop, frame, local->op_ret, local->op_errno, +                                  xattr); + +	return 0; +} + + +int32_t +afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, +	      gf_xattrop_flags_t optype, dict_t *xattr) +{ +	afr_private_t *priv = NULL; +	afr_local_t *local  = NULL; + +	int ret = -1; + +	int i = 0; +	int32_t call_count = 0; +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	call_count = local->call_count; +	frame->local = local; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND (frame, afr_fxattrop_cbk, +				    priv->children[i], +				    priv->children[i]->fops->fxattrop, +				    fd, optype, xattr); +			if (!--call_count) +				break; +		} +	} + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, NULL); +	} +	return 0; +} + +/* }}} */ + + +int32_t +afr_inodelk_cbk (call_frame_t *frame, void *cookie, +		 xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ +	afr_local_t *local = NULL; + +	int call_count = -1; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (op_ret == 0) +			local->op_ret = 0; + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		AFR_STACK_UNWIND (inodelk, frame, local->op_ret, +                                  local->op_errno); + +	return 0; +} + + +int32_t +afr_inodelk (call_frame_t *frame, xlator_t *this, +             const char *volume, loc_t *loc, int32_t cmd, struct flock *flock) +{ +	afr_private_t *priv = NULL; +	afr_local_t *local  = NULL; + +	int ret = -1; + +	int i = 0; +	int32_t call_count = 0; +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	call_count = local->call_count; +	frame->local = local; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND (frame, afr_inodelk_cbk, +				    priv->children[i], +				    priv->children[i]->fops->inodelk, +				    volume, loc, cmd, flock); + +			if (!--call_count) +				break; +		} +	} + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (inodelk, frame, op_ret, op_errno); +	} +	return 0; +} + + +int32_t +afr_finodelk_cbk (call_frame_t *frame, void *cookie, +		  xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ +	afr_local_t *local = NULL; + +	int call_count = -1; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (op_ret == 0) +			local->op_ret = 0; + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		AFR_STACK_UNWIND (finodelk, frame, local->op_ret, +                                  local->op_errno); + +	return 0; +} + + +int32_t +afr_finodelk (call_frame_t *frame, xlator_t *this, +              const char *volume, fd_t *fd, int32_t cmd, struct flock *flock) +{ +	afr_private_t *priv = NULL; +	afr_local_t *local  = NULL; + +	int ret = -1; + +	int i = 0; +	int32_t call_count = 0; +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	call_count = local->call_count; +	frame->local = local; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND (frame, afr_finodelk_cbk, +				    priv->children[i], +				    priv->children[i]->fops->finodelk, +				    volume, fd, cmd, flock); + +			if (!--call_count) +				break; +		} +	} + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (finodelk, frame, op_ret, op_errno); +	} +	return 0; +} + + +int32_t +afr_entrylk_cbk (call_frame_t *frame, void *cookie, +		 xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ +	afr_local_t *local = NULL; + +	int call_count = -1; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (op_ret == 0) +			local->op_ret = 0; + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		AFR_STACK_UNWIND (entrylk, frame, local->op_ret, +                                  local->op_errno); + +	return 0; +} + + +int32_t +afr_entrylk (call_frame_t *frame, xlator_t *this, +             const char *volume, loc_t *loc, +	     const char *basename, entrylk_cmd cmd, entrylk_type type) +{ +	afr_private_t *priv = NULL; +	afr_local_t *local  = NULL; + +	int ret = -1; + +	int i = 0; +	int32_t call_count = 0; +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	call_count = local->call_count; +	frame->local = local; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND (frame, afr_entrylk_cbk, +				    priv->children[i], +				    priv->children[i]->fops->entrylk, +				    volume, loc, basename, cmd, type); + +			if (!--call_count) +				break; +		} +	} + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (entrylk, frame, op_ret, op_errno); +	} +	return 0; +} + + + +int32_t +afr_fentrylk_cbk (call_frame_t *frame, void *cookie, +		 xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ +	afr_local_t *local = NULL; + +	int call_count = -1; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (op_ret == 0) +			local->op_ret = 0; + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		AFR_STACK_UNWIND (fentrylk, frame, local->op_ret, +                                  local->op_errno); + +	return 0; +} + + +int32_t +afr_fentrylk (call_frame_t *frame, xlator_t *this, +              const char *volume, fd_t *fd, +	      const char *basename, entrylk_cmd cmd, entrylk_type type) +{ +	afr_private_t *priv = NULL; +	afr_local_t *local  = NULL; + +	int ret = -1; + +	int i = 0; +	int32_t call_count = 0; +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	call_count = local->call_count; +	frame->local = local; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND (frame, afr_fentrylk_cbk, +				    priv->children[i], +				    priv->children[i]->fops->fentrylk, +				    volume, fd, basename, cmd, type); + +			if (!--call_count) +				break; +		} +	} + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (fentrylk, frame, op_ret, op_errno); +	} +	return 0; +} + +int32_t +afr_statfs_cbk (call_frame_t *frame, void *cookie, +		xlator_t *this, int32_t op_ret, int32_t op_errno, +		struct statvfs *statvfs) +{ +	afr_local_t *local = NULL; + +	int call_count = 0; + +	LOCK (&frame->lock); +	{ +		local = frame->local; + +		if (op_ret == 0) { +			local->op_ret   = op_ret; + +			if (local->cont.statfs.buf_set) { +				if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail) +					local->cont.statfs.buf = *statvfs; +			} else { +				local->cont.statfs.buf = *statvfs; +				local->cont.statfs.buf_set = 1; +			} +		} + +		if (op_ret == -1) +			local->op_errno = op_errno; + +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		AFR_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno, +				  &local->cont.statfs.buf); + +	return 0; +} + + +int32_t +afr_statfs (call_frame_t *frame, xlator_t *this, +	    loc_t *loc) +{ +	afr_private_t *  priv        = NULL; +	int              child_count = 0; +	afr_local_t   *  local       = NULL; +	int              i           = 0; + +	int ret = -1; +	int              call_count = 0; +	int32_t          op_ret      = -1; +	int32_t          op_errno    = 0; + +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); +	VALIDATE_OR_GOTO (loc, out); + +	priv = this->private; +	child_count = priv->child_count; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	frame->local = local; +	call_count = local->call_count; + +	for (i = 0; i < child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND (frame, afr_statfs_cbk, +				    priv->children[i], +				    priv->children[i]->fops->statfs, +				    loc); +			if (!--call_count) +				break; +		} +	} + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (statfs, frame, op_ret, op_errno, NULL); +	} +	return 0; +} + + +int32_t +afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		   int32_t op_ret, int32_t op_errno, struct flock *lock) +{ +	afr_local_t * local = NULL; + +	int call_count = -1; + +	local = frame->local; +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, +				  lock); + +	return 0; +} + + +int32_t +afr_lk_unlock (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t   * local = NULL; +	afr_private_t * priv  = NULL; + +	int i; +	int call_count = 0; + +	local = frame->local; +	priv  = this->private; + +	call_count = afr_locked_nodes_count (local->cont.lk.locked_nodes, +					     priv->child_count); + +	if (call_count == 0) { +		AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, +				  &local->cont.lk.flock); +		return 0; +	} + +	local->call_count = call_count; + +	local->cont.lk.flock.l_type = F_UNLCK; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->cont.lk.locked_nodes[i]) { +			STACK_WIND (frame, afr_lk_unlock_cbk, +				    priv->children[i], +				    priv->children[i]->fops->lk, +				    local->fd, F_SETLK, +				    &local->cont.lk.flock); + +			if (!--call_count) +				break; +		} +	} + +	return 0; +} + + +int32_t +afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +	    int32_t op_ret, int32_t op_errno, struct flock *lock) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; + +	int call_count  = -1; +	int child_index = -1; + +	local = frame->local; +	priv  = this->private; + +	child_index = (long) cookie; + +	call_count = --local->call_count; + +	if (!child_went_down (op_ret, op_errno) && (op_ret == -1)) { +		local->op_ret   = -1; +		local->op_errno = op_errno; + +		afr_lk_unlock (frame, this); +		return 0; +	} + +	if (op_ret == 0) { +		local->op_ret        = 0; +		local->op_errno      = 0; +		local->cont.lk.locked_nodes[child_index] = 1; +		local->cont.lk.flock = *lock; +	} + +	child_index++; + +	if (child_index < priv->child_count) { +		STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) child_index, +				   priv->children[child_index], +				   priv->children[child_index]->fops->lk, +				   local->fd, local->cont.lk.cmd, +				   &local->cont.lk.flock); +	} else if (local->op_ret == -1) { +		/* all nodes have gone down */ + +		AFR_STACK_UNWIND (lk, frame, -1, ENOTCONN, &local->cont.lk.flock); +	} else { +		/* locking has succeeded on all nodes that are up */ + +		AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, +                                  &local->cont.lk.flock); +	} + +	return 0; +} + + +int +afr_lk (call_frame_t *frame, xlator_t *this, +	fd_t *fd, int32_t cmd, +	struct flock *flock) +{ +	afr_private_t *priv = NULL; +	afr_local_t *local = NULL; + +	int i = 0; + +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); +	AFR_LOCAL_INIT (local, priv); + +	frame->local  = local; + +	local->cont.lk.locked_nodes = GF_CALLOC (priv->child_count, +					      sizeof (*local->cont.lk.locked_nodes), +                                              gf_afr_mt_char); + +	if (!local->cont.lk.locked_nodes) { +		gf_log (this->name, GF_LOG_ERROR, "Out of memory"); +		op_errno = ENOMEM; +		goto out; +	} + +	local->fd            = fd_ref (fd); +	local->cont.lk.cmd   = cmd; +	local->cont.lk.flock = *flock; + +	STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0, +			   priv->children[i], +			   priv->children[i]->fops->lk, +			   fd, cmd, flock); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (lk, frame, op_ret, op_errno, NULL); +	} +	return 0; +} + +int +afr_priv_dump (xlator_t *this) +{ +	afr_private_t *priv = NULL; +        char  key_prefix[GF_DUMP_MAX_BUF_LEN]; +        char  key[GF_DUMP_MAX_BUF_LEN]; +        int   i = 0; + + +        assert(this); +	priv = this->private; + +        assert(priv); +        snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); +        gf_proc_dump_add_section(key_prefix); +        gf_proc_dump_build_key(key, key_prefix, "child_count"); +        gf_proc_dump_write(key, "%u", priv->child_count); +        gf_proc_dump_build_key(key, key_prefix, "read_child_rr"); +        gf_proc_dump_write(key, "%u", priv->read_child_rr); +	for (i = 0; i < priv->child_count; i++) { +                gf_proc_dump_build_key(key, key_prefix, "child_up[%d]", i); +                gf_proc_dump_write(key, "%d", priv->child_up[i]); +                gf_proc_dump_build_key(key, key_prefix, +                                        "pending_key[%d]", i); +                gf_proc_dump_write(key, "%s", priv->pending_key[i]); +        } +        gf_proc_dump_build_key(key, key_prefix, "data_self_heal"); +        gf_proc_dump_write(key, "%d", priv->data_self_heal); +        gf_proc_dump_build_key(key, key_prefix, "metadata_self_heal"); +        gf_proc_dump_write(key, "%d", priv->metadata_self_heal); +        gf_proc_dump_build_key(key, key_prefix, "entry_self_heal"); +        gf_proc_dump_write(key, "%d", priv->entry_self_heal); +        gf_proc_dump_build_key(key, key_prefix, "data_change_log"); +        gf_proc_dump_write(key, "%d", priv->data_change_log); +        gf_proc_dump_build_key(key, key_prefix, "metadata_change_log"); +        gf_proc_dump_write(key, "%d", priv->metadata_change_log); +        gf_proc_dump_build_key(key, key_prefix, "entry_change_log"); +        gf_proc_dump_write(key, "%d", priv->entry_change_log); +        gf_proc_dump_build_key(key, key_prefix, "read_child"); +        gf_proc_dump_write(key, "%d", priv->read_child); +        gf_proc_dump_build_key(key, key_prefix, "favorite_child"); +        gf_proc_dump_write(key, "%u", priv->favorite_child); +        gf_proc_dump_build_key(key, key_prefix, "data_lock_server_count"); +        gf_proc_dump_write(key, "%u", priv->data_lock_server_count); +        gf_proc_dump_build_key(key, key_prefix, "metadata_lock_server_count"); +        gf_proc_dump_write(key, "%u", priv->metadata_lock_server_count); +        gf_proc_dump_build_key(key, key_prefix, "entry_lock_server_count"); +        gf_proc_dump_write(key, "%u", priv->entry_lock_server_count); +        gf_proc_dump_build_key(key, key_prefix, "wait_count"); +        gf_proc_dump_write(key, "%u", priv->wait_count); + +        return 0; +} + + +/** + * find_child_index - find the child's index in the array of subvolumes + * @this: AFR + * @child: child + */ + +static int +find_child_index (xlator_t *this, xlator_t *child) +{ +	afr_private_t *priv = NULL; + +	int i = -1; + +	priv = this->private; + +	for (i = 0; i < priv->child_count; i++) { +		if ((xlator_t *) child == priv->children[i]) +			break; +	} + +	return i; +} + +int32_t +afr_notify (xlator_t *this, int32_t event, +            void *data, ...) +{ +	afr_private_t *     priv     = NULL; +	unsigned char *     child_up = NULL; + +	int i           = -1; +	int up_children = 0; + +	priv = this->private; + +	if (!priv) +		return 0; + +	child_up = priv->child_up; + +	switch (event) { +	case GF_EVENT_CHILD_UP: +		i = find_child_index (this, data); + +		child_up[i] = 1; + +                LOCK (&priv->lock); +                { +                        priv->up_count++; +                } +                UNLOCK (&priv->lock); + +		/* +		   if all the children were down, and one child came up, +		   send notify to parent +		*/ + +		for (i = 0; i < priv->child_count; i++) +			if (child_up[i]) +				up_children++; + +		if (up_children == 1) { +                        gf_log (this->name, GF_LOG_NORMAL, +                                "Subvolume '%s' came back up; " +                                "going online.", ((xlator_t *)data)->name); + +			default_notify (this, event, data); +                } + +		break; + +	case GF_EVENT_CHILD_DOWN: +		i = find_child_index (this, data); + +		child_up[i] = 0; + +                LOCK (&priv->lock); +                { +                        priv->down_count++; +                } +                UNLOCK (&priv->lock); + +		/* +		   if all children are down, and this was the last to go down, +		   send notify to parent +		*/ + +		for (i = 0; i < priv->child_count; i++) +			if (child_up[i]) +				up_children++; + +		if (up_children == 0) { +                        gf_log (this->name, GF_LOG_ERROR, +                                "All subvolumes are down. Going offline " +                                "until atleast one of them comes back up."); + +			default_notify (this, event, data); +                } + +		break; + +	default: +		default_notify (this, event, data); +	} + +	return 0; + +} diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index 344bdf74ece..6125ac0dd05 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -228,11 +228,14 @@ afr_opendir_cbk (call_frame_t *frame, void *cookie,  		 xlator_t *this, int32_t op_ret, int32_t op_errno,  		 fd_t *fd)  { +        afr_private_t *priv = NULL;  	afr_local_t * local  = NULL;  	int call_count = -1;          int ret        = 0; +        priv = this->private; +  	LOCK (&frame->lock);  	{  		local = frame->local; diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index ef72fb19779..74f3d9ddb0e 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -656,7 +656,6 @@ out:  	return 0;  } -  int32_t  afr_getxattr (call_frame_t *frame, xlator_t *this,  	      loc_t *loc, const char *name) @@ -671,6 +670,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,  	int32_t op_ret   = -1;  	int32_t op_errno = 0; +  	VALIDATE_OR_GOTO (frame, out);  	VALIDATE_OR_GOTO (this, out);  	VALIDATE_OR_GOTO (this->private, out); @@ -690,6 +690,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,                          op_errno = ENODATA;                          goto out;                  } +          }          read_child = afr_read_child (this, loc->inode); diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index eff01da8610..f02b5fe1e6d 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -47,7 +47,6 @@  #include "afr.h"  #include "afr-transaction.h" -  /* {{{ writev */  int @@ -1366,11 +1365,10 @@ afr_setxattr_done (call_frame_t *frame, xlator_t *this)  	local->transaction.unwind (frame, this);  	AFR_STACK_DESTROY (frame); -	 +  	return 0;  } -  int  afr_setxattr (call_frame_t *frame, xlator_t *this,  	      loc_t *loc, dict_t *dict, int32_t flags) @@ -1390,13 +1388,6 @@ afr_setxattr (call_frame_t *frame, xlator_t *this,  	priv = this->private; -	transaction_frame = copy_frame (frame); -	if (!transaction_frame) { -		gf_log (this->name, GF_LOG_ERROR, -			"Out of memory."); -		goto out; -	} -  	ALLOC_OR_GOTO (local, afr_local_t, out);  	ret = AFR_LOCAL_INIT (local, priv); @@ -1405,6 +1396,13 @@ afr_setxattr (call_frame_t *frame, xlator_t *this,  		goto out;  	} +	transaction_frame = copy_frame (frame); +	if (!transaction_frame) { +		gf_log (this->name, GF_LOG_ERROR, +			"Out of memory."); +		goto out; +	} +  	transaction_frame->local = local;  	local->op_ret = -1; diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index 27117c1848c..3720d85e61b 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -40,6 +40,7 @@ enum gf_afr_mem_types_ {          gf_afr_mt_uint8_t,          gf_afr_mt_loc_t,          gf_afr_mt_entry_name, +        gf_afr_mt_pump_priv,          gf_afr_mt_end  };  #endif diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index e3e484aabbe..61fe89dfe42 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -25,7 +25,7 @@  #include "afr-transaction.h"  #include "afr-self-heal-common.h"  #include "afr-self-heal.h" - +#include "pump.h"  /**   * select_source - select a source and return it @@ -1536,7 +1536,6 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this)  	return 0;  } -  int  afr_self_heal (call_frame_t *frame, xlator_t *this)  { @@ -1548,9 +1547,12 @@ afr_self_heal (call_frame_t *frame, xlator_t *this)          call_frame_t *sh_frame = NULL;          afr_local_t  *sh_local = NULL; +  	local = frame->local;  	priv  = this->private; +        afr_set_lk_owner (frame, this); +          if (local->self_heal.background) {                  LOCK (&priv->lock);                  { diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 2d689e389bc..51d1455f455 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -1376,7 +1376,7 @@ int32_t afr_lock (call_frame_t *frame, xlator_t *this)          afr_pid_save (frame);          frame->root->pid = (long) frame->root; - +        afr_set_lk_owner (frame, this);  	return afr_lock_rec (frame, this, 0);  } diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index b795ea1d17f..629e1875e6e 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -28,2591 +28,17 @@  #define _CONFIG_H  #include "config.h"  #endif - -#include "glusterfs.h" -#include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" -#include "statedump.h" - -#include "fd.h" - -#include "afr-inode-read.h" -#include "afr-inode-write.h" -#include "afr-dir-read.h" -#include "afr-dir-write.h" -#include "afr-transaction.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" - -#define AFR_ICTX_OPENDIR_DONE_MASK     0x0000000200000000ULL -#define AFR_ICTX_SPLIT_BRAIN_MASK      0x0000000100000000ULL -#define AFR_ICTX_READ_CHILD_MASK       0x00000000FFFFFFFFULL - - -uint64_t -afr_is_split_brain (xlator_t *this, inode_t *inode) -{ -        int ret = 0; - -        uint64_t ctx         = 0; -        uint64_t split_brain = 0; - -        VALIDATE_OR_GOTO (inode, out); - -        LOCK (&inode->lock); -        { -                ret = __inode_ctx_get (inode, this, &ctx); - -                if (ret < 0) -                        goto unlock; -                 -                split_brain = ctx & AFR_ICTX_SPLIT_BRAIN_MASK; -        } -unlock: -        UNLOCK (&inode->lock); - -out: -        return split_brain; -} - - -void -afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set) -{ -        uint64_t ctx = 0; -        int      ret = 0; - -        VALIDATE_OR_GOTO (inode, out); - -        LOCK (&inode->lock); -        { -                ret = __inode_ctx_get (inode, this, &ctx); - -                if (ret < 0) { -                        ctx = 0; -                } - -                if (set) { -			ctx = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx) -                                | (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_SPLIT_BRAIN_MASK); -		} else { -			ctx = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx); -		} -                __inode_ctx_put (inode, this, ctx); -        } -        UNLOCK (&inode->lock); -out: -        return; -} - - -uint64_t -afr_is_opendir_done (xlator_t *this, inode_t *inode) -{ -        int ret = 0; - -        uint64_t ctx          = 0; -        uint64_t opendir_done = 0; - -        VALIDATE_OR_GOTO (inode, out); - -        LOCK (&inode->lock); -        { -                ret = __inode_ctx_get (inode, this, &ctx); - -                if (ret < 0) -                        goto unlock; -                 -                opendir_done = ctx & AFR_ICTX_OPENDIR_DONE_MASK; -        } -unlock: -        UNLOCK (&inode->lock); - -out: -        return opendir_done; -} - - -void -afr_set_opendir_done (xlator_t *this, inode_t *inode) -{ -        uint64_t ctx = 0; -        int      ret = 0; - -        VALIDATE_OR_GOTO (inode, out); - -        LOCK (&inode->lock); -        { -                ret = __inode_ctx_get (inode, this, &ctx); - -                if (ret < 0) { -                        ctx = 0; -                } - -                ctx = (~AFR_ICTX_OPENDIR_DONE_MASK & ctx) -                        | (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_OPENDIR_DONE_MASK); - -                __inode_ctx_put (inode, this, ctx); -        } -        UNLOCK (&inode->lock); -out: -        return; -} - - -uint64_t -afr_read_child (xlator_t *this, inode_t *inode) -{ -        int ret = 0; - -        uint64_t ctx         = 0; -        uint64_t read_child  = 0; - -        VALIDATE_OR_GOTO (inode, out); - -        LOCK (&inode->lock); -        { -                ret = __inode_ctx_get (inode, this, &ctx); - -                if (ret < 0) -                        goto unlock; -                 -                read_child = ctx & AFR_ICTX_READ_CHILD_MASK; -        } -unlock: -        UNLOCK (&inode->lock); - -out: -        return read_child; -} - - -void -afr_set_read_child (xlator_t *this, inode_t *inode, int32_t read_child) -{ -        uint64_t ctx = 0; -        int      ret = 0; - -        VALIDATE_OR_GOTO (inode, out); - -        LOCK (&inode->lock); -        { -                ret = __inode_ctx_get (inode, this, &ctx); - -                if (ret < 0) { -                        ctx = 0; -                } - -                ctx = (~AFR_ICTX_READ_CHILD_MASK & ctx)  -                        | (AFR_ICTX_READ_CHILD_MASK & read_child); - -                __inode_ctx_put (inode, this, ctx); -        } -        UNLOCK (&inode->lock); -         -out: -        return; -} - - -/** - * afr_local_cleanup - cleanup everything in frame->local - */ - -void -afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) -{ -	afr_self_heal_t *sh = NULL; -	afr_private_t   *priv = NULL; -	int              i = 0; - - -	sh = &local->self_heal; -	priv = this->private; - -	if (sh->buf) -		GF_FREE (sh->buf); - -	if (sh->xattr) { -		for (i = 0; i < priv->child_count; i++) { -			if (sh->xattr[i]) { -				dict_unref (sh->xattr[i]); -				sh->xattr[i] = NULL; -			} -		} -		GF_FREE (sh->xattr); -	} - -	if (sh->child_errno) -		GF_FREE (sh->child_errno); - -	if (sh->pending_matrix) { -		for (i = 0; i < priv->child_count; i++) { -			GF_FREE (sh->pending_matrix[i]); -		} -		GF_FREE (sh->pending_matrix); -	} - -	if (sh->delta_matrix) { -		for (i = 0; i < priv->child_count; i++) { -			GF_FREE (sh->delta_matrix[i]); -		} -		GF_FREE (sh->delta_matrix); -	} - -	if (sh->sources) -		GF_FREE (sh->sources); - -	if (sh->success) -		GF_FREE (sh->success); - -	if (sh->locked_nodes) -		GF_FREE (sh->locked_nodes); - -	if (sh->healing_fd && !sh->healing_fd_opened) { -		fd_unref (sh->healing_fd); -		sh->healing_fd = NULL; -	} - -        if (sh->linkname) -                GF_FREE ((char *)sh->linkname); - -	loc_wipe (&sh->parent_loc); -} - - -void -afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) -{ -        int             i = 0; -        afr_private_t * priv = NULL; - -        priv = this->private; - -        for (i = 0; i < priv->child_count; i++) { -                if (local->pending && local->pending[i]) -                        GF_FREE (local->pending[i]); -        } - -        GF_FREE (local->pending); - -	GF_FREE (local->transaction.locked_nodes); -	GF_FREE (local->transaction.child_errno); -	GF_FREE (local->child_errno); - -	GF_FREE (local->transaction.basename); -	GF_FREE (local->transaction.new_basename); - -	loc_wipe (&local->transaction.parent_loc); -	loc_wipe (&local->transaction.new_parent_loc); -} - - -void -afr_local_cleanup (afr_local_t *local, xlator_t *this) -{ -        int i; -        afr_private_t * priv = NULL; - -	if (!local) -		return; - -	afr_local_sh_cleanup (local, this); - -        afr_local_transaction_cleanup (local, this); - -        priv = this->private; - -	loc_wipe (&local->loc); -	loc_wipe (&local->newloc); - -	if (local->fd) -		fd_unref (local->fd); -	 -	if (local->xattr_req) -		dict_unref (local->xattr_req); - -	GF_FREE (local->child_up); - -	{ /* lookup */ -                if (local->cont.lookup.xattrs) { -                        for (i = 0; i < priv->child_count; i++) { -                                if (local->cont.lookup.xattrs[i]) { -                                        dict_unref (local->cont.lookup.xattrs[i]); -                                        local->cont.lookup.xattrs[i] = NULL; -                                } -                        } -                        GF_FREE (local->cont.lookup.xattrs); -                        local->cont.lookup.xattrs = NULL; -                } - -		if (local->cont.lookup.xattr) { -                        dict_unref (local->cont.lookup.xattr); -                } - -                if (local->cont.lookup.inode) { -                        inode_unref (local->cont.lookup.inode); -                } -	} - -	{ /* getxattr */ -		if (local->cont.getxattr.name) -			GF_FREE (local->cont.getxattr.name); -	} - -	{ /* lk */ -		if (local->cont.lk.locked_nodes) -			GF_FREE (local->cont.lk.locked_nodes); -	} - -	{ /* create */ -		if (local->cont.create.fd) -			fd_unref (local->cont.create.fd); -	} - -	{ /* writev */ -		GF_FREE (local->cont.writev.vector); -	} - -	{ /* setxattr */ -		if (local->cont.setxattr.dict) -			dict_unref (local->cont.setxattr.dict); -	} - -	{ /* removexattr */ -		GF_FREE (local->cont.removexattr.name); -	} - -	{ /* symlink */ -		GF_FREE (local->cont.symlink.linkpath); -	} - -        { /* opendir */ -                if (local->cont.opendir.checksum) -                        GF_FREE (local->cont.opendir.checksum); -        } -} - - -int -afr_frame_return (call_frame_t *frame) -{ -	afr_local_t *local = NULL; -	int          call_count = 0; - -	local = frame->local; - -	LOCK (&frame->lock); -	{ -		call_count = --local->call_count; -	} -	UNLOCK (&frame->lock); - -	return call_count; -} - - -/** - * up_children_count - return the number of children that are up - */ - -int -afr_up_children_count (int child_count, unsigned char *child_up) -{ -	int i   = 0; -	int ret = 0; - -	for (i = 0; i < child_count; i++) -		if (child_up[i]) -			ret++; -	return ret; -} - - -int -afr_locked_nodes_count (unsigned char *locked_nodes, int child_count) -{ -	int ret = 0; -	int i; - -	for (i = 0; i < child_count; i++) -		if (locked_nodes[i]) -			ret++; - -	return ret; -} - - -ino64_t -afr_itransform (ino64_t ino, int child_count, int child_index) -{ -	ino64_t scaled_ino = -1; - -	if (ino == ((uint64_t) -1)) { -		scaled_ino = ((uint64_t) -1); -		goto out; -	} - -	scaled_ino = (ino * child_count) + child_index; - -out: -	return scaled_ino; -} - - -int -afr_deitransform_orig (ino64_t ino, int child_count) -{ -	int index = -1; - -	index = ino % child_count; - -	return index; -} - - -int -afr_deitransform (ino64_t ino, int child_count) -{ -	return 0; -} - - -int -afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this) -{ -	afr_local_t *local = NULL; - -	local = frame->local; - -	if (local->govinda_gOvinda) { -                afr_set_split_brain (this, local->cont.lookup.inode, _gf_true); -	} - -	AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, -			  local->cont.lookup.inode, -			  &local->cont.lookup.buf, -			  local->cont.lookup.xattr, -                          &local->cont.lookup.postparent); - -	return 0; -} - - -static void -afr_lookup_collect_xattr (afr_local_t *local, xlator_t *this, -                          int child_index, dict_t *xattr) -{ -	uint32_t        open_fd_count = 0; -        uint32_t        inodelk_count = 0; -        uint32_t        entrylk_count = 0; - -        int ret = 0; - -        if (afr_sh_has_metadata_pending (xattr, child_index, this)) { -                local->self_heal.need_metadata_self_heal = _gf_true; -                gf_log(this->name, GF_LOG_DEBUG, -                       "metadata self-heal is pending for %s.", -                       local->loc.path); -        } - -        if (afr_sh_has_entry_pending (xattr, child_index, this)) { -                local->self_heal.need_entry_self_heal = _gf_true; -                gf_log(this->name, GF_LOG_DEBUG, -                       "entry self-heal is pending for %s.", local->loc.path); -        } - -        if (afr_sh_has_data_pending (xattr, child_index, this)) { -                local->self_heal.need_data_self_heal = _gf_true; -                gf_log(this->name, GF_LOG_DEBUG, -                       "data self-heal is pending for %s.", local->loc.path); -        } - -        ret = dict_get_uint32 (xattr, GLUSTERFS_OPEN_FD_COUNT, -                               &open_fd_count); -        if (ret == 0) -                local->open_fd_count += open_fd_count; - -        ret = dict_get_uint32 (xattr, GLUSTERFS_INODELK_COUNT, -                               &inodelk_count); -        if (ret == 0) -                local->inodelk_count += inodelk_count; - -        ret = dict_get_uint32 (xattr, GLUSTERFS_ENTRYLK_COUNT, -                               &entrylk_count); -        if (ret == 0) -                local->entrylk_count += entrylk_count; -} - - -static void -afr_lookup_self_heal_check (xlator_t *this, afr_local_t *local, -                            struct iatt *buf, struct iatt *lookup_buf) -{ -        if (FILETYPE_DIFFERS (buf, lookup_buf)) { -                /* mismatching filetypes with same name -                */ - -                gf_log (this->name, GF_LOG_NORMAL, -                        "filetype differs for %s ", local->loc.path); - -                local->govinda_gOvinda = 1; -        } - -        if (PERMISSION_DIFFERS (buf, lookup_buf)) { -                /* mismatching permissions */ -                gf_log (this->name, GF_LOG_NORMAL, -                        "permissions differ for %s ", local->loc.path); -                local->self_heal.need_metadata_self_heal = _gf_true; -        } - -        if (OWNERSHIP_DIFFERS (buf, lookup_buf)) { -                /* mismatching permissions */ -                local->self_heal.need_metadata_self_heal = _gf_true; -                gf_log (this->name, GF_LOG_NORMAL, -                        "ownership differs for %s ", local->loc.path); -        } - -        if (SIZE_DIFFERS (buf, lookup_buf) -            && IA_ISREG (buf->ia_type)) { -                gf_log (this->name, GF_LOG_NORMAL, -                        "size differs for %s ", local->loc.path); -                local->self_heal.need_data_self_heal = _gf_true; -        } - -} - - -static void -afr_lookup_done (call_frame_t *frame, xlator_t *this, struct iatt *lookup_buf) -{ -        int  unwind = 1; -        int  source = -1; -        char sh_type_str[256] = {0,}; - -        afr_local_t *local = NULL; - -        local = frame->local; - -        local->cont.lookup.postparent.ia_ino  = local->cont.lookup.parent_ino; - -        if (local->cont.lookup.ino) { -                local->cont.lookup.buf.ia_ino = local->cont.lookup.ino; -                local->cont.lookup.buf.ia_gen = local->cont.lookup.gen; -        } - -        if (local->op_ret == 0) { -                /* KLUDGE: assuming DHT will not itransform in -                   revalidate */ -                if (local->cont.lookup.inode->ino) { -                        local->cont.lookup.buf.ia_ino = -                                local->cont.lookup.inode->ino; -                        local->cont.lookup.buf.ia_gen = -                                local->cont.lookup.inode->generation; -                } -        } - -        if (local->success_count && local->enoent_count) { -                local->self_heal.need_metadata_self_heal = _gf_true; -                local->self_heal.need_data_self_heal     = _gf_true; -                local->self_heal.need_entry_self_heal    = _gf_true; -                gf_log(this->name, GF_LOG_NORMAL, -                       "entries are missing in lookup of %s.", -                       local->loc.path); -        } - -        if (local->success_count) { -                /* check for split-brain case in previous lookup */ -                if (afr_is_split_brain (this, -                                        local->cont.lookup.inode)) { -                        local->self_heal.need_data_self_heal = _gf_true; -                        gf_log(this->name, GF_LOG_NORMAL, -                               "split brain detected during lookup of " -                               "%s.", local->loc.path); -                } -        } - -        if ((local->self_heal.need_metadata_self_heal -             || local->self_heal.need_data_self_heal -             || local->self_heal.need_entry_self_heal) -            && ((!local->cont.lookup.is_revalidate) -                || (local->op_ret != -1))) { - -                if (local->open_fd_count -                    || local->inodelk_count -                    || local->entrylk_count) { - -                        /* Someone else is doing self-heal on this file. -                           So just make a best effort to set the read-subvolume -                           and return */ - -                        if (IA_ISREG (local->cont.lookup.inode->ia_type)) { -                                source = afr_self_heal_get_source (this, local, local->cont.lookup.xattrs); - -                                if (source >= 0) { -                                        afr_set_read_child (this, -                                                            local->cont.lookup.inode, -                                                            source); -                                } -                        } -                } else { -                        if (!local->cont.lookup.inode->ia_type) { -                                /* fix for RT #602 */ -                                local->cont.lookup.inode->ia_type = -                                        lookup_buf->ia_type; -                        } - -                        local->self_heal.background = _gf_true; -                        local->self_heal.type       = local->cont.lookup.buf.ia_type; -                        local->self_heal.unwind     = afr_self_heal_lookup_unwind; - -                        unwind = 0; - -                        afr_self_heal_type_str_get(&local->self_heal, -                                                   sh_type_str, -                                                   sizeof(sh_type_str)); - -                        gf_log (this->name, GF_LOG_NORMAL, "background %s " -                                "self-heal triggered. path: %s", -                                sh_type_str, local->loc.path); - -                        afr_self_heal (frame, this); -                } -        } - -        if (unwind) { -                AFR_STACK_UNWIND (lookup, frame, local->op_ret, -                                  local->op_errno, -                                  local->cont.lookup.inode, -                                  &local->cont.lookup.buf, -                                  local->cont.lookup.xattr, -                                  &local->cont.lookup.postparent); -        } -} - - -/* - * During a lookup, some errors are more "important" than - * others in that they must be given higher priority while - * returning to the user. - * - * The hierarchy is ESTALE > ENOENT > others - * - */ - -static gf_boolean_t -__error_more_important (int32_t old_errno, int32_t new_errno) -{ -        gf_boolean_t ret = _gf_true; - -        /* Nothing should ever overwrite ESTALE */ -        if (old_errno == ESTALE) -                ret = _gf_false; - -        /* Nothing should overwrite ENOENT, except ESTALE */ -        else if ((old_errno == ENOENT) && (new_errno != ESTALE)) -                ret = _gf_false; - -        return ret; -} - - -int -afr_fresh_lookup_cbk (call_frame_t *frame, void *cookie, -                      xlator_t *this,  int32_t op_ret,	int32_t op_errno, -                      inode_t *inode,	struct iatt *buf, dict_t *xattr, -                      struct iatt *postparent) -{ -	afr_local_t *   local = NULL; -	afr_private_t * priv  = NULL; -	struct iatt *   lookup_buf = NULL; - -	int             call_count      = -1; -	int             child_index     = -1; -        int             first_up_child  = -1; - -	child_index = (long) cookie; -	priv = this->private; - -	LOCK (&frame->lock); -	{ -		local = frame->local; - -                lookup_buf = &local->cont.lookup.buf; - -		if (op_ret == -1) { -			if (op_errno == ENOENT) -				local->enoent_count++; - -                        if (__error_more_important (local->op_errno, op_errno)) -                                local->op_errno = op_errno; - -                        if (local->op_errno == ESTALE) { -                                local->op_ret = -1; -                        } - -                        goto unlock; -		} - -                afr_lookup_collect_xattr (local, this, child_index, xattr); - -                first_up_child = afr_first_up_child (priv); - -                if (child_index == first_up_child) { -                        local->cont.lookup.ino = -                                afr_itransform (buf->ia_ino, -                                                priv->child_count, -                                                first_up_child); -                        local->cont.lookup.gen = buf->ia_gen; -                } - -		if (local->success_count == 0) { -                        if (local->op_errno != ESTALE) -                                local->op_ret = op_ret; - -			local->cont.lookup.inode               = inode_ref (inode); -			local->cont.lookup.xattr               = dict_ref (xattr); -			local->cont.lookup.xattrs[child_index] = dict_ref (xattr); -                        local->cont.lookup.postparent          = *postparent; - -                        *lookup_buf = *buf; - -                        lookup_buf->ia_ino = afr_itransform (buf->ia_ino, -                                                             priv->child_count, -                                                             child_index); -                        if (priv->read_child >= 0) { -                                afr_set_read_child (this, -                                                    local->cont.lookup.inode, -                                                    priv->read_child); -                        } else { -                                afr_set_read_child (this, -                                                    local->cont.lookup.inode, -                                                    child_index); -                        } - -		} else { -                        afr_lookup_self_heal_check (this, local, buf, lookup_buf); - -                        if (child_index == local->read_child_index) { -                                /* -                                   lookup has succeeded on the read child. -                                   So use its inode number -                                */ -                                if (local->cont.lookup.xattr) -                                        dict_unref (local->cont.lookup.xattr); - -                                local->cont.lookup.xattr = dict_ref (xattr); -                                local->cont.lookup.xattrs[child_index] = dict_ref (xattr); -                                local->cont.lookup.postparent          = *postparent; - -                                *lookup_buf = *buf; - -                                if (priv->read_child >= 0) { -                                        afr_set_read_child (this, -                                                            local->cont.lookup.inode, -                                                            priv->read_child); -                                } else { -                                        afr_set_read_child (this, -                                                            local->cont.lookup.inode, -                                                            local->read_child_index); -                                } -                        } - -		} - -		local->success_count++; -	} -unlock: -	UNLOCK (&frame->lock); - -	call_count = afr_frame_return (frame); - -	if (call_count == 0) { -                afr_lookup_done (frame, this, lookup_buf); -	} - -	return 0; -} - - -int -afr_revalidate_lookup_cbk (call_frame_t *frame, void *cookie, -                           xlator_t *this, int32_t op_ret, int32_t op_errno, -                           inode_t *inode, struct iatt *buf, dict_t *xattr, -                           struct iatt *postparent) -{ -	afr_local_t *   local = NULL; -	afr_private_t * priv  = NULL; -	struct iatt *   lookup_buf = NULL; -         -	int             call_count      = -1; -	int             child_index     = -1; -        int             first_up_child  = -1; - -	child_index = (long) cookie; -	priv = this->private; - -	LOCK (&frame->lock); -	{ -		local = frame->local; - -		lookup_buf = &local->cont.lookup.buf; - -		if (op_ret == -1) { -			if (op_errno == ENOENT) -				local->enoent_count++; - -                        if (__error_more_important (local->op_errno, op_errno)) -                            local->op_errno = op_errno; - -                            if (local->op_errno == ESTALE) { -                                    local->op_ret = -1; -                            } - -                            goto unlock; -                } - -                afr_lookup_collect_xattr (local, this, child_index, xattr); - -                first_up_child = afr_first_up_child (priv); - -                if (child_index == first_up_child) { -                        local->cont.lookup.ino =  -                                afr_itransform (buf->ia_ino, -                                                priv->child_count, -                                                first_up_child); -                        local->cont.lookup.gen = buf->ia_gen; -                } - -		/* in case of revalidate, we need to send stat of the -		 * child whose stat was sent during the first lookup. -		 * (so that time stamp does not vary with revalidate. -		 * in case it is down, stat of the fist success will -		 * be replied */ - -		/* inode number should be preserved across revalidates */ - -		if (local->success_count == 0) { -                        if (local->op_errno != ESTALE) -                                local->op_ret = op_ret; - -			local->cont.lookup.inode               = inode_ref (inode); -			local->cont.lookup.xattr               = dict_ref (xattr); -			local->cont.lookup.xattrs[child_index] = dict_ref (xattr); -                        local->cont.lookup.postparent          = *postparent; - -			*lookup_buf = *buf; - -                        lookup_buf->ia_ino = afr_itransform (buf->ia_ino, -                                                             priv->child_count, -                                                             child_index); - -                        if (priv->read_child >= 0) { -                                afr_set_read_child (this, -                                                    local->cont.lookup.inode,  -                                                    priv->read_child); -                        } else { -                                afr_set_read_child (this, -                                                    local->cont.lookup.inode,  -                                                    child_index); -                        } - -		} else { -                        afr_lookup_self_heal_check (this, local, buf, lookup_buf); - -                        if (child_index == local->read_child_index) { - -                                /* -                                   lookup has succeeded on the read child. -                                   So use its inode number -                                */ - -                                if (local->cont.lookup.xattr) -                                        dict_unref (local->cont.lookup.xattr); - -                                local->cont.lookup.xattr               = dict_ref (xattr); -                                local->cont.lookup.xattrs[child_index] = dict_ref (xattr); -                                local->cont.lookup.postparent          = *postparent; - -                                *lookup_buf = *buf; - -                                if (priv->read_child >= 0) { -                                        afr_set_read_child (this, -                                                            local->cont.lookup.inode, -                                                            priv->read_child); -                                } else { -                                        afr_set_read_child (this, -                                                            local->cont.lookup.inode, -                                                            local->read_child_index); -                                } -                        } - -		} - -		local->success_count++; -	} -unlock: -	UNLOCK (&frame->lock); - -	call_count = afr_frame_return (frame); - -	if (call_count == 0) { -                afr_lookup_done (frame, this, lookup_buf); -	} - -	return 0; -} - - -int -afr_lookup (call_frame_t *frame, xlator_t *this, -	    loc_t *loc, dict_t *xattr_req) -{ -	afr_private_t *priv = NULL; -	afr_local_t   *local = NULL; -	int            ret = -1; -	int            i = 0; - -        fop_lookup_cbk_t callback; - -        int call_count = 0; - -        uint64_t       ctx; - -	int32_t        op_errno = 0; - -	priv = this->private; - -	ALLOC_OR_GOTO (local, afr_local_t, out); - -	local->op_ret = -1; - -	frame->local = local; - -        if (!strcmp (loc->path, "/" GF_REPLICATE_TRASH_DIR)) { -                op_errno = ENOENT; -                goto out; -        } - -	loc_copy (&local->loc, loc); - -        ret = inode_ctx_get (loc->inode, this, &ctx); -        if (ret == 0) { -                /* lookup is a revalidate */ - -                callback = afr_revalidate_lookup_cbk; - -                local->cont.lookup.is_revalidate = _gf_true; -                local->read_child_index          = afr_read_child (this, -                                                                   loc->inode); -        } else { -                callback = afr_fresh_lookup_cbk; - -                LOCK (&priv->read_child_lock); -                { -                        local->read_child_index = (++priv->read_child_rr) -                                % (priv->child_count); -                } -                UNLOCK (&priv->read_child_lock); -        } - -        if (loc->parent) -                local->cont.lookup.parent_ino = loc->parent->ino; - -	local->child_up = memdup (priv->child_up, priv->child_count); - -        local->cont.lookup.xattrs = GF_CALLOC (priv->child_count, -                                    sizeof (*local->cont.lookup.xattr), -                                    gf_afr_mt_dict_t); - -	local->call_count = afr_up_children_count (priv->child_count, -                                                   local->child_up); -        call_count = local->call_count; - -        if (local->call_count == 0) { -                ret      = -1; -                op_errno = ENOTCONN; -                goto out; -        } - -	/* By default assume ENOTCONN. On success it will be set to 0. */ -	local->op_errno = ENOTCONN; -	 -	if (xattr_req == NULL) -		local->xattr_req = dict_new (); -	else -		local->xattr_req = dict_ref (xattr_req); - -        for (i = 0; i < priv->child_count; i++) { -		ret = dict_set_uint64 (local->xattr_req, priv->pending_key[i], -				       3 * sizeof(int32_t)); -                 -                /* 3 = data+metadata+entry */ -        } - -	ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_OPEN_FD_COUNT, 0); -        ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); -        ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); - -	for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i]) { -                        STACK_WIND_COOKIE (frame, callback, (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->lookup, -                                           loc, local->xattr_req); -                        if (!--call_count) -                                break; -                } -	} - -	ret = 0; -out: -	if (ret == -1) -		AFR_STACK_UNWIND (lookup, frame, -1, op_errno, -                                  NULL, NULL, NULL, NULL); - -	return 0; -} - - -/* {{{ open */ - -int -afr_fd_ctx_set (xlator_t *this, fd_t *fd) -{ -        afr_private_t * priv = NULL; - -        int op_ret = 0; -        int ret    = 0; - -        uint64_t       ctx; -        afr_fd_ctx_t * fd_ctx = NULL; - -        VALIDATE_OR_GOTO (this->private, out); -        VALIDATE_OR_GOTO (fd, out); - -        priv = this->private; - -        LOCK (&fd->lock); -        { -                ret = __fd_ctx_get (fd, this, &ctx); -                 -                if (ret == 0) -                        goto unlock; - -                fd_ctx = GF_CALLOC (1, sizeof (afr_fd_ctx_t), -                                    gf_afr_mt_afr_fd_ctx_t); -                if (!fd_ctx) { -                        gf_log (this->name, GF_LOG_ERROR, -                                "Out of memory"); -                         -                        op_ret = -ENOMEM; -                        goto unlock; -                } - -                fd_ctx->pre_op_done = GF_CALLOC (sizeof (*fd_ctx->pre_op_done), -                                                 priv->child_count, -                                                 gf_afr_mt_char); -                if (!fd_ctx->pre_op_done) { -                        gf_log (this->name, GF_LOG_ERROR, -                                "Out of memory"); -                        op_ret = -ENOMEM; -                        goto unlock; -                } - -                fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), -                                               priv->child_count, -                                               gf_afr_mt_char); -                if (!fd_ctx->opened_on) { -                        gf_log (this->name, GF_LOG_ERROR, -                                "Out of memory"); -                        op_ret = -ENOMEM; -                        goto unlock; -                } - -                fd_ctx->child_failed = GF_CALLOC ( -                                         sizeof (*fd_ctx->child_failed), -                                         priv->child_count, -                                         gf_afr_mt_char); -                 -                if (!fd_ctx->child_failed) { -                        gf_log (this->name, GF_LOG_ERROR, -                                "Out of memory"); - -                        op_ret = -ENOMEM; -                        goto unlock; -                } - -                fd_ctx->up_count   = priv->up_count; -                fd_ctx->down_count = priv->down_count; - -                ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); -                if (ret < 0) { -                        op_ret = ret; -                } - -                INIT_LIST_HEAD (&fd_ctx->entries); -        } -unlock: -        UNLOCK (&fd->lock); -out: -        return ret; -} - -/* {{{ flush */ - -int -afr_flush_unwind (call_frame_t *frame, xlator_t *this) -{ -	afr_local_t *   local = NULL; -	afr_private_t * priv  = NULL; -	call_frame_t   *main_frame = NULL; - -	local = frame->local; -	priv  = this->private; - -	LOCK (&frame->lock); -	{ -		if (local->transaction.main_frame) -			main_frame = local->transaction.main_frame; -		local->transaction.main_frame = NULL; -	} -	UNLOCK (&frame->lock); - -	if (main_frame) { -		AFR_STACK_UNWIND (flush, main_frame, -                                  local->op_ret, local->op_errno); -	} -         -	return 0; -} - - -int -afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  -		      int32_t op_ret, int32_t op_errno) -{ -        afr_local_t *   local = NULL; -	afr_private_t * priv  = NULL; - -	int call_count  = -1; -	int child_index = (long) cookie; -	int need_unwind = 0; - -	local = frame->local; -	priv  = this->private; - -	LOCK (&frame->lock); -	{ -		if (afr_fop_failed (op_ret, op_errno)) -			afr_transaction_fop_failed (frame, this, child_index); - -		if (op_ret != -1) { -			if (local->success_count == 0) { -				local->op_ret = op_ret; -			} -			local->success_count++; - -			if (local->success_count == priv->wait_count) { -				need_unwind = 1; -			} -		} - -		local->op_errno = op_errno; -	} -	UNLOCK (&frame->lock); - -	if (need_unwind) -		afr_flush_unwind (frame, this); - -	call_count = afr_frame_return (frame); - -	if (call_count == 0) { -		local->transaction.resume (frame, this); -	} -	 -	return 0; -} - - -int -afr_flush_wind (call_frame_t *frame, xlator_t *this) -{ -	afr_local_t *local = NULL; -	afr_private_t *priv = NULL; -	 -	int i = 0; -	int call_count = -1; - -	local = frame->local; -	priv = this->private; - -	call_count = afr_up_children_count (priv->child_count, local->child_up); - -	if (call_count == 0) { -		local->transaction.resume (frame, this); -		return 0; -	} - -	local->call_count = call_count; - -	for (i = 0; i < priv->child_count; i++) {				 -		if (local->child_up[i]) { -			STACK_WIND_COOKIE (frame, afr_flush_wind_cbk,  -					   (void *) (long) i,	 -					   priv->children[i],  -					   priv->children[i]->fops->flush, -					   local->fd); -		 -			if (!--call_count) -				break; -		} -	} -	 -	return 0; -} - - -int -afr_flush_done (call_frame_t *frame, xlator_t *this) -{ -	afr_local_t *local = NULL; - -	local = frame->local; - -	local->transaction.unwind (frame, this); - -        AFR_STACK_DESTROY (frame); - -	return 0; -} - - -int -afr_plain_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                     int32_t op_ret, int32_t op_errno) - -{ -	afr_local_t *local = NULL; - -	int call_count = -1; - -	local = frame->local; - -	LOCK (&frame->lock); -	{ -		if (op_ret == 0) -			local->op_ret = 0; - -		local->op_errno = op_errno; -	} -	UNLOCK (&frame->lock); - -	call_count = afr_frame_return (frame); - -	if (call_count == 0) -		AFR_STACK_UNWIND (flush, frame, local->op_ret, local->op_errno); - -	return 0; -} - - -static int -__no_pre_op_done (xlator_t *this, fd_t *fd) -{ -        int i      = 0; -        int op_ret = 1; - -        int _ret = 0; -        uint64_t       ctx; -        afr_fd_ctx_t * fd_ctx = NULL; - -        afr_private_t *priv = NULL; - -        priv = this->private; - -        LOCK (&fd->lock); -        { -                _ret = __fd_ctx_get (fd, this, &ctx); - -                if (_ret < 0) { -                        goto out; -                } - -                fd_ctx = (afr_fd_ctx_t *)(long) ctx; - -                for (i = 0; i < priv->child_count; i++) { -                        if (fd_ctx->pre_op_done[i]) { -                                op_ret = 0; -                                break; -                        } -                } -        } -out: -        UNLOCK (&fd->lock); - -        return op_ret; -} - - -int -afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) -{ -	afr_private_t * priv  = NULL; -	afr_local_t   * local = NULL; - -        call_frame_t  * transaction_frame = NULL; - -	int ret        = -1; - -	int op_ret   = -1; -	int op_errno = 0; - -        int i          = 0; -        int call_count = 0; - -	VALIDATE_OR_GOTO (frame, out); -	VALIDATE_OR_GOTO (this, out); -	VALIDATE_OR_GOTO (this->private, out); - -	priv = this->private; - -	ALLOC_OR_GOTO (local, afr_local_t, out); - -	ret = AFR_LOCAL_INIT (local, priv); -	if (ret < 0) { -		op_errno = -ret; -		goto out; -	} - -        call_count = afr_up_children_count (priv->child_count, local->child_up); - -        if (__no_pre_op_done (this, fd)) { -                frame->local = local; - -                for (i = 0; i < priv->child_count; i++) { -                        if (local->child_up[i]) { -                                STACK_WIND_COOKIE (frame, afr_plain_flush_cbk, -                                                   (void *) (long) i, -                                                   priv->children[i], -                                                   priv->children[i]->fops->flush, -                                                   fd); -                                if (!--call_count) -                                        break; -                        } -                } -        } else { -                transaction_frame = copy_frame (frame); -                if (!transaction_frame) { -                        op_errno = ENOMEM; -                        gf_log (this->name, GF_LOG_ERROR, -                                "Out of memory."); -                        goto out; -                } - -                transaction_frame->local = local; - -                local->op = GF_FOP_FLUSH; -         -                local->transaction.fop    = afr_flush_wind; -                local->transaction.done   = afr_flush_done; -                local->transaction.unwind = afr_flush_unwind; - -                local->fd                 = fd_ref (fd); - -                local->transaction.main_frame = frame; -                local->transaction.start  = 0; -                local->transaction.len    = 0; - -                afr_transaction (transaction_frame, this, AFR_FLUSH_TRANSACTION); -        } - -	op_ret = 0; -out: -	if (op_ret == -1) { -                if (transaction_frame) -                        AFR_STACK_DESTROY (transaction_frame); - -		AFR_STACK_UNWIND (flush, frame, op_ret, op_errno); -	} - -	return 0; -} - -/* }}} */ - - -int -afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) -{ -        uint64_t        ctx = 0; -        afr_fd_ctx_t    *fd_ctx = NULL; -        int             ret = 0; - -        ret = fd_ctx_get (fd, this, &ctx); - -        if (ret < 0) -                goto out; - -        fd_ctx = (afr_fd_ctx_t *)(long) ctx; - -        if (fd_ctx) { -                if (fd_ctx->child_failed) -                        GF_FREE (fd_ctx->child_failed); - -                if (fd_ctx->pre_op_done) -                        GF_FREE (fd_ctx->pre_op_done); - -                if (fd_ctx->opened_on) -                        GF_FREE (fd_ctx->opened_on); - -                GF_FREE (fd_ctx); -        } -         -out: -        return 0; -} - - -int -afr_release (xlator_t *this, fd_t *fd) -{ -        afr_cleanup_fd_ctx (this, fd); - -        return 0; -} - - -/* {{{ fsync */ - -int -afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -               int32_t op_ret, int32_t op_errno, struct iatt *prebuf, -               struct iatt *postbuf) -{ -	afr_local_t *local = NULL; -	 -	int call_count = -1; - -        int child_index = (long) cookie; -        int read_child  = 0; - -	local = frame->local; - -        read_child = afr_read_child (this, local->fd->inode); - -	LOCK (&frame->lock); -	{ -                if (child_index == read_child) { -                        local->read_child_returned = _gf_true; -                } - -		if (op_ret == 0) { -			local->op_ret = 0; - -			if (local->success_count == 0) { -				local->cont.fsync.prebuf  = *prebuf; -				local->cont.fsync.postbuf = *postbuf; -			} - -                        if (child_index == read_child) { -                                local->cont.fsync.prebuf  = *prebuf; -                                local->cont.fsync.postbuf = *postbuf; -                        } - -			local->success_count++; -                } - -		local->op_errno = op_errno; -	} -	UNLOCK (&frame->lock); - -	call_count = afr_frame_return (frame); - -	if (call_count == 0) { -                local->cont.fsync.prebuf.ia_ino  = local->cont.fsync.ino; -                local->cont.fsync.postbuf.ia_ino = local->cont.fsync.ino; - -		AFR_STACK_UNWIND (fsync, frame, local->op_ret, local->op_errno, -                                  &local->cont.fsync.prebuf, -                                  &local->cont.fsync.postbuf); -        } - -	return 0; -} - - -int -afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, -	   int32_t datasync) -{ -	afr_private_t *priv = NULL; -	afr_local_t *local = NULL; - -	int ret = -1; - -	int i = 0; -	int32_t call_count = 0; -	int32_t op_ret   = -1; -	int32_t op_errno = 0; - -	VALIDATE_OR_GOTO (frame, out); -	VALIDATE_OR_GOTO (this, out); -	VALIDATE_OR_GOTO (this->private, out); - -	priv = this->private; - -	ALLOC_OR_GOTO (local, afr_local_t, out); - -	ret = AFR_LOCAL_INIT (local, priv); -	if (ret < 0) { -		op_errno = -ret; -		goto out; -	} - -	call_count = local->call_count; -	frame->local = local; - -        local->fd             = fd_ref (fd); -        local->cont.fsync.ino = fd->inode->ino; - -	for (i = 0; i < priv->child_count; i++) { -		if (local->child_up[i]) { -			STACK_WIND_COOKIE (frame, afr_fsync_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->fsync, -                                           fd, datasync); -			if (!--call_count) -				break; -		} -	} - -	op_ret = 0; -out: -	if (op_ret == -1) { -		AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, NULL, NULL); -	} -	return 0; -} - -/* }}} */ - -/* {{{ fsync */ - -int32_t -afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, -		  xlator_t *this, int32_t op_ret, int32_t op_errno) -{ -	afr_local_t *local = NULL; -	 -	int call_count = -1; - -	local = frame->local; - -	LOCK (&frame->lock); -	{ -		if (op_ret == 0) -			local->op_ret = 0; - -		local->op_errno = op_errno; -	} -	UNLOCK (&frame->lock); - -	call_count = afr_frame_return (frame); - -	if (call_count == 0) -		AFR_STACK_UNWIND (fsyncdir, frame, local->op_ret, -                                  local->op_errno); - -	return 0; -} - - -int32_t -afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, -	      int32_t datasync) -{ -	afr_private_t *priv = NULL; -	afr_local_t *local = NULL; - -	int ret = -1; - -	int i = 0; -	int32_t call_count = 0; -	int32_t op_ret   = -1; -	int32_t op_errno = 0; - -	VALIDATE_OR_GOTO (frame, out); -	VALIDATE_OR_GOTO (this, out); -	VALIDATE_OR_GOTO (this->private, out); - -	priv = this->private; - -	ALLOC_OR_GOTO (local, afr_local_t, out); - -	ret = AFR_LOCAL_INIT (local, priv); -	if (ret < 0) { -		op_errno = -ret; -		goto out; -	} - -	call_count = local->call_count; -	frame->local = local; - -	for (i = 0; i < priv->child_count; i++) { -		if (local->child_up[i]) { -			STACK_WIND (frame, afr_fsyncdir_cbk, -				    priv->children[i], -				    priv->children[i]->fops->fsyncdir, -				    fd, datasync); -			if (!--call_count) -				break; -		} -	} - -	op_ret = 0; -out: -	if (op_ret == -1) { -		AFR_STACK_UNWIND (fsyncdir, frame, op_ret, op_errno); -	} -	return 0; -} - -/* }}} */ - -/* {{{ xattrop */ - -int32_t -afr_xattrop_cbk (call_frame_t *frame, void *cookie, -		 xlator_t *this, int32_t op_ret, int32_t op_errno, -		 dict_t *xattr) -{ -	afr_local_t *local = NULL; -	 -	int call_count = -1; - -	local = frame->local; - -	LOCK (&frame->lock); -	{ -		if (op_ret == 0) -			local->op_ret = 0; - -		local->op_errno = op_errno; -	} -	UNLOCK (&frame->lock); - -	call_count = afr_frame_return (frame); - -	if (call_count == 0) -		AFR_STACK_UNWIND (xattrop, frame, local->op_ret, local->op_errno, -                                  xattr); - -	return 0; -} - - -int32_t -afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, -	     gf_xattrop_flags_t optype, dict_t *xattr) -{ -	afr_private_t *priv = NULL; -	afr_local_t *local  = NULL; - -	int ret = -1; - -	int i = 0; -	int32_t call_count = 0; -	int32_t op_ret   = -1; -	int32_t op_errno = 0; - -	VALIDATE_OR_GOTO (frame, out); -	VALIDATE_OR_GOTO (this, out); -	VALIDATE_OR_GOTO (this->private, out); - -	priv = this->private; - -	ALLOC_OR_GOTO (local, afr_local_t, out); - -	ret = AFR_LOCAL_INIT (local, priv); -	if (ret < 0) { -		op_errno = -ret; -		goto out; -	} - -	call_count = local->call_count; -	frame->local = local; - -	for (i = 0; i < priv->child_count; i++) { -		if (local->child_up[i]) { -			STACK_WIND (frame, afr_xattrop_cbk, -				    priv->children[i], -				    priv->children[i]->fops->xattrop, -				    loc, optype, xattr); -			if (!--call_count) -				break; -		} -	} - -	op_ret = 0; -out: -	if (op_ret == -1) { -		AFR_STACK_UNWIND (xattrop, frame, op_ret, op_errno, NULL); -	} -	return 0; -} - -/* }}} */ - -/* {{{ fxattrop */ - -int32_t -afr_fxattrop_cbk (call_frame_t *frame, void *cookie, -		  xlator_t *this, int32_t op_ret, int32_t op_errno, -		  dict_t *xattr) -{ -	afr_local_t *local = NULL; -	 -	int call_count = -1; - -	local = frame->local; - -	LOCK (&frame->lock); -	{ -		if (op_ret == 0) -			local->op_ret = 0; - -		local->op_errno = op_errno; -	} -	UNLOCK (&frame->lock); - -	call_count = afr_frame_return (frame); - -	if (call_count == 0) -		AFR_STACK_UNWIND (fxattrop, frame, local->op_ret, local->op_errno, -                                  xattr); - -	return 0; -} - - -int32_t -afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, -	      gf_xattrop_flags_t optype, dict_t *xattr) -{ -	afr_private_t *priv = NULL; -	afr_local_t *local  = NULL; - -	int ret = -1; - -	int i = 0; -	int32_t call_count = 0; -	int32_t op_ret   = -1; -	int32_t op_errno = 0; - -	VALIDATE_OR_GOTO (frame, out); -	VALIDATE_OR_GOTO (this, out); -	VALIDATE_OR_GOTO (this->private, out); - -	priv = this->private; - -	ALLOC_OR_GOTO (local, afr_local_t, out); - -	ret = AFR_LOCAL_INIT (local, priv); -	if (ret < 0) { -		op_errno = -ret; -		goto out; -	} - -	call_count = local->call_count; -	frame->local = local; - -	for (i = 0; i < priv->child_count; i++) { -		if (local->child_up[i]) { -			STACK_WIND (frame, afr_fxattrop_cbk, -				    priv->children[i], -				    priv->children[i]->fops->fxattrop, -				    fd, optype, xattr); -			if (!--call_count) -				break; -		} -	} - -	op_ret = 0; -out: -	if (op_ret == -1) { -		AFR_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, NULL); -	} -	return 0; -} - -/* }}} */ - - -int32_t -afr_inodelk_cbk (call_frame_t *frame, void *cookie, -		 xlator_t *this, int32_t op_ret, int32_t op_errno) -		 -{ -	afr_local_t *local = NULL; -	 -	int call_count = -1; - -	local = frame->local; - -	LOCK (&frame->lock); -	{ -		if (op_ret == 0) -			local->op_ret = 0; - -		local->op_errno = op_errno; -	} -	UNLOCK (&frame->lock); - -	call_count = afr_frame_return (frame); - -	if (call_count == 0) -		AFR_STACK_UNWIND (inodelk, frame, local->op_ret, -                                  local->op_errno); - -	return 0; -} - - -int32_t -afr_inodelk (call_frame_t *frame, xlator_t *this,  -             const char *volume, loc_t *loc, int32_t cmd, struct flock *flock) -{ -	afr_private_t *priv = NULL; -	afr_local_t *local  = NULL; - -	int ret = -1; - -	int i = 0; -	int32_t call_count = 0; -	int32_t op_ret   = -1; -	int32_t op_errno = 0; - -	VALIDATE_OR_GOTO (frame, out); -	VALIDATE_OR_GOTO (this, out); -	VALIDATE_OR_GOTO (this->private, out); - -	priv = this->private; - -	ALLOC_OR_GOTO (local, afr_local_t, out); - -	ret = AFR_LOCAL_INIT (local, priv); -	if (ret < 0) { -		op_errno = -ret; -		goto out; -	} - -	call_count = local->call_count; -	frame->local = local; - -	for (i = 0; i < priv->child_count; i++) { -		if (local->child_up[i]) { -			STACK_WIND (frame, afr_inodelk_cbk, -				    priv->children[i], -				    priv->children[i]->fops->inodelk, -				    volume, loc, cmd, flock); - -			if (!--call_count) -				break; -		} -	} - -	op_ret = 0; -out: -	if (op_ret == -1) { -		AFR_STACK_UNWIND (inodelk, frame, op_ret, op_errno); -	} -	return 0; -} - - -int32_t -afr_finodelk_cbk (call_frame_t *frame, void *cookie, -		  xlator_t *this, int32_t op_ret, int32_t op_errno) -		 -{ -	afr_local_t *local = NULL; -	 -	int call_count = -1; - -	local = frame->local; - -	LOCK (&frame->lock); -	{ -		if (op_ret == 0) -			local->op_ret = 0; - -		local->op_errno = op_errno; -	} -	UNLOCK (&frame->lock); - -	call_count = afr_frame_return (frame); - -	if (call_count == 0) -		AFR_STACK_UNWIND (finodelk, frame, local->op_ret, -                                  local->op_errno); - -	return 0; -} - - -int32_t -afr_finodelk (call_frame_t *frame, xlator_t *this,  -              const char *volume, fd_t *fd, int32_t cmd, struct flock *flock) -{ -	afr_private_t *priv = NULL; -	afr_local_t *local  = NULL; - -	int ret = -1; - -	int i = 0; -	int32_t call_count = 0; -	int32_t op_ret   = -1; -	int32_t op_errno = 0; - -	VALIDATE_OR_GOTO (frame, out); -	VALIDATE_OR_GOTO (this, out); -	VALIDATE_OR_GOTO (this->private, out); - -	priv = this->private; - -	ALLOC_OR_GOTO (local, afr_local_t, out); - -	ret = AFR_LOCAL_INIT (local, priv); -	if (ret < 0) { -		op_errno = -ret; -		goto out; -	} - -	call_count = local->call_count; -	frame->local = local; - -	for (i = 0; i < priv->child_count; i++) { -		if (local->child_up[i]) { -			STACK_WIND (frame, afr_finodelk_cbk, -				    priv->children[i], -				    priv->children[i]->fops->finodelk, -				    volume, fd, cmd, flock); - -			if (!--call_count) -				break; -		} -	} - -	op_ret = 0; -out: -	if (op_ret == -1) { -		AFR_STACK_UNWIND (finodelk, frame, op_ret, op_errno); -	} -	return 0; -} - - -int32_t -afr_entrylk_cbk (call_frame_t *frame, void *cookie, -		 xlator_t *this, int32_t op_ret, int32_t op_errno) -		 -{ -	afr_local_t *local = NULL; -	 -	int call_count = -1; - -	local = frame->local; - -	LOCK (&frame->lock); -	{ -		if (op_ret == 0) -			local->op_ret = 0; - -		local->op_errno = op_errno; -	} -	UNLOCK (&frame->lock); - -	call_count = afr_frame_return (frame); - -	if (call_count == 0) -		AFR_STACK_UNWIND (entrylk, frame, local->op_ret, -                                  local->op_errno); - -	return 0; -} - - -int32_t -afr_entrylk (call_frame_t *frame, xlator_t *this,  -             const char *volume, loc_t *loc, -	     const char *basename, entrylk_cmd cmd, entrylk_type type) -{ -	afr_private_t *priv = NULL; -	afr_local_t *local  = NULL; - -	int ret = -1; - -	int i = 0; -	int32_t call_count = 0; -	int32_t op_ret   = -1; -	int32_t op_errno = 0; - -	VALIDATE_OR_GOTO (frame, out); -	VALIDATE_OR_GOTO (this, out); -	VALIDATE_OR_GOTO (this->private, out); - -	priv = this->private; - -	ALLOC_OR_GOTO (local, afr_local_t, out); - -	ret = AFR_LOCAL_INIT (local, priv); -	if (ret < 0) { -		op_errno = -ret; -		goto out; -	} - -	call_count = local->call_count; -	frame->local = local; - -	for (i = 0; i < priv->child_count; i++) { -		if (local->child_up[i]) { -			STACK_WIND (frame, afr_entrylk_cbk, -				    priv->children[i], -				    priv->children[i]->fops->entrylk, -				    volume, loc, basename, cmd, type); - -			if (!--call_count) -				break; -		} -	} - -	op_ret = 0; -out: -	if (op_ret == -1) { -		AFR_STACK_UNWIND (entrylk, frame, op_ret, op_errno); -	} -	return 0; -} - - - -int32_t -afr_fentrylk_cbk (call_frame_t *frame, void *cookie, -		 xlator_t *this, int32_t op_ret, int32_t op_errno) -		 -{ -	afr_local_t *local = NULL; -	 -	int call_count = -1; - -	local = frame->local; - -	LOCK (&frame->lock); -	{ -		if (op_ret == 0) -			local->op_ret = 0; - -		local->op_errno = op_errno; -	} -	UNLOCK (&frame->lock); - -	call_count = afr_frame_return (frame); - -	if (call_count == 0) -		AFR_STACK_UNWIND (fentrylk, frame, local->op_ret, -                                  local->op_errno); - -	return 0; -} - - -int32_t -afr_fentrylk (call_frame_t *frame, xlator_t *this,  -              const char *volume, fd_t *fd, -	      const char *basename, entrylk_cmd cmd, entrylk_type type) -{ -	afr_private_t *priv = NULL; -	afr_local_t *local  = NULL; - -	int ret = -1; - -	int i = 0; -	int32_t call_count = 0; -	int32_t op_ret   = -1; -	int32_t op_errno = 0; - -	VALIDATE_OR_GOTO (frame, out); -	VALIDATE_OR_GOTO (this, out); -	VALIDATE_OR_GOTO (this->private, out); - -	priv = this->private; - -	ALLOC_OR_GOTO (local, afr_local_t, out); - -	ret = AFR_LOCAL_INIT (local, priv); -	if (ret < 0) { -		op_errno = -ret; -		goto out; -	} - -	call_count = local->call_count; -	frame->local = local; - -	for (i = 0; i < priv->child_count; i++) { -		if (local->child_up[i]) { -			STACK_WIND (frame, afr_fentrylk_cbk, -				    priv->children[i], -				    priv->children[i]->fops->fentrylk, -				    volume, fd, basename, cmd, type); - -			if (!--call_count) -				break; -		} -	} - -	op_ret = 0; -out: -	if (op_ret == -1) { -		AFR_STACK_UNWIND (fentrylk, frame, op_ret, op_errno); -	} -	return 0; -} - -int32_t -afr_statfs_cbk (call_frame_t *frame, void *cookie, -		xlator_t *this, int32_t op_ret, int32_t op_errno, -		struct statvfs *statvfs) -{ -	afr_local_t *local = NULL; - -	int call_count = 0; - -	LOCK (&frame->lock); -	{ -		local = frame->local; - -		if (op_ret == 0) { -			local->op_ret   = op_ret; -			 -			if (local->cont.statfs.buf_set) { -				if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail) -					local->cont.statfs.buf = *statvfs; -			} else { -				local->cont.statfs.buf = *statvfs; -				local->cont.statfs.buf_set = 1; -			} -		} - -		if (op_ret == -1) -			local->op_errno = op_errno; - -	} -	UNLOCK (&frame->lock); - -	call_count = afr_frame_return (frame); - -	if (call_count == 0) -		AFR_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno,  -				  &local->cont.statfs.buf); - -	return 0; -} - - -int32_t -afr_statfs (call_frame_t *frame, xlator_t *this, -	    loc_t *loc) -{ -	afr_private_t *  priv        = NULL; -	int              child_count = 0; -	afr_local_t   *  local       = NULL; -	int              i           = 0; - -	int ret = -1; -	int              call_count = 0; -	int32_t          op_ret      = -1; -	int32_t          op_errno    = 0; - -	VALIDATE_OR_GOTO (this, out); -	VALIDATE_OR_GOTO (this->private, out); -	VALIDATE_OR_GOTO (loc, out); - -	priv = this->private; -	child_count = priv->child_count; - -	ALLOC_OR_GOTO (local, afr_local_t, out); -	 -	ret = AFR_LOCAL_INIT (local, priv); -	if (ret < 0) { -		op_errno = -ret; -		goto out; -	} - -	frame->local = local; -	call_count = local->call_count; - -	for (i = 0; i < child_count; i++) { -		if (local->child_up[i]) { -			STACK_WIND (frame, afr_statfs_cbk, -				    priv->children[i], -				    priv->children[i]->fops->statfs,  -				    loc); -			if (!--call_count) -				break; -		} -	} -	 -	op_ret = 0; -out: -	if (op_ret == -1) { -		AFR_STACK_UNWIND (statfs, frame, op_ret, op_errno, NULL); -	} -	return 0; -} - - -int32_t -afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  -		   int32_t op_ret, int32_t op_errno, struct flock *lock) -{ -	afr_local_t * local = NULL; - -	int call_count = -1; - -	local = frame->local; -	call_count = afr_frame_return (frame); - -	if (call_count == 0) -		AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, -				  lock); - -	return 0; -} - - -int32_t  -afr_lk_unlock (call_frame_t *frame, xlator_t *this) -{ -	afr_local_t   * local = NULL; -	afr_private_t * priv  = NULL; - -	int i; -	int call_count = 0; - -	local = frame->local; -	priv  = this->private; - -	call_count = afr_locked_nodes_count (local->cont.lk.locked_nodes,  -					     priv->child_count); - -	if (call_count == 0) { -		AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, -				  &local->cont.lk.flock); -		return 0; -	} - -	local->call_count = call_count; - -	local->cont.lk.flock.l_type = F_UNLCK; - -	for (i = 0; i < priv->child_count; i++) { -		if (local->cont.lk.locked_nodes[i]) { -			STACK_WIND (frame, afr_lk_unlock_cbk, -				    priv->children[i], -				    priv->children[i]->fops->lk, -				    local->fd, F_SETLK,  -				    &local->cont.lk.flock); - -			if (!--call_count) -				break; -		} -	} - -	return 0; -} - - -int32_t -afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  -	    int32_t op_ret, int32_t op_errno, struct flock *lock) -{ -	afr_local_t *local = NULL; -	afr_private_t *priv = NULL; - -	int call_count  = -1; -	int child_index = -1; - -	local = frame->local; -	priv  = this->private; - -	child_index = (long) cookie; - -	call_count = --local->call_count; - -	if (!child_went_down (op_ret, op_errno) && (op_ret == -1)) { -		local->op_ret   = -1; -		local->op_errno = op_errno; - -		afr_lk_unlock (frame, this); -		return 0; -	} - -	if (op_ret == 0) { -		local->op_ret        = 0; -		local->op_errno      = 0; -		local->cont.lk.locked_nodes[child_index] = 1; -		local->cont.lk.flock = *lock; -	} - -	child_index++; - -	if (child_index < priv->child_count) { -		STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) child_index, -				   priv->children[child_index], -				   priv->children[child_index]->fops->lk, -				   local->fd, local->cont.lk.cmd,  -				   &local->cont.lk.flock); -	} else if (local->op_ret == -1) { -		/* all nodes have gone down */ -		 -		AFR_STACK_UNWIND (lk, frame, -1, ENOTCONN, &local->cont.lk.flock); -	} else { -		/* locking has succeeded on all nodes that are up */ -		 -		AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, -                                  &local->cont.lk.flock); -	} - -	return 0; -} - - -int -afr_lk (call_frame_t *frame, xlator_t *this, -	fd_t *fd, int32_t cmd, -	struct flock *flock) -{ -	afr_private_t *priv = NULL; -	afr_local_t *local = NULL; - -	int i = 0; - -	int32_t op_ret   = -1; -	int32_t op_errno = 0; - -	VALIDATE_OR_GOTO (frame, out); -	VALIDATE_OR_GOTO (this, out); -	VALIDATE_OR_GOTO (this->private, out); - -	priv = this->private; - -	ALLOC_OR_GOTO (local, afr_local_t, out); -	AFR_LOCAL_INIT (local, priv); - -	frame->local  = local; - -	local->cont.lk.locked_nodes = GF_CALLOC (priv->child_count,  -					      sizeof (*local->cont.lk.locked_nodes), -                                              gf_afr_mt_char); -	 -	if (!local->cont.lk.locked_nodes) { -		gf_log (this->name, GF_LOG_ERROR, "Out of memory"); -		op_errno = ENOMEM; -		goto out; -	} - -	local->fd            = fd_ref (fd); -	local->cont.lk.cmd   = cmd; -	local->cont.lk.flock = *flock; - -	STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0, -			   priv->children[i], -			   priv->children[i]->fops->lk, -			   fd, cmd, flock); - -	op_ret = 0; -out: -	if (op_ret == -1) { -		AFR_STACK_UNWIND (lk, frame, op_ret, op_errno, NULL); -	} -	return 0; -} - -int -afr_priv_dump (xlator_t *this) -{ -	afr_private_t *priv = NULL; -        char  key_prefix[GF_DUMP_MAX_BUF_LEN]; -        char  key[GF_DUMP_MAX_BUF_LEN]; -        int   i = 0; - - -        assert(this); -	priv = this->private; - -        assert(priv); -        snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); -        gf_proc_dump_add_section(key_prefix); -        gf_proc_dump_build_key(key, key_prefix, "child_count"); -        gf_proc_dump_write(key, "%u", priv->child_count); -        gf_proc_dump_build_key(key, key_prefix, "read_child_rr"); -        gf_proc_dump_write(key, "%u", priv->read_child_rr); -	for (i = 0; i < priv->child_count; i++) { -                gf_proc_dump_build_key(key, key_prefix, "child_up[%d]", i); -                gf_proc_dump_write(key, "%d", priv->child_up[i]); -                gf_proc_dump_build_key(key, key_prefix,  -                                        "pending_key[%d]", i); -                gf_proc_dump_write(key, "%s", priv->pending_key[i]); -        } -        gf_proc_dump_build_key(key, key_prefix, "data_self_heal"); -        gf_proc_dump_write(key, "%d", priv->data_self_heal); -        gf_proc_dump_build_key(key, key_prefix, "metadata_self_heal"); -        gf_proc_dump_write(key, "%d", priv->metadata_self_heal); -        gf_proc_dump_build_key(key, key_prefix, "entry_self_heal"); -        gf_proc_dump_write(key, "%d", priv->entry_self_heal); -        gf_proc_dump_build_key(key, key_prefix, "data_change_log"); -        gf_proc_dump_write(key, "%d", priv->data_change_log); -        gf_proc_dump_build_key(key, key_prefix, "metadata_change_log"); -        gf_proc_dump_write(key, "%d", priv->metadata_change_log); -        gf_proc_dump_build_key(key, key_prefix, "entry_change_log"); -        gf_proc_dump_write(key, "%d", priv->entry_change_log); -        gf_proc_dump_build_key(key, key_prefix, "read_child"); -        gf_proc_dump_write(key, "%d", priv->read_child); -        gf_proc_dump_build_key(key, key_prefix, "favorite_child"); -        gf_proc_dump_write(key, "%u", priv->favorite_child); -        gf_proc_dump_build_key(key, key_prefix, "data_lock_server_count"); -        gf_proc_dump_write(key, "%u", priv->data_lock_server_count); -        gf_proc_dump_build_key(key, key_prefix, "metadata_lock_server_count"); -        gf_proc_dump_write(key, "%u", priv->metadata_lock_server_count); -        gf_proc_dump_build_key(key, key_prefix, "entry_lock_server_count"); -        gf_proc_dump_write(key, "%u", priv->entry_lock_server_count); -        gf_proc_dump_build_key(key, key_prefix, "wait_count"); -        gf_proc_dump_write(key, "%u", priv->wait_count); - -        return 0; -} - - -/** - * find_child_index - find the child's index in the array of subvolumes - * @this: AFR - * @child: child - */ - -static int -find_child_index (xlator_t *this, xlator_t *child) -{ -	afr_private_t *priv = NULL; - -	int i = -1; - -	priv = this->private; - -	for (i = 0; i < priv->child_count; i++) { -		if ((xlator_t *) child == priv->children[i]) -			break; -	} - -	return i; -} - +#include "afr-common.c"  int32_t  notify (xlator_t *this, int32_t event,  	void *data, ...)  { -	afr_private_t *     priv     = NULL; -	unsigned char *     child_up = NULL; - -	int i           = -1; -	int up_children = 0; - -	priv = this->private; - -	if (!priv) -		return 0; - -	child_up = priv->child_up; - -	switch (event) { -	case GF_EVENT_CHILD_UP: -		i = find_child_index (this, data); - -		child_up[i] = 1; - -                LOCK (&priv->lock); -                { -                        priv->up_count++; -                } -                UNLOCK (&priv->lock); - -		/*  -		   if all the children were down, and one child came up,  -		   send notify to parent -		*/ - -		for (i = 0; i < priv->child_count; i++) -			if (child_up[i]) -				up_children++; - -		if (up_children == 1) { -                        gf_log (this->name, GF_LOG_NORMAL, -                                "Subvolume '%s' came back up; " -                                "going online.", ((xlator_t *)data)->name); -                         -			default_notify (this, event, data); -                } +        int ret = -1; -		break; +        ret = afr_notify (this, event, data); -	case GF_EVENT_CHILD_DOWN: -		i = find_child_index (this, data); - -		child_up[i] = 0; - -                LOCK (&priv->lock); -                { -                        priv->down_count++; -                } -                UNLOCK (&priv->lock); -		 -		/*  -		   if all children are down, and this was the last to go down, -		   send notify to parent -		*/ - -		for (i = 0; i < priv->child_count; i++) -			if (child_up[i]) -				up_children++; - -		if (up_children == 0) { -                        gf_log (this->name, GF_LOG_ERROR, -                                "All subvolumes are down. Going offline " -                                "until atleast one of them comes back up."); - -			default_notify (this, event, data); -                } - -		break; - -	default: -		default_notify (this, event, data); -	} - -	return 0; +        return ret;  }  int32_t @@ -2624,7 +50,7 @@ mem_acct_init (xlator_t *this)                  return ret;          ret = xlator_mem_acct_init (this, gf_afr_mt_end + 1); -         +          if (ret != 0) {                  gf_log(this->name, GF_LOG_ERROR, "Memory accounting init"                                  "failed"); @@ -2727,7 +153,7 @@ init (xlator_t *this)  				"Defaulting to data-self-heal as 'on'",  				self_heal);  			priv->data_self_heal = 1; -		}  +		}  	}          priv->data_self_heal_algorithm = ""; @@ -2953,6 +379,10 @@ init (xlator_t *this)  		i++;  	} +        LOCK_INIT (&priv->root_inode_lk); +        priv->first_lookup = 1; +        priv->root_inode = NULL; +  	ret = 0;  out:  	return ret; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 125f5c2a2f4..3fa987ee83d 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -32,6 +32,8 @@  #define AFR_XATTR_PREFIX "trusted.afr" +struct _pump_private; +  typedef struct _afr_private {  	gf_lock_t lock;               /* to guard access to child_count, etc */  	unsigned int child_count;     /* total number of children   */ @@ -41,6 +43,10 @@ typedef struct _afr_private {  	xlator_t **children; +        gf_lock_t root_inode_lk; +        int first_lookup; +        inode_t *root_inode; +  	unsigned char *child_up;          char **pending_key; @@ -73,6 +79,9 @@ typedef struct _afr_private {          uint64_t up_count;      /* number of CHILD_UPs we have seen */          uint64_t down_count;    /* number of CHILD_DOWNs we have seen */ + +	struct _pump_private *pump_private; /* Set if we are loaded as pump */ +        gf_boolean_t pump_loaded;  } afr_private_t;  typedef struct { @@ -205,6 +214,7 @@ typedef struct _afr_local {  	unsigned int success_count;  	unsigned int enoent_count; +  	unsigned int govinda_gOvinda;  	unsigned int read_child_index; @@ -576,6 +586,18 @@ typedef struct {  #define all_tried(i, count)  ((i) == (count) - 1)  int +pump_command_reply (call_frame_t *frame, xlator_t *this); + +int32_t +afr_notify (xlator_t *this, int32_t event, +            void *data, ...); + +void +afr_set_lk_owner (call_frame_t *frame, xlator_t *this); + +int pump_start (call_frame_t *frame, xlator_t *this); + +int  afr_fd_ctx_set (xlator_t *this, fd_t *fd);  uint64_t diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c new file mode 100644 index 00000000000..e69c028450b --- /dev/null +++ b/xlators/cluster/afr/src/pump.c @@ -0,0 +1,1817 @@ +/* +   Copyright (c) 2007-2009 Gluster, Inc. <http://www.gluster.com> +   This file is part of GlusterFS. + +   GlusterFS is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published +   by the Free Software Foundation; either version 3 of the License, +   or (at your option) any later version. + +   GlusterFS is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see +   <http://www.gnu.org/licenses/>. +*/ + +#include <unistd.h> +#include <sys/time.h> +#include <stdlib.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "afr-common.c" + +pump_state_t +pump_get_state () +{ +        xlator_t *this = NULL; +        afr_private_t *priv = NULL; +        pump_private_t *pump_priv = NULL; + +        pump_state_t ret; + +        this = THIS; +        priv = this->private; +        pump_priv = priv->pump_private; + +        LOCK (&pump_priv->pump_state_lock); +        { +                ret = pump_priv->pump_state; +        } +        UNLOCK (&pump_priv->pump_state_lock); + +        return ret; +} + +int +pump_change_state (xlator_t *this, pump_state_t state) +{ +        afr_private_t *priv = NULL; +        pump_private_t *pump_priv = NULL; + +        pump_state_t state_old; +        pump_state_t state_new; + +	unsigned char *     child_up = NULL; +        int i = 0; + + +        priv = this->private; +        pump_priv = priv->pump_private; + +	child_up = priv->child_up; + +        assert (pump_priv); + +        LOCK (&pump_priv->pump_state_lock); +        { +                state_old = pump_priv->pump_state; +                state_new = state; + +                pump_priv->pump_state = state; + +                switch (pump_priv->pump_state) { +                case PUMP_STATE_RESUME: +                case PUMP_STATE_RUNNING: +                case PUMP_STATE_PAUSE: +                { +                        priv->pump_loaded = _gf_true; +                        i = 1; + +                        child_up[i] = 1; + +                        LOCK (&priv->lock); +                        { +                                priv->up_count++; +                        } +                        UNLOCK (&priv->lock); + +                        break; +                } +                case PUMP_STATE_ABORT: +                { +                        priv->pump_loaded = _gf_false; +                        i = 1; + +                        child_up[i] = 0; + +                        LOCK (&priv->lock); +                        { +                                priv->down_count++; +                        } +                        UNLOCK (&priv->lock); + +                        LOCK (&pump_priv->resume_path_lock); +                        { +                                pump_priv->number_files_pumped = 0; +                        } +                        UNLOCK (&pump_priv->resume_path_lock); + + +                        break; +                } + +                } +        } +        UNLOCK (&pump_priv->pump_state_lock); + +        gf_log (this->name, GF_LOG_DEBUG, +                "Pump changing state from %d to %d", +                state_old, +                state_new); + +        return  0; +} + +static int +pump_set_resume_path (xlator_t *this, const char *path) +{ +        int ret = 0; + +        afr_private_t *priv = NULL; +        pump_private_t *pump_priv = NULL; + +        priv = this->private; +        pump_priv = priv->pump_private; + +        assert (pump_priv); + +        LOCK (&pump_priv->resume_path_lock); +        { +                pump_priv->resume_path = strdup (path); +                if (!pump_priv->resume_path) +                        ret = -1; +        } +        UNLOCK (&pump_priv->resume_path_lock); + +        return ret; +} + +static void +build_child_loc (loc_t *parent, loc_t *child, char *path, char *name) +{ +        child->path = path; +        child->name = name; + +        child->parent = inode_ref (parent->inode); +        child->inode = inode_new (parent->inode->table); +} + +static char * +build_file_path (loc_t *loc, gf_dirent_t *entry) +{ +        xlator_t *this = NULL; +        char *file_path = NULL; +        int pathlen = 0; +        int total_size = 0; + +        this = THIS; + +        pathlen = STRLEN_0 (loc->path); + +        if (IS_ROOT_PATH (loc->path)) { +                total_size = pathlen + entry->d_len; +                file_path = GF_CALLOC (1, total_size, gf_afr_mt_char); +                if (!file_path) { +                        gf_log (this->name, GF_LOG_ERROR, +                                "Out of memory"); +                        return NULL; +                } + +                gf_log (this->name, GF_LOG_TRACE, +                        "constructing file path of size=%d" +                        "pathlen=%d, d_len=%d", +                        total_size, pathlen, +                        entry->d_len); + +                snprintf(file_path, total_size, "%s%s", loc->path, entry->d_name); + +        } else { +                total_size = pathlen + entry->d_len + 1; /* for the extra '/' in the path */ +                file_path = GF_CALLOC (1, total_size + 1, gf_afr_mt_char); +                if (!file_path) { +                        gf_log (this->name, GF_LOG_ERROR, +                                "Out of memory"); +                        return NULL; +                } + +                gf_log (this->name, GF_LOG_TRACE, +                        "constructing file path of size=%d" +                        "pathlen=%d, d_len=%d", +                        total_size, pathlen, +                        entry->d_len); + +                snprintf(file_path, total_size, "%s/%s", loc->path, entry->d_name); +        } + +        gf_log (this->name, GF_LOG_TRACE, +                "path=%s and d_name=%s", loc->path, entry->d_name); +        gf_log (this->name, GF_LOG_TRACE, +                "constructed file_path=%s of size=%d", file_path, total_size); + +        return file_path; +} + +static int +pump_check_and_update_status (xlator_t *this) +{ +        pump_state_t state; +        int ret = -1; + +        state = pump_get_state (); + +        switch (state) { + +        case PUMP_STATE_RESUME: +        case PUMP_STATE_RUNNING: +        { +                ret = 0; +                break; +        } +        case PUMP_STATE_PAUSE: +        case PUMP_STATE_ABORT: +        { +                ret = -1; +                break; +        } +        default: +        { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Unknown pump state"); +                ret = -1; +                break; +        } + +        } + +        return ret; +} + +static const char * +pump_get_resume_path (xlator_t *this) +{ +        afr_private_t *priv = NULL; +        pump_private_t *pump_priv = NULL; + +        const char *resume_path = NULL; + +        priv = this->private; +        pump_priv = priv->pump_private; + +        resume_path = pump_priv->resume_path; + +        return resume_path; +} + +static int +pump_update_resume_state (xlator_t *this, const char *path) +{ +        afr_private_t *priv = NULL; +        pump_private_t *pump_priv = NULL; + +        pump_state_t state; +        const char *resume_path = NULL; + +        priv = this->private; +        pump_priv = priv->pump_private; + +        state = pump_get_state (); + +        if (state == PUMP_STATE_RESUME) { +                resume_path = pump_get_resume_path (this); +                if (strcmp (resume_path, "/") == 0) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "Reached the resume path (/). Proceeding to change state" +                                " to running"); +                        pump_change_state (this, PUMP_STATE_RUNNING); +                } else if (strcmp (resume_path, path) == 0) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "Reached the resume path. Proceeding to change state" +                                " to running"); +                        pump_change_state (this, PUMP_STATE_RUNNING); +                } else { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "Not yet hit the resume path:res-path=%s,path=%s", +                                resume_path, path); +                } +        } + +        return 0; +} + +static gf_boolean_t +is_pump_traversal_allowed (xlator_t *this, const char *path) +{ +        afr_private_t *priv = NULL; +        pump_private_t *pump_priv = NULL; + +        pump_state_t state; +        const char *resume_path = NULL; +        gf_boolean_t ret = _gf_true; + +        priv = this->private; +        pump_priv = priv->pump_private; + +        state = pump_get_state (); + +        if (state == PUMP_STATE_RESUME) { +                resume_path = pump_get_resume_path (this); +                if (strstr (resume_path, path)) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "On the right path to resumption path"); +                        ret = _gf_true; +                } else { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "Not the right path to resuming=> ignoring traverse"); +                        ret = _gf_false; +                } +        } + +        return ret; +} + +static int +pump_update_file_stats (xlator_t *this, long source_blocks, +                               long sink_blocks) +{ +        afr_private_t  *priv        = NULL; +        pump_private_t *pump_priv   = NULL; + +        priv = this->private; +        pump_priv = priv->pump_private; + +        LOCK (&pump_priv->resume_path_lock); +        { +                pump_priv->source_blocks = source_blocks; +                pump_priv->sink_blocks   = sink_blocks; +        } +        UNLOCK (&pump_priv->resume_path_lock); + +        return 0; +} + +static int +pump_save_file_stats (xlator_t *this) +{ +        afr_private_t  *priv        = NULL; +        struct statvfs  source_buf  = {0, }; +        struct statvfs  sink_buf    = {0, }; +        loc_t loc; +        int ret = -1; + +        priv = this->private; + +        assert (priv->root_inode); + +        build_root_loc (priv->root_inode, &loc); + +        ret = syncop_statfs (PUMP_SOURCE_CHILD (this), +                             &loc, &source_buf); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "source statfs failed"); +        } else { +                gf_log (this->name, GF_LOG_DEBUG, +                        "source statfs succeeded"); +        } + + +        ret = syncop_statfs (PUMP_SOURCE_CHILD (this), +                             &loc, &sink_buf); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "sink statfs failed"); +        } else { +                gf_log (this->name, GF_LOG_DEBUG, +                        "sink statfs succeeded"); +        } + +        pump_update_file_stats (this, +                                source_buf.f_blocks, +                                sink_buf.f_blocks); + +        return 0; + +} +static int +pump_save_path (xlator_t *this, const char *path) +{ +        afr_private_t *priv = NULL; +        pump_private_t *pump_priv = NULL; +        pump_state_t state; +        dict_t *dict = NULL; +        loc_t  loc; +        int dict_ret = 0; +        int ret = -1; + +        state = pump_get_state (); +        if (state != PUMP_STATE_RUNNING) +                return 0; + +        priv = this->private; +        pump_priv = priv->pump_private; + +        assert (priv->root_inode); + +        build_root_loc (priv->root_inode, &loc); + +        dict = dict_new (); +        dict_ret = dict_set_str (dict, PUMP_PATH, (char *)path); + +//        ret = syncop_setxattr (PUMP_SOURCE_CHILD (this), &loc, dict, 0); +        ret = 0; +        if (ret < 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "setxattr failed - could not save path=%s", path); +        } else { +                gf_log (this->name, GF_LOG_DEBUG, +                        "setxattr succeeded - saved path=%s", path); +                gf_log (this->name, GF_LOG_DEBUG, +                        "Saving path for status info"); + +                LOCK (&pump_priv->resume_path_lock); +                { +                        pump_priv->number_files_pumped++; + +                        strncpy (pump_priv->current_file, path, +                                 PATH_MAX); +                } +                UNLOCK (&pump_priv->resume_path_lock); + +        } + +        dict_unref (dict); + +        return 0; +} + +static int +gf_pump_traverse_directory (loc_t *loc) +{ +        xlator_t *this = NULL; +        afr_private_t *priv = NULL; +        fd_t     *fd   = NULL; + +        off_t       offset   = 0; +        loc_t       entry_loc; +        gf_dirent_t *entry = NULL; +        gf_dirent_t *tmp = NULL; +        gf_dirent_t entries; + +	struct iatt iatt, parent; +	dict_t *xattr_rsp; + +        int source = 0; + +        char *file_path = NULL; +        int ret = 0; + +        INIT_LIST_HEAD (&entries.list); +        this = THIS; +        priv = this->private; + +        assert (loc->inode); + +	fd = fd_create (loc->inode, PUMP_PID); +        if (!fd) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Failed to create fd for %s", loc->path); +                goto out; +        } + +        ret = syncop_opendir (priv->children[source], loc, fd); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "opendir failed on %s", loc->path); +                goto out; +        } + +        gf_log (this->name, GF_LOG_TRACE, +                "pump opendir on %s returned=%d", +                loc->path, ret); + +        while (syncop_readdirp (priv->children[source], fd, 131072, offset, &entries)) { + +                if (list_empty (&entries.list)) { +                        gf_log (this->name, GF_LOG_TRACE, +                                "no more entries in directory"); +                        goto out; +                } + +                list_for_each_entry_safe (entry, tmp, &entries.list, list) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "found readdir entry=%s", entry->d_name); + +                        file_path = build_file_path (loc, entry); +                        if (!file_path) { +                                gf_log (this->name, GF_LOG_DEBUG, +                                        "file path construction failed"); +                                goto out; +                        } + +                        build_child_loc (loc, &entry_loc, file_path, entry->d_name); + +                        ret = syncop_lookup (this, &entry_loc, NULL, +                                             &iatt, &xattr_rsp, &parent); + +                        entry_loc.ino = iatt.ia_ino; +                        entry_loc.inode->ino = iatt.ia_ino; + +                        gf_log (this->name, GF_LOG_DEBUG, +                                "lookup %s => %"PRId64, +                                entry_loc.path, +                                iatt.ia_ino); + +                        pump_update_resume_state (this, entry_loc.path); + +                        if (!IS_ENTRY_CWD(entry->d_name) && +                            !IS_ENTRY_PARENT (entry->d_name)) { +                                pump_save_path (this, entry_loc.path); +                                pump_save_file_stats (this); +                        } + +                        ret = pump_check_and_update_status (this); +                        if (ret < 0) { +                                gf_log (this->name, GF_LOG_DEBUG, +                                        "Pump beginning to exit out"); +                                goto out; +                        } + +                        gf_log (this->name, GF_LOG_TRACE, +                                "type of file=%d, IFDIR=%d", +                                iatt.ia_type, IA_IFDIR); + +                        if (IA_ISDIR (iatt.ia_type) && !IS_ENTRY_CWD(entry->d_name) && +                            !IS_ENTRY_PARENT (entry->d_name)) { +                                if (is_pump_traversal_allowed (this, entry_loc.path)) { +                                        gf_log (this->name, GF_LOG_TRACE, +                                                "entering dir=%s", +                                                entry->d_name); +                                        gf_pump_traverse_directory (&entry_loc); +                                } +                        } + +                        offset = entry->d_off; +                        loc_wipe (&entry_loc); +                } + +                gf_dirent_free (&entries); +                gf_log (this->name, GF_LOG_TRACE, +                        "offset incremented to %d", +                        (int32_t ) offset); + +        } + +out: +        return 0; + +} + +void +build_root_loc (inode_t *inode, loc_t *loc) +{ +        loc->path = "/"; +        loc->name = ""; +        loc->inode = inode; +        loc->ino = 1; +        loc->inode->ino = 1; + +} + +static int +pump_update_resume_path (xlator_t *this) +{ +        afr_private_t *priv = NULL; +        pump_private_t *pump_priv = NULL; + +        const char *resume_path = NULL; + +        priv = this->private; +        pump_priv = priv->pump_private; + +        resume_path = pump_get_resume_path (this); + +        if (resume_path) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Found a path to resume from: %s", +                        resume_path); + +        }else { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Did not find a path=> setting to '/'"); +                pump_set_resume_path (this, "/"); +        } + +        pump_change_state (this, PUMP_STATE_RESUME); + +        return 0; +} + +static int +pump_complete_migration (xlator_t *this) +{ +        afr_private_t *priv = NULL; +        pump_private_t *pump_priv = NULL; +        dict_t *dict = NULL; +        pump_state_t state; +        loc_t  loc; +        int dict_ret = 0; +        int ret = -1; + +        priv = this->private; +        pump_priv = priv->pump_private; + +        assert (priv->root_inode); + +        build_root_loc (priv->root_inode, &loc); + +        dict = dict_new (); + +        state = pump_get_state (); +        if (state == PUMP_STATE_RUNNING) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Pump finished pumping"); + +                pump_priv->pump_finished = _gf_true; + +                dict_ret = dict_set_str (dict, PUMP_SOURCE_COMPLETE, "jargon"); + +                ret = syncop_setxattr (PUMP_SOURCE_CHILD (this), &loc, dict, 0); +                if (ret < 0) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "setxattr failed - while  notifying source complete"); +                } +                dict_ret = dict_set_str (dict, PUMP_SINK_COMPLETE, "jargon"); + +                ret = syncop_setxattr (PUMP_SINK_CHILD (this), &loc, dict, 0); +                if (ret < 0) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "setxattr failed - while notifying sink complete"); +                } +        } + +        return 0; +} + +static int +pump_task (void *data) +{ +	xlator_t *this = NULL; +        afr_private_t *priv = NULL; + + +        loc_t loc; +	struct iatt iatt, parent; +	dict_t *xattr_rsp; + +        int ret = -1; + +        this = THIS; +        priv = this->private; + +        assert (priv->root_inode); + +        build_root_loc (priv->root_inode, &loc); + +        ret = syncop_lookup (this, &loc, NULL, +                             &iatt, &xattr_rsp, &parent); + +        gf_log (this->name, GF_LOG_TRACE, +                "lookup: ino=%"PRId64", path=%s", +                loc.ino, +                loc.path); + +        ret = pump_check_and_update_status (this); +        if (ret < 0) { +                goto out; +        } + +        pump_update_resume_path (this); + +        gf_pump_traverse_directory (&loc); + +        pump_complete_migration (this); +out: +	return 0; +} + + +static int +pump_task_completion (int ret, void *data) +{ +        xlator_t *this = NULL; +        call_frame_t *frame = NULL; +        afr_private_t *priv = NULL; +        pump_private_t *pump_priv = NULL; + +        this = THIS; + +        frame = (call_frame_t *) data; + +        priv = this->private; +        pump_priv = priv->pump_private; + +        inode_unref (priv->root_inode); + +        gf_log (this->name, GF_LOG_DEBUG, +                "Pump xlator exiting"); +	return 0; +} + +int +pump_start (call_frame_t *frame, xlator_t *this) +{ +	afr_private_t *priv = NULL; +	pump_private_t *pump_priv = NULL; +        call_frame_t *pump_frame = NULL; + +	int ret = -1; + +	priv = this->private; +        pump_priv = priv->pump_private; + +        pump_frame = copy_frame (frame); + +        if (!pump_frame->root->lk_owner) +                pump_frame->root->lk_owner = PUMP_LK_OWNER; + +	ret = synctask_new (pump_priv->env, pump_task, pump_task_completion, +                            pump_frame); + +        if (ret != -1) { +                gf_log (this->name, GF_LOG_TRACE, +                        "setting pump as started"); +        } else { +                gf_log (this->name, GF_LOG_DEBUG, +                        "starting pump failed"); +                pump_change_state (this, PUMP_STATE_ABORT); +        } + +	return ret; +} + +gf_boolean_t +is_pump_loaded (xlator_t *this) +{ +        afr_private_t *priv = NULL; +        pump_private_t *pump_priv = NULL; + +        gf_boolean_t ret; + +        priv = this->private; +        pump_priv = priv->pump_private; + +        if (priv->pump_loaded) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Pump is already started"); +                ret = _gf_true; +        } else { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Pump is not started"); +                ret = _gf_false; +        } + +        return ret; + +} + +int32_t +pump_cmd_start_getxattr_cbk (call_frame_t *frame, +                             void *cookie, +                             xlator_t *this, +                             int32_t op_ret, +                             int32_t op_errno, +                             dict_t *dict) +{ +        afr_local_t *local = NULL; +        char *path = NULL; + +        pump_state_t state; +        int ret = 0; +        int dict_ret = -1; + +        local = frame->local; + +        if (op_ret < 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "getxattr failed - changing pump state to RUNNING with '/'"); +                path = "/"; +        } else { +                gf_log (this->name, GF_LOG_TRACE, +                        "getxattr succeeded"); + +                dict_ret =  dict_get_str (dict, PUMP_PATH, &path); +                if (dict_ret < 0) +                        path = "/"; +        } + +        state = pump_get_state (); +        if ((state == PUMP_STATE_RUNNING) || +            (state == PUMP_STATE_RESUME)) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Pump is already started"); +                ret = -1; +                goto out; +        } + +        pump_set_resume_path (this, path); +        pump_change_state (this, PUMP_STATE_RUNNING); + +        ret = pump_start (frame, this); + +out: +        local->op_ret = ret; +        pump_command_reply (frame, this); +	return 0; +} + +int +pump_execute_status (call_frame_t *frame, xlator_t *this) +{ +        afr_private_t *priv = NULL; +        pump_private_t *pump_priv = NULL; + +        uint64_t number_files = 0; + +        char filename[PATH_MAX]; +        char *dict_str = NULL; + +        int32_t op_ret = 0; +        int32_t op_errno = 0; + +        dict_t *dict = NULL; +        int ret = -1; + +        priv = this->private; +        pump_priv = priv->pump_private; + +        LOCK (&pump_priv->resume_path_lock); +        { +                number_files  = pump_priv->number_files_pumped; +                strncpy (filename, pump_priv->current_file, PATH_MAX); +        } +        UNLOCK (&pump_priv->resume_path_lock); + +        dict_str     = GF_CALLOC (1, PATH_MAX + 256, gf_afr_mt_char); +        if (!dict_str) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Out of memory"); +                op_ret = -1; +                op_errno = ENOMEM; +                goto out; +        } + +        if (pump_priv->pump_finished) { +        snprintf (dict_str, PATH_MAX + 256, "Number of files migrated = %"PRIu64"        Migration complete ", +                  number_files); +        } else { +        snprintf (dict_str, PATH_MAX + 256, "Number of files migrated = %"PRIu64"       Current file= %s ", +                  number_files, filename); +        } + +        dict = dict_new (); + +        ret = dict_set_str (dict, PUMP_CMD_STATUS, dict_str); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "dict_set_str returned negative value"); +        } + +        op_ret = 0; + +out: + +        AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); + +        dict_unref (dict); +        GF_FREE (dict_str); + +        return 0; +} + +int +pump_execute_pause (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; + +        local = frame->local; + +        pump_change_state (this, PUMP_STATE_PAUSE); + +        local->op_ret = 0; +        pump_command_reply (frame, this); + +        return 0; +} + +int +pump_execute_start (call_frame_t *frame, xlator_t *this) +{ +        afr_private_t *priv = NULL; +        afr_local_t   *local = NULL; + +        int ret = 0; +        loc_t loc; + +        priv = this->private; +        local = frame->local; + +        if (!priv->root_inode) { +                gf_log (this->name, GF_LOG_NORMAL, +                        "Pump xlator cannot be started without an initial lookup"); +                ret = -1; +                goto out; +        } + +        assert (priv->root_inode); + +        build_root_loc (priv->root_inode, &loc); + +	STACK_WIND (frame, +		    pump_cmd_start_getxattr_cbk, +		    PUMP_SOURCE_CHILD(this), +		    PUMP_SOURCE_CHILD(this)->fops->getxattr, +		    &loc, +		    PUMP_PATH); + +        ret = 0; + +out: +        if (ret < 0) { +                local->op_ret = ret; +                pump_command_reply (frame, this); +        } + +	return 0; +} + +int32_t +pump_cmd_abort_removexattr_cbk (call_frame_t *frame, +                                void *cookie, +                                xlator_t *this, +                                int32_t op_ret, +                                int32_t op_errno) +{ +        afr_local_t *local = NULL; + +        local = frame->local; + +        if (op_ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Aborting pump failed. Please remove xattr" +                        PUMP_PATH "of the source child's '/'"); +                local->op_ret = -1; +        } else { +                gf_log (this->name, GF_LOG_DEBUG, +                "remove xattr succeeded"); +                local->op_ret = 0; +        } + +        pump_change_state (this, PUMP_STATE_ABORT); + +        pump_command_reply (frame, this); +	return 0; +} + +int +pump_execute_abort (call_frame_t *frame, xlator_t *this) +{ +        afr_private_t *priv = NULL; +        afr_local_t   *local = NULL; +        pump_private_t *pump_priv = NULL; +        loc_t root_loc; +        int ret = 0; + +        priv = this->private; +        local = frame->local; +        pump_priv = priv->pump_private; + +        if (!priv->pump_loaded) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Trying to abort pump xlator which is not loaded"); +                ret = -1; +                goto out; +        } + +        assert (priv->root_inode); + +        build_root_loc (priv->root_inode, &root_loc); + +	STACK_WIND (frame, +		    pump_cmd_abort_removexattr_cbk, +		    PUMP_SOURCE_CHILD (this), +		    PUMP_SOURCE_CHILD (this)->fops->removexattr, +		    &root_loc, +		    PUMP_PATH); + +        ret = 0; + +out: +        if (ret < 0) { +                local->op_ret = ret; +                pump_command_reply (frame, this); +        } + +        return 0; +} + +gf_boolean_t +pump_command_status (xlator_t *this, dict_t *dict) +{ +        char *cmd = NULL; +        int dict_ret = -1; +        int ret = _gf_true; + +        dict_ret = dict_get_str (dict, PUMP_CMD_STATUS, &cmd); +        if (dict_ret < 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Not a pump status command"); +                ret = _gf_false; +                goto out; +        } + +        gf_log (this->name, GF_LOG_DEBUG, +                "Hit a pump command - status"); +        ret = _gf_true; + +out: +        return ret; + +} + +gf_boolean_t +pump_command_pause (xlator_t *this, dict_t *dict) +{ +        char *cmd = NULL; +        int dict_ret = -1; +        int ret = _gf_true; + +        dict_ret = dict_get_str (dict, PUMP_CMD_PAUSE, &cmd); +        if (dict_ret < 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Not a pump pause command"); +                ret = _gf_false; +                goto out; +        } + +        gf_log (this->name, GF_LOG_DEBUG, +                "Hit a pump command - pause"); +        ret = _gf_true; + +out: +        return ret; + +} + +gf_boolean_t +pump_command_abort (xlator_t *this, dict_t *dict) +{ +        char *cmd = NULL; +        int dict_ret = -1; +        int ret = _gf_true; + +        dict_ret = dict_get_str (dict, PUMP_CMD_ABORT, &cmd); +        if (dict_ret < 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Not a pump abort command"); +                ret = _gf_false; +                goto out; +        } + +        gf_log (this->name, GF_LOG_DEBUG, +                "Hit a pump command - abort"); +        ret = _gf_true; + +out: +        return ret; + +} + +gf_boolean_t +pump_command_start (xlator_t *this, dict_t *dict) +{ +        char *cmd = NULL; +        int dict_ret = -1; +        int ret = _gf_true; + +        dict_ret = dict_get_str (dict, PUMP_CMD_START, &cmd); +        if (dict_ret < 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Not a pump start command"); +                ret = _gf_false; +                goto out; +        } + +        gf_log (this->name, GF_LOG_DEBUG, +                "Hit a pump command - start"); +        ret = _gf_true; + +out: +        return ret; + +} + +struct _xattr_key { +        char *key; +        struct list_head list; +}; + +static void +__gather_xattr_keys (dict_t *dict, char *key, data_t *value, +                     void *data) +{ +        struct list_head *  list  = data; +        struct _xattr_key * xkey  = NULL; + +        if (!strncmp (key, AFR_XATTR_PREFIX, +                      strlen (AFR_XATTR_PREFIX))) { + +                xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key); +                if (!xkey) +                        return; + +                xkey->key = key; +                INIT_LIST_HEAD (&xkey->list); + +                list_add_tail (&xkey->list, list); +        } +} + +static void +__filter_xattrs (dict_t *dict) +{ +        struct list_head keys; + +        struct _xattr_key *key; +        struct _xattr_key *tmp; + +        INIT_LIST_HEAD (&keys); + +        dict_foreach (dict, __gather_xattr_keys, +                      (void *) &keys); + +        list_for_each_entry_safe (key, tmp, &keys, list) { +                dict_del (dict, key->key); + +                list_del_init (&key->list); + +                GF_FREE (key); +        } +} + +int32_t +pump_getxattr_cbk (call_frame_t *frame, void *cookie, +		  xlator_t *this, int32_t op_ret, int32_t op_errno, +		  dict_t *dict) +{ +	afr_private_t * priv     = NULL; +	afr_local_t *   local    = NULL; +	xlator_t **     children = NULL; + +	int unwind     = 1; +	int last_tried = -1; +	int this_try = -1; +        int read_child = -1; + +	priv     = this->private; +	children = priv->children; + +	local = frame->local; + +        read_child = (long) cookie; + +	if (op_ret == -1) { +        retry: +		last_tried = local->cont.getxattr.last_tried; + +		if (all_tried (last_tried, priv->child_count)) { +			goto out; +		} +		this_try = ++local->cont.getxattr.last_tried; + +                if (this_try == read_child) { +                        goto retry; +                } + +		unwind = 0; +		STACK_WIND_COOKIE (frame, pump_getxattr_cbk, +				   (void *) (long) read_child, +				   children[this_try], +				   children[this_try]->fops->getxattr, +				   &local->loc, +				   local->cont.getxattr.name); +	} + +out: +	if (unwind) { +                if (op_ret >= 0 && dict) +                        __filter_xattrs (dict); + +		AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); +	} + +	return 0; +} + +int32_t +pump_getxattr (call_frame_t *frame, xlator_t *this, +	      loc_t *loc, const char *name) +{ +	afr_private_t *   priv       = NULL; +	xlator_t **       children   = NULL; +	int               call_child = 0; +	afr_local_t     * local      = NULL; + +        int               read_child = -1; + +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv     = this->private; +	VALIDATE_OR_GOTO (priv->children, out); + +	children = priv->children; + +	ALLOC_OR_GOTO (local, afr_local_t, out); +	frame->local = local; + +        if (name) { +                if (!strncmp (name, AFR_XATTR_PREFIX, +                              strlen (AFR_XATTR_PREFIX))) { + +                        op_errno = ENODATA; +                        goto out; +                } + +                if (!strcmp (name, PUMP_CMD_STATUS)) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "Hit pump command - status"); +                        pump_execute_status (frame, this); +                        op_ret = 0; +                        goto out; +                } +        } + +        read_child = afr_read_child (this, loc->inode); + +        if (read_child >= 0) { +                call_child = read_child; + +                local->cont.getxattr.last_tried = -1; +        } else { +                call_child = afr_first_up_child (priv); + +                if (call_child == -1) { +                        op_errno = ENOTCONN; +                        gf_log (this->name, GF_LOG_DEBUG, +                                "no child is up"); +                        goto out; +                } + +                local->cont.getxattr.last_tried = call_child; +        } + +	loc_copy (&local->loc, loc); +	if (name) +	  local->cont.getxattr.name       = gf_strdup (name); + +	STACK_WIND_COOKIE (frame, pump_getxattr_cbk, +			   (void *) (long) call_child, +			   children[call_child], children[call_child]->fops->getxattr, +			   loc, name); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, NULL); +	} +	return 0; +} + +static int +afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +	call_frame_t   *main_frame = NULL; + +	local = frame->local; +	priv  = this->private; + +	LOCK (&frame->lock); +	{ +		if (local->transaction.main_frame) +			main_frame = local->transaction.main_frame; +		local->transaction.main_frame = NULL; +	} +	UNLOCK (&frame->lock); + +	if (main_frame) { +		AFR_STACK_UNWIND (setxattr, main_frame, +                                  local->op_ret, local->op_errno) +	} +	return 0; +} + +static int +afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		       int32_t op_ret, int32_t op_errno) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	int call_count  = -1; +	int need_unwind = 0; + +	local = frame->local; +	priv = this->private; + +	LOCK (&frame->lock); +	{ +		if (op_ret != -1) { +			if (local->success_count == 0) { +				local->op_ret = op_ret; +			} +			local->success_count++; + +			if (local->success_count == priv->child_count) { +				need_unwind = 1; +			} +		} + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	if (need_unwind) +		local->transaction.unwind (frame, this); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +	} + +	return 0; +} + +static int +afr_setxattr_wind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; + +	int call_count = -1; +	int i = 0; + +	local = frame->local; +	priv = this->private; + +	call_count = afr_up_children_count (priv->child_count, local->child_up); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, +					   (void *) (long) i, +					   priv->children[i], +					   priv->children[i]->fops->setxattr, +					   &local->loc, +					   local->cont.setxattr.dict, +					   local->cont.setxattr.flags); + +			if (!--call_count) +				break; +		} +	} + +	return 0; +} + + +static int +afr_setxattr_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t * local = frame->local; + +	local->transaction.unwind (frame, this); + +	AFR_STACK_DESTROY (frame); + +	return 0; +} + +int32_t +pump_setxattr_cbk (call_frame_t *frame, +		      void *cookie, +		      xlator_t *this, +		      int32_t op_ret, +		      int32_t op_errno) +{ +	STACK_UNWIND (frame, +		      op_ret, +		      op_errno); +	return 0; +} + +int +pump_command_reply (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; + +        local = frame->local; + +        if (local->op_ret < 0) +                gf_log (this->name, GF_LOG_NORMAL, +                        "Command failed"); +        else +                gf_log (this->name, GF_LOG_NORMAL, +                        "Command succeeded"); + +        AFR_STACK_UNWIND (setxattr, +                          frame, +                          local->op_ret, +                          local->op_errno); + +        return 0; +} + +int +pump_parse_command (call_frame_t *frame, xlator_t *this, +                    afr_local_t *local, dict_t *dict) +{ + +        int ret = -1; + +        if (pump_command_start (this, dict)) { +                frame->local = local; +                ret = pump_execute_start (frame, this); + +        } else if (pump_command_pause (this, dict)) { +                frame->local = local; +                ret = pump_execute_pause (frame, this); + +        } else if (pump_command_abort (this, dict)) { +                frame->local = local; +                ret = pump_execute_abort (frame, this); +        } +        return ret; +} + +int +pump_setxattr (call_frame_t *frame, xlator_t *this, +               loc_t *loc, dict_t *dict, int32_t flags) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; +	call_frame_t   *transaction_frame = NULL; + +	int ret = -1; + +	int op_ret   = -1; +	int op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +        ret = pump_parse_command (frame, this, +                                  local, dict); +        if (ret >= 0) { +                op_ret = 0; +                goto out; +        } + +	transaction_frame = copy_frame (frame); +	if (!transaction_frame) { +		gf_log (this->name, GF_LOG_ERROR, +			"Out of memory."); +		goto out; +	} + +	transaction_frame->local = local; + +	local->op_ret = -1; + +	local->cont.setxattr.dict  = dict_ref (dict); +	local->cont.setxattr.flags = flags; + +	local->transaction.fop    = afr_setxattr_wind; +	local->transaction.done   = afr_setxattr_done; +	local->transaction.unwind = afr_setxattr_unwind; + +	loc_copy (&local->loc, loc); + +	local->transaction.main_frame = frame; +	local->transaction.start   = LLONG_MAX - 1; +	local->transaction.len     = 0; + +	afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		if (transaction_frame) +			AFR_STACK_DESTROY (transaction_frame); +		AFR_STACK_UNWIND (setxattr, frame, op_ret, op_errno); +	} + +	return 0; +} + +int32_t +mem_acct_init (xlator_t *this) +{ +        int     ret = -1; + +        if (!this) +                return ret; + +        ret = xlator_mem_acct_init (this, gf_afr_mt_end + 1); + +        if (ret != 0) { +                gf_log(this->name, GF_LOG_ERROR, "Memory accounting init" +                                "failed"); +                return ret; +        } + +        return ret; +} + +int32_t +notify (xlator_t *this, int32_t event, +	void *data, ...) +{ +        int ret = -1; + +	switch (event) { +	case GF_EVENT_CHILD_DOWN: +                pump_change_state (this, PUMP_STATE_ABORT); +                break; +        } + +        ret = afr_notify (this, event, data); + +        return ret; +} + +int32_t +init (xlator_t *this) +{ +	afr_private_t * priv        = NULL; +        pump_private_t *pump_priv = NULL; +	int             child_count = 0; +	xlator_list_t * trav        = NULL; +	int             i           = 0; +	int             ret         = -1; +	int             op_errno    = 0; + +        int source_child = 0; + +	if (!this->children) { +		gf_log (this->name, GF_LOG_ERROR, +			"pump translator needs a source and sink" +                        "subvolumes defined."); +		return -1; +	} + +	if (!this->parents) { +		gf_log (this->name, GF_LOG_WARNING, +			"Volume is dangling."); +	} + +	ALLOC_OR_GOTO (this->private, afr_private_t, out); + +	priv = this->private; + +        priv->read_child = source_child; +        priv->favorite_child = source_child; +        priv->background_self_heal_count = 0; + +	priv->data_self_heal     = 1; +	priv->metadata_self_heal = 1; +	priv->entry_self_heal    = 1; + +        priv->data_self_heal_algorithm = ""; + +        priv->data_self_heal_window_size = 16; + +	priv->data_change_log     = 1; +	priv->metadata_change_log = 1; +	priv->entry_change_log    = 1; + +	/* Locking options */ + +        /* Lock server count infact does not matter. Locks are held +           on all subvolumes, in this case being the source +           and the sink. +        */ + +	priv->data_lock_server_count = 2; +	priv->metadata_lock_server_count = 2; +	priv->entry_lock_server_count = 2; + +	priv->strict_readdir = _gf_false; + +        trav = this->children; +        while (trav) { +                child_count++; +                trav = trav->next; +        } + +	priv->wait_count = 1; + +        if (child_count != 2) { +                gf_log (this->name, GF_LOG_ERROR, +                        "There should be exactly 2 children - one source " +                        "and one sink"); +                return -1; +        } +	priv->child_count = child_count; + +	LOCK_INIT (&priv->lock); +        LOCK_INIT (&priv->read_child_lock); + +	priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, +                                 gf_afr_mt_char); +	if (!priv->child_up) { +		gf_log (this->name, GF_LOG_ERROR, +			"Out of memory."); +		op_errno = ENOMEM; +		goto out; +	} + +	priv->children = GF_CALLOC (sizeof (xlator_t *), child_count, +                                 gf_afr_mt_xlator_t); +	if (!priv->children) { +		gf_log (this->name, GF_LOG_ERROR, +			"Out of memory."); +		op_errno = ENOMEM; +		goto out; +	} + +        priv->pending_key = GF_CALLOC (sizeof (*priv->pending_key), +                                       child_count, +                                       gf_afr_mt_char); +        if (!priv->pending_key) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Out of memory."); +                op_errno = ENOMEM; +                goto out; +        } + +	trav = this->children; +	i = 0; +	while (i < child_count) { +		priv->children[i] = trav->xlator; + +                ret = asprintf (&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX, +                                trav->xlator->name); +                if (-1 == ret) { +                        gf_log (this->name, GF_LOG_ERROR, +                                "asprintf failed to set pending key"); +                        op_errno = ENOMEM; +                        goto out; +                } + +		trav = trav->next; +		i++; +	} + +        priv->first_lookup = 1; +        priv->root_inode = NULL; + +	pump_priv = GF_CALLOC (1, sizeof (*pump_priv), +                            gf_afr_mt_pump_priv); +	if (!pump_priv) { +		gf_log (this->name, GF_LOG_ERROR, +			"Out of memory"); +                op_errno = ENOMEM; +		goto out; +	} + +        LOCK_INIT (&pump_priv->resume_path_lock); +        LOCK_INIT (&pump_priv->pump_state_lock); + +        pump_priv->resume_path = GF_CALLOC (1, PATH_MAX, +                                            gf_afr_mt_char); +        if (!pump_priv->resume_path) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Out of memory"); +                ret = -1; +                goto out; +        } + +	pump_priv->env = syncenv_new (0); +        if (!pump_priv->env) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Could not create new sync-environment"); +                ret = -1; +                goto out; +        } + +	priv->pump_private = pump_priv; + +        pump_change_state (this, PUMP_STATE_ABORT); + +	ret = 0; +out: +	return ret; +} + +int +fini (xlator_t *this) +{ +	return 0; +} + + +struct xlator_fops fops = { +	.lookup      = afr_lookup, +	.open        = afr_open, +	.lk          = afr_lk, +	.flush       = afr_flush, +	.statfs      = afr_statfs, +	.fsync       = afr_fsync, +	.fsyncdir    = afr_fsyncdir, +	.xattrop     = afr_xattrop, +	.fxattrop    = afr_fxattrop, +	.inodelk     = afr_inodelk, +	.finodelk    = afr_finodelk, +	.entrylk     = afr_entrylk, +	.fentrylk    = afr_fentrylk, + +	/* inode read */ +	.access      = afr_access, +	.stat        = afr_stat, +	.fstat       = afr_fstat, +	.readlink    = afr_readlink, +	.getxattr    = pump_getxattr, +	.readv       = afr_readv, + +	/* inode write */ +	.writev      = afr_writev, +	.truncate    = afr_truncate, +	.ftruncate   = afr_ftruncate, +	.setxattr    = pump_setxattr, +        .setattr     = afr_setattr, +	.fsetattr    = afr_fsetattr, +	.removexattr = afr_removexattr, + +	/* dir read */ +	.opendir     = afr_opendir, +	.readdir     = afr_readdir, +	.readdirp    = afr_readdirp, + +	/* dir write */ +	.create      = afr_create, +	.mknod       = afr_mknod, +	.mkdir       = afr_mkdir, +	.unlink      = afr_unlink, +	.rmdir       = afr_rmdir, +	.link        = afr_link, +	.symlink     = afr_symlink, +	.rename      = afr_rename, +}; + +struct xlator_dumpops dumpops = { +        .priv       = afr_priv_dump, +}; + + +struct xlator_cbks cbks = { +	.release     = afr_release, +	.releasedir  = afr_releasedir, +}; + +struct volume_options options[] = { +	{ .key  = {NULL} }, +}; diff --git a/xlators/cluster/afr/src/pump.h b/xlators/cluster/afr/src/pump.h new file mode 100644 index 00000000000..15799002b18 --- /dev/null +++ b/xlators/cluster/afr/src/pump.h @@ -0,0 +1,96 @@ +/* +   Copyright (c) 2007-2009 Gluster, Inc. <http://www.gluster.com> +   This file is part of GlusterFS. + +   GlusterFS is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published +   by the Free Software Foundation; either version 3 of the License, +   or (at your option) any later version. + +   GlusterFS is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see +   <http://www.gnu.org/licenses/>. +*/ + +#ifndef __PUMP_H__ +#define __PUMP_H__ + +#include "syncop.h" + +#define PUMP_PID 696969 +#define PUMP_LK_OWNER 696969 + +#define IS_ROOT_PATH(path) (!strcmp (path, "/")) +#define IS_ENTRY_CWD(entry) (!strcmp (entry, ".")) +#define IS_ENTRY_PARENT(entry) (!strcmp (entry, "..")) + +#define PUMP_CMD_START  "trusted.glusterfs.pump.start" +#define PUMP_CMD_ABORT  "trusted.glusterfs.pump.abort" +#define PUMP_CMD_PAUSE  "trusted.glusterfs.pump.pause" +#define PUMP_CMD_STATUS "trusted.glusterfs.pump.status" + +#define PUMP_SOURCE_COMPLETE "trusted.glusterfs.pump-source-complete" +#define PUMP_SINK_COMPLETE "trusted.glusterfs.pump-sink-complete" + +#define PUMP_PATH "trusted.glusterfs.pump-path" + +#define PUMP_SOURCE_CHILD(xl) (xl->children->xlator) +#define PUMP_SINK_CHILD(xl) (xl->children->next->xlator) + +typedef enum { +        PUMP_STATE_RUNNING, +        PUMP_STATE_RESUME, +        PUMP_STATE_PAUSE, +        PUMP_STATE_ABORT, +} pump_state_t; + +typedef struct _pump_private { +	struct syncenv *env; +        const char *resume_path; +        gf_lock_t resume_path_lock; +        gf_lock_t pump_state_lock; +        pump_state_t pump_state; +        long source_blocks; +        long sink_blocks; +        char current_file[PATH_MAX]; +        uint64_t number_files_pumped; +        gf_boolean_t pump_finished; +} pump_private_t; + +void +build_root_loc (inode_t *inode, loc_t *loc); +int pump_start (call_frame_t *frame, xlator_t *this); + +gf_boolean_t +pump_command_start (xlator_t *this, dict_t *dict); + +int +pump_execute_start (call_frame_t *frame, xlator_t *this); + +gf_boolean_t +pump_command_pause (xlator_t *this, dict_t *dict); + +int +pump_execute_pause (call_frame_t *frame, xlator_t *this); + +gf_boolean_t +pump_command_abort (xlator_t *this, dict_t *dict); + +int +pump_execute_abort (call_frame_t *frame, xlator_t *this); + +gf_boolean_t +pump_command_status (xlator_t *this, dict_t *dict); + +int +pump_execute_status (call_frame_t *frame, xlator_t *this); + +gf_boolean_t +is_pump_loaded (xlator_t *this); + +#endif /* __PUMP_H__ */ diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c index 9aa4801e99b..a8b334c3dd9 100644 --- a/xlators/mgmt/glusterd/src/glusterd-handler.c +++ b/xlators/mgmt/glusterd/src/glusterd-handler.c @@ -848,6 +848,44 @@ out:  }  int +glusterd_handle_replace_brick (rpcsvc_request_t *req) +{ +        int32_t                         ret = -1; +        gf1_cli_replace_brick_req          cli_req = {0,}; +        dict_t                          *dict = NULL; + +        GF_ASSERT (req); + +        if (!gf_xdr_to_cli_replace_brick_req (req->msg[0], &cli_req)) { +                //failed to decode msg; +                req->rpc_err = GARBAGE_ARGS; +                goto out; +        } + +        gf_log ("glusterd", GF_LOG_NORMAL, "Received replace brick req"); + +        if (cli_req.bricks.bricks_len) { +                /* Unserialize the dictionary */ +                dict  = dict_new (); + +                ret = dict_unserialize (cli_req.bricks.bricks_val, +                                        cli_req.bricks.bricks_len, +                                        &dict); +                if (ret < 0) { +                        gf_log ("glusterd", GF_LOG_ERROR, +                                "failed to " +                                "unserialize req-buffer to dictionary"); +                        goto out; +                } +        } + +        ret = glusterd_replace_brick (req, dict); + +out: +        return ret; +} + +int  glusterd_handle_remove_brick (rpcsvc_request_t *req)  {          int32_t                         ret = -1; @@ -1657,6 +1695,23 @@ glusterd_add_brick (rpcsvc_request_t *req, dict_t *dict)  }  int32_t +glusterd_replace_brick (rpcsvc_request_t *req, dict_t *dict) +{ +        int32_t      ret       = -1; + +        GF_ASSERT (req); +        GF_ASSERT (dict); + +        glusterd_op_set_op (GD_OP_REPLACE_BRICK); + +        glusterd_op_set_ctx (GD_OP_REPLACE_BRICK, dict); + +        ret = glusterd_op_txn_begin (); + +        return ret; +} + +int32_t  glusterd_remove_brick (rpcsvc_request_t *req, dict_t *dict)  {          int32_t      ret       = -1; diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c index 31e2a9e50f2..8f6861417be 100644 --- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c +++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c @@ -84,6 +84,7 @@ glusterd_op_get_len (glusterd_op_t op)                  case GD_OP_START_BRICK:                          break; +                case GD_OP_REPLACE_BRICK:                  case GD_OP_ADD_BRICK:                          {                                  dict_t *dict = glusterd_op_get_ctx (op); @@ -210,6 +211,20 @@ glusterd_op_build_payload (glusterd_op_t op, gd1_mgmt_stage_op_req **req)                          }                          break; +                case GD_OP_REPLACE_BRICK: +                        { +                                dict_t  *dict = NULL; +                                dict = glusterd_op_get_ctx (op); +                                GF_ASSERT (dict); +                                ret = dict_allocate_and_serialize (dict, +                                                &stage_req->buf.buf_val, +                                        (size_t *)&stage_req->buf.buf_len); +                                if (ret) { +                                        goto out; +                                } +                        } +                        break; +                  case GD_OP_REMOVE_BRICK:                          {                                  dict_t  *dict = NULL; @@ -533,6 +548,55 @@ out:  }  static int +glusterd_op_stage_replace_brick (gd1_mgmt_stage_op_req *req) +{ +        int                                     ret = 0; +        dict_t                                  *dict = NULL; +        char                                    *src_brick = NULL; +        char                                    *dst_brick = NULL; + +        GF_ASSERT (req); + +        dict = dict_new (); +        if (!dict) +                goto out; + +        ret = dict_unserialize (req->buf.buf_val, req->buf.buf_len, &dict); + +        if (ret) { +                gf_log ("", GF_LOG_ERROR, "Unable to unserialize dict"); +                goto out; +        } +        /* Need to do a little more error checking and validation */ +        ret = dict_get_str (dict, "src-brick", &src_brick); + +        if (ret) { +                gf_log ("", GF_LOG_ERROR, "Unable to get src brick"); +                goto out; +        } + +        gf_log ("", GF_LOG_DEBUG, +                "src brick=%s", src_brick); + +        ret = dict_get_str (dict, "dst-brick", &dst_brick); + +        if (ret) { +                gf_log ("", GF_LOG_ERROR, "Unable to get dest brick"); +                goto out; +        } + +        gf_log ("", GF_LOG_DEBUG, +                "dest brick=%s", dst_brick); + +        ret = 0; + +out: +        gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + +        return ret; +} + +static int  glusterd_op_stage_remove_brick (gd1_mgmt_stage_op_req *req)  {          int                                     ret = 0; @@ -828,6 +892,310 @@ out:  }  static int +replace_brick_start_dst_brick (glusterd_volinfo_t *volinfo, glusterd_brickinfo_t *dst_brickinfo) +{        glusterd_conf_t    *priv = NULL; +        char  cmd_str[8192] = {0,}; +        char filename[PATH_MAX]; +        FILE *file = NULL; +        int ret; + +        priv = THIS->private; + +        snprintf (filename, PATH_MAX, "%s/vols/%s/replace_dst_brick.vol", +                  priv->workdir, volinfo->volname); + + +        file = fopen (filename, "a+"); +        if (!file) { +                gf_log ("", GF_LOG_DEBUG, +                        "Open of volfile failed"); +                return -1; +        } + +        truncate (filename, 0); + +        fprintf (file, "volume src-posix\n"); +        fprintf (file, "type storage/posix\n"); +        fprintf (file, "option directory %s\n", +                 dst_brickinfo->path); +        fprintf (file, "end-volume\n"); +        fprintf (file, "volume %s\n", +                 dst_brickinfo->path); +        fprintf (file, "type features/locks\n"); +        fprintf (file, "subvolumes src-posix\n"); +        fprintf (file, "end-volume\n"); +        fprintf (file, "volume src-server\n"); +        fprintf (file, "type protocol/server\n"); +        fprintf (file, "option auth.addr.%s.allow *\n", +                 dst_brickinfo->path); +        fprintf (file, "option listen-port 34034\n"); +        fprintf (file, "subvolumes %s\n", +                 dst_brickinfo->path); +        fprintf (file, "end-volume\n"); + +        gf_log ("", GF_LOG_DEBUG, +                "starting dst brick"); + +        snprintf (cmd_str, 4096, "glusterfs -f %s/vols/%s/replace_dst_brick.vol", +                  priv->workdir, volinfo->volname); + +        ret = system (cmd_str); + + +        fclose (file); + +        return 0; +} + +static int +replace_brick_spawn_brick (glusterd_volinfo_t *volinfo, dict_t *dict, +                           glusterd_brickinfo_t *dst_brickinfo) + +{ + +        replace_brick_start_dst_brick (volinfo, dst_brickinfo); + +        return 0; +} + +static int +replace_brick_generate_volfile (glusterd_volinfo_t *volinfo, +                                glusterd_brickinfo_t *src_brickinfo) +{ +        glusterd_conf_t    *priv = NULL; +        FILE *file = NULL; +        char filename[PATH_MAX]; +        int ret; + +        priv = THIS->private; + +        gf_log ("", GF_LOG_DEBUG, +                "Creating volfile"); + +        snprintf (filename, PATH_MAX, "%s/vols/%s/replace_brick.vol", +                  priv->workdir, volinfo->volname); + +        file = fopen (filename, "a+"); +        if (!file) { +                gf_log ("", GF_LOG_DEBUG, +                        "Open of volfile failed"); +                return -1; +        } + +        ret = truncate (filename, 0); +        ret = unlink ("/tmp/replace_brick.vol"); + +        fprintf (file, "volume client/protocol\n"); +        fprintf (file, "type protocol/client\n"); +        fprintf (file, "option remote-host %s\n", +                 src_brickinfo->hostname); +        fprintf (file, "option remote-subvolume %s\n", +                 src_brickinfo->path); +        fprintf (file, "option remote-port 34034\n"); +        fprintf (file, "echo end-volume\n"); + +        ret = symlink(filename, "/tmp/replace_brick.vol"); +        if (!ret) { +                gf_log ("", GF_LOG_DEBUG, +                        "symlink call failed"); +                return -1; +        } + +        fclose (file); + +        return 0; +} + +static int +replace_brick_start_source_brick (glusterd_volinfo_t *volinfo, glusterd_brickinfo_t *src_brickinfo, +                                  glusterd_brickinfo_t *dst_brickinfo) +{        glusterd_conf_t    *priv = NULL; +        char filename[PATH_MAX]; +        FILE *file = NULL; +        char  cmd_str[8192] = {0,}; +        int ret; + +        priv = THIS->private; + +        gf_log ("", GF_LOG_DEBUG, +                "Creating volfile"); + +        snprintf (filename, PATH_MAX, "%s/vols/%s/replace_source_brick.vol", +                  priv->workdir, volinfo->volname); + +        file = fopen (filename, "a+"); +        if (!file) { +                gf_log ("", GF_LOG_DEBUG, +                        "Open of volfile failed"); +                return -1; +        } + +        ret = truncate (filename, 0); + +        fprintf (file, "volume src-posix\n"); +        fprintf (file, "type storage/posix\n"); +        fprintf (file, "option directory %s\n", +                 src_brickinfo->path); +        fprintf (file, "end-volume\n"); +        fprintf (file, "volume locks\n"); +        fprintf (file, "type features/locks\n"); +        fprintf (file, "subvolumes src-posix\n"); +        fprintf (file, "end-volume\n"); +        fprintf (file, "volume remote-client\n"); +        fprintf (file, "type protocol/client\n"); +        fprintf (file, "option remote-host %s\n", +                 dst_brickinfo->hostname); +        fprintf (file, "option remote-port 34034\n"); +        fprintf (file, "option remote-subvolume %s\n", +                 dst_brickinfo->path); +        fprintf (file, "end-volume\n"); +        fprintf (file, "volume %s\n", +                 src_brickinfo->path); +        fprintf (file, "type cluster/pump\n"); +        fprintf (file, "subvolumes locks remote-client\n"); +        fprintf (file, "end-volume\n"); +        fprintf (file, "volume src-server\n"); +        fprintf (file, "type protocol/server\n"); +        fprintf (file, "option auth.addr.%s.allow *\n", +                 src_brickinfo->path); +        fprintf (file, "option listen-port 34034\n"); +        fprintf (file, "subvolumes %s\n", +                 src_brickinfo->path); +        fprintf (file, "end-volume\n"); + + +        gf_log ("", GF_LOG_DEBUG, +                "starting source brick"); + +        snprintf (cmd_str, 4096, "glusterfs -f %s/vols/%s/replace_source_brick.vol -l /tmp/b_log -LTRACE", +                  priv->workdir, volinfo->volname); + +        ret = system (cmd_str); + +        fclose (file); + +        return 0; +} + +static int +replace_brick_mount (glusterd_volinfo_t *volinfo, glusterd_brickinfo_t *src_brickinfo, +                     glusterd_brickinfo_t *dst_brickinfo, gf1_cli_replace_op op) +{ + +        gf_log ("", GF_LOG_DEBUG, +                "starting source brick"); + +        replace_brick_start_source_brick (volinfo, src_brickinfo, +                                          dst_brickinfo); + +        return 0; +} + +static int +glusterd_op_replace_brick (gd1_mgmt_stage_op_req *req) +{ +        int                                     ret = 0; +        dict_t                                  *dict = NULL; +        glusterd_volinfo_t                      *volinfo = NULL; +        char                                    *volname = NULL; +        xlator_t                                *this = NULL; +        glusterd_conf_t                         *priv = NULL; +        char                                    *src_brick = NULL; +        char                                    *dst_brick = NULL; +        char                                    *str = NULL; + +        glusterd_brickinfo_t                    *src_brickinfo = NULL; +        glusterd_brickinfo_t                    *dst_brickinfo = NULL; + +        GF_ASSERT (req); + +        this = THIS; +        GF_ASSERT (this); + +        priv = this->private; +        GF_ASSERT (priv); + +        dict = dict_new (); +        if (!dict) +                goto out; + +        ret = dict_unserialize (req->buf.buf_val, req->buf.buf_len, &dict); + +        if (ret) { +                gf_log ("", GF_LOG_ERROR, "Unable to unserialize dict"); +                goto out; +        } + +        /* Need to do a little more error checking and validation */ +        ret = dict_get_str (dict, "src-brick", &src_brick); + +        if (ret) { +                gf_log ("", GF_LOG_ERROR, "Unable to get src brick"); +                goto out; +        } + +        gf_log (this->name, GF_LOG_DEBUG, +                "src brick=%s", src_brick); + +        ret = dict_get_str (dict, "dst-brick", &dst_brick); + +        if (ret) { +                gf_log ("", GF_LOG_ERROR, "Unable to get dest brick"); +                goto out; +        } + +        gf_log (this->name, GF_LOG_DEBUG, +                "dst brick=%s", dst_brick); + +        ret = dict_get_str (dict, "volname", &volname); + +        if (ret) { +                gf_log ("", GF_LOG_ERROR, "Unable to get volume name"); +                goto out; +        } + +        ret = glusterd_volinfo_find (volname, &volinfo); + +        if (ret) { +                gf_log ("", GF_LOG_ERROR, "Unable to allocate memory"); +                goto out; +        } + +        str = strdup (src_brick); + +        ret = glusterd_brickinfo_get (str, volinfo, +                                      &src_brickinfo); +        if (ret) { +                gf_log ("", GF_LOG_DEBUG, "Unable to get src-brickinfo"); +                goto out; +        } + +        ret = glusterd_brickinfo_from_brick (dst_brick, &dst_brickinfo); +        if (ret) { +                gf_log ("", GF_LOG_DEBUG, "Unable to get dst-brickinfo"); +                goto out; +        } + +        replace_brick_generate_volfile (volinfo, src_brickinfo); + +        if (!glusterd_is_local_addr (src_brickinfo->hostname)) { +                gf_log ("", GF_LOG_NORMAL, +                        "I AM THE SOURCE HOST"); +                replace_brick_mount (volinfo, src_brickinfo, dst_brickinfo, +                                     req->op); +        } else if (!glusterd_is_local_addr (dst_brickinfo->hostname)) { +                gf_log ("", GF_LOG_NORMAL, +                        "I AM THE DESTINATION HOST"); +                replace_brick_spawn_brick (volinfo, dict, dst_brickinfo); +        } + +        ret = 0; + +out: +        return ret; +} + +static int  glusterd_op_remove_brick (gd1_mgmt_stage_op_req *req)  {          int                                     ret = 0; @@ -1609,6 +1977,11 @@ glusterd_op_stage_validate (gd1_mgmt_stage_op_req *req)                          ret = glusterd_op_stage_add_brick (req);                          break; +                case GD_OP_REPLACE_BRICK: +                        ret = glusterd_op_stage_replace_brick (req); +                        break; + +                  case GD_OP_REMOVE_BRICK:                          ret = glusterd_op_stage_remove_brick (req);                          break; @@ -1652,6 +2025,10 @@ glusterd_op_commit_perform (gd1_mgmt_stage_op_req *req)                          ret = glusterd_op_add_brick (req);                          break; +                case GD_OP_REPLACE_BRICK: +                        ret = glusterd_op_replace_brick (req); +                        break; +                  case GD_OP_REMOVE_BRICK:                          ret = glusterd_op_remove_brick (req);                          break; diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index 6b32bd3a0d8..5ef0ce1e5c1 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -73,7 +73,7 @@ glusterd_unset_lock_owner (uuid_t owner)          return 0;  } -static int32_t +int32_t  glusterd_is_local_addr (char *hostname)  {          int32_t         ret = -1; @@ -598,11 +598,14 @@ glusterd_brickinfo_from_brick (char *brick,          glusterd_brickinfo_t    *new_brickinfo = NULL;          char                    *hostname = NULL;          char                    *path = NULL; +        char                    *tmp = NULL;          GF_ASSERT (brick);          GF_ASSERT (brickinfo); -        hostname = strtok (brick, ":"); +        tmp = strdup (brick); + +        hostname = strtok (tmp, ":");          path = strtok (NULL, ":");          GF_ASSERT (hostname); diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h index fc8b33ab1b9..dd97e67a68c 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.h +++ b/xlators/mgmt/glusterd/src/glusterd-utils.h @@ -115,4 +115,6 @@ glusterd_is_cli_op_req (int32_t op);  int32_t  glusterd_brickinfo_get (char *brick, glusterd_volinfo_t *volinfo,                          glusterd_brickinfo_t **brickinfo); +int32_t +glusterd_is_local_addr (char *hostname);  #endif diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index 9f0986bb62f..368c0a6847a 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -249,6 +249,13 @@ glusterd_add_brick (rpcsvc_request_t *req, dict_t *dict);  int  glusterd_handle_add_brick (rpcsvc_request_t *req); + +int32_t +glusterd_replace_brick (rpcsvc_request_t *req, dict_t *dict); + +int +glusterd_handle_replace_brick (rpcsvc_request_t *req); +  int  glusterd_handle_remove_brick (rpcsvc_request_t *req); diff --git a/xlators/mgmt/glusterd/src/glusterd3_1-mops.c b/xlators/mgmt/glusterd/src/glusterd3_1-mops.c index 7169121d574..86d09194e11 100644 --- a/xlators/mgmt/glusterd/src/glusterd3_1-mops.c +++ b/xlators/mgmt/glusterd/src/glusterd3_1-mops.c @@ -1164,6 +1164,10 @@ glusterd_handle_rpc_msg (rpcsvc_request_t *req)                          ret = glusterd_handle_add_brick (req);                          break; +                case GD_MGMT_CLI_REPLACE_BRICK: +                        ret = glusterd_handle_replace_brick (req); +                        break; +                  case GD_MGMT_CLI_REMOVE_BRICK:                          ret = glusterd_handle_remove_brick (req);                          break; @@ -1203,6 +1207,7 @@ rpcsvc_actor_t glusterd1_mgmt_actors[] = {          [GD_MGMT_CLI_DELETE_VOLUME] = { "DELETE_VOLUME", GD_MGMT_CLI_DELETE_VOLUME, glusterd_handle_rpc_msg, NULL, NULL},          [GD_MGMT_CLI_GET_VOLUME] = { "GET_VOLUME", GD_MGMT_CLI_GET_VOLUME, glusterd_handle_rpc_msg, NULL, NULL},          [GD_MGMT_CLI_ADD_BRICK] = { "ADD_BRICK", GD_MGMT_CLI_ADD_BRICK, glusterd_handle_rpc_msg, NULL, NULL}, +        [GD_MGMT_CLI_REPLACE_BRICK] = { "REPLACE_BRICK", GD_MGMT_CLI_REPLACE_BRICK, glusterd_handle_rpc_msg, NULL, NULL},          [GD_MGMT_CLI_REMOVE_BRICK] = { "REMOVE_BRICK", GD_MGMT_CLI_REMOVE_BRICK, glusterd_handle_rpc_msg, NULL, NULL},  };  | 
