diff options
| -rw-r--r-- | cli/src/cli-cmd-parser.c | 91 | ||||
| -rw-r--r-- | cli/src/cli-cmd-volume.c | 79 | ||||
| -rw-r--r-- | cli/src/cli-rpc-ops.c | 6 | ||||
| -rw-r--r-- | heal/src/glfs-heal.c | 416 | ||||
| -rw-r--r-- | libglusterfs/src/glusterfs.h | 1 | ||||
| -rw-r--r-- | rpc/rpc-lib/src/protocol-common.h | 2 | ||||
| -rw-r--r-- | tests/basic/afr/split-brain-healing.t | 183 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 76 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-inode-read.c | 5 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 191 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-data.c | 62 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-metadata.c | 34 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal.h | 18 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 4 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-common.c | 12 | 
15 files changed, 1039 insertions, 141 deletions
| diff --git a/cli/src/cli-cmd-parser.c b/cli/src/cli-cmd-parser.c index 28888ba656d..53b14d27708 100644 --- a/cli/src/cli-cmd-parser.c +++ b/cli/src/cli-cmd-parser.c @@ -2929,6 +2929,43 @@ out:          return ret;  } +static int +set_hostname_path_in_dict (const char *token, dict_t *dict, int heal_op) +{ +        char *hostname = NULL; +        char *path     = NULL; +        int   ret      = 0; + +        ret = extract_hostname_path_from_token (token, &hostname, &path); +        if (ret) +                goto out; + +        switch (heal_op) { +        case GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK: +                ret = dict_set_dynstr (dict, "heal-source-hostname", +                                       hostname); +                if (ret) +                        goto out; +                ret = dict_set_dynstr (dict, "heal-source-brickpath", +                                       path); +                break; +        case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA: +                ret = dict_set_dynstr (dict, "per-replica-cmd-hostname", +                                       hostname); +                if (ret) +                        goto out; +                ret = dict_set_dynstr (dict, "per-replica-cmd-path", +                                       path); +                break; +        default: +                ret = -1; +                break; +        } + +out: +        return ret; + +}  int  cli_cmd_volume_heal_options_parse (const char **words, int wordcount, @@ -2936,8 +2973,6 @@ cli_cmd_volume_heal_options_parse (const char **words, int wordcount,  {          int     ret = 0;          dict_t  *dict = NULL; -        char    *hostname = NULL; -        char    *path = NULL;          dict = dict_new ();          if (!dict) @@ -3008,6 +3043,35 @@ cli_cmd_volume_heal_options_parse (const char **words, int wordcount,                  ret = -1;                  goto out;          } +        if (wordcount == 6) { +                if (strcmp (words[3], "split-brain")) { +                        ret = -1; +                        goto out; +                } +                if (!strcmp (words[4], "bigger-file")) { +                        ret = dict_set_int32 (dict, "heal-op", +                                        GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE); +                        if (ret) +                                goto out; +                        ret = dict_set_str (dict, "file", (char *)words[5]); +                        if (ret) +                                goto out; +                        goto done; +                } +                if (!strcmp (words[4], "source-brick")) { +                        ret = dict_set_int32 (dict, "heal-op", +                                              GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK); +                        if (ret) +                                goto out; +                        ret = set_hostname_path_in_dict (words[5], dict, +                                              GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK); +                        if (ret) +                                goto out; +                        goto done; +                } +                ret = -1; +                goto out; +        }          if (wordcount == 7) {                  if (!strcmp (words[3], "statistics")                      && !strcmp (words[4], "heal-count") @@ -3017,21 +3081,26 @@ cli_cmd_volume_heal_options_parse (const char **words, int wordcount,                                     GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA);                          if (ret)                                  goto out; -                        ret = extract_hostname_path_from_token (words[6], -                                                              &hostname, &path); +                        ret = set_hostname_path_in_dict (words[6], dict, +                                   GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA);                          if (ret)                                  goto out; -                        ret = dict_set_dynstr (dict, "per-replica-cmd-hostname", -                                               hostname); +                        goto done; + +                } +                if (!strcmp (words[3], "split-brain") && +                    !strcmp (words[4], "source-brick")) { +                        ret = dict_set_int32 (dict, "heal-op", +                                              GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK); +                        ret = set_hostname_path_in_dict (words[5], dict, +                                              GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK);                          if (ret)                                  goto out; -                        ret = dict_set_dynstr (dict, "per-replica-cmd-path", -                                               path); +                        ret = dict_set_str (dict, "file", +                                            (char *) words[6]);                          if (ret)                                  goto out; -                        else -                                goto done; - +                        goto done;                  }          }          ret = -1; diff --git a/cli/src/cli-cmd-volume.c b/cli/src/cli-cmd-volume.c index 238c8673d75..501b5776dec 100644 --- a/cli/src/cli-cmd-volume.c +++ b/cli/src/cli-cmd-volume.c @@ -1879,6 +1879,60 @@ cli_print_brick_status (cli_volume_status_t *status)          return 0;  } +#define NEEDS_GLFS_HEAL(op) ((op == GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE) || \ +                             (op == GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK) ||      \ +                             (op == GF_AFR_OP_INDEX_SUMMARY)) + +int +cli_launch_glfs_heal (int heal_op, dict_t *options) +{ +        char      buff[PATH_MAX] = {0}; +        runner_t  runner         = {0}; +        char      *filename      = NULL; +        char      *hostname      = NULL; +        char      *path          = NULL; +        char      *volname       = NULL; +        char      *out           = NULL; +        int        ret           = 0; + +        runinit (&runner); +        ret = dict_get_str (options, "volname", &volname); +        runner_add_args (&runner, SBIN_DIR"/glfsheal", volname, NULL); +        runner_redir (&runner, STDOUT_FILENO, RUN_PIPE); + +        switch (heal_op) { +        case GF_AFR_OP_INDEX_SUMMARY: +                break; +        case GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE: +                ret = dict_get_str (options, "file", &filename); +                runner_add_args (&runner, "bigger-file", filename, NULL); +                break; +        case  GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK: +                ret = dict_get_str (options, "heal-source-hostname", +                                    &hostname); +                ret = dict_get_str (options, "heal-source-brickpath", +                                    &path); +                runner_add_args (&runner, "source-brick", NULL); +                runner_argprintf (&runner, "%s:%s", hostname, path); +                if (dict_get_str (options, "file", &filename) == 0) +                        runner_argprintf (&runner, filename); +                break; +        default: +                ret = -1; +        } +        ret = runner_start (&runner); +        if (ret == -1) +                goto out; +        while ((out = fgets (buff, sizeof(buff), +                             runner_chio (&runner, STDOUT_FILENO)))) { +                printf ("%s", out); +        } +        ret = runner_end (&runner); +        ret = WEXITSTATUS (ret); + +out: +        return ret; +}  int  cli_cmd_volume_heal_cbk (struct cli_state *state, struct cli_cmd_word *word,                            const char **words, int wordcount) @@ -1892,9 +1946,6 @@ cli_cmd_volume_heal_cbk (struct cli_state *state, struct cli_cmd_word *word,          xlator_t                *this = NULL;          cli_local_t             *local = NULL;          int                     heal_op = 0; -        runner_t                runner = {0}; -        char                    buff[PATH_MAX] = {0}; -        char                    *out = NULL;          this = THIS;          frame = create_frame (this, this->ctx->pool); @@ -1916,21 +1967,10 @@ cli_cmd_volume_heal_cbk (struct cli_state *state, struct cli_cmd_word *word,          ret = dict_get_int32 (options, "heal-op", &heal_op);          if (ret < 0)                  goto out; - -        if (heal_op == GF_AFR_OP_INDEX_SUMMARY) { -                runinit (&runner); -                runner_add_args (&runner, SBIN_DIR"/glfsheal", words[2], NULL); -                runner_redir (&runner, STDOUT_FILENO, RUN_PIPE); -                ret = runner_start (&runner); +        if (NEEDS_GLFS_HEAL (heal_op)) { +                ret = cli_launch_glfs_heal (heal_op, options);                  if (ret == -1)                          goto out; -                while ((out = fgets(buff, sizeof(buff), -                                   runner_chio (&runner, STDOUT_FILENO)))) { -                        printf ("%s", out); -                } - -                ret = runner_end (&runner); -                ret = WEXITSTATUS (ret);          }          else {                  proc = &cli_rpc_prog->proctable[GLUSTER_CLI_HEAL_VOLUME]; @@ -1946,7 +1986,7 @@ out:          if (ret) {                  cli_cmd_sent_status_get (&sent);                  if ((sent == 0) && (parse_error == 0)) -                        cli_out ("Volume heal failed"); +                        cli_out ("Volume heal failed.");          }          CLI_STACK_DESTROY (frame); @@ -2316,7 +2356,10 @@ struct cli_cmd volume_cmds[] = {            cli_cmd_volume_status_cbk,            "display status of all or specified volume(s)/brick"}, -        { "volume heal <VOLNAME> [{full | statistics {heal-count {replica <hostname:brickname>}} |info {healed | heal-failed | split-brain}}]", +        { "volume heal <VOLNAME> [full | statistics [heal-count "\ +          "[replica <HOSTNAME:BRICKNAME>]] |info [healed | heal-failed | "\ +          "split-brain]| split-brain {bigger-file <FILE> |source-brick "\ +          "<HOSTNAME:BRICKNAME> [<FILE>]}]",            cli_cmd_volume_heal_cbk,            "self-heal commands on volume specified by <VOLNAME>"}, diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c index 1d8cf23ff42..72ffaf4129a 100644 --- a/cli/src/cli-rpc-ops.c +++ b/cli/src/cli-rpc-ops.c @@ -7358,6 +7358,12 @@ gf_cli_heal_volume_cbk (struct rpc_req *req, struct iovec *iov,                  case    GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:                          heal_op_str = "count of entries to be healed per replica";                          break; +                /* The below 2 cases are never hit; they're coded only to make +                 * compiler warnings go away.*/ +                case    GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE: +                case    GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK: +                        break; +                  case    GF_AFR_OP_INVALID:                          heal_op_str = "invalid heal op";                          break; diff --git a/heal/src/glfs-heal.c b/heal/src/glfs-heal.c index a9baad3ac56..f49f3a58afc 100644 --- a/heal/src/glfs-heal.c +++ b/heal/src/glfs-heal.c @@ -14,11 +14,17 @@  #include "glfs.h"  #include "glfs-handles.h"  #include "glfs-internal.h" +#include "protocol-common.h"  #include "syncop.h"  #include <string.h>  #include <time.h>  #define DEFAULT_HEAL_LOG_FILE_DIRECTORY DATADIR "/log/glusterfs" +#define USAGE_STR "Usage: %s <VOLNAME> [bigger-file <FILE> | "\ +                  "source-brick <HOSTNAME:BRICKNAME> [<FILE>]]\n" + +int glfsh_heal_splitbrain_file (glfs_t *fs, xlator_t *top_subvol, +                                loc_t *rootloc, char *file, dict_t *xattr_req);  int  glfsh_link_inode_update_loc (loc_t *loc, struct iatt *iattr) @@ -83,6 +89,37 @@ out:          return ret;  } +int +glfsh_get_index_dir_fd (xlator_t *xl, loc_t *loc, fd_t **fd) +{ +        int ret = -1; + +        *fd = fd_create (loc->inode, GF_CLIENT_PID_GLFS_HEAL); +        if (!*fd) { +                printf ("fd_create failed: %s", strerror(errno)); +                goto out; +        } +        ret = syncop_opendir (xl, loc, *fd); +        if (ret) { +                fd_unref(*fd); +#ifdef GF_LINUX_HOST_OS /* See comment in afr_shd_index_opendir() */ +                *fd = fd_anonymous (loc->inode); +                if (!*fd) { +                        printf ("fd_anonymous failed: %s", +                                strerror(errno)); +                        goto out; +                } +                ret = 0; +#else +                printf ("opendir failed: %s", strerror(errno)); +                goto out; +#endif +        } + +out: +        return ret; +} +  static xlator_t*  _get_afr_ancestor (xlator_t *xl)  { @@ -185,6 +222,33 @@ glfsh_print_heal_status (dict_t *dict, char *path, uuid_t gfid,  }  static int +glfsh_heal_entries (glfs_t *fs, xlator_t *top_subvol, loc_t *rootloc, +                    gf_dirent_t *entries,  uint64_t *offset, +                    uint64_t *num_entries, dict_t *xattr_req) { + +        gf_dirent_t      *entry          = NULL; +        gf_dirent_t      *tmp            = NULL; +        int               ret            = 0; +        char              file[64]      = {0}; + +        list_for_each_entry_safe (entry, tmp, &entries->list, list) { +                *offset = entry->d_off; +                if ((strcmp (entry->d_name, ".") == 0) || +                    (strcmp (entry->d_name, "..") == 0)) +                        continue; +                memset (file, 0, sizeof(file)); +                snprintf (file, sizeof(file), "gfid:%s", entry->d_name); +                ret = glfsh_heal_splitbrain_file (fs, top_subvol, rootloc, file, +                                                 xattr_req); +                if (ret) +                        continue; +                (*num_entries)++; +        } + +        return ret; +} + +static int  glfsh_process_entries (xlator_t *xl, fd_t *fd, gf_dirent_t *entries,                         uint64_t *offset, uint64_t *num_entries)  { @@ -240,15 +304,21 @@ glfsh_process_entries (xlator_t *xl, fd_t *fd, gf_dirent_t *entries,  }  static int -glfsh_crawl_directory (xlator_t   *readdir_xl, fd_t *fd, loc_t *loc) +glfsh_crawl_directory (glfs_t *fs, xlator_t *top_subvol, loc_t *rootloc, +                       xlator_t *readdir_xl, fd_t *fd, loc_t *loc, +                       dict_t *xattr_req)  {          uint64_t        offset = 0;          gf_dirent_t     entries;          int             ret = 0;          gf_boolean_t    free_entries = _gf_false;          uint64_t        num_entries = 0; +        int             heal_op = -1;          INIT_LIST_HEAD (&entries.list); +        ret = dict_get_int32 (xattr_req, "heal-op", &heal_op); +        if (ret) +                return ret;          while (1) {                  ret = syncop_readdir (readdir_xl, fd, 131072, offset, &entries); @@ -260,11 +330,16 @@ glfsh_crawl_directory (xlator_t   *readdir_xl, fd_t *fd, loc_t *loc)                  if (list_empty (&entries.list))                          goto out; -                ret = glfsh_process_entries (readdir_xl, fd, &entries, &offset, -                                             &num_entries); -                if (ret < 0) -                        goto out; - +                if (heal_op == GF_AFR_OP_INDEX_SUMMARY) { +                        ret = glfsh_process_entries (readdir_xl, fd, &entries, +                                                     &offset, &num_entries); +                        if (ret < 0) +                                goto out; +                } else if (heal_op == GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK) { +                        ret = glfsh_heal_entries (fs, top_subvol, rootloc, +                                                  &entries, &offset, +                                                  &num_entries, xattr_req); +                }                  gf_dirent_free (&entries);                  free_entries = _gf_false;          } @@ -275,9 +350,12 @@ out:          if (ret < 0) {                  printf ("Failed to complete gathering info. "                           "Number of entries so far: %"PRIu64"\n", num_entries); -        } -        else { -                printf ("Number of entries: %"PRIu64"\n", num_entries); +        } else { +                if (heal_op == GF_AFR_OP_INDEX_SUMMARY) +                        printf ("Number of entries: %"PRIu64"\n", num_entries); +                else if (heal_op == GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK) +                        printf ("Number of healed entries: %"PRIu64"\n", +                                num_entries);          }          return ret;  } @@ -333,13 +411,22 @@ out:  }  void -glfsh_print_pending_heals (xlator_t *xl, loc_t *rootloc) +glfsh_print_pending_heals (glfs_t *fs, xlator_t *top_subvol, loc_t *rootloc, +                           xlator_t *xl)  {          int ret = 0;          loc_t   dirloc = {0};          fd_t    *fd = NULL;          int32_t op_errno = 0; +        dict_t *xattr_req = NULL; +        xattr_req = dict_new(); +        if (!xattr_req) +                goto out; + +        ret = dict_set_int32 (xattr_req, "heal-op", GF_AFR_OP_INDEX_SUMMARY); +        if (ret) +                goto out;          ret = glfsh_print_brick (xl, rootloc);          if (ret < 0) {                  glfsh_print_brick_from_xl (xl); @@ -356,30 +443,16 @@ glfsh_print_pending_heals (xlator_t *xl, loc_t *rootloc)                  goto out;          } -        fd = fd_create (dirloc.inode, GF_CLIENT_PID_GLFS_HEAL); -        if (!fd) { -                printf ("fd_create failed: %s", strerror(errno)); -                goto out; -        } -        ret = syncop_opendir (xl, &dirloc, fd); -        if (ret) { -                fd_unref(fd); -#ifdef GF_LINUX_HOST_OS /* See comment in afr_shd_index_opendir() */ -                fd = fd_anonymous (dirloc.inode); -                if (!fd) { -                        printf ("fd_anonymous failed: %s", -                                strerror(errno)); -                        goto out; -                } -#else -                printf ("opendir failed: %s", strerror(errno)); +        ret = glfsh_get_index_dir_fd (xl, &dirloc, &fd); +        if (ret)                  goto out; -#endif -        } -        ret = glfsh_crawl_directory (xl, fd, &dirloc); +        ret = glfsh_crawl_directory (fs, top_subvol, rootloc, xl, fd, &dirloc, +                                     xattr_req);          if (fd)                  fd_unref (fd); +        if (xattr_req) +                dict_unref (xattr_req);          if (ret < 0)                  printf ("Failed to find entries with pending self-heal\n");  out: @@ -411,6 +484,209 @@ glfsh_validate_replicate_volume (xlator_t *xl)          return ret;  } +static xlator_t* +_brick_path_to_client_xlator (xlator_t *top_subvol, char *hostname, +                              char *brickpath) +{ +        int ret             = 0; +        xlator_t *xl        = NULL; +        char *remote_host   = NULL; +        char *remote_subvol = NULL; + +        xl = top_subvol; + +        while (xl->next) +                xl = xl->next; + +        while (xl) { +                if (!strcmp (xl->type, "protocol/client")) { +                        ret = dict_get_str (xl->options, "remote-host", +                                                    &remote_host); +                        if (ret < 0) +                                goto out; +                        ret = dict_get_str (xl->options, +                                            "remote-subvolume", &remote_subvol); +                        if (ret < 0) +                                goto out; +                        if (!strcmp (hostname, remote_host) && +                            !strcmp (brickpath, remote_subvol)) +                                return xl; +                } +                xl = xl->prev; +        } + +out: +        return NULL; +} + + +int +glfsh_gather_heal_info (glfs_t *fs, xlator_t *top_subvol, loc_t *rootloc) +{ +        xlator_t  *xl       = NULL; +        xlator_t  *afr_xl   = NULL; +        xlator_t  *old_THIS = NULL; + +        xl = top_subvol; +        while (xl->next) +                xl = xl->next; +        while (xl) { +                if (strcmp (xl->type, "protocol/client") == 0) { +                        afr_xl = _get_afr_ancestor (xl); +                        if (afr_xl) +                                old_THIS = THIS; +                                THIS = afr_xl; +                                glfsh_print_pending_heals (fs, top_subvol, +                                                           rootloc, xl); +                                THIS = old_THIS; +                                printf ("\n"); +                } + +                xl = xl->prev; +        } + +        return 0; +} + +int +glfsh_heal_splitbrain_file (glfs_t *fs, xlator_t *top_subvol, loc_t *rootloc, +                           char *file, dict_t *xattr_req) +{ +        int          ret        = -1; +        int          reval      = 0; +        loc_t        loc        = {0, }; +        char        *path       = NULL; +        char        *filename   = NULL; +        struct iatt  iatt       = {0, }; +        xlator_t    *xl         = top_subvol; +        dict_t      *xattr_rsp  = NULL; +        char        *sh_fail_msg = NULL; +        int32_t      op_errno   = 0; + +        if (!strncmp (file, "gfid:", 5)) { +                filename = gf_strdup(file); +                path = strtok (filename, ":"); +                path = strtok (NULL, ";"); +                uuid_parse (path, loc.gfid); +                loc.path = gf_strdup (uuid_utoa (loc.gfid)); +                loc.inode = inode_new (rootloc->inode->table); +                ret = syncop_lookup (xl, &loc, xattr_req, 0, &xattr_rsp, 0); +                if (ret) { +                        op_errno = -ret; +                        printf ("Lookup failed on %s:%s.\n", file, +                                strerror(op_errno)); +                        goto out; +                } +        } else { +                if (file[0] != '/') { +                        printf ("<FILE> must be absolute path w.r.t. the " +                                "volume, starting with '/'\n"); +                        ret = -1; +                        goto out; +                } +retry: +                ret = glfs_resolve (fs, xl, file, &loc, &iatt, reval); +                ESTALE_RETRY (ret, errno, reval, &loc, retry); +                if (ret) { +                        printf("Lookup failed on %s:%s\n", +                               file, strerror (errno)); +                        goto out; +                } +        } + +        ret = syncop_getxattr (xl, &loc, &xattr_rsp, GF_AFR_HEAL_SBRAIN, +                               xattr_req); +        if (ret) { +                op_errno = -ret; +                printf ("Healing %s failed:%s.\n", file, strerror(op_errno)); +                goto out; +        } +        ret = dict_get_str (xattr_rsp, "sh-fail-msg", &sh_fail_msg); +        if (!ret) { +                printf ("Healing %s failed: %s.\n", file, sh_fail_msg); +                ret = -1; +                goto out; +        } +        printf ("Healed %s.\n", file); +        ret = 0; +out: +        if (xattr_rsp) +                dict_unref (xattr_rsp); +        return ret; +} + +int +glfsh_heal_from_brick (glfs_t *fs, xlator_t *top_subvol, loc_t *rootloc, +                      char *hostname, char *brickpath, char *file) +{ +        int       ret       = -1; +        dict_t   *xattr_req = NULL; +        xlator_t *client    = NULL; +        fd_t     *fd        = NULL; +        loc_t     dirloc    = {0}; +        int32_t   op_errno  = 0; + +        xattr_req = dict_new(); +        if (!xattr_req) +                goto out; +        ret = dict_set_int32 (xattr_req, "heal-op", +                              GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK); +        if (ret) +                goto out; +        client = _brick_path_to_client_xlator (top_subvol, hostname, brickpath); +        if (!client) { +                printf("\"%s:%s\"- No such brick available in the volume.\n", +                       hostname, brickpath); +                ret = -1; +                goto out; +        } +        ret = dict_set_str (xattr_req, "child-name", client->name); +        if (ret) +                goto out; +        if (file) +                ret = glfsh_heal_splitbrain_file (fs, top_subvol, rootloc, file, +                                                 xattr_req); +        else { +                ret = glfsh_get_index_dir_loc (rootloc, client, &dirloc, +                                               &op_errno); +                ret = glfsh_get_index_dir_fd (client, &dirloc, &fd); +                if (ret) +                        goto out; +                ret = glfsh_crawl_directory (fs, top_subvol, rootloc, client, +                                             fd, &dirloc, xattr_req); +                if (fd) +                        fd_unref (fd); +        } +out: +        if (xattr_req) +                dict_unref (xattr_req); +        loc_wipe (&dirloc); +        return ret; +} + +int +glfsh_heal_from_bigger_file (glfs_t *fs, xlator_t *top_subvol, loc_t *rootloc, +                            char *file) +{ + +        int ret = -1; +        dict_t *xattr_req = NULL; + +        xattr_req = dict_new(); +        if (!xattr_req) +                goto out; +        ret = dict_set_int32 (xattr_req, "heal-op", +                              GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE); +        if (ret) +                goto out; +        ret = glfsh_heal_splitbrain_file (fs, top_subvol, rootloc, file, +                                         xattr_req); +out: +        if (xattr_req) +                dict_unref (xattr_req); +        return ret; +} +  int  main (int argc, char **argv)  { @@ -418,18 +694,54 @@ main (int argc, char **argv)          int        ret = 0;          char      *volname = NULL;          xlator_t  *top_subvol = NULL; -        xlator_t  *xl = NULL;          loc_t     rootloc = {0};          char      logfilepath[PATH_MAX] = {0}; -        xlator_t  *old_THIS = NULL; -        xlator_t  *afr_xl = NULL; +        char      *hostname = NULL; +        char      *path = NULL; +        char      *file = NULL; +        gf_xl_afr_op_t heal_op = -1; -        if (argc != 2) { -                printf ("Usage: %s <volname>\n", argv[0]); +        if (argc < 2) { +                printf (USAGE_STR, argv[0]);                  ret = -1;                  goto out;          }          volname = argv[1]; +        switch (argc) { +        case 2: +                heal_op = GF_AFR_OP_INDEX_SUMMARY; +                break; +        case 4: +                if (!strcmp (argv[2], "bigger-file")) { +                        heal_op = GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE; +                        file = argv[3]; +                } else if (!strcmp (argv[2], "source-brick")) { +                        heal_op = GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK; +                        hostname = strtok (argv[3], ":"); +                        path = strtok (NULL, ":"); +                } else { +                        printf (USAGE_STR, argv[0]); +                        ret = -1; +                        goto out; +                } +                break; +        case 5: +                if (!strcmp (argv[2], "source-brick")) { +                        heal_op = GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK; +                        hostname = strtok (argv[3], ":"); +                        path = strtok (NULL, ":"); +                        file = argv[4]; +                } else { +                        printf (USAGE_STR, argv[0]); +                        ret = -1; +                        goto out; +                } +                break; +        default: +                printf (USAGE_STR, argv[0]); +                ret = -1; +                goto out; +        }          fs = glfs_new (volname);          if (!fs) { @@ -485,30 +797,28 @@ main (int argc, char **argv)          rootloc.inode = inode_ref (top_subvol->itable->root);          glfs_loc_touchup (&rootloc); -        xl = top_subvol; -        while (xl->next) -                xl = xl->next; - -        while (xl) { -                if (strcmp (xl->type, "protocol/client") == 0) { -                        afr_xl = _get_afr_ancestor (xl); -                        if (afr_xl) { -                                old_THIS = THIS; -                                THIS = afr_xl; -                                glfsh_print_pending_heals (xl, &rootloc); -                                THIS = old_THIS; -                                printf("\n"); -                        } -                } - -                xl = xl->prev; +        switch (heal_op) { +        case GF_AFR_OP_INDEX_SUMMARY: +                ret = glfsh_gather_heal_info (fs, top_subvol, &rootloc); +                break; +        case GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE: +                ret = glfsh_heal_from_bigger_file (fs, top_subvol, +                                                   &rootloc, file); +                        break; +        case GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK: +                ret = glfsh_heal_from_brick (fs, top_subvol, &rootloc, +                                             hostname, path, file); +                break; +        default: +                ret = -1; +                break;          }          loc_wipe (&rootloc);          glfs_subvol_done (fs, top_subvol);          glfs_fini (fs); -        return 0; +        return ret;  out:          if (fs && top_subvol)                  glfs_subvol_done (fs, top_subvol); diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index 4c213f41576..73945e578fe 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -138,6 +138,7 @@  #define GF_XATTROP_INDEX_COUNT "glusterfs.xattrop_index_count"  #define GF_AFR_HEAL_INFO "glusterfs.heal-info" +#define GF_AFR_HEAL_SBRAIN "glusterfs.heal-sbrain"  #define GF_GFIDLESS_LOOKUP "gfidless-lookup"  /* replace-brick and pump related internal xattrs */ diff --git a/rpc/rpc-lib/src/protocol-common.h b/rpc/rpc-lib/src/protocol-common.h index 1fd063aec25..f560c103acd 100644 --- a/rpc/rpc-lib/src/protocol-common.h +++ b/rpc/rpc-lib/src/protocol-common.h @@ -231,6 +231,8 @@ typedef enum {          GF_AFR_OP_STATISTICS,          GF_AFR_OP_STATISTICS_HEAL_COUNT,          GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA, +        GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE, +        GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK,  } gf_xl_afr_op_t ;  struct gf_gsync_detailed_status_ { diff --git a/tests/basic/afr/split-brain-healing.t b/tests/basic/afr/split-brain-healing.t new file mode 100644 index 00000000000..1dc317df8dd --- /dev/null +++ b/tests/basic/afr/split-brain-healing.t @@ -0,0 +1,183 @@ +#!/bin/bash + +#Test the split-brain resolution CLI commands. +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +function get_replicate_subvol_number { +        local filename=$1 +        #get_backend_paths +        if [ -f $B0/${V0}1/$filename ] +        then +                echo 0 +        elif [ -f $B0/${V0}3/$filename ] +        then    echo 1 +        else +                echo -1 +        fi +} + +cleanup; + +AREQUAL_PATH=$(dirname $0)/../../utils +CFLAGS="" +test "`uname -s`" != "Linux" && { +    CFLAGS="$CFLAGS -I$(dirname $0)/../../../contrib/argp-standalone "; +    CFLAGS="$CFLAGS -L$(dirname $0)/../../../contrib/argp-standalone -largp "; +    CFLAGS="$CFLAGS -lintl"; +} +build_tester $AREQUAL_PATH/arequal-checksum.c $CFLAGS +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2,3,4} +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 + +cd $M0 +for i in {1..10} +do +        echo "Initial content">>file$i +done + +replica_0_files_list=(`ls $B0/${V0}1`) +replica_1_files_list=(`ls $B0/${V0}3`) + +############ Create data split-brain in the files. ########################### +TEST kill_brick $V0 $H0 $B0/${V0}1 +for file in ${!replica_0_files_list[*]} +do +        echo "B1 is down">>${replica_0_files_list[$file]} +done +TEST kill_brick $V0 $H0 $B0/${V0}3 +for file in ${!replica_1_files_list[*]} +do +        echo "B3 is down">>${replica_1_files_list[$file]} +done + +SMALLER_FILE_SIZE=$(stat -c %s file1) + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2 + +TEST kill_brick $V0 $H0 $B0/${V0}2 +for file in ${!replica_0_files_list[*]} +do +        echo "B2 is down">>${replica_0_files_list[$file]} +        echo "appending more content to make it the bigger file">>${replica_0_files_list[$file]} +done +TEST kill_brick $V0 $H0 $B0/${V0}4 +for file in ${!replica_1_files_list[*]} +do +        echo "B4 is down">>${replica_1_files_list[$file]} +        echo "appending more content to make it the bigger file">>${replica_1_files_list[$file]} +done + +BIGGER_FILE_SIZE=$(stat -c %s file1) + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 3 + + +############### Acessing the files should now give EIO. ############################### +TEST ! cat file1 +TEST ! cat file2 +TEST ! cat file3 +TEST ! cat file4 +TEST ! cat file5 +TEST ! cat file6 +TEST ! cat file7 +TEST ! cat file8 +TEST ! cat file9 +TEST ! cat file10 +################### +TEST $CLI volume set $V0 cluster.self-heal-daemon on +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 3 + +################ Heal file1 using the bigger-file option  ############## +$CLI volume heal $V0 split-brain bigger-file /file1 +EXPECT "0" echo $? +EXPECT $BIGGER_FILE_SIZE stat -c %s file1 + +################ Heal file2 using the bigger-file option and its gfid ############## +subvolume=$(get_replicate_subvol_number file2) +if [ $subvolume == 0 ] +then +        GFID=$(gf_get_gfid_xattr $B0/${V0}1/file2) +elif [ $subvolume == 1 ] +then +        GFID=$(gf_get_gfid_xattr $B0/${V0}3/file2) +fi +GFIDSTR="gfid:$(gf_gfid_xattr_to_str $GFID)" +$CLI volume heal $V0 split-brain bigger-file $GFIDSTR +EXPECT "0" echo $? + +################ Heal file3 using the source-brick option  ############## +################ Use the brick having smaller file size as source ####### +subvolume=$(get_replicate_subvol_number file3) +if [ $subvolume == 0 ] +then +        $CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}2 /file3 +elif [ $subvolume == 1] +then +        $CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}4 /file3 +fi +EXPECT "0" echo $? +EXPECT $SMALLER_FILE_SIZE stat -c %s file3 + +################ Heal file4 using the source-brick option and it's gfid ############## +################ Use the brick having smaller file size as source ####### +subvolume=$(get_replicate_subvol_number file4) +if [ $subvolume == 0 ] +then +        GFID=$(gf_get_gfid_xattr $B0/${V0}1/file4) +        GFIDSTR="gfid:$(gf_gfid_xattr_to_str $GFID)" +        $CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}2 $GFIDSTR +elif [ $subvolume == 1] +then +        GFID=$(gf_get_gfid_xattr $B0/${V0}3/file4) +        GFIDSTR="gfid:$(gf_gfid_xattr_to_str $GFID)" +        $CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}4 $GFIDSTR +fi +EXPECT "0" echo $? +EXPECT $SMALLER_FILE_SIZE stat -c %s file4 + +################ Heal remaining SB'ed files of replica_0 using B1 as source ############## +$CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}1 +EXPECT "0" echo $? + +################ Heal remaining SB'ed files of replica_1 using B3 as source ############## +$CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}3 +EXPECT "0" echo $? + +############### Reading the files should now suceed. ############################### +TEST  cat file1 +TEST  cat file2 +TEST  cat file3 +TEST  cat file4 +TEST  cat file5 +TEST  cat file6 +TEST  cat file7 +TEST  cat file8 +TEST  cat file9 +TEST  cat file10 + +################ File contents on the bricks must be same. ################################ +TEST diff <(arequal-checksum -p $B0/$V01 -i .glusterfs) <(arequal-checksum -p $B0/$V02 -i .glusterfs) +TEST diff <(arequal-checksum -p $B0/$V03 -i .glusterfs) <(arequal-checksum -p $B0/$V04 -i .glusterfs) + +############### Trying to heal files not in SB should fail. ############################### +$CLI volume heal $V0 split-brain bigger-file /file1 +EXPECT "1" echo $? +$CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}4 /file3 +EXPECT "1" echo $? + +cd - +TEST rm $AREQUAL_PATH/arequal-checksum +cleanup diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index f39db802588..e6d45add4e8 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -4471,5 +4471,81 @@ out:          AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL);          if (dict)                 dict_unref (dict); +        if (inode) { +                inode_forget (inode, 1); +                inode_unref (inode); +        } +        return ret; +} + +int32_t +afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc) +{ +        gf_boolean_t    data_selfheal     = _gf_false; +        gf_boolean_t    metadata_selfheal = _gf_false; +        gf_boolean_t    entry_selfheal    = _gf_false; +        dict_t         *dict              = NULL; +        afr_local_t    *local             = NULL; +        inode_t        *inode             = NULL; +        int entry_ret = 0, metadata_ret = 0, data_ret = 0; +        int ret = 0, op_errno = 0; + +        local = frame->local; +        dict = dict_new (); +        if (!dict) { +                op_errno = ENOMEM; +                ret = -1; +                goto out; +        } + +        ret = afr_selfheal_unlocked_inspect (frame, this, loc->gfid, &inode, +                                             &data_selfheal, +                                             &metadata_selfheal, +                                             &entry_selfheal); +        if (ret) { +                op_errno = -ret; +                ret = -1; +                goto out; +        } + +        if (!data_selfheal && !metadata_selfheal && !entry_selfheal) { +                ret = dict_set_str (dict, "sh-fail-msg", +                                    "File not in split-brain"); +                if (ret) +                        gf_log (this->name, GF_LOG_WARNING, +                                "Failed to set sh-fail-msg in dict"); +                ret = 0; +                goto out; +        } + +        if (data_selfheal) +                data_ret = afr_selfheal_data (frame, this, inode); + +        if (metadata_selfheal) +                metadata_ret = afr_selfheal_metadata (frame, this, inode); + +        if (entry_selfheal) +                entry_ret = afr_selfheal_entry (frame, this, inode); + +        ret = (data_ret | metadata_ret | entry_ret); + +        if (local->xdata_rsp) { +                /* 'sh-fail-msg' has been set in the dict during self-heal.*/ +                dict_copy (local->xdata_rsp, dict); +                ret = 0; +        } else if (ret) { +                /*Some other error during self-heal. Just propagate it.*/ +                op_errno = -ret; +                ret = -1; +        } + +out: +        AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL); +        if (dict) +                dict_unref(dict); +        if (inode) { +                inode_forget (inode, 1); +                inode_unref (inode); +        }          return ret;  } diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index e64070e1bcd..78dd65f30e7 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -1380,6 +1380,11 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,                  return 0;          } +        if (!strcmp (name, GF_AFR_HEAL_SBRAIN)) { +                afr_heal_splitbrain_file (frame, this, loc); +                return 0; +        } +          /*           * if we are doing getxattr with pathinfo as the key then we           * collect information from all childs diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 6198d4cf72c..e9d853c4ecd 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -17,7 +17,7 @@  #include "afr.h"  #include "afr-self-heal.h"  #include "byte-order.h" - +#include "protocol-common.h"  int  afr_selfheal_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -287,6 +287,39 @@ afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies,  	return 0;  } +/* + * If by chance there are multiple sources with differing sizes, select + * the largest file as the source. + * + * This can happen if data was directly modified in the backend or for snapshots + */ +void +afr_mark_largest_file_as_source (xlator_t *this, unsigned char *sources, +                                 struct afr_reply *replies) +{ +        int i = 0; +        afr_private_t *priv = NULL; +        uint64_t size = 0; + +        /* Find source with biggest file size */ +        priv = this->private; +        for (i = 0; i < priv->child_count; i++) { +                if (!sources[i]) +                        continue; +                if (size <= replies[i].poststat.ia_size) { +                        size = replies[i].poststat.ia_size; +                } +        } + +        /* Mark sources with less size as not source */ +        for (i = 0; i < priv->child_count; i++) { +                if (!sources[i]) +                        continue; +                if (size > replies[i].poststat.ia_size) +                        sources[i] = 0; +        } +} +  void  afr_mark_active_sinks (xlator_t *this, unsigned char *sources,                         unsigned char *locked_on, unsigned char *sinks) @@ -304,6 +337,154 @@ afr_mark_active_sinks (xlator_t *this, unsigned char *sources,  }  gf_boolean_t +afr_dict_contains_heal_op (call_frame_t *frame) +{ +        afr_local_t   *local     = NULL; +        dict_t        *xdata_req = NULL; +        int            ret       = 0; +        int            heal_op   = -1; + +        local = frame->local; +        xdata_req = local->xdata_req; +        ret = dict_get_int32 (xdata_req, "heal-op", &heal_op); +        if (ret) +                return _gf_false; +        if (local->xdata_rsp == NULL) { +                local->xdata_rsp = dict_new(); +                if (!local->xdata_rsp) +                        return _gf_true; +        } +        ret = dict_set_str (local->xdata_rsp, "sh-fail-msg", +                            "File not in split-brain"); + +        return _gf_true; +} + +/* Return a source depending on the type of heal_op, and set sources[source], + * sinks[source] and healed_sinks[source] to 1, 0 and 0 respectively. Do so + * only if the following condition is met: + * ∀i((i ∈ locked_on[] ∧ i=1)==>(sources[i]=0 ∧ sinks[i]=1 ∧ healed_sinks[i]=1)) + * i.e. for each locked node, sources[node] is 0; healed_sinks[node] and + * sinks[node] are 1. This should be the case if the file is in split-brain. + */ +int +afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this, +                                   unsigned char *sources, +                                   unsigned char *sinks, +                                   unsigned char *healed_sinks, +                                   unsigned char *locked_on, +                                   struct afr_reply *replies, +                                   afr_transaction_type type) +{ +        afr_local_t   *local     = NULL; +        afr_private_t *priv      = NULL; +        dict_t        *xdata_req = NULL; +        dict_t        *xdata_rsp = NULL; +        int            ret       = 0; +        int            heal_op   = -1; +        int            i         = 0; +        char          *name      = NULL; +        int            source     = -1; + +        local = frame->local; +        priv = this->private; +        xdata_req = local->xdata_req; +        ret = dict_get_int32 (xdata_req, "heal-op", &heal_op); +        if (ret) +                goto out; +        for (i = 0; i < priv->child_count; i++) { +                if (locked_on[i]) +                        if (sources[i] || !sinks[i] || !healed_sinks[i]) { +                                ret = -1; +                                goto out; +                        } +        } +        if (local->xdata_rsp == NULL) { +                local->xdata_rsp = dict_new(); +                if (!local->xdata_rsp) { +                        ret = -1; +                        goto out; +                } +        } +        xdata_rsp = local->xdata_rsp; + +        switch (heal_op) { +        case GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE: +                if (type == AFR_METADATA_TRANSACTION) { +                        ret = dict_set_str (xdata_rsp, "sh-fail-msg", +                                            "Use source-brick option to" +                                            " heal metadata split-brain"); +                        if (!ret) +                                ret = -1; +                        goto out; +                } +                for (i = 0 ; i < priv->child_count; i++) +                        if (locked_on[i]) +                                sources[i] = 1; +                afr_mark_largest_file_as_source (this, sources, replies); +                if (AFR_COUNT (sources, priv->child_count) != 1) { +                        ret = dict_set_str (xdata_rsp, "sh-fail-msg", +                                            "No bigger file"); +                        if (!ret) +                                ret = -1; +                        goto out; +                } +                for (i = 0 ; i < priv->child_count; i++) +                        if (sources[i]) +                                source = i; +                sinks[source] = 0; +                healed_sinks[source] = 0; +                break; +        case GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK: +                ret = dict_get_str (xdata_req, "child-name", &name); +                if (ret) +                        goto out; +                source = afr_get_child_index_from_name (this, name); +                if (source < 0) { +                        ret = dict_set_str (xdata_rsp, "sh-fail-msg", +                                            "Invalid brick name"); +                        if (!ret) +                                ret = -1; +                        goto out; +                } +                if (locked_on[source] != 1) { +                        ret = dict_set_str (xdata_rsp, "sh-fail-msg", +                                            "Brick is not up"); +                        if (!ret) +                                ret = -1; +                        goto out; +                } +                sources[source] = 1; +                sinks[source] = 0; +                healed_sinks[source] = 0; +                break; +        default: +                ret = -1; +                goto out; +        } +        ret = source; +out: +        return ret; + +} + +int +afr_get_child_index_from_name (xlator_t *this, char *name) +{ +        afr_private_t *priv  = this->private; +        int            index = -1; + +        for (index = 0; index < priv->child_count; index++) { +                if (!strcmp (priv->children[index]->name, name)) +                        goto out; +        } +        index = -1; +out: +        return index; +} + + +gf_boolean_t  afr_does_witness_exist (xlator_t *this, uint64_t *witness)  {          int i = 0; @@ -427,6 +608,14 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,                  }          } +         /* If no sources, all locked nodes are sinks - split brain */ +         if (AFR_COUNT (sources, priv->child_count) == 0) { +                for (i = 0; i < priv->child_count; i++) { +                        if (locked_on[i]) +                                sinks[i] = 1; +                } +        } +          /* In afr-v1 if a file is self-accused but didn't have any pending           * operations on others then it is similar to 'dirty' in afr-v2.           * Consider such cases as witness. diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index a434b9e6ba1..45a099cec86 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -17,6 +17,7 @@  #include "afr.h"  #include "afr-self-heal.h"  #include "byte-order.h" +#include "protocol-common.h"  enum {  	AFR_SELFHEAL_DATA_FULL = 0, @@ -426,41 +427,6 @@ afr_does_size_mismatch (xlator_t *this, unsigned char *sources,          return _gf_false;  } -/* - * If by chance there are multiple sources with differing sizes, select - * the largest file as the source. - * - * This can happen if data was directly modified in the backend or for snapshots - */ - -static void -afr_mark_largest_file_as_source (xlator_t *this, unsigned char *sources, -                                 struct afr_reply *replies) -{ -        int i = 0; -        afr_private_t *priv = NULL; -        uint64_t size = 0; - -        /* Find source with biggest file size */ -        priv = this->private; -        for (i = 0; i < priv->child_count; i++) { -                if (!sources[i]) -                        continue; -                if (size <= replies[i].poststat.ia_size) { -                        size = replies[i].poststat.ia_size; -                } -        } - -        /* Mark sources with less size as not source */ -        for (i = 0; i < priv->child_count; i++) { -                if (!sources[i]) -                        continue; -                if (size > replies[i].poststat.ia_size) -                        sources[i] = 0; -        } - -        return; -}  static void  afr_mark_biggest_witness_as_source (xlator_t *this, unsigned char *sources, @@ -518,7 +484,9 @@ afr_mark_newest_file_as_source (xlator_t *this, unsigned char *sources,  }  static int -__afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources, +__afr_selfheal_data_finalize_source (call_frame_t *frame, xlator_t *this, +                                     unsigned char *sources, +                                     unsigned char *sinks,  				     unsigned char *healed_sinks,  				     unsigned char *locked_on,  				     struct afr_reply *replies, @@ -528,7 +496,6 @@ __afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources,  	afr_private_t *priv = NULL;  	int source = -1;  	int sources_count = 0; -  	priv = this->private;  	sources_count = AFR_COUNT (sources, priv->child_count); @@ -536,9 +503,21 @@ __afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources,  	if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0)              || !sources_count) {  		/* split brain */ -		return -EIO; +                source = afr_mark_split_brain_source_sinks (frame, this, +                                                            sources, sinks, +                                                            healed_sinks, +                                                            locked_on, replies, +                                                          AFR_DATA_TRANSACTION); +                if (source < 0) +                        return -EIO; +                return source;  	} +        /* No split brain at this point. If we were called from +         * afr_heal_splitbrain_file(), abort.*/ +        if (afr_dict_contains_heal_op(frame)) +                return -EIO; +          /* If there are no witnesses/size-mismatches on sources we are done*/          if (!afr_does_size_mismatch (this, sources, replies) &&              !afr_has_source_witnesses (this, sources, witness)) @@ -605,9 +584,10 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this,          */          AFR_INTERSECT (healed_sinks, sinks, locked_on, priv->child_count); -	source = __afr_selfheal_data_finalize_source (this, sources, -                                                      healed_sinks, locked_on, -                                                      replies, witness); +	source = __afr_selfheal_data_finalize_source (frame, this, sources, +                                                      sinks, healed_sinks, +                                                      locked_on, replies, +                                                      witness);  	if (source < 0)  		return -EIO; diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index 0518c1821e3..05d9f2b4917 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -17,6 +17,7 @@  #include "afr.h"  #include "afr-self-heal.h"  #include "byte-order.h" +#include "protocol-common.h"  #define AFR_HEAL_ATTR (GF_SET_ATTR_UID|GF_SET_ATTR_GID|GF_SET_ATTR_MODE) @@ -199,6 +200,7 @@ out:  static int  __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,                                           unsigned char *sources, +                                         unsigned char *sinks,  					 unsigned char *healed_sinks,  					 unsigned char *locked_on,  					 struct afr_reply *replies) @@ -208,13 +210,26 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,  	struct iatt first = {0, };  	int source = -1;  	int sources_count = 0; +        dict_t *xdata_req = NULL; +        afr_local_t *local = NULL;  	priv = this->private; +        local = frame->local; +        xdata_req = local->xdata_req;  	sources_count = AFR_COUNT (sources, priv->child_count);  	if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0)  	    || !sources_count) { + +                source = afr_mark_split_brain_source_sinks (frame, this, +                                                            sources, sinks, +                                                            healed_sinks, +                                                            locked_on, replies, +                                                      AFR_METADATA_TRANSACTION); +                if (source >= 0) +                        return source; +  		/* If this is a directory mtime/ctime only split brain  		   use the most recent */  		source = afr_dirtime_splitbrain_source (frame, this, @@ -224,17 +239,7 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,  				"split brain on %s",  				 uuid_utoa (replies[source].poststat.ia_gfid));  			sources[source] = 1; - -			for (i = 0; i < priv->child_count; i++) { -				if (i == source) -					continue; - -				if (!locked_on[i]) -					continue; - -				healed_sinks[i] = 1; -			} - +			healed_sinks[source] = 0;  			return source;  		} @@ -253,6 +258,11 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,  		}  	} +        /* No split brain at this point. If we were called from +         * afr_heal_splitbrain_file(), abort.*/ +        if (afr_dict_contains_heal_op(frame)) +                return -EIO; +  	for (i = 0; i < priv->child_count; i++) {  		if (!sources[i])  			continue; @@ -352,7 +362,7 @@ __afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *i          }  	source = __afr_selfheal_metadata_finalize_source (frame, this, sources, -                                                          healed_sinks, +                                                          sinks, healed_sinks,                                                            locked_on, replies);  	if (source < 0) diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 50cff91ccb3..74cc9608cf6 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -193,10 +193,28 @@ afr_log_selfheal (uuid_t gfid, xlator_t *this, int ret, char *type,                    int source, unsigned char *healed_sinks);  void +afr_mark_largest_file_as_source (xlator_t *this, unsigned char *sources, +                                 struct afr_reply *replies); +void  afr_mark_active_sinks (xlator_t *this, unsigned char *sources,                         unsigned char *locked_on, unsigned char *sinks);  gf_boolean_t +afr_dict_contains_heal_op (call_frame_t *frame); + +int +afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this, +                                   unsigned char *sources, +                                   unsigned char *sinks, +                                   unsigned char *healed_sinks, +                                   unsigned char *locked_on, +                                   struct afr_reply *replies, +                                   afr_transaction_type type); + +int +afr_get_child_index_from_name (xlator_t *this, char *name); + +gf_boolean_t  afr_does_witness_exist (xlator_t *this, uint64_t *witness);  int diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 4fdc5f774cc..09821b724fe 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -1021,4 +1021,8 @@ afr_is_xattr_ignorable (char *key);  int  afr_get_heal_info (call_frame_t *frame, xlator_t *this, loc_t *loc,                     dict_t *xdata); + +int +afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc); +  #endif /* __AFR_H__ */ diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 82b527e9141..866e3faf629 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -2636,8 +2636,10 @@ dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          this_call_cnt = dht_frame_return (frame); -        if (!xattr || (op_ret == -1)) +        if (!xattr || (op_ret == -1)) { +                local->op_ret = op_ret;                  goto out; +        }          if (dict_get (xattr, conf->xattr_name)) {                  dict_del (xattr, conf->xattr_name); @@ -2808,7 +2810,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,                          subvol = layout->list[i].xlator;                          STACK_WIND (frame, dht_vgetxattr_dir_cbk,                                      subvol, subvol->fops->getxattr, -                                    loc, key, NULL); +                                    loc, key, xdata);                  }                  return 0;          } @@ -2821,7 +2823,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,                  local->call_cnt = 1;                  STACK_WIND (frame, dht_vgetxattr_cbk, cached_subvol, -                            cached_subvol->fops->getxattr, loc, key, NULL); +                            cached_subvol->fops->getxattr, loc, key, xdata);                  return 0;          } @@ -2854,7 +2856,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,                  if (hashed_subvol) {                          STACK_WIND (frame, dht_linkinfo_getxattr_cbk, hashed_subvol,                                      hashed_subvol->fops->getxattr, loc, -                                    GF_XATTR_PATHINFO_KEY, NULL); +                                    GF_XATTR_PATHINFO_KEY, xdata);                          return 0;                  }                  op_errno = ENODATA; @@ -2933,7 +2935,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,                  subvol = layout->list[i].xlator;                  STACK_WIND (frame, dht_getxattr_cbk,                              subvol, subvol->fops->getxattr, -                            loc, key, NULL); +                            loc, key, xdata);          }          return 0; | 
