diff options
-rw-r--r-- | cli/src/cli-cmd-parser.c | 91 | ||||
-rw-r--r-- | cli/src/cli-cmd-volume.c | 79 | ||||
-rw-r--r-- | cli/src/cli-rpc-ops.c | 6 | ||||
-rw-r--r-- | heal/src/glfs-heal.c | 416 | ||||
-rw-r--r-- | libglusterfs/src/glusterfs.h | 1 | ||||
-rw-r--r-- | rpc/rpc-lib/src/protocol-common.h | 2 | ||||
-rw-r--r-- | tests/basic/afr/split-brain-healing.t | 183 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 76 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-inode-read.c | 5 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 191 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-data.c | 62 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-metadata.c | 34 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal.h | 18 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.h | 4 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-common.c | 12 |
15 files changed, 1039 insertions, 141 deletions
diff --git a/cli/src/cli-cmd-parser.c b/cli/src/cli-cmd-parser.c index 28888ba656d..53b14d27708 100644 --- a/cli/src/cli-cmd-parser.c +++ b/cli/src/cli-cmd-parser.c @@ -2929,6 +2929,43 @@ out: return ret; } +static int +set_hostname_path_in_dict (const char *token, dict_t *dict, int heal_op) +{ + char *hostname = NULL; + char *path = NULL; + int ret = 0; + + ret = extract_hostname_path_from_token (token, &hostname, &path); + if (ret) + goto out; + + switch (heal_op) { + case GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK: + ret = dict_set_dynstr (dict, "heal-source-hostname", + hostname); + if (ret) + goto out; + ret = dict_set_dynstr (dict, "heal-source-brickpath", + path); + break; + case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA: + ret = dict_set_dynstr (dict, "per-replica-cmd-hostname", + hostname); + if (ret) + goto out; + ret = dict_set_dynstr (dict, "per-replica-cmd-path", + path); + break; + default: + ret = -1; + break; + } + +out: + return ret; + +} int cli_cmd_volume_heal_options_parse (const char **words, int wordcount, @@ -2936,8 +2973,6 @@ cli_cmd_volume_heal_options_parse (const char **words, int wordcount, { int ret = 0; dict_t *dict = NULL; - char *hostname = NULL; - char *path = NULL; dict = dict_new (); if (!dict) @@ -3008,6 +3043,35 @@ cli_cmd_volume_heal_options_parse (const char **words, int wordcount, ret = -1; goto out; } + if (wordcount == 6) { + if (strcmp (words[3], "split-brain")) { + ret = -1; + goto out; + } + if (!strcmp (words[4], "bigger-file")) { + ret = dict_set_int32 (dict, "heal-op", + GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE); + if (ret) + goto out; + ret = dict_set_str (dict, "file", (char *)words[5]); + if (ret) + goto out; + goto done; + } + if (!strcmp (words[4], "source-brick")) { + ret = dict_set_int32 (dict, "heal-op", + GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK); + if (ret) + goto out; + ret = set_hostname_path_in_dict (words[5], dict, + GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK); + if (ret) + goto out; + goto done; + } + ret = -1; + goto out; + } if (wordcount == 7) { if (!strcmp (words[3], "statistics") && !strcmp (words[4], "heal-count") @@ -3017,21 +3081,26 @@ cli_cmd_volume_heal_options_parse (const char **words, int wordcount, GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA); if (ret) goto out; - ret = extract_hostname_path_from_token (words[6], - &hostname, &path); + ret = set_hostname_path_in_dict (words[6], dict, + GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA); if (ret) goto out; - ret = dict_set_dynstr (dict, "per-replica-cmd-hostname", - hostname); + goto done; + + } + if (!strcmp (words[3], "split-brain") && + !strcmp (words[4], "source-brick")) { + ret = dict_set_int32 (dict, "heal-op", + GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK); + ret = set_hostname_path_in_dict (words[5], dict, + GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK); if (ret) goto out; - ret = dict_set_dynstr (dict, "per-replica-cmd-path", - path); + ret = dict_set_str (dict, "file", + (char *) words[6]); if (ret) goto out; - else - goto done; - + goto done; } } ret = -1; diff --git a/cli/src/cli-cmd-volume.c b/cli/src/cli-cmd-volume.c index 238c8673d75..501b5776dec 100644 --- a/cli/src/cli-cmd-volume.c +++ b/cli/src/cli-cmd-volume.c @@ -1879,6 +1879,60 @@ cli_print_brick_status (cli_volume_status_t *status) return 0; } +#define NEEDS_GLFS_HEAL(op) ((op == GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE) || \ + (op == GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK) || \ + (op == GF_AFR_OP_INDEX_SUMMARY)) + +int +cli_launch_glfs_heal (int heal_op, dict_t *options) +{ + char buff[PATH_MAX] = {0}; + runner_t runner = {0}; + char *filename = NULL; + char *hostname = NULL; + char *path = NULL; + char *volname = NULL; + char *out = NULL; + int ret = 0; + + runinit (&runner); + ret = dict_get_str (options, "volname", &volname); + runner_add_args (&runner, SBIN_DIR"/glfsheal", volname, NULL); + runner_redir (&runner, STDOUT_FILENO, RUN_PIPE); + + switch (heal_op) { + case GF_AFR_OP_INDEX_SUMMARY: + break; + case GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE: + ret = dict_get_str (options, "file", &filename); + runner_add_args (&runner, "bigger-file", filename, NULL); + break; + case GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK: + ret = dict_get_str (options, "heal-source-hostname", + &hostname); + ret = dict_get_str (options, "heal-source-brickpath", + &path); + runner_add_args (&runner, "source-brick", NULL); + runner_argprintf (&runner, "%s:%s", hostname, path); + if (dict_get_str (options, "file", &filename) == 0) + runner_argprintf (&runner, filename); + break; + default: + ret = -1; + } + ret = runner_start (&runner); + if (ret == -1) + goto out; + while ((out = fgets (buff, sizeof(buff), + runner_chio (&runner, STDOUT_FILENO)))) { + printf ("%s", out); + } + ret = runner_end (&runner); + ret = WEXITSTATUS (ret); + +out: + return ret; +} int cli_cmd_volume_heal_cbk (struct cli_state *state, struct cli_cmd_word *word, const char **words, int wordcount) @@ -1892,9 +1946,6 @@ cli_cmd_volume_heal_cbk (struct cli_state *state, struct cli_cmd_word *word, xlator_t *this = NULL; cli_local_t *local = NULL; int heal_op = 0; - runner_t runner = {0}; - char buff[PATH_MAX] = {0}; - char *out = NULL; this = THIS; frame = create_frame (this, this->ctx->pool); @@ -1916,21 +1967,10 @@ cli_cmd_volume_heal_cbk (struct cli_state *state, struct cli_cmd_word *word, ret = dict_get_int32 (options, "heal-op", &heal_op); if (ret < 0) goto out; - - if (heal_op == GF_AFR_OP_INDEX_SUMMARY) { - runinit (&runner); - runner_add_args (&runner, SBIN_DIR"/glfsheal", words[2], NULL); - runner_redir (&runner, STDOUT_FILENO, RUN_PIPE); - ret = runner_start (&runner); + if (NEEDS_GLFS_HEAL (heal_op)) { + ret = cli_launch_glfs_heal (heal_op, options); if (ret == -1) goto out; - while ((out = fgets(buff, sizeof(buff), - runner_chio (&runner, STDOUT_FILENO)))) { - printf ("%s", out); - } - - ret = runner_end (&runner); - ret = WEXITSTATUS (ret); } else { proc = &cli_rpc_prog->proctable[GLUSTER_CLI_HEAL_VOLUME]; @@ -1946,7 +1986,7 @@ out: if (ret) { cli_cmd_sent_status_get (&sent); if ((sent == 0) && (parse_error == 0)) - cli_out ("Volume heal failed"); + cli_out ("Volume heal failed."); } CLI_STACK_DESTROY (frame); @@ -2316,7 +2356,10 @@ struct cli_cmd volume_cmds[] = { cli_cmd_volume_status_cbk, "display status of all or specified volume(s)/brick"}, - { "volume heal <VOLNAME> [{full | statistics {heal-count {replica <hostname:brickname>}} |info {healed | heal-failed | split-brain}}]", + { "volume heal <VOLNAME> [full | statistics [heal-count "\ + "[replica <HOSTNAME:BRICKNAME>]] |info [healed | heal-failed | "\ + "split-brain]| split-brain {bigger-file <FILE> |source-brick "\ + "<HOSTNAME:BRICKNAME> [<FILE>]}]", cli_cmd_volume_heal_cbk, "self-heal commands on volume specified by <VOLNAME>"}, diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c index 1d8cf23ff42..72ffaf4129a 100644 --- a/cli/src/cli-rpc-ops.c +++ b/cli/src/cli-rpc-ops.c @@ -7358,6 +7358,12 @@ gf_cli_heal_volume_cbk (struct rpc_req *req, struct iovec *iov, case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA: heal_op_str = "count of entries to be healed per replica"; break; + /* The below 2 cases are never hit; they're coded only to make + * compiler warnings go away.*/ + case GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE: + case GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK: + break; + case GF_AFR_OP_INVALID: heal_op_str = "invalid heal op"; break; diff --git a/heal/src/glfs-heal.c b/heal/src/glfs-heal.c index a9baad3ac56..f49f3a58afc 100644 --- a/heal/src/glfs-heal.c +++ b/heal/src/glfs-heal.c @@ -14,11 +14,17 @@ #include "glfs.h" #include "glfs-handles.h" #include "glfs-internal.h" +#include "protocol-common.h" #include "syncop.h" #include <string.h> #include <time.h> #define DEFAULT_HEAL_LOG_FILE_DIRECTORY DATADIR "/log/glusterfs" +#define USAGE_STR "Usage: %s <VOLNAME> [bigger-file <FILE> | "\ + "source-brick <HOSTNAME:BRICKNAME> [<FILE>]]\n" + +int glfsh_heal_splitbrain_file (glfs_t *fs, xlator_t *top_subvol, + loc_t *rootloc, char *file, dict_t *xattr_req); int glfsh_link_inode_update_loc (loc_t *loc, struct iatt *iattr) @@ -83,6 +89,37 @@ out: return ret; } +int +glfsh_get_index_dir_fd (xlator_t *xl, loc_t *loc, fd_t **fd) +{ + int ret = -1; + + *fd = fd_create (loc->inode, GF_CLIENT_PID_GLFS_HEAL); + if (!*fd) { + printf ("fd_create failed: %s", strerror(errno)); + goto out; + } + ret = syncop_opendir (xl, loc, *fd); + if (ret) { + fd_unref(*fd); +#ifdef GF_LINUX_HOST_OS /* See comment in afr_shd_index_opendir() */ + *fd = fd_anonymous (loc->inode); + if (!*fd) { + printf ("fd_anonymous failed: %s", + strerror(errno)); + goto out; + } + ret = 0; +#else + printf ("opendir failed: %s", strerror(errno)); + goto out; +#endif + } + +out: + return ret; +} + static xlator_t* _get_afr_ancestor (xlator_t *xl) { @@ -185,6 +222,33 @@ glfsh_print_heal_status (dict_t *dict, char *path, uuid_t gfid, } static int +glfsh_heal_entries (glfs_t *fs, xlator_t *top_subvol, loc_t *rootloc, + gf_dirent_t *entries, uint64_t *offset, + uint64_t *num_entries, dict_t *xattr_req) { + + gf_dirent_t *entry = NULL; + gf_dirent_t *tmp = NULL; + int ret = 0; + char file[64] = {0}; + + list_for_each_entry_safe (entry, tmp, &entries->list, list) { + *offset = entry->d_off; + if ((strcmp (entry->d_name, ".") == 0) || + (strcmp (entry->d_name, "..") == 0)) + continue; + memset (file, 0, sizeof(file)); + snprintf (file, sizeof(file), "gfid:%s", entry->d_name); + ret = glfsh_heal_splitbrain_file (fs, top_subvol, rootloc, file, + xattr_req); + if (ret) + continue; + (*num_entries)++; + } + + return ret; +} + +static int glfsh_process_entries (xlator_t *xl, fd_t *fd, gf_dirent_t *entries, uint64_t *offset, uint64_t *num_entries) { @@ -240,15 +304,21 @@ glfsh_process_entries (xlator_t *xl, fd_t *fd, gf_dirent_t *entries, } static int -glfsh_crawl_directory (xlator_t *readdir_xl, fd_t *fd, loc_t *loc) +glfsh_crawl_directory (glfs_t *fs, xlator_t *top_subvol, loc_t *rootloc, + xlator_t *readdir_xl, fd_t *fd, loc_t *loc, + dict_t *xattr_req) { uint64_t offset = 0; gf_dirent_t entries; int ret = 0; gf_boolean_t free_entries = _gf_false; uint64_t num_entries = 0; + int heal_op = -1; INIT_LIST_HEAD (&entries.list); + ret = dict_get_int32 (xattr_req, "heal-op", &heal_op); + if (ret) + return ret; while (1) { ret = syncop_readdir (readdir_xl, fd, 131072, offset, &entries); @@ -260,11 +330,16 @@ glfsh_crawl_directory (xlator_t *readdir_xl, fd_t *fd, loc_t *loc) if (list_empty (&entries.list)) goto out; - ret = glfsh_process_entries (readdir_xl, fd, &entries, &offset, - &num_entries); - if (ret < 0) - goto out; - + if (heal_op == GF_AFR_OP_INDEX_SUMMARY) { + ret = glfsh_process_entries (readdir_xl, fd, &entries, + &offset, &num_entries); + if (ret < 0) + goto out; + } else if (heal_op == GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK) { + ret = glfsh_heal_entries (fs, top_subvol, rootloc, + &entries, &offset, + &num_entries, xattr_req); + } gf_dirent_free (&entries); free_entries = _gf_false; } @@ -275,9 +350,12 @@ out: if (ret < 0) { printf ("Failed to complete gathering info. " "Number of entries so far: %"PRIu64"\n", num_entries); - } - else { - printf ("Number of entries: %"PRIu64"\n", num_entries); + } else { + if (heal_op == GF_AFR_OP_INDEX_SUMMARY) + printf ("Number of entries: %"PRIu64"\n", num_entries); + else if (heal_op == GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK) + printf ("Number of healed entries: %"PRIu64"\n", + num_entries); } return ret; } @@ -333,13 +411,22 @@ out: } void -glfsh_print_pending_heals (xlator_t *xl, loc_t *rootloc) +glfsh_print_pending_heals (glfs_t *fs, xlator_t *top_subvol, loc_t *rootloc, + xlator_t *xl) { int ret = 0; loc_t dirloc = {0}; fd_t *fd = NULL; int32_t op_errno = 0; + dict_t *xattr_req = NULL; + xattr_req = dict_new(); + if (!xattr_req) + goto out; + + ret = dict_set_int32 (xattr_req, "heal-op", GF_AFR_OP_INDEX_SUMMARY); + if (ret) + goto out; ret = glfsh_print_brick (xl, rootloc); if (ret < 0) { glfsh_print_brick_from_xl (xl); @@ -356,30 +443,16 @@ glfsh_print_pending_heals (xlator_t *xl, loc_t *rootloc) goto out; } - fd = fd_create (dirloc.inode, GF_CLIENT_PID_GLFS_HEAL); - if (!fd) { - printf ("fd_create failed: %s", strerror(errno)); - goto out; - } - ret = syncop_opendir (xl, &dirloc, fd); - if (ret) { - fd_unref(fd); -#ifdef GF_LINUX_HOST_OS /* See comment in afr_shd_index_opendir() */ - fd = fd_anonymous (dirloc.inode); - if (!fd) { - printf ("fd_anonymous failed: %s", - strerror(errno)); - goto out; - } -#else - printf ("opendir failed: %s", strerror(errno)); + ret = glfsh_get_index_dir_fd (xl, &dirloc, &fd); + if (ret) goto out; -#endif - } - ret = glfsh_crawl_directory (xl, fd, &dirloc); + ret = glfsh_crawl_directory (fs, top_subvol, rootloc, xl, fd, &dirloc, + xattr_req); if (fd) fd_unref (fd); + if (xattr_req) + dict_unref (xattr_req); if (ret < 0) printf ("Failed to find entries with pending self-heal\n"); out: @@ -411,6 +484,209 @@ glfsh_validate_replicate_volume (xlator_t *xl) return ret; } +static xlator_t* +_brick_path_to_client_xlator (xlator_t *top_subvol, char *hostname, + char *brickpath) +{ + int ret = 0; + xlator_t *xl = NULL; + char *remote_host = NULL; + char *remote_subvol = NULL; + + xl = top_subvol; + + while (xl->next) + xl = xl->next; + + while (xl) { + if (!strcmp (xl->type, "protocol/client")) { + ret = dict_get_str (xl->options, "remote-host", + &remote_host); + if (ret < 0) + goto out; + ret = dict_get_str (xl->options, + "remote-subvolume", &remote_subvol); + if (ret < 0) + goto out; + if (!strcmp (hostname, remote_host) && + !strcmp (brickpath, remote_subvol)) + return xl; + } + xl = xl->prev; + } + +out: + return NULL; +} + + +int +glfsh_gather_heal_info (glfs_t *fs, xlator_t *top_subvol, loc_t *rootloc) +{ + xlator_t *xl = NULL; + xlator_t *afr_xl = NULL; + xlator_t *old_THIS = NULL; + + xl = top_subvol; + while (xl->next) + xl = xl->next; + while (xl) { + if (strcmp (xl->type, "protocol/client") == 0) { + afr_xl = _get_afr_ancestor (xl); + if (afr_xl) + old_THIS = THIS; + THIS = afr_xl; + glfsh_print_pending_heals (fs, top_subvol, + rootloc, xl); + THIS = old_THIS; + printf ("\n"); + } + + xl = xl->prev; + } + + return 0; +} + +int +glfsh_heal_splitbrain_file (glfs_t *fs, xlator_t *top_subvol, loc_t *rootloc, + char *file, dict_t *xattr_req) +{ + int ret = -1; + int reval = 0; + loc_t loc = {0, }; + char *path = NULL; + char *filename = NULL; + struct iatt iatt = {0, }; + xlator_t *xl = top_subvol; + dict_t *xattr_rsp = NULL; + char *sh_fail_msg = NULL; + int32_t op_errno = 0; + + if (!strncmp (file, "gfid:", 5)) { + filename = gf_strdup(file); + path = strtok (filename, ":"); + path = strtok (NULL, ";"); + uuid_parse (path, loc.gfid); + loc.path = gf_strdup (uuid_utoa (loc.gfid)); + loc.inode = inode_new (rootloc->inode->table); + ret = syncop_lookup (xl, &loc, xattr_req, 0, &xattr_rsp, 0); + if (ret) { + op_errno = -ret; + printf ("Lookup failed on %s:%s.\n", file, + strerror(op_errno)); + goto out; + } + } else { + if (file[0] != '/') { + printf ("<FILE> must be absolute path w.r.t. the " + "volume, starting with '/'\n"); + ret = -1; + goto out; + } +retry: + ret = glfs_resolve (fs, xl, file, &loc, &iatt, reval); + ESTALE_RETRY (ret, errno, reval, &loc, retry); + if (ret) { + printf("Lookup failed on %s:%s\n", + file, strerror (errno)); + goto out; + } + } + + ret = syncop_getxattr (xl, &loc, &xattr_rsp, GF_AFR_HEAL_SBRAIN, + xattr_req); + if (ret) { + op_errno = -ret; + printf ("Healing %s failed:%s.\n", file, strerror(op_errno)); + goto out; + } + ret = dict_get_str (xattr_rsp, "sh-fail-msg", &sh_fail_msg); + if (!ret) { + printf ("Healing %s failed: %s.\n", file, sh_fail_msg); + ret = -1; + goto out; + } + printf ("Healed %s.\n", file); + ret = 0; +out: + if (xattr_rsp) + dict_unref (xattr_rsp); + return ret; +} + +int +glfsh_heal_from_brick (glfs_t *fs, xlator_t *top_subvol, loc_t *rootloc, + char *hostname, char *brickpath, char *file) +{ + int ret = -1; + dict_t *xattr_req = NULL; + xlator_t *client = NULL; + fd_t *fd = NULL; + loc_t dirloc = {0}; + int32_t op_errno = 0; + + xattr_req = dict_new(); + if (!xattr_req) + goto out; + ret = dict_set_int32 (xattr_req, "heal-op", + GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK); + if (ret) + goto out; + client = _brick_path_to_client_xlator (top_subvol, hostname, brickpath); + if (!client) { + printf("\"%s:%s\"- No such brick available in the volume.\n", + hostname, brickpath); + ret = -1; + goto out; + } + ret = dict_set_str (xattr_req, "child-name", client->name); + if (ret) + goto out; + if (file) + ret = glfsh_heal_splitbrain_file (fs, top_subvol, rootloc, file, + xattr_req); + else { + ret = glfsh_get_index_dir_loc (rootloc, client, &dirloc, + &op_errno); + ret = glfsh_get_index_dir_fd (client, &dirloc, &fd); + if (ret) + goto out; + ret = glfsh_crawl_directory (fs, top_subvol, rootloc, client, + fd, &dirloc, xattr_req); + if (fd) + fd_unref (fd); + } +out: + if (xattr_req) + dict_unref (xattr_req); + loc_wipe (&dirloc); + return ret; +} + +int +glfsh_heal_from_bigger_file (glfs_t *fs, xlator_t *top_subvol, loc_t *rootloc, + char *file) +{ + + int ret = -1; + dict_t *xattr_req = NULL; + + xattr_req = dict_new(); + if (!xattr_req) + goto out; + ret = dict_set_int32 (xattr_req, "heal-op", + GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE); + if (ret) + goto out; + ret = glfsh_heal_splitbrain_file (fs, top_subvol, rootloc, file, + xattr_req); +out: + if (xattr_req) + dict_unref (xattr_req); + return ret; +} + int main (int argc, char **argv) { @@ -418,18 +694,54 @@ main (int argc, char **argv) int ret = 0; char *volname = NULL; xlator_t *top_subvol = NULL; - xlator_t *xl = NULL; loc_t rootloc = {0}; char logfilepath[PATH_MAX] = {0}; - xlator_t *old_THIS = NULL; - xlator_t *afr_xl = NULL; + char *hostname = NULL; + char *path = NULL; + char *file = NULL; + gf_xl_afr_op_t heal_op = -1; - if (argc != 2) { - printf ("Usage: %s <volname>\n", argv[0]); + if (argc < 2) { + printf (USAGE_STR, argv[0]); ret = -1; goto out; } volname = argv[1]; + switch (argc) { + case 2: + heal_op = GF_AFR_OP_INDEX_SUMMARY; + break; + case 4: + if (!strcmp (argv[2], "bigger-file")) { + heal_op = GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE; + file = argv[3]; + } else if (!strcmp (argv[2], "source-brick")) { + heal_op = GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK; + hostname = strtok (argv[3], ":"); + path = strtok (NULL, ":"); + } else { + printf (USAGE_STR, argv[0]); + ret = -1; + goto out; + } + break; + case 5: + if (!strcmp (argv[2], "source-brick")) { + heal_op = GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK; + hostname = strtok (argv[3], ":"); + path = strtok (NULL, ":"); + file = argv[4]; + } else { + printf (USAGE_STR, argv[0]); + ret = -1; + goto out; + } + break; + default: + printf (USAGE_STR, argv[0]); + ret = -1; + goto out; + } fs = glfs_new (volname); if (!fs) { @@ -485,30 +797,28 @@ main (int argc, char **argv) rootloc.inode = inode_ref (top_subvol->itable->root); glfs_loc_touchup (&rootloc); - xl = top_subvol; - while (xl->next) - xl = xl->next; - - while (xl) { - if (strcmp (xl->type, "protocol/client") == 0) { - afr_xl = _get_afr_ancestor (xl); - if (afr_xl) { - old_THIS = THIS; - THIS = afr_xl; - glfsh_print_pending_heals (xl, &rootloc); - THIS = old_THIS; - printf("\n"); - } - } - - xl = xl->prev; + switch (heal_op) { + case GF_AFR_OP_INDEX_SUMMARY: + ret = glfsh_gather_heal_info (fs, top_subvol, &rootloc); + break; + case GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE: + ret = glfsh_heal_from_bigger_file (fs, top_subvol, + &rootloc, file); + break; + case GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK: + ret = glfsh_heal_from_brick (fs, top_subvol, &rootloc, + hostname, path, file); + break; + default: + ret = -1; + break; } loc_wipe (&rootloc); glfs_subvol_done (fs, top_subvol); glfs_fini (fs); - return 0; + return ret; out: if (fs && top_subvol) glfs_subvol_done (fs, top_subvol); diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index 4c213f41576..73945e578fe 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -138,6 +138,7 @@ #define GF_XATTROP_INDEX_COUNT "glusterfs.xattrop_index_count" #define GF_AFR_HEAL_INFO "glusterfs.heal-info" +#define GF_AFR_HEAL_SBRAIN "glusterfs.heal-sbrain" #define GF_GFIDLESS_LOOKUP "gfidless-lookup" /* replace-brick and pump related internal xattrs */ diff --git a/rpc/rpc-lib/src/protocol-common.h b/rpc/rpc-lib/src/protocol-common.h index 1fd063aec25..f560c103acd 100644 --- a/rpc/rpc-lib/src/protocol-common.h +++ b/rpc/rpc-lib/src/protocol-common.h @@ -231,6 +231,8 @@ typedef enum { GF_AFR_OP_STATISTICS, GF_AFR_OP_STATISTICS_HEAL_COUNT, GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA, + GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE, + GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK, } gf_xl_afr_op_t ; struct gf_gsync_detailed_status_ { diff --git a/tests/basic/afr/split-brain-healing.t b/tests/basic/afr/split-brain-healing.t new file mode 100644 index 00000000000..1dc317df8dd --- /dev/null +++ b/tests/basic/afr/split-brain-healing.t @@ -0,0 +1,183 @@ +#!/bin/bash + +#Test the split-brain resolution CLI commands. +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +function get_replicate_subvol_number { + local filename=$1 + #get_backend_paths + if [ -f $B0/${V0}1/$filename ] + then + echo 0 + elif [ -f $B0/${V0}3/$filename ] + then echo 1 + else + echo -1 + fi +} + +cleanup; + +AREQUAL_PATH=$(dirname $0)/../../utils +CFLAGS="" +test "`uname -s`" != "Linux" && { + CFLAGS="$CFLAGS -I$(dirname $0)/../../../contrib/argp-standalone "; + CFLAGS="$CFLAGS -L$(dirname $0)/../../../contrib/argp-standalone -largp "; + CFLAGS="$CFLAGS -lintl"; +} +build_tester $AREQUAL_PATH/arequal-checksum.c $CFLAGS +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2,3,4} +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 + +cd $M0 +for i in {1..10} +do + echo "Initial content">>file$i +done + +replica_0_files_list=(`ls $B0/${V0}1`) +replica_1_files_list=(`ls $B0/${V0}3`) + +############ Create data split-brain in the files. ########################### +TEST kill_brick $V0 $H0 $B0/${V0}1 +for file in ${!replica_0_files_list[*]} +do + echo "B1 is down">>${replica_0_files_list[$file]} +done +TEST kill_brick $V0 $H0 $B0/${V0}3 +for file in ${!replica_1_files_list[*]} +do + echo "B3 is down">>${replica_1_files_list[$file]} +done + +SMALLER_FILE_SIZE=$(stat -c %s file1) + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2 + +TEST kill_brick $V0 $H0 $B0/${V0}2 +for file in ${!replica_0_files_list[*]} +do + echo "B2 is down">>${replica_0_files_list[$file]} + echo "appending more content to make it the bigger file">>${replica_0_files_list[$file]} +done +TEST kill_brick $V0 $H0 $B0/${V0}4 +for file in ${!replica_1_files_list[*]} +do + echo "B4 is down">>${replica_1_files_list[$file]} + echo "appending more content to make it the bigger file">>${replica_1_files_list[$file]} +done + +BIGGER_FILE_SIZE=$(stat -c %s file1) + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 3 + + +############### Acessing the files should now give EIO. ############################### +TEST ! cat file1 +TEST ! cat file2 +TEST ! cat file3 +TEST ! cat file4 +TEST ! cat file5 +TEST ! cat file6 +TEST ! cat file7 +TEST ! cat file8 +TEST ! cat file9 +TEST ! cat file10 +################### +TEST $CLI volume set $V0 cluster.self-heal-daemon on +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 3 + +################ Heal file1 using the bigger-file option ############## +$CLI volume heal $V0 split-brain bigger-file /file1 +EXPECT "0" echo $? +EXPECT $BIGGER_FILE_SIZE stat -c %s file1 + +################ Heal file2 using the bigger-file option and its gfid ############## +subvolume=$(get_replicate_subvol_number file2) +if [ $subvolume == 0 ] +then + GFID=$(gf_get_gfid_xattr $B0/${V0}1/file2) +elif [ $subvolume == 1 ] +then + GFID=$(gf_get_gfid_xattr $B0/${V0}3/file2) +fi +GFIDSTR="gfid:$(gf_gfid_xattr_to_str $GFID)" +$CLI volume heal $V0 split-brain bigger-file $GFIDSTR +EXPECT "0" echo $? + +################ Heal file3 using the source-brick option ############## +################ Use the brick having smaller file size as source ####### +subvolume=$(get_replicate_subvol_number file3) +if [ $subvolume == 0 ] +then + $CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}2 /file3 +elif [ $subvolume == 1] +then + $CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}4 /file3 +fi +EXPECT "0" echo $? +EXPECT $SMALLER_FILE_SIZE stat -c %s file3 + +################ Heal file4 using the source-brick option and it's gfid ############## +################ Use the brick having smaller file size as source ####### +subvolume=$(get_replicate_subvol_number file4) +if [ $subvolume == 0 ] +then + GFID=$(gf_get_gfid_xattr $B0/${V0}1/file4) + GFIDSTR="gfid:$(gf_gfid_xattr_to_str $GFID)" + $CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}2 $GFIDSTR +elif [ $subvolume == 1] +then + GFID=$(gf_get_gfid_xattr $B0/${V0}3/file4) + GFIDSTR="gfid:$(gf_gfid_xattr_to_str $GFID)" + $CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}4 $GFIDSTR +fi +EXPECT "0" echo $? +EXPECT $SMALLER_FILE_SIZE stat -c %s file4 + +################ Heal remaining SB'ed files of replica_0 using B1 as source ############## +$CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}1 +EXPECT "0" echo $? + +################ Heal remaining SB'ed files of replica_1 using B3 as source ############## +$CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}3 +EXPECT "0" echo $? + +############### Reading the files should now suceed. ############################### +TEST cat file1 +TEST cat file2 +TEST cat file3 +TEST cat file4 +TEST cat file5 +TEST cat file6 +TEST cat file7 +TEST cat file8 +TEST cat file9 +TEST cat file10 + +################ File contents on the bricks must be same. ################################ +TEST diff <(arequal-checksum -p $B0/$V01 -i .glusterfs) <(arequal-checksum -p $B0/$V02 -i .glusterfs) +TEST diff <(arequal-checksum -p $B0/$V03 -i .glusterfs) <(arequal-checksum -p $B0/$V04 -i .glusterfs) + +############### Trying to heal files not in SB should fail. ############################### +$CLI volume heal $V0 split-brain bigger-file /file1 +EXPECT "1" echo $? +$CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}4 /file3 +EXPECT "1" echo $? + +cd - +TEST rm $AREQUAL_PATH/arequal-checksum +cleanup diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index f39db802588..e6d45add4e8 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -4471,5 +4471,81 @@ out: AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL); if (dict) dict_unref (dict); + if (inode) { + inode_forget (inode, 1); + inode_unref (inode); + } + return ret; +} + +int32_t +afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + gf_boolean_t data_selfheal = _gf_false; + gf_boolean_t metadata_selfheal = _gf_false; + gf_boolean_t entry_selfheal = _gf_false; + dict_t *dict = NULL; + afr_local_t *local = NULL; + inode_t *inode = NULL; + int entry_ret = 0, metadata_ret = 0, data_ret = 0; + int ret = 0, op_errno = 0; + + local = frame->local; + dict = dict_new (); + if (!dict) { + op_errno = ENOMEM; + ret = -1; + goto out; + } + + ret = afr_selfheal_unlocked_inspect (frame, this, loc->gfid, &inode, + &data_selfheal, + &metadata_selfheal, + &entry_selfheal); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + + if (!data_selfheal && !metadata_selfheal && !entry_selfheal) { + ret = dict_set_str (dict, "sh-fail-msg", + "File not in split-brain"); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "Failed to set sh-fail-msg in dict"); + ret = 0; + goto out; + } + + if (data_selfheal) + data_ret = afr_selfheal_data (frame, this, inode); + + if (metadata_selfheal) + metadata_ret = afr_selfheal_metadata (frame, this, inode); + + if (entry_selfheal) + entry_ret = afr_selfheal_entry (frame, this, inode); + + ret = (data_ret | metadata_ret | entry_ret); + + if (local->xdata_rsp) { + /* 'sh-fail-msg' has been set in the dict during self-heal.*/ + dict_copy (local->xdata_rsp, dict); + ret = 0; + } else if (ret) { + /*Some other error during self-heal. Just propagate it.*/ + op_errno = -ret; + ret = -1; + } + +out: + AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL); + if (dict) + dict_unref(dict); + if (inode) { + inode_forget (inode, 1); + inode_unref (inode); + } return ret; } diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index e64070e1bcd..78dd65f30e7 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -1380,6 +1380,11 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, return 0; } + if (!strcmp (name, GF_AFR_HEAL_SBRAIN)) { + afr_heal_splitbrain_file (frame, this, loc); + return 0; + } + /* * if we are doing getxattr with pathinfo as the key then we * collect information from all childs diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 6198d4cf72c..e9d853c4ecd 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -17,7 +17,7 @@ #include "afr.h" #include "afr-self-heal.h" #include "byte-order.h" - +#include "protocol-common.h" int afr_selfheal_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -287,6 +287,39 @@ afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies, return 0; } +/* + * If by chance there are multiple sources with differing sizes, select + * the largest file as the source. + * + * This can happen if data was directly modified in the backend or for snapshots + */ +void +afr_mark_largest_file_as_source (xlator_t *this, unsigned char *sources, + struct afr_reply *replies) +{ + int i = 0; + afr_private_t *priv = NULL; + uint64_t size = 0; + + /* Find source with biggest file size */ + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (size <= replies[i].poststat.ia_size) { + size = replies[i].poststat.ia_size; + } + } + + /* Mark sources with less size as not source */ + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (size > replies[i].poststat.ia_size) + sources[i] = 0; + } +} + void afr_mark_active_sinks (xlator_t *this, unsigned char *sources, unsigned char *locked_on, unsigned char *sinks) @@ -304,6 +337,154 @@ afr_mark_active_sinks (xlator_t *this, unsigned char *sources, } gf_boolean_t +afr_dict_contains_heal_op (call_frame_t *frame) +{ + afr_local_t *local = NULL; + dict_t *xdata_req = NULL; + int ret = 0; + int heal_op = -1; + + local = frame->local; + xdata_req = local->xdata_req; + ret = dict_get_int32 (xdata_req, "heal-op", &heal_op); + if (ret) + return _gf_false; + if (local->xdata_rsp == NULL) { + local->xdata_rsp = dict_new(); + if (!local->xdata_rsp) + return _gf_true; + } + ret = dict_set_str (local->xdata_rsp, "sh-fail-msg", + "File not in split-brain"); + + return _gf_true; +} + +/* Return a source depending on the type of heal_op, and set sources[source], + * sinks[source] and healed_sinks[source] to 1, 0 and 0 respectively. Do so + * only if the following condition is met: + * ∀i((i ∈ locked_on[] ∧ i=1)==>(sources[i]=0 ∧ sinks[i]=1 ∧ healed_sinks[i]=1)) + * i.e. for each locked node, sources[node] is 0; healed_sinks[node] and + * sinks[node] are 1. This should be the case if the file is in split-brain. + */ +int +afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this, + unsigned char *sources, + unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *locked_on, + struct afr_reply *replies, + afr_transaction_type type) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t *xdata_req = NULL; + dict_t *xdata_rsp = NULL; + int ret = 0; + int heal_op = -1; + int i = 0; + char *name = NULL; + int source = -1; + + local = frame->local; + priv = this->private; + xdata_req = local->xdata_req; + ret = dict_get_int32 (xdata_req, "heal-op", &heal_op); + if (ret) + goto out; + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i]) + if (sources[i] || !sinks[i] || !healed_sinks[i]) { + ret = -1; + goto out; + } + } + if (local->xdata_rsp == NULL) { + local->xdata_rsp = dict_new(); + if (!local->xdata_rsp) { + ret = -1; + goto out; + } + } + xdata_rsp = local->xdata_rsp; + + switch (heal_op) { + case GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE: + if (type == AFR_METADATA_TRANSACTION) { + ret = dict_set_str (xdata_rsp, "sh-fail-msg", + "Use source-brick option to" + " heal metadata split-brain"); + if (!ret) + ret = -1; + goto out; + } + for (i = 0 ; i < priv->child_count; i++) + if (locked_on[i]) + sources[i] = 1; + afr_mark_largest_file_as_source (this, sources, replies); + if (AFR_COUNT (sources, priv->child_count) != 1) { + ret = dict_set_str (xdata_rsp, "sh-fail-msg", + "No bigger file"); + if (!ret) + ret = -1; + goto out; + } + for (i = 0 ; i < priv->child_count; i++) + if (sources[i]) + source = i; + sinks[source] = 0; + healed_sinks[source] = 0; + break; + case GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK: + ret = dict_get_str (xdata_req, "child-name", &name); + if (ret) + goto out; + source = afr_get_child_index_from_name (this, name); + if (source < 0) { + ret = dict_set_str (xdata_rsp, "sh-fail-msg", + "Invalid brick name"); + if (!ret) + ret = -1; + goto out; + } + if (locked_on[source] != 1) { + ret = dict_set_str (xdata_rsp, "sh-fail-msg", + "Brick is not up"); + if (!ret) + ret = -1; + goto out; + } + sources[source] = 1; + sinks[source] = 0; + healed_sinks[source] = 0; + break; + default: + ret = -1; + goto out; + } + ret = source; +out: + return ret; + +} + +int +afr_get_child_index_from_name (xlator_t *this, char *name) +{ + afr_private_t *priv = this->private; + int index = -1; + + for (index = 0; index < priv->child_count; index++) { + if (!strcmp (priv->children[index]->name, name)) + goto out; + } + index = -1; +out: + return index; +} + + +gf_boolean_t afr_does_witness_exist (xlator_t *this, uint64_t *witness) { int i = 0; @@ -427,6 +608,14 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this, } } + /* If no sources, all locked nodes are sinks - split brain */ + if (AFR_COUNT (sources, priv->child_count) == 0) { + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i]) + sinks[i] = 1; + } + } + /* In afr-v1 if a file is self-accused but didn't have any pending * operations on others then it is similar to 'dirty' in afr-v2. * Consider such cases as witness. diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index a434b9e6ba1..45a099cec86 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -17,6 +17,7 @@ #include "afr.h" #include "afr-self-heal.h" #include "byte-order.h" +#include "protocol-common.h" enum { AFR_SELFHEAL_DATA_FULL = 0, @@ -426,41 +427,6 @@ afr_does_size_mismatch (xlator_t *this, unsigned char *sources, return _gf_false; } -/* - * If by chance there are multiple sources with differing sizes, select - * the largest file as the source. - * - * This can happen if data was directly modified in the backend or for snapshots - */ - -static void -afr_mark_largest_file_as_source (xlator_t *this, unsigned char *sources, - struct afr_reply *replies) -{ - int i = 0; - afr_private_t *priv = NULL; - uint64_t size = 0; - - /* Find source with biggest file size */ - priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (!sources[i]) - continue; - if (size <= replies[i].poststat.ia_size) { - size = replies[i].poststat.ia_size; - } - } - - /* Mark sources with less size as not source */ - for (i = 0; i < priv->child_count; i++) { - if (!sources[i]) - continue; - if (size > replies[i].poststat.ia_size) - sources[i] = 0; - } - - return; -} static void afr_mark_biggest_witness_as_source (xlator_t *this, unsigned char *sources, @@ -518,7 +484,9 @@ afr_mark_newest_file_as_source (xlator_t *this, unsigned char *sources, } static int -__afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources, +__afr_selfheal_data_finalize_source (call_frame_t *frame, xlator_t *this, + unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on, struct afr_reply *replies, @@ -528,7 +496,6 @@ __afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources, afr_private_t *priv = NULL; int source = -1; int sources_count = 0; - priv = this->private; sources_count = AFR_COUNT (sources, priv->child_count); @@ -536,9 +503,21 @@ __afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources, if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0) || !sources_count) { /* split brain */ - return -EIO; + source = afr_mark_split_brain_source_sinks (frame, this, + sources, sinks, + healed_sinks, + locked_on, replies, + AFR_DATA_TRANSACTION); + if (source < 0) + return -EIO; + return source; } + /* No split brain at this point. If we were called from + * afr_heal_splitbrain_file(), abort.*/ + if (afr_dict_contains_heal_op(frame)) + return -EIO; + /* If there are no witnesses/size-mismatches on sources we are done*/ if (!afr_does_size_mismatch (this, sources, replies) && !afr_has_source_witnesses (this, sources, witness)) @@ -605,9 +584,10 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this, */ AFR_INTERSECT (healed_sinks, sinks, locked_on, priv->child_count); - source = __afr_selfheal_data_finalize_source (this, sources, - healed_sinks, locked_on, - replies, witness); + source = __afr_selfheal_data_finalize_source (frame, this, sources, + sinks, healed_sinks, + locked_on, replies, + witness); if (source < 0) return -EIO; diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index 0518c1821e3..05d9f2b4917 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -17,6 +17,7 @@ #include "afr.h" #include "afr-self-heal.h" #include "byte-order.h" +#include "protocol-common.h" #define AFR_HEAL_ATTR (GF_SET_ATTR_UID|GF_SET_ATTR_GID|GF_SET_ATTR_MODE) @@ -199,6 +200,7 @@ out: static int __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on, struct afr_reply *replies) @@ -208,13 +210,26 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this, struct iatt first = {0, }; int source = -1; int sources_count = 0; + dict_t *xdata_req = NULL; + afr_local_t *local = NULL; priv = this->private; + local = frame->local; + xdata_req = local->xdata_req; sources_count = AFR_COUNT (sources, priv->child_count); if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0) || !sources_count) { + + source = afr_mark_split_brain_source_sinks (frame, this, + sources, sinks, + healed_sinks, + locked_on, replies, + AFR_METADATA_TRANSACTION); + if (source >= 0) + return source; + /* If this is a directory mtime/ctime only split brain use the most recent */ source = afr_dirtime_splitbrain_source (frame, this, @@ -224,17 +239,7 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this, "split brain on %s", uuid_utoa (replies[source].poststat.ia_gfid)); sources[source] = 1; - - for (i = 0; i < priv->child_count; i++) { - if (i == source) - continue; - - if (!locked_on[i]) - continue; - - healed_sinks[i] = 1; - } - + healed_sinks[source] = 0; return source; } @@ -253,6 +258,11 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this, } } + /* No split brain at this point. If we were called from + * afr_heal_splitbrain_file(), abort.*/ + if (afr_dict_contains_heal_op(frame)) + return -EIO; + for (i = 0; i < priv->child_count; i++) { if (!sources[i]) continue; @@ -352,7 +362,7 @@ __afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *i } source = __afr_selfheal_metadata_finalize_source (frame, this, sources, - healed_sinks, + sinks, healed_sinks, locked_on, replies); if (source < 0) diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 50cff91ccb3..74cc9608cf6 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -193,10 +193,28 @@ afr_log_selfheal (uuid_t gfid, xlator_t *this, int ret, char *type, int source, unsigned char *healed_sinks); void +afr_mark_largest_file_as_source (xlator_t *this, unsigned char *sources, + struct afr_reply *replies); +void afr_mark_active_sinks (xlator_t *this, unsigned char *sources, unsigned char *locked_on, unsigned char *sinks); gf_boolean_t +afr_dict_contains_heal_op (call_frame_t *frame); + +int +afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this, + unsigned char *sources, + unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *locked_on, + struct afr_reply *replies, + afr_transaction_type type); + +int +afr_get_child_index_from_name (xlator_t *this, char *name); + +gf_boolean_t afr_does_witness_exist (xlator_t *this, uint64_t *witness); int diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 4fdc5f774cc..09821b724fe 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -1021,4 +1021,8 @@ afr_is_xattr_ignorable (char *key); int afr_get_heal_info (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int +afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc); + #endif /* __AFR_H__ */ diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 82b527e9141..866e3faf629 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -2636,8 +2636,10 @@ dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, this_call_cnt = dht_frame_return (frame); - if (!xattr || (op_ret == -1)) + if (!xattr || (op_ret == -1)) { + local->op_ret = op_ret; goto out; + } if (dict_get (xattr, conf->xattr_name)) { dict_del (xattr, conf->xattr_name); @@ -2808,7 +2810,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, subvol = layout->list[i].xlator; STACK_WIND (frame, dht_vgetxattr_dir_cbk, subvol, subvol->fops->getxattr, - loc, key, NULL); + loc, key, xdata); } return 0; } @@ -2821,7 +2823,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, local->call_cnt = 1; STACK_WIND (frame, dht_vgetxattr_cbk, cached_subvol, - cached_subvol->fops->getxattr, loc, key, NULL); + cached_subvol->fops->getxattr, loc, key, xdata); return 0; } @@ -2854,7 +2856,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, if (hashed_subvol) { STACK_WIND (frame, dht_linkinfo_getxattr_cbk, hashed_subvol, hashed_subvol->fops->getxattr, loc, - GF_XATTR_PATHINFO_KEY, NULL); + GF_XATTR_PATHINFO_KEY, xdata); return 0; } op_errno = ENODATA; @@ -2933,7 +2935,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, subvol = layout->list[i].xlator; STACK_WIND (frame, dht_getxattr_cbk, subvol, subvol->fops->getxattr, - loc, key, NULL); + loc, key, xdata); } return 0; |