diff options
| -rw-r--r-- | xlators/cluster/dht/src/dht-common.c | 114 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 11 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-rebalance.c | 400 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-shared.c | 4 | 
4 files changed, 524 insertions, 5 deletions
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 65c9c0b0a31..be92236e3bd 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -11465,3 +11465,117 @@ dht_dir_layout_error_check(xlator_t *this, inode_t *inode)      /* Returning the first xlator error as all xlators have errors */      return layout->list[0].err;  } + +/* Get brick paths from all the local subvols and store for use. + * + * TODO: Make sure newly added brick is not picked for migration. + * Otherwise there will be no rebalance as directory entries won't be present + * on a newly added brick */ +int +dht_get_brick_paths(xlator_t *this, dht_conf_t *conf, loc_t *loc) +{ +    dict_t *dict = NULL; +    gf_defrag_info_t *defrag = conf->defrag; +    char *key = NULL; +    char *tmp = NULL; +    char *str = NULL; +    char *token; +    char *saveptr = NULL; +    int i = 1; +    int j = 0; +    int ret = 0; + +    key = gf_strdup("glusterfs.pathinfo"); +    if (!key) { +        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0, +               "failed to allocate " +               "memory"); +        ret = -1; +        goto out; +    } + +    defrag->local_brick_paths = GF_CALLOC(conf->local_subvols_cnt, +                                          sizeof(*defrag->local_brick_paths), +                                          gf_common_mt_pointer); + +    for (j = 0; j < conf->local_subvols_cnt; j++) { +        ret = syncop_getxattr(conf->local_subvols[j], loc, &dict, key, NULL, +                              NULL); +        if (ret == -1) { +            gf_msg(this->name, GF_LOG_WARNING, 0, 0, +                   "failed to get path," +                   " errno %d", +                   ret); +            /* TODO: We need not break out from here and can resume operation. +             * We need a place holder in gf_defrag_info_t to mark which +             * local_brick_paths we are working on. Right now, we blindly +             * take defrag->local_brick_path[0]. This can be dynamic based on +             * need */ +            goto out; +        } + +        str = NULL; +        ret = dict_get_str(dict, key, &str); +        if (ret != 0) { +            gf_msg(this->name, GF_LOG_ERROR, -ret, 0, "dict get failed for :%s", +                   key); +            goto out; +        } +        if (str == NULL) { +            gf_msg(this->name, GF_LOG_ERROR, 0, 0, "key:%s not found", key); +            ret = -1; +            goto out; +        } + +        if (!defrag->is_pure_distribute) { +            tmp = strstr(str, "REPLICATE"); +            if (tmp) { +                defrag->is_pure_distribute = _gf_false; +                break; +            } + +            /*TODO: fetching glusterfs.pathinfo on erasure volume is failing. +             *Function the old way till we get it resolved */ +            tmp = strstr(str, "ERASURE"); +            if (tmp) { +                defrag->is_pure_distribute = _gf_false; +                break; +            } + +            defrag->is_pure_distribute = _gf_true; +        } + +        saveptr = NULL; + +        for (token = strtok_r(str, ":", &saveptr), i = 1; token;) { +            token = strtok_r(NULL, ":", &saveptr); +            i++; +            if (i == 3) { +                token = strtok_r(token, ">", &saveptr); +                break; +            } else { +                continue; +            } +        } + +        defrag->local_brick_paths[j] = gf_strdup(token); +    } + +out: +    if (ret == -1) { +        gf_msg(this->name, GF_LOG_INFO, 0, 0, +               "failed to get brick path. " +               "Will operate old way"); +        for (j = 0; j < conf->local_subvols_cnt; j++) { +            GF_FREE(defrag->local_brick_paths[j]); +        } +        defrag->is_pure_distribute = _gf_false; +    } + +    if (defrag->is_pure_distribute) { +        gf_msg(this->name, GF_LOG_INFO, 0, 0, "volume type : pure distribute"); +    } + +    GF_FREE(key); +    return ret; +} diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 028c6ac6b9f..84891406c71 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -598,6 +598,15 @@ struct gf_defrag_info_ {      gf_boolean_t stats;      /* lock migration flag */      gf_boolean_t lock_migration_enabled; + +    /* local system crawl */ +    char **local_brick_paths; + +    /* whether the volume is pure distribute */ +    gf_boolean_t is_pure_distribute; + +    /*TODO: Introduce a glusterd option to tune this behaviour*/ +    gf_boolean_t operate_dist;  };  typedef struct gf_defrag_info_ gf_defrag_info_t; @@ -1482,4 +1491,6 @@ dht_set_parent_layout_in_dict(loc_t *loc, xlator_t *this, dht_local_t *local);  int  dht_dir_layout_error_check(xlator_t *this, inode_t *inode); +int +dht_get_brick_paths(xlator_t *this, dht_conf_t *conf, loc_t *loc);  #endif /* _DHT_H */ diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index c141ffce90d..d850eef62ab 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -14,6 +14,7 @@  #include <signal.h>  #include <glusterfs/events.h>  #include "glusterfs/compat-errno.h"  // for ENODATA on BSD +#include <string.h>  #define GF_DISK_SECTOR_SIZE 512  #define DHT_REBALANCE_PID 4242        /* Change it if required */ @@ -4052,6 +4053,368 @@ out:  }  int +gf_defrag_fix_layout_puredist(xlator_t *this, gf_defrag_info_t *defrag, +                              loc_t *loc, dict_t *fix_layout, +                              dict_t *migrate_data) +{ +    int ret = -1; +    loc_t entry_loc = { +        0, +    }; +    fd_t *fd = NULL; +    inode_t *linked_inode = NULL, *inode = NULL; +    dht_conf_t *conf = NULL; +    int should_commit_hash = 1; +    int perrno = 0; +    /* absolute brick path length */ +    int brick_len = 0; +    /* dir path length (relative to gluster mount) */ +    int dir_len = 0; +    /* absolute dir path length */ +    int total_len = 0; +    struct dirent *entry = NULL; +    struct dirent scratch[2] = {{ +        0, +    }}; +    DIR *dirp = NULL; +    int full_entry_length = 0; +    int entry_len = 0; +    char full_entry_path[4096] = { +        0, +    }; +    char full_dir_path[4096] = { +        0, +    }; +    ssize_t size = 0; +    uuid_t tmp_gfid; +    struct stat tmpbuf = { +        0, +    }; +    struct iatt iatt = { +        0, +    }; + +    struct stat lstatbuf = { +        0, +    }; +    struct iatt stbuf = { +        0, +    }; + +    conf = this->private; +    if (!conf) { +        ret = -1; +        goto out; +    } + +    /* +     * Since the primary target for the following lookup is to figure out if the +     * entry still exists, going to do a direct stat call rather than going +     * through the whole gluster stack. There are some benefits of doing gluster +     * lookup, but this is redundant since we have done already one gluster +     * lookup in the parent function. +     * +     * Randomly selecting the first local subvol to read, since it is expected +     * that the directory structure is present in all the subvols identically +     */ + +    brick_len = strlen(defrag->local_brick_paths[0]); +    /* discarding the first "/" */ +    dir_len = strlen(loc->path) - 1; +    /* Extra two: one for "/" at the end and one more for '\0'*/ +    total_len = brick_len + dir_len + 2; + +    snprintf(full_dir_path, total_len, "%s%s/", defrag->local_brick_paths[0], +             loc->path + 1); + +    ret = sys_lstat(full_dir_path, &tmpbuf); +    if (ret == -1) { +        gf_log(this->name, GF_LOG_ERROR, +               "[absolutepath %s] directory " +               "not found, path %s error %d", +               full_dir_path, loc->path, errno); +        goto out; +    } + +    dirp = sys_opendir(full_dir_path); +    if (!dirp) { +        ret = -1; +        gf_msg(this->name, GF_LOG_ERROR, errno, 0, "failed to open dir : %s", +               loc->path); +        if (conf->decommission_subvols_cnt) { +            defrag->total_failures++; +        } +        goto out; +    } + +    while ((entry = sys_readdir(dirp, scratch)) != NULL) { +        if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { +            ret = 1; +            goto out; +        } +        if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..") || +            !strcmp(entry->d_name, ".glusterfs")) +            continue; + +        /* TODO: Need to add a check for _DIRENT_HAVE_D_TYPE flag to fall back +           to stat in case d_type is not defined */ +        if (entry->d_type != DT_DIR) { +            continue; +        } + +        entry_len = strlen(entry->d_name); +        full_entry_length = total_len + entry_len + 1; /* one more for "/"*/ + +        snprintf(full_entry_path, full_entry_length, "%s%s/", full_dir_path, +                 entry->d_name); + +        size = sys_lgetxattr(full_entry_path, GFID_XATTR_KEY, tmp_gfid, 16); +        if (size != 16) { +            gf_log(this->name, GF_LOG_ERROR, "gfid not found, path %s", +                   full_entry_path); +            continue; +        } + +        loc_wipe(&entry_loc); + +        ret = dht_build_child_loc(this, &entry_loc, loc, entry->d_name); +        if (ret) { +            gf_log(this->name, GF_LOG_ERROR, +                   "Child loc" +                   " build failed for entry: %s", +                   entry->d_name); + +            if (conf->decommission_in_progress) { +                defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + +                goto out; +            } else { +                should_commit_hash = 0; + +                continue; +            } +        } + +        if (gf_uuid_is_null(tmp_gfid)) { +            gf_log(this->name, GF_LOG_ERROR, +                   "%s/%s" +                   " gfid not present", +                   loc->path, entry->d_name); +            continue; +        } + +        gf_uuid_copy(entry_loc.gfid, tmp_gfid); + +        /*In case the gfid stored in the inode by inode_link +         *and the gfid obtained in the lookup differs, then +         *client3_3_lookup_cbk will return ESTALE and proper +         *error will be captured. +         */ +        memset(&lstatbuf, 0, sizeof(struct stat)); +        ret = sys_lstat(full_entry_path, &lstatbuf); +        if (ret == -1) { +            gf_msg(this->name, GF_LOG_ERROR, errno, 0, "lstat failed for %s", +                   entry->d_name); +        } + +        memset(&stbuf, 0, sizeof(struct iatt)); +        iatt_from_stat(&stbuf, &lstatbuf); +        gf_uuid_copy(stbuf.ia_gfid, entry_loc.gfid); +        linked_inode = inode_link(entry_loc.inode, loc->inode, entry->d_name, +                                  &stbuf); + +        inode = entry_loc.inode; +        entry_loc.inode = linked_inode; +        inode_unref(inode); + +        if (gf_uuid_is_null(loc->gfid)) { +            gf_log(this->name, GF_LOG_ERROR, +                   "%s/%s" +                   " gfid not present", +                   loc->path, entry->d_name); +            continue; +        } + +        gf_uuid_copy(entry_loc.pargfid, loc->gfid); + +        ret = syncop_lookup(this, &entry_loc, &iatt, NULL, NULL, NULL); +        if (ret) { +            if (-ret == ENOENT || -ret == ESTALE) { +                gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_LOOKUP_FAILED, +                       "Dir:%s renamed or removed. " +                       "Skipping", +                       loc->path); +                ret = 0; +                if (conf->decommission_subvols_cnt) { +                    defrag->total_failures++; +                } +                continue; +            } else { +                gf_msg(this->name, GF_LOG_ERROR, -ret, +                       DHT_MSG_DIR_LOOKUP_FAILED, "lookup failed for:%s", +                       entry_loc.path); + +                defrag->total_failures++; + +                if (conf->decommission_in_progress) { +                    defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; +                    ret = -1; +                    goto out; +                } else { +                    should_commit_hash = 0; +                    continue; +                } +            } +        } + +        /* A return value of 2 means, either process_dir or +         * lookup of a dir failed. Hence, don't commit hash +         * for the current directory*/ + +        ret = gf_defrag_fix_layout_puredist(this, defrag, &entry_loc, +                                            fix_layout, migrate_data); + +        if (defrag->defrag_status == GF_DEFRAG_STATUS_STOPPED) { +            goto out; +        } + +        if (ret && ret != 2) { +            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_FIX_FAILED, +                   "Fix layout failed for %s", entry_loc.path); + +            defrag->total_failures++; + +            if (conf->decommission_in_progress) { +                defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + +                goto out; +            } else { +                /* Let's not commit-hash if +                 * gf_defrag_fix_layout failed*/ +                continue; +            } +        } +    } + +    ret = sys_closedir(dirp); +    if (ret) { +        gf_msg_debug(this->name, 0, +                     "Failed to close dir %s. Reason :" +                     " %s", +                     full_dir_path, strerror(errno)); +        ret = 0; +    } + +    dirp = NULL; + +    /* A directory layout is fixed only after its subdirs are healed to +     * any newly added bricks. If the layout is fixed before subdirs are +     * healed, the newly added brick will get a non-null layout. +     * Any subdirs which hash to that layout will no longer show up +     * in a directory listing until they are healed. +     */ + +    ret = syncop_setxattr(this, loc, fix_layout, 0, NULL, NULL); + +    /* In case of a race where the directory is deleted just before +     * layout setxattr, the errors are updated in the layout structure. +     * We can use this information to make a decision whether the directory +     * is deleted entirely. +     */ +    if (ret == 0) { +        ret = dht_dir_layout_error_check(this, loc->inode); +        ret = -ret; +    } + +    if (ret) { +        if (-ret == ENOENT || -ret == ESTALE) { +            gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_LAYOUT_FIX_FAILED, +                   "Setxattr failed. Dir %s " +                   "renamed or removed", +                   loc->path); +            if (conf->decommission_subvols_cnt) { +                defrag->total_failures++; +            } +            ret = 0; +            goto out; +        } else { +            gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LAYOUT_FIX_FAILED, +                   "Setxattr failed for %s", loc->path); + +            defrag->total_failures++; + +            if (conf->decommission_in_progress) { +                defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; +                ret = -1; +                goto out; +            } +        } +    } + +    if ((defrag->cmd != GF_DEFRAG_CMD_START_TIER) && +        (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX)) { +        ret = gf_defrag_process_dir(this, defrag, loc, migrate_data, &perrno); + +        if (ret && (ret != 2)) { +            if (perrno == ENOENT || perrno == ESTALE) { +                ret = 0; +                goto out; +            } else { +                defrag->total_failures++; + +                gf_msg(this->name, GF_LOG_ERROR, 0, +                       DHT_MSG_DEFRAG_PROCESS_DIR_FAILED, +                       "gf_defrag_process_dir failed for " +                       "directory: %s", +                       loc->path); + +                if (conf->decommission_in_progress) { +                    goto out; +                } + +                should_commit_hash = 0; +            } +        } else if (ret == 2) { +            should_commit_hash = 0; +        } +    } + +    gf_msg_trace(this->name, 0, "fix layout called on %s", loc->path); + +    if (should_commit_hash && +        gf_defrag_settle_hash(this, defrag, loc, fix_layout) != 0) { +        defrag->total_failures++; + +        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SETTLE_HASH_FAILED, +               "Settle hash failed for %s", loc->path); + +        ret = -1; + +        if (conf->decommission_in_progress) { +            defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; +            goto out; +        } +    } + +    ret = 0; +out: +    loc_wipe(&entry_loc); + +    if (fd) +        fd_unref(fd); + +    if (ret == 0 && should_commit_hash == 0) { +        ret = 2; +    } + +    if (dirp) { +        sys_closedir(dirp); +    } + +    return ret; +} + +int  dht_init_local_subvols_and_nodeuuids(xlator_t *this, dht_conf_t *conf,                                       loc_t *loc)  { @@ -4405,6 +4768,7 @@ gf_defrag_start_crawl(void *data)      pthread_t *tid = NULL;      pthread_t filecnt_thread;      gf_boolean_t fc_thread_started = _gf_false; +    int i = 0;      this = data;      if (!this) @@ -4539,6 +4903,12 @@ gf_defrag_start_crawl(void *data)              goto out;          } +        ret = dht_get_brick_paths(this, conf, &loc); +        if (ret) { +            gf_log(this->name, GF_LOG_WARNING, "could not get brick path"); +            ret = 0; +        } +          /* Initialise the structures required for parallel migration */          ret = gf_defrag_parallel_migration_init(this, defrag, &tid,                                                  &thread_index); @@ -4556,11 +4926,23 @@ gf_defrag_start_crawl(void *data)          }      } -    ret = gf_defrag_fix_layout(this, defrag, &loc, fix_layout, migrate_data); -    if (ret && ret != 2) { -        defrag->total_failures++; -        ret = -1; -        goto out; +    /* TODO: Need to introduce a flag to safely operate in the old way */ +    if (defrag->operate_dist && defrag->is_pure_distribute) { +        ret = gf_defrag_fix_layout_puredist(this, defrag, &loc, fix_layout, +                                            migrate_data); +        if (ret && ret != 2) { +            defrag->total_failures++; +            ret = -1; +            goto out; +        } +    } else { +        ret = gf_defrag_fix_layout(this, defrag, &loc, fix_layout, +                                   migrate_data); +        if (ret && ret != 2) { +            defrag->total_failures++; +            ret = -1; +            goto out; +        }      }      if (ret != 2 && @@ -4606,6 +4988,14 @@ out:      }      UNLOCK(&defrag->lock); +    for (i = 0; i < conf->local_subvols_cnt; i++) { +        if (defrag->local_brick_paths[i]) { +            GF_FREE(defrag->local_brick_paths[i]); +        } +    } + +    GF_FREE(defrag->local_brick_paths); +      GF_FREE(defrag);      conf->defrag = NULL; diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index d85b4d1ce13..811bb55925f 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -700,6 +700,10 @@ dht_init(xlator_t *this)          pthread_cond_init(&defrag->fc_wakeup_cond, 0);          defrag->global_error = 0; + +        defrag->is_pure_distribute = _gf_false; + +        defrag->operate_dist = _gf_true;      }      conf->use_fallocate = 1;  | 
