diff options
| -rw-r--r-- | tests/basic/fencing/afr-lock-heal-advanced.c | 227 | ||||
| -rw-r--r-- | tests/basic/fencing/afr-lock-heal-advanced.t | 104 | ||||
| -rw-r--r-- | tests/basic/fencing/afr-lock-heal-basic.c | 182 | ||||
| -rw-r--r-- | tests/basic/fencing/afr-lock-heal-basic.t | 99 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 818 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-inode-read.c | 4 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-inode-write.c | 10 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-mem-types.h | 2 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-messages.h | 2 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.c | 3 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 46 | 
11 files changed, 1461 insertions, 36 deletions
diff --git a/tests/basic/fencing/afr-lock-heal-advanced.c b/tests/basic/fencing/afr-lock-heal-advanced.c new file mode 100644 index 00000000000..e202ccd5b29 --- /dev/null +++ b/tests/basic/fencing/afr-lock-heal-advanced.c @@ -0,0 +1,227 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <signal.h> +#include <unistd.h> +#include <glusterfs/api/glfs.h> +#include <glusterfs/api/glfs-handles.h> + +#define GF_ENFORCE_MANDATORY_LOCK "trusted.glusterfs.enforce-mandatory-lock" + +FILE *logfile_fp; + +#define LOG_ERR(func, err)                                                     \ +    do {                                                                       \ +        if (!logfile_fp) {                                                     \ +            fprintf(stderr, "%\n%d %s : returned error (%s)\n", __LINE__,      \ +                    func, strerror(err));                                      \ +            fflush(stderr);                                                    \ +        } else {                                                               \ +            fprintf(logfile_fp, "\n%d %s : returned error (%s)\n", __LINE__,   \ +                    func, strerror(err));                                      \ +            fflush(logfile_fp);                                                \ +        }                                                                      \ +    } while (0) + +glfs_t * +setup_client(char *hostname, char *volname, char *log_file) +{ +    int ret = 0; +    glfs_t *fs = NULL; + +    fs = glfs_new(volname); +    if (!fs) { +        fprintf(logfile_fp, "\nglfs_new: returned NULL (%s)\n", +                strerror(errno)); +        goto error; +    } + +    ret = glfs_set_volfile_server(fs, "tcp", hostname, 24007); +    if (ret < 0) { +        fprintf(logfile_fp, "\nglfs_set_volfile_server failed ret:%d (%s)\n", +                ret, strerror(errno)); +        goto error; +    } + +    ret = glfs_set_logging(fs, log_file, 7); +    if (ret < 0) { +        fprintf(logfile_fp, "\nglfs_set_logging failed with ret: %d (%s)\n", +                ret, strerror(errno)); +        goto error; +    } + +    ret = glfs_init(fs); +    if (ret < 0) { +        fprintf(logfile_fp, "\nglfs_init failed with ret: %d (%s)\n", ret, +                strerror(errno)); +        goto error; +    } + +out: +    return fs; +error: +    return NULL; +} + +glfs_fd_t * +open_file(glfs_t *fs, char *fname) +{ +    glfs_fd_t *fd = NULL; + +    fd = glfs_creat(fs, fname, O_CREAT, 0644); +    if (!fd) { +        LOG_ERR("glfs_creat", errno); +        goto out; +    } +out: +    return fd; +} + +int +acquire_mandatory_lock(glfs_t *fs, glfs_fd_t *fd) +{ +    struct flock lock; +    int ret = 0; + +    /* initialize lock */ +    lock.l_type = F_WRLCK; +    lock.l_whence = SEEK_SET; +    lock.l_start = 0; +    lock.l_len = 100; + +    ret = glfs_fsetxattr(fd, GF_ENFORCE_MANDATORY_LOCK, "set", 8, 0); +    if (ret < 0) { +        LOG_ERR("glfs_fsetxattr", errno); +        ret = -1; +        goto out; +    } + +    /* take a write mandatory lock */ +    ret = glfs_file_lock(fd, F_SETLKW, &lock, GLFS_LK_MANDATORY); +    if (ret) { +        LOG_ERR("glfs_file_lock", errno); +        ret = -1; +        goto out; +    } + +out: +    return ret; +} + +int +perform_test(glfs_t *fs, char *file1, char *file2) +{ +    int ret = 0; +    glfs_fd_t *fd1 = NULL; +    glfs_fd_t *fd2 = NULL; +    char *buf = "0123456789"; + +    fd1 = open_file(fs, file1); +    if (!fd1) { +        ret = -1; +        goto out; +    } +    fd2 = open_file(fs, file2); +    if (!fd2) { +        ret = -1; +        goto out; +    } + +    /* Kill one brick from the .t.*/ +    pause(); + +    ret = acquire_mandatory_lock(fs, fd1); +    if (ret) { +        goto out; +    } +    ret = acquire_mandatory_lock(fs, fd2); +    if (ret) { +        goto out; +    } + +    /* Bring the brick up and let the locks heal. */ +    pause(); +    /*At this point, the .t would have killed and brought back 2 bricks, marking +     * the fd bad.*/ + +    ret = glfs_write(fd1, buf, 10, 0); +    if (ret > 0) { +        /* Write is supposed to fail with EBADFD*/ +        LOG_ERR("glfs_write", ret); +        goto out; +    } + +    ret = 0; +out: +    if (fd1) +        glfs_close(fd1); +    if (fd2) +        glfs_close(fd2); +    return ret; +} + +static void +sigusr1_handler(int signo) +{ +    /*Signal caught. Just continue with the execution.*/ +} + +int +main(int argc, char *argv[]) +{ +    int ret = 0; +    glfs_t *fs = NULL; +    char *volname = NULL; +    char log_file[100]; +    char *hostname = NULL; +    char *fname1 = NULL; +    char *fname2 = NULL; + +    if (argc != 7) { +        fprintf(stderr, +                "Expect following args %s <host> <volname> <file1> <file2> " +                "<log file " +                "location> <log_file_suffix>\n", +                argv[0]); +        return -1; +    } + +    hostname = argv[1]; +    volname = argv[2]; +    fname1 = argv[3]; +    fname2 = argv[4]; + +    /*Use SIGUSR1 and pause()as a means of hitting break-points this program +     *when signalled from the .t test case.*/ +    if (signal(SIGUSR1, sigusr1_handler) == SIG_ERR) { +        LOG_ERR("SIGUSR1 handler error", errno); +        exit(EXIT_FAILURE); +    } + +    sprintf(log_file, "%s/%s.%s.%s", argv[5], "lock-heal.c", argv[6], "log"); +    logfile_fp = fopen(log_file, "w"); +    if (!logfile_fp) { +        fprintf(stderr, "\nfailed to open %s\n", log_file); +        fflush(stderr); +        return -1; +    } + +    sprintf(log_file, "%s/%s.%s.%s", argv[5], "glfs-client", argv[6], "log"); +    fs = setup_client(hostname, volname, log_file); +    if (!fs) { +        LOG_ERR("setup_client", errno); +        return -1; +    } + +    ret = perform_test(fs, fname1, fname2); + +error: +    if (fs) { +        /*glfs_fini(fs)*/;  // glfs fini path is racy and crashes the program +    } + +    fclose(logfile_fp); + +    return ret; +} diff --git a/tests/basic/fencing/afr-lock-heal-advanced.t b/tests/basic/fencing/afr-lock-heal-advanced.t new file mode 100644 index 00000000000..8a7a208db29 --- /dev/null +++ b/tests/basic/fencing/afr-lock-heal-advanced.t @@ -0,0 +1,104 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +cleanup; + +function is_gfapi_program_alive() +{ +        pid=$1 +        ps -p $pid +        if [ $? -eq 0 ] +        then +                echo "Y" +        else +                echo "N" +        fi +} + +function get_active_lock_count { +    brick=$1 +    sdump=$(generate_brick_statedump $V0 $H0 $brick) +    lock_count="$(grep ACTIVE $sdump| wc -l)" +    echo "$lock_count" +} + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info; + +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +EXPECT 'Created' volinfo_field $V0 'Status'; +TEST $CLI volume set $V0 performance.write-behind off +TEST $CLI volume set $V0 performance.open-behind off +TEST $CLI volume set $V0 locks.mandatory-locking forced +TEST $CLI volume set $V0 enforce-mandatory-lock on +TEST $CLI volume start $V0; +EXPECT 'Started' volinfo_field $V0 'Status'; + +logdir=`gluster --print-logdir` +TEST build_tester $(dirname $0)/afr-lock-heal-advanced.c -lgfapi -ggdb + +#------------------------------------------------------------------------------ +# Use more than 1 fd from same client so that list_for_each_* loops are executed more than once. +$(dirname $0)/afr-lock-heal-advanced $H0 $V0 "/FILE1" "/FILE2" $logdir C1& +client_pid=$! +TEST [ $client_pid ] + +TEST sleep 5 # By now, the client would  have opened an fd on FILE1 and FILE2 and waiting for a SIGUSR1. +EXPECT "Y" is_gfapi_program_alive $client_pid + +# Kill brick-3 and let client-1 take lock on both files. +TEST kill_brick $V0 $H0 $B0/${V0}2 +TEST kill -SIGUSR1 $client_pid +# If program is still alive, glfs_file_lock() was a success. +EXPECT "Y" is_gfapi_program_alive $client_pid + +# Check lock is present on brick-1 and brick-2 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "2" get_active_lock_count $B0/${V0}0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "2" get_active_lock_count $B0/${V0}1 + +# Restart brick-3 and check that the lock has healed on it. +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2 +TEST sleep 10 #Needed for client to re-open fd? Otherwise client_pre_lk_v2() fails with EBADFD for remote-fd. + +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "2" get_active_lock_count $B0/${V0}2 + +#------------------------------------------------------------------------------ +# Kill same brick before heal completes the first time and check it completes the second time. +TEST $CLI volume set $V0 delay-gen locks +TEST $CLI volume set $V0 delay-gen.delay-duration 5000000 +TEST $CLI volume set $V0 delay-gen.delay-percentage 100 +TEST $CLI volume set $V0 delay-gen.enable finodelk + +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST $CLI volume reset $V0 delay-gen +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "2" get_active_lock_count $B0/${V0}0 + +#------------------------------------------------------------------------------ +# Kill 2 bricks and bring it back. The fds must be marked bad. +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 + +# TODO: `gluster v statedump $V0 client localhost:$client_pid` is not working, +# so sleep for 20 seconds for the client to connect to connect to the bricks. +TEST sleep $CHILD_UP_TIMEOUT + +# Try to write to FILE1 from the .c; it must fail. +TEST kill -SIGUSR1 $client_pid +wait $client_pid +ret=$? +TEST [ $ret == 0 ] + +cleanup_tester $(dirname $0)/afr-lock-heal-advanced +cleanup; diff --git a/tests/basic/fencing/afr-lock-heal-basic.c b/tests/basic/fencing/afr-lock-heal-basic.c new file mode 100644 index 00000000000..768c9e57181 --- /dev/null +++ b/tests/basic/fencing/afr-lock-heal-basic.c @@ -0,0 +1,182 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <signal.h> +#include <unistd.h> +#include <glusterfs/api/glfs.h> +#include <glusterfs/api/glfs-handles.h> + +#define GF_ENFORCE_MANDATORY_LOCK "trusted.glusterfs.enforce-mandatory-lock" + +FILE *logfile_fp; + +#define LOG_ERR(func, err)                                                     \ +    do {                                                                       \ +        if (!logfile_fp) {                                                     \ +            fprintf(stderr, "%\n%d %s : returned error (%s)\n", __LINE__,      \ +                    func, strerror(err));                                      \ +            fflush(stderr);                                                    \ +        } else {                                                               \ +            fprintf(logfile_fp, "\n%d %s : returned error (%s)\n", __LINE__,   \ +                    func, strerror(err));                                      \ +            fflush(logfile_fp);                                                \ +        }                                                                      \ +    } while (0) + +glfs_t * +setup_client(char *hostname, char *volname, char *log_file) +{ +    int ret = 0; +    glfs_t *fs = NULL; + +    fs = glfs_new(volname); +    if (!fs) { +        fprintf(logfile_fp, "\nglfs_new: returned NULL (%s)\n", +                strerror(errno)); +        goto error; +    } + +    ret = glfs_set_volfile_server(fs, "tcp", hostname, 24007); +    if (ret < 0) { +        fprintf(logfile_fp, "\nglfs_set_volfile_server failed ret:%d (%s)\n", +                ret, strerror(errno)); +        goto error; +    } + +    ret = glfs_set_logging(fs, log_file, 7); +    if (ret < 0) { +        fprintf(logfile_fp, "\nglfs_set_logging failed with ret: %d (%s)\n", +                ret, strerror(errno)); +        goto error; +    } + +    ret = glfs_init(fs); +    if (ret < 0) { +        fprintf(logfile_fp, "\nglfs_init failed with ret: %d (%s)\n", ret, +                strerror(errno)); +        goto error; +    } + +out: +    return fs; +error: +    return NULL; +} + +int +acquire_mandatory_lock(glfs_t *fs, char *fname) +{ +    struct flock lock; +    int ret = 0; +    glfs_fd_t *fd = NULL; + +    fd = glfs_creat(fs, fname, O_CREAT, 0644); +    if (!fd) { +        if (errno != EEXIST) { +            LOG_ERR("glfs_creat", errno); +            ret = -1; +            goto out; +        } +        fd = glfs_open(fs, fname, O_RDWR | O_NONBLOCK); +        if (!fd) { +            LOG_ERR("glfs_open", errno); +            ret = -1; +            goto out; +        } +    } + +    /* initialize lock */ +    lock.l_type = F_WRLCK; +    lock.l_whence = SEEK_SET; +    lock.l_start = 0; +    lock.l_len = 100; + +    ret = glfs_fsetxattr(fd, GF_ENFORCE_MANDATORY_LOCK, "set", 8, 0); +    if (ret < 0) { +        LOG_ERR("glfs_fsetxattr", errno); +        ret = -1; +        goto out; +    } + +    pause(); + +    /* take a write mandatory lock */ +    ret = glfs_file_lock(fd, F_SETLKW, &lock, GLFS_LK_MANDATORY); +    if (ret) { +        LOG_ERR("glfs_file_lock", errno); +        goto out; +    } + +    pause(); + +out: +    if (fd) { +        glfs_close(fd); +    } + +    return ret; +} + +static void +sigusr1_handler(int signo) +{ +    /*Signal caught. Just continue with the execution.*/ +} + +int +main(int argc, char *argv[]) +{ +    int ret = 0; +    glfs_t *fs = NULL; +    char *volname = NULL; +    char log_file[100]; +    char *hostname = NULL; +    char *fname = NULL; + +    if (argc != 6) { +        fprintf(stderr, +                "Expect following args %s <host> <volname> <file> <log file " +                "location> <log_file_suffix>\n", +                argv[0]); +        return -1; +    } + +    hostname = argv[1]; +    volname = argv[2]; +    fname = argv[3]; + +    /*Use SIGUSR1 and pause()as a means of hitting break-points this program +     *when signalled from the .t test case.*/ +    if (signal(SIGUSR1, sigusr1_handler) == SIG_ERR) { +        LOG_ERR("SIGUSR1 handler error", errno); +        exit(EXIT_FAILURE); +    } + +    sprintf(log_file, "%s/%s.%s.%s", argv[4], "lock-heal-basic.c", argv[5], +            "log"); +    logfile_fp = fopen(log_file, "w"); +    if (!logfile_fp) { +        fprintf(stderr, "\nfailed to open %s\n", log_file); +        fflush(stderr); +        return -1; +    } + +    sprintf(log_file, "%s/%s.%s.%s", argv[4], "glfs-client", argv[5], "log"); +    fs = setup_client(hostname, volname, log_file); +    if (!fs) { +        LOG_ERR("setup_client", errno); +        return -1; +    } + +    ret = acquire_mandatory_lock(fs, fname); + +error: +    if (fs) { +        /*glfs_fini(fs)*/;  // glfs fini path is racy and crashes the program +    } + +    fclose(logfile_fp); + +    return ret; +} diff --git a/tests/basic/fencing/afr-lock-heal-basic.t b/tests/basic/fencing/afr-lock-heal-basic.t new file mode 100644 index 00000000000..5ac05c7aec6 --- /dev/null +++ b/tests/basic/fencing/afr-lock-heal-basic.t @@ -0,0 +1,99 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +cleanup; + +function is_gfapi_program_alive() +{ +        pid=$1 +        ps -p $pid +        if [ $? -eq 0 ] +        then +                echo "Y" +        else +                echo "N" +        fi +} + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info; + +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +EXPECT 'Created' volinfo_field $V0 'Status'; +TEST $CLI volume set $V0 performance.write-behind off +TEST $CLI volume set $V0 performance.open-behind off +TEST $CLI volume set $V0 locks.mandatory-locking forced +TEST $CLI volume set $V0 enforce-mandatory-lock on +TEST $CLI volume start $V0; +EXPECT 'Started' volinfo_field $V0 'Status'; + +logdir=`gluster --print-logdir` +TEST build_tester $(dirname $0)/afr-lock-heal-basic.c -lgfapi -ggdb + +$(dirname $0)/afr-lock-heal-basic $H0 $V0 "/FILE" $logdir C1& +client1_pid=$! +TEST [ $client1_pid ] + +$(dirname $0)/afr-lock-heal-basic $H0 $V0 "/FILE" $logdir C2& +client2_pid=$! +TEST [ $client2_pid ] + +TEST sleep 5 # By now, the 2 clients would  have opened an fd on FILE and waiting for a SIGUSR1. +EXPECT "Y" is_gfapi_program_alive $client1_pid +EXPECT "Y" is_gfapi_program_alive $client2_pid + +# Kill brick-3 and let client-1 take lock on the file. +TEST kill_brick $V0 $H0 $B0/${V0}2 +TEST kill -SIGUSR1 $client1_pid +# If program is still alive, glfs_file_lock() was a success. +EXPECT "Y" is_gfapi_program_alive $client1_pid + +# Check lock is present on brick-1 and brick-2 +b1_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}0) +b2_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}1) +c1_lock_on_b1="$(grep ACTIVE $b1_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')" +c1_lock_on_b2="$(grep ACTIVE $b2_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')" +TEST [ "$c1_lock_on_b1" == "$c1_lock_on_b2" ] + +# Restart brick-3 and check that the lock has healed on it. +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2 +TEST sleep 10 #Needed for client to re-open fd? Otherwise client_pre_lk_v2() fails with EBADFD for remote-fd. Also wait for lock heal. + +b3_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}2) +c1_lock_on_b3="$(grep ACTIVE $b3_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')" +TEST [ "$c1_lock_on_b1" == "$c1_lock_on_b3" ] + +# Kill brick-1 and let client-2 preempt the lock on bricks 2 and 3. +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST kill -SIGUSR1 $client2_pid +# If program is still alive, glfs_file_lock() was a success. +EXPECT "Y" is_gfapi_program_alive $client2_pid + +# Restart brick-1 and let lock healing complete. +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +TEST sleep 10 #Needed for client to re-open fd? Otherwise client_pre_lk_v2() fails with EBADFD for remote-fd. Also wait for lock heal. + +# Check that all bricks now have locks from client 2 only. +b1_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}0) +b2_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}1) +b3_sdump=$(generate_brick_statedump $V0 $H0 $B0/${V0}2) +c2_lock_on_b1="$(grep ACTIVE $b1_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')" +c2_lock_on_b2="$(grep ACTIVE $b2_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')" +c2_lock_on_b3="$(grep ACTIVE $b3_sdump| awk '{print $1,$2,$3,S4,$5,$6,$7,$8}'|tr -d '(,), ,')" +TEST [ "$c2_lock_on_b1" == "$c2_lock_on_b2" ] +TEST [ "$c2_lock_on_b1" == "$c2_lock_on_b3" ] +TEST [ "$c2_lock_on_b1" != "$c1_lock_on_b1" ] + +#Let the client programs run and exit. +TEST kill -SIGUSR1 $client1_pid +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "N" is_gfapi_program_alive $client1_pid +TEST kill -SIGUSR1 $client2_pid +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "N" is_gfapi_program_alive $client2_pid + +cleanup_tester $(dirname $0)/afr-lock-heal-basic +cleanup; diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 4b22af7cb3f..07bf53a1941 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -45,6 +45,21 @@ afr_quorum_errno(afr_private_t *priv)      return ENOTCONN;  } +static void +afr_fill_success_replies(afr_local_t *local, afr_private_t *priv, +                         unsigned char *replies) +{ +    int i = 0; + +    for (i = 0; i < priv->child_count; i++) { +        if (local->replies[i].valid && local->replies[i].op_ret == 0) { +            replies[i] = 1; +        } else { +            replies[i] = 0; +        } +    } +} +  int  afr_fav_child_reset_sink_xattrs(void *opaque); @@ -54,6 +69,581 @@ afr_fav_child_reset_sink_xattrs_cbk(int ret, call_frame_t *frame, void *opaque);  static void  afr_discover_done(call_frame_t *frame, xlator_t *this); +int +afr_dom_lock_acquire_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +                         int op_ret, int op_errno, dict_t *xdata) +{ +    afr_local_t *local = frame->local; +    afr_private_t *priv = this->private; +    int i = (long)cookie; + +    local->cont.lk.dom_lock_op_ret[i] = op_ret; +    local->cont.lk.dom_lock_op_errno[i] = op_errno; +    if (op_ret < 0) { +        gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, +               "%s: Failed to acquire %s on %s", +               uuid_utoa(local->fd->inode->gfid), AFR_LK_HEAL_DOM, +               priv->children[i]->name); +    } else { +        local->cont.lk.dom_locked_nodes[i] = 1; +    } + +    syncbarrier_wake(&local->barrier); + +    return 0; +} + +int +afr_dom_lock_acquire(call_frame_t *frame) +{ +    afr_local_t *local = NULL; +    afr_private_t *priv = NULL; +    struct gf_flock flock = { +        0, +    }; +    int i = 0; + +    priv = frame->this->private; +    local = frame->local; +    local->cont.lk.dom_locked_nodes = GF_CALLOC( +        priv->child_count, sizeof(*local->cont.lk.locked_nodes), +        gf_afr_mt_char); +    if (!local->cont.lk.dom_locked_nodes) { +        return -ENOMEM; +    } +    local->cont.lk.dom_lock_op_ret = GF_CALLOC( +        priv->child_count, sizeof(*local->cont.lk.dom_lock_op_ret), +        gf_afr_mt_int32_t); +    if (!local->cont.lk.dom_lock_op_ret) { +        return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */ +    } +    local->cont.lk.dom_lock_op_errno = GF_CALLOC( +        priv->child_count, sizeof(*local->cont.lk.dom_lock_op_errno), +        gf_afr_mt_int32_t); +    if (!local->cont.lk.dom_lock_op_errno) { +        return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */ +    } +    flock.l_type = F_WRLCK; + +    AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM, +              local->fd, F_SETLK, &flock, NULL); + +    if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL)) +        goto blocking_lock; + +    /*If any of the bricks returned EAGAIN, we still need blocking locks.*/ +    if (AFR_COUNT(local->cont.lk.dom_locked_nodes, priv->child_count) != +        priv->child_count) { +        for (i = 0; i < priv->child_count; i++) { +            if (local->cont.lk.dom_lock_op_ret[i] == -1 && +                local->cont.lk.dom_lock_op_errno[i] == EAGAIN) +                goto blocking_lock; +        } +    } + +    return 0; + +blocking_lock: +    afr_dom_lock_release(frame); +    AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM, +              local->fd, F_SETLKW, &flock, NULL); +    if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL)) { +        afr_dom_lock_release(frame); +        return -afr_quorum_errno(priv); +    } + +    return 0; +} + +int +afr_dom_lock_release_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +                         int op_ret, int op_errno, dict_t *xdata) +{ +    afr_local_t *local = frame->local; +    afr_private_t *priv = this->private; +    int i = (long)cookie; + +    if (op_ret < 0) { +        gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, +               "%s: Failed to release %s on %s", local->loc.path, +               AFR_LK_HEAL_DOM, priv->children[i]->name); +    } +    local->cont.lk.dom_locked_nodes[i] = 0; + +    syncbarrier_wake(&local->barrier); + +    return 0; +} + +void +afr_dom_lock_release(call_frame_t *frame) +{ +    afr_local_t *local = NULL; +    afr_private_t *priv = NULL; +    unsigned char *locked_on = NULL; +    struct gf_flock flock = { +        0, +    }; + +    local = frame->local; +    priv = frame->this->private; +    locked_on = local->cont.lk.dom_locked_nodes; +    if (AFR_COUNT(locked_on, priv->child_count) == 0) +        return; +    flock.l_type = F_UNLCK; + +    AFR_ONLIST(locked_on, frame, afr_dom_lock_release_cbk, finodelk, +               AFR_LK_HEAL_DOM, local->fd, F_SETLK, &flock, NULL); + +    return; +} + +static void +afr_lk_heal_info_cleanup(afr_lk_heal_info_t *info) +{ +    if (!info) +        return; +    if (info->xdata_req) +        dict_unref(info->xdata_req); +    if (info->fd) +        fd_unref(info->fd); +    GF_FREE(info->locked_nodes); +    GF_FREE(info->child_up_event_gen); +    GF_FREE(info->child_down_event_gen); +    GF_FREE(info); +} + +static int +afr_add_lock_to_saved_locks(call_frame_t *frame, xlator_t *this) +{ +    afr_private_t *priv = this->private; +    afr_local_t *local = frame->local; +    afr_lk_heal_info_t *info = NULL; +    afr_fd_ctx_t *fd_ctx = NULL; +    int ret = -ENOMEM; + +    info = GF_CALLOC(sizeof(*info), 1, gf_afr_mt_lk_heal_info_t); +    if (!info) { +        goto cleanup; +    } +    INIT_LIST_HEAD(&info->pos); +    info->fd = fd_ref(local->fd); +    info->cmd = local->cont.lk.cmd; +    info->pid = frame->root->pid; +    info->flock = local->cont.lk.user_flock; +    info->xdata_req = dict_copy_with_ref(local->xdata_req, NULL); +    if (!info->xdata_req) { +        goto cleanup; +    } +    info->lk_owner = frame->root->lk_owner; +    info->locked_nodes = GF_MALLOC( +        sizeof(*info->locked_nodes) * priv->child_count, gf_afr_mt_char); +    if (!info->locked_nodes) { +        goto cleanup; +    } +    memcpy(info->locked_nodes, local->cont.lk.locked_nodes, +           sizeof(*info->locked_nodes) * priv->child_count); +    info->child_up_event_gen = GF_CALLOC(sizeof(*info->child_up_event_gen), +                                         priv->child_count, gf_afr_mt_int32_t); +    if (!info->child_up_event_gen) { +        goto cleanup; +    } +    info->child_down_event_gen = GF_CALLOC(sizeof(*info->child_down_event_gen), +                                           priv->child_count, +                                           gf_afr_mt_int32_t); +    if (!info->child_down_event_gen) { +        goto cleanup; +    } + +    LOCK(&local->fd->lock); +    { +        fd_ctx = __afr_fd_ctx_get(local->fd, this); +        if (fd_ctx) +            fd_ctx->lk_heal_info = info; +    } +    UNLOCK(&local->fd->lock); +    if (!fd_ctx) { +        goto cleanup; +    } + +    LOCK(&priv->lock); +    { +        list_add_tail(&info->pos, &priv->saved_locks); +    } +    UNLOCK(&priv->lock); + +    return 0; +cleanup: +    gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM, +           "%s: Failed to add lock to healq", +           uuid_utoa(local->fd->inode->gfid)); +    if (info) { +        afr_lk_heal_info_cleanup(info); +        if (fd_ctx) { +            LOCK(&local->fd->lock); +            { +                fd_ctx->lk_heal_info = NULL; +            } +            UNLOCK(&local->fd->lock); +        } +    } +    return ret; +} + +static int +afr_remove_lock_from_saved_locks(afr_local_t *local, xlator_t *this) +{ +    afr_private_t *priv = this->private; +    struct gf_flock flock = local->cont.lk.user_flock; +    afr_lk_heal_info_t *info = NULL; +    afr_fd_ctx_t *fd_ctx = NULL; +    int ret = -EINVAL; + +    fd_ctx = afr_fd_ctx_get(local->fd, this); +    if (!fd_ctx || !fd_ctx->lk_heal_info) { +        goto out; +    } + +    info = fd_ctx->lk_heal_info; +    if ((info->flock.l_start != flock.l_start) || +        (info->flock.l_whence != flock.l_whence) || +        (info->flock.l_len != flock.l_len)) { +        /*TODO: Compare lkowners too.*/ +        goto out; +    } + +    LOCK(&priv->lock); +    { +        list_del(&fd_ctx->lk_heal_info->pos); +    } +    UNLOCK(&priv->lock); + +    afr_lk_heal_info_cleanup(info); +    fd_ctx->lk_heal_info = NULL; +    ret = 0; +out: +    if (ret) +        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM, +               "%s: Failed to remove lock from healq", +               uuid_utoa(local->fd->inode->gfid)); +    return ret; +} + +int +afr_lock_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +                  int32_t op_ret, int32_t op_errno, struct gf_flock *lock, +                  dict_t *xdata) +{ +    afr_local_t *local = frame->local; +    int i = (long)cookie; + +    local->replies[i].valid = 1; +    local->replies[i].op_ret = op_ret; +    local->replies[i].op_errno = op_errno; +    if (op_ret != 0) { +        gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, +               "Failed to heal lock on child %d for %s", i, +               uuid_utoa(local->fd->inode->gfid)); +    } +    syncbarrier_wake(&local->barrier); +    return 0; +} + +int +afr_getlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, +              int32_t op_errno, struct gf_flock *lock, dict_t *xdata) +{ +    afr_local_t *local = frame->local; +    int i = (long)cookie; + +    local->replies[i].valid = 1; +    local->replies[i].op_ret = op_ret; +    local->replies[i].op_errno = op_errno; +    if (op_ret != 0) { +        gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, +               "Failed getlk for %s", uuid_utoa(local->fd->inode->gfid)); +    } else { +        local->cont.lk.getlk_rsp[i] = *lock; +    } + +    syncbarrier_wake(&local->barrier); +    return 0; +} + +static gf_boolean_t +afr_does_lk_owner_match(call_frame_t *frame, afr_private_t *priv, +                        afr_lk_heal_info_t *info) +{ +    int i = 0; +    afr_local_t *local = frame->local; +    struct gf_flock flock = { +        0, +    }; +    gf_boolean_t ret = _gf_true; +    char *wind_on = alloca0(priv->child_count); +    unsigned char *success_replies = alloca0(priv->child_count); +    local->cont.lk.getlk_rsp = GF_CALLOC(sizeof(*local->cont.lk.getlk_rsp), +                                         priv->child_count, gf_afr_mt_gf_lock); + +    flock = info->flock; +    for (i = 0; i < priv->child_count; i++) { +        if (info->locked_nodes[i]) +            wind_on[i] = 1; +    } + +    AFR_ONLIST(wind_on, frame, afr_getlk_cbk, lk, info->fd, F_GETLK, &flock, +               info->xdata_req); + +    afr_fill_success_replies(local, priv, success_replies); +    if (AFR_COUNT(success_replies, priv->child_count) == 0) { +        ret = _gf_false; +        goto out; +    } + +    for (i = 0; i < priv->child_count; i++) { +        if (!local->replies[i].valid || local->replies[i].op_ret != 0) +            continue; +        if (local->cont.lk.getlk_rsp[i].l_type == F_UNLCK) +            continue; +        /*TODO: Do we really need to compare lkowner if F_UNLCK is true?*/ +        if (!is_same_lkowner(&local->cont.lk.getlk_rsp[i].l_owner, +                             &info->lk_owner)) { +            ret = _gf_false; +            break; +        } +    } +out: +    afr_local_replies_wipe(local, priv); +    GF_FREE(local->cont.lk.getlk_rsp); +    local->cont.lk.getlk_rsp = NULL; +    return ret; +} + +static void +afr_mark_fd_bad(fd_t *fd, xlator_t *this) +{ +    afr_fd_ctx_t *fd_ctx = NULL; + +    if (!fd) +        return; +    LOCK(&fd->lock); +    { +        fd_ctx = __afr_fd_ctx_get(fd, this); +        if (fd_ctx) { +            fd_ctx->is_fd_bad = _gf_true; +            fd_ctx->lk_heal_info = NULL; +        } +    } +    UNLOCK(&fd->lock); +} + +static void +afr_add_lock_to_lkhealq(afr_private_t *priv, afr_lk_heal_info_t *info) +{ +    LOCK(&priv->lock); +    { +        list_del(&info->pos); +        list_add_tail(&info->pos, &priv->lk_healq); +    } +    UNLOCK(&priv->lock); +} + +static void +afr_lock_heal_do(call_frame_t *frame, afr_private_t *priv, +                 afr_lk_heal_info_t *info) +{ +    int i = 0; +    int op_errno = 0; +    int32_t *current_event_gen = NULL; +    afr_local_t *local = frame->local; +    xlator_t *this = frame->this; +    char *wind_on = alloca0(priv->child_count); +    gf_boolean_t retry = _gf_true; + +    frame->root->pid = info->pid; +    lk_owner_copy(&frame->root->lk_owner, &info->lk_owner); + +    op_errno = -afr_dom_lock_acquire(frame); +    if ((op_errno != 0)) { +        goto release; +    } + +    if (!afr_does_lk_owner_match(frame, priv, info)) { +        gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_LK_HEAL_DOM, +               "Ignoring lock heal for %s since lk-onwers mismatch. " +               "Lock possibly pre-empted by another client.", +               uuid_utoa(info->fd->inode->gfid)); +        goto release; +    } + +    for (i = 0; i < priv->child_count; i++) { +        if (info->locked_nodes[i]) +            continue; +        wind_on[i] = 1; +    } + +    current_event_gen = alloca(priv->child_count); +    memcpy(current_event_gen, info->child_up_event_gen, +           priv->child_count * sizeof *current_event_gen); +    AFR_ONLIST(wind_on, frame, afr_lock_heal_cbk, lk, info->fd, info->cmd, +               &info->flock, info->xdata_req); + +    LOCK(&priv->lock); +    { +        for (i = 0; i < priv->child_count; i++) { +            if (!wind_on[i]) +                continue; +            if ((!local->replies[i].valid) || (local->replies[i].op_ret != 0)) { +                continue; +            } + +            if ((current_event_gen[i] == info->child_up_event_gen[i]) && +                (current_event_gen[i] > info->child_down_event_gen[i])) { +                info->locked_nodes[i] = 1; +                retry = _gf_false; +                list_del_init(&info->pos); +                list_add_tail(&info->pos, &priv->saved_locks); +            } else { +                /*We received subsequent child up/down events while heal was in +                 * progress; don't mark child as healed. Attempt again on the +                 * new child up*/ +                gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_LK_HEAL_DOM, +                       "Event gen mismatch: skipped healing lock on child %d " +                       "for %s.", +                       i, uuid_utoa(info->fd->inode->gfid)); +            } +        } +    } +    UNLOCK(&priv->lock); + +release: +    afr_dom_lock_release(frame); +    if (retry) +        afr_add_lock_to_lkhealq(priv, info); +    return; +} + +static int +afr_lock_heal_done(int ret, call_frame_t *frame, void *opaque) +{ +    STACK_DESTROY(frame->root); +    return 0; +} + +static int +afr_lock_heal(void *opaque) +{ +    call_frame_t *frame = (call_frame_t *)opaque; +    call_frame_t *iter_frame = NULL; +    xlator_t *this = frame->this; +    afr_private_t *priv = this->private; +    afr_lk_heal_info_t *info = NULL; +    afr_lk_heal_info_t *tmp = NULL; +    struct list_head healq = { +        0, +    }; +    int ret = 0; + +    iter_frame = afr_copy_frame(frame); +    if (!iter_frame) { +        return ENOMEM; +    } + +    INIT_LIST_HEAD(&healq); +    LOCK(&priv->lock); +    { +        list_splice_init(&priv->lk_healq, &healq); +    } +    UNLOCK(&priv->lock); + +    list_for_each_entry_safe(info, tmp, &healq, pos) +    { +        GF_ASSERT((AFR_COUNT(info->locked_nodes, priv->child_count) < +                   priv->child_count)); +        ((afr_local_t *)(iter_frame->local))->fd = fd_ref(info->fd); +        afr_lock_heal_do(iter_frame, priv, info); +        AFR_STACK_RESET(iter_frame); +        if (iter_frame->local == NULL) { +            ret = ENOTCONN; +            gf_msg(frame->this->name, GF_LOG_ERROR, ENOTCONN, +                   AFR_MSG_LK_HEAL_DOM, +                   "Aborting processing of lk_healq." +                   "Healing will be reattempted on next child up for locks " +                   "that are still in quorum."); +            LOCK(&priv->lock); +            { +                list_add_tail(&healq, &priv->lk_healq); +            } +            UNLOCK(&priv->lock); +            break; +        } +    } + +    AFR_STACK_DESTROY(iter_frame); +    return ret; +} + +static int +__afr_lock_heal_synctask(xlator_t *this, afr_private_t *priv, int child) +{ +    int ret = 0; +    call_frame_t *frame = NULL; +    afr_lk_heal_info_t *info = NULL; +    afr_lk_heal_info_t *tmp = NULL; + +    if (priv->shd.iamshd) +        return 0; + +    list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos) +    { +        info->child_up_event_gen[child] = priv->event_generation; +        list_del_init(&info->pos); +        list_add_tail(&info->pos, &priv->lk_healq); +    } + +    frame = create_frame(this, this->ctx->pool); +    if (!frame) +        return -1; + +    ret = synctask_new(this->ctx->env, afr_lock_heal, afr_lock_heal_done, frame, +                       frame); +    if (ret) +        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_LK_HEAL_DOM, +               "Failed to launch lock heal synctask"); + +    return ret; +} + +static int +__afr_mark_pending_lk_heal(xlator_t *this, afr_private_t *priv, int child) +{ +    afr_lk_heal_info_t *info = NULL; +    afr_lk_heal_info_t *tmp = NULL; + +    if (priv->shd.iamshd) +        return 0; +    list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos) +    { +        info->child_down_event_gen[child] = priv->event_generation; +        if (info->locked_nodes[child] == 1) +            info->locked_nodes[child] = 0; +        if (!afr_has_quorum(info->locked_nodes, this, NULL)) { +            /* Since the lock was lost on quorum no. of nodes, we should +             * not attempt to heal it anymore. Some other client could have +             * acquired the lock, modified data and released it and this +             * client wouldn't know about it if we heal it.*/ +            afr_mark_fd_bad(info->fd, this); +            list_del(&info->pos); +            afr_lk_heal_info_cleanup(info); +            /* We're not winding an unlock on the node where the lock is still +             * present because when fencing logic switches over to the new +             * client (since we marked the fd bad), it should preempt any +             * existing lock. */ +        } +    } +    return 0; +} +  gf_boolean_t  afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv,                                int32_t *op_errno) @@ -68,6 +658,19 @@ afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv,      return _gf_true;  } +gf_boolean_t +afr_is_lock_mode_mandatory(dict_t *xdata) +{ +    int ret = 0; +    uint32_t lk_mode = GF_LK_ADVISORY; + +    ret = dict_get_uint32(xdata, GF_LOCK_MODE, &lk_mode); +    if (!ret && lk_mode == GF_LK_MANDATORY) +        return _gf_true; + +    return _gf_false; +} +  call_frame_t *  afr_copy_frame(call_frame_t *base)  { @@ -1224,18 +1827,6 @@ refresh_done:      return 0;  } -static void -afr_fill_success_replies(afr_local_t *local, afr_private_t *priv, -                         unsigned char *replies) -{ -    int i = 0; - -    for (i = 0; i < priv->child_count; i++) { -        if (local->replies[i].valid && local->replies[i].op_ret == 0) -            replies[i] = 1; -    } -} -  int  afr_inode_refresh_done(call_frame_t *frame, xlator_t *this, int error)  { @@ -2049,6 +2640,9 @@ afr_local_cleanup(afr_local_t *local, xlator_t *this)      { /* lk */          GF_FREE(local->cont.lk.locked_nodes); +        GF_FREE(local->cont.lk.dom_locked_nodes); +        GF_FREE(local->cont.lk.dom_lock_op_ret); +        GF_FREE(local->cont.lk.dom_lock_op_errno);      }      { /* create */ @@ -3451,8 +4045,18 @@ out:  }  void -_afr_cleanup_fd_ctx(afr_fd_ctx_t *fd_ctx) +_afr_cleanup_fd_ctx(xlator_t *this, afr_fd_ctx_t *fd_ctx)  { +    afr_private_t *priv = this->private; + +    if (fd_ctx->lk_heal_info) { +        LOCK(&priv->lock); +        { +            list_del(&fd_ctx->lk_heal_info->pos); +        } +        afr_lk_heal_info_cleanup(fd_ctx->lk_heal_info); +        fd_ctx->lk_heal_info = NULL; +    }      GF_FREE(fd_ctx->opened_on);      GF_FREE(fd_ctx);      return; @@ -3472,7 +4076,7 @@ afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd)      fd_ctx = (afr_fd_ctx_t *)(long)ctx;      if (fd_ctx) { -        _afr_cleanup_fd_ctx(fd_ctx); +        _afr_cleanup_fd_ctx(this, fd_ctx);      }  out: @@ -3565,13 +4169,14 @@ __afr_fd_ctx_set(xlator_t *this, fd_t *fd)      }      fd_ctx->readdir_subvol = -1; +    fd_ctx->lk_heal_info = NULL;      ret = __fd_ctx_set(fd, this, (uint64_t)(long)fd_ctx);      if (ret)          gf_msg_debug(this->name, 0, "failed to set fd ctx (%p)", fd);  out:      if (ret && fd_ctx) -        _afr_cleanup_fd_ctx(fd_ctx); +        _afr_cleanup_fd_ctx(this, fd_ctx);      return ret;  } @@ -3694,6 +4299,7 @@ afr_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)      call_stub_t *stub = NULL;      int op_errno = ENOMEM; +    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);      local = AFR_FRAME_INIT(frame, op_errno);      if (!local)          goto out; @@ -4230,9 +4836,9 @@ out:  }  static int32_t -afr_handle_inodelk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume, -                   loc_t *loc, fd_t *fd, int32_t cmd, struct gf_flock *flock, -                   dict_t *xdata) +afr_handle_inodelk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, +                   const char *volume, loc_t *loc, fd_t *fd, int32_t cmd, +                   struct gf_flock *flock, dict_t *xdata)  {      afr_local_t *local = NULL;      int32_t op_errno = ENOMEM; @@ -4244,8 +4850,10 @@ afr_handle_inodelk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume,      local->op = fop;      if (loc)          loc_copy(&local->loc, loc); -    if (fd) +    if (fd && (flock->l_type != F_UNLCK)) { +        AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);          local->fd = fd_ref(fd); +    }      local->cont.inodelk.volume = gf_strdup(volume);      if (!local->cont.inodelk.volume) { @@ -4274,8 +4882,8 @@ int32_t  afr_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,              int32_t cmd, struct gf_flock *flock, dict_t *xdata)  { -    afr_handle_inodelk(frame, GF_FOP_INODELK, volume, loc, NULL, cmd, flock, -                       xdata); +    afr_handle_inodelk(frame, this, GF_FOP_INODELK, volume, loc, NULL, cmd, +                       flock, xdata);      return 0;  } @@ -4283,15 +4891,16 @@ int32_t  afr_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,               int32_t cmd, struct gf_flock *flock, dict_t *xdata)  { -    afr_handle_inodelk(frame, GF_FOP_FINODELK, volume, NULL, fd, cmd, flock, -                       xdata); +    afr_handle_inodelk(frame, this, GF_FOP_FINODELK, volume, NULL, fd, cmd, +                       flock, xdata);      return 0;  }  static int -afr_handle_entrylk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume, -                   loc_t *loc, fd_t *fd, const char *basename, entrylk_cmd cmd, -                   entrylk_type type, dict_t *xdata) +afr_handle_entrylk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, +                   const char *volume, loc_t *loc, fd_t *fd, +                   const char *basename, entrylk_cmd cmd, entrylk_type type, +                   dict_t *xdata)  {      afr_local_t *local = NULL;      int32_t op_errno = ENOMEM; @@ -4303,8 +4912,10 @@ afr_handle_entrylk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume,      local->op = fop;      if (loc)          loc_copy(&local->loc, loc); -    if (fd) +    if (fd && (cmd != ENTRYLK_UNLOCK)) { +        AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);          local->fd = fd_ref(fd); +    }      local->cont.entrylk.cmd = cmd;      local->cont.entrylk.in_cmd = cmd;      local->cont.entrylk.type = type; @@ -4331,8 +4942,8 @@ afr_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,              const char *basename, entrylk_cmd cmd, entrylk_type type,              dict_t *xdata)  { -    afr_handle_entrylk(frame, GF_FOP_ENTRYLK, volume, loc, NULL, basename, cmd, -                       type, xdata); +    afr_handle_entrylk(frame, this, GF_FOP_ENTRYLK, volume, loc, NULL, basename, +                       cmd, type, xdata);      return 0;  } @@ -4341,8 +4952,8 @@ afr_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,               const char *basename, entrylk_cmd cmd, entrylk_type type,               dict_t *xdata)  { -    afr_handle_entrylk(frame, GF_FOP_FENTRYLK, volume, NULL, fd, basename, cmd, -                       type, xdata); +    afr_handle_entrylk(frame, this, GF_FOP_FENTRYLK, volume, NULL, fd, basename, +                       cmd, type, xdata);      return 0;  } @@ -4460,9 +5071,10 @@ afr_lk_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,      }      call_count = afr_frame_return(frame); -    if (call_count == 0) +    if (call_count == 0) {          AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, NULL,                           local->xdata_rsp); +    }      return 0;  } @@ -4561,11 +5173,133 @@ afr_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,  }  int +afr_lk_transaction_cbk(int ret, call_frame_t *frame, void *opaque) +{ +    return 0; +} + +int +afr_lk_txn_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +                    int32_t op_ret, int32_t op_errno, struct gf_flock *lock, +                    dict_t *xdata) +{ +    afr_local_t *local = NULL; +    int child_index = -1; + +    local = frame->local; +    child_index = (long)cookie; +    afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata); +    if (op_ret == 0) { +        local->op_ret = 0; +        local->op_errno = 0; +        local->cont.lk.locked_nodes[child_index] = 1; +        local->cont.lk.ret_flock = *lock; +    } +    syncbarrier_wake(&local->barrier); +    return 0; +} + +int +afr_lk_txn_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +                      int32_t op_ret, int32_t op_errno, struct gf_flock *lock, +                      dict_t *xdata) +{ +    afr_local_t *local = frame->local; +    afr_private_t *priv = this->private; +    int child_index = (long)cookie; + +    if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { +        gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL, +               "gfid=%s: unlock failed on subvolume %s " +               "with lock owner %s", +               uuid_utoa(local->fd->inode->gfid), +               priv->children[child_index]->name, +               lkowner_utoa(&frame->root->lk_owner)); +    } +    return 0; +} +int +afr_lk_transaction(void *opaque) +{ +    call_frame_t *frame = NULL; +    xlator_t *this = NULL; +    afr_private_t *priv = NULL; +    afr_local_t *local = NULL; +    char *wind_on = NULL; +    int op_errno = 0; +    int i = 0; +    int ret = 0; + +    frame = (call_frame_t *)opaque; +    local = frame->local; +    this = frame->this; +    priv = this->private; +    wind_on = alloca0(priv->child_count); + +    if (priv->arbiter_count || priv->child_count != 3) { +        op_errno = ENOTSUP; +        gf_msg(frame->this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, +               "%s: Lock healing supported only for replica 3 volumes.", +               uuid_utoa(local->fd->inode->gfid)); +        goto err; +    } + +    op_errno = -afr_dom_lock_acquire(frame);  // Released during +                                              // AFR_STACK_UNWIND +    if (op_errno != 0) { +        goto err; +    } +    if (priv->quorum_count && +        !afr_has_quorum(local->cont.lk.dom_locked_nodes, this, NULL)) { +        op_errno = afr_final_errno(local, priv); +        goto err; +    } + +    for (i = 0; i < priv->child_count; i++) { +        if (priv->child_up[i] && local->cont.lk.dom_locked_nodes[i]) +            wind_on[i] = 1; +    } +    AFR_ONLIST(wind_on, frame, afr_lk_txn_wind_cbk, lk, local->fd, +               local->cont.lk.cmd, &local->cont.lk.user_flock, +               local->xdata_req); + +    if (priv->quorum_count && +        !afr_has_quorum(local->cont.lk.locked_nodes, this, NULL)) { +        local->op_ret = -1; +        local->op_errno = afr_final_errno(local, priv); +        goto unlock; +    } else { +        if (local->cont.lk.user_flock.l_type == F_UNLCK) +            ret = afr_remove_lock_from_saved_locks(local, this); +        else +            ret = afr_add_lock_to_saved_locks(frame, this); +        if (ret) { +            local->op_ret = -1; +            local->op_errno = -ret; +            goto unlock; +        } +        AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, +                         &local->cont.lk.ret_flock, local->xdata_rsp); +    } + +    return 0; + +unlock: +    local->cont.lk.user_flock.l_type = F_UNLCK; +    AFR_ONLIST(local->cont.lk.locked_nodes, frame, afr_lk_txn_unlock_cbk, lk, +               local->fd, F_SETLK, &local->cont.lk.user_flock, NULL); +err: +    AFR_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL); +    return -1; +} + +int  afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,         struct gf_flock *flock, dict_t *xdata)  {      afr_private_t *priv = NULL;      afr_local_t *local = NULL; +    int ret = 0;      int i = 0;      int32_t op_errno = ENOMEM; @@ -4576,9 +5310,11 @@ afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,          goto out;      local->op = GF_FOP_LK; -    if (!afr_lk_is_unlock(cmd, flock) && -        !afr_is_consistent_io_possible(local, priv, &op_errno)) -        goto out; +    if (!afr_lk_is_unlock(cmd, flock)) { +        AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); +        if (!afr_is_consistent_io_possible(local, priv, &op_errno)) +            goto out; +    }      local->cont.lk.locked_nodes = GF_CALLOC(          priv->child_count, sizeof(*local->cont.lk.locked_nodes), @@ -4596,6 +5332,16 @@ afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,      if (xdata)          local->xdata_req = dict_ref(xdata); +    if (afr_is_lock_mode_mandatory(xdata)) { +        ret = synctask_new(this->ctx->env, afr_lk_transaction, +                           afr_lk_transaction_cbk, frame, frame); +        if (ret) { +            op_errno = ENOMEM; +            goto out; +        } +        return 0; +    } +      STACK_WIND_COOKIE(frame, afr_lk_cbk, (void *)(long)0, priv->children[i],                        priv->children[i]->fops->lk, fd, cmd, flock,                        local->xdata_req); @@ -5593,6 +6339,7 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)                  __afr_handle_child_up_event(this, child_xlator, idx,                                              child_latency_msec, &event,                                              &call_psh, &up_child); +                __afr_lock_heal_synctask(this, priv, idx);                  break;              case GF_EVENT_CHILD_DOWN: @@ -5606,6 +6353,7 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)                  __afr_handle_child_down_event(this, child_xlator, idx,                                                child_latency_msec, &event,                                                &call_psh, &up_child); +                __afr_mark_pending_lk_heal(this, priv, idx);                  break;              case GF_EVENT_CHILD_CONNECTING: diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index e8894a62620..c5521704de2 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -302,6 +302,7 @@ afr_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)      afr_local_t *local = NULL;      int op_errno = 0; +    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);      local = AFR_FRAME_INIT(frame, op_errno);      if (!local)          goto out; @@ -1698,6 +1699,7 @@ afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,      int32_t op_errno = 0;      fop_fgetxattr_cbk_t cbk = NULL; +    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);      local = AFR_FRAME_INIT(frame, op_errno);      if (!local)          goto out; @@ -1791,6 +1793,7 @@ afr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,      afr_local_t *local = NULL;      int32_t op_errno = 0; +    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);      local = AFR_FRAME_INIT(frame, op_errno);      if (!local)          goto out; @@ -1866,6 +1869,7 @@ afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,      afr_local_t *local = NULL;      int32_t op_errno = 0; +    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);      local = AFR_FRAME_INIT(frame, op_errno);      if (!local)          goto out; diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 9acb4d0e053..a3d2150efe2 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -491,6 +491,7 @@ afr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,      int op_errno = ENOMEM;      int ret = -1; +    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);      local = AFR_FRAME_INIT(frame, op_errno);      if (!local)          goto out; @@ -730,6 +731,7 @@ afr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,      int ret = -1;      int op_errno = ENOMEM; +    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);      transaction_frame = copy_frame(frame);      if (!transaction_frame)          goto out; @@ -940,6 +942,7 @@ afr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf,      int ret = -1;      int op_errno = ENOMEM; +    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);      transaction_frame = copy_frame(frame);      if (!transaction_frame)          goto out; @@ -1690,6 +1693,7 @@ afr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,      GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.afr.*", dict, op_errno, out); +    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);      transaction_frame = copy_frame(frame);      if (!transaction_frame)          goto out; @@ -1898,6 +1902,7 @@ afr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,      GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.afr.*", name, op_errno, out); +    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);      transaction_frame = copy_frame(frame);      if (!transaction_frame)          goto out; @@ -1998,6 +2003,7 @@ afr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,      int ret = -1;      int op_errno = ENOMEM; +    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);      transaction_frame = copy_frame(frame);      if (!transaction_frame)          goto out; @@ -2107,6 +2113,7 @@ afr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,      int ret = -1;      int op_errno = ENOMEM; +    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);      transaction_frame = copy_frame(frame);      if (!transaction_frame)          goto out; @@ -2213,6 +2220,7 @@ afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,      int ret = -1;      int op_errno = ENOMEM; +    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);      transaction_frame = copy_frame(frame);      if (!transaction_frame)          goto out; @@ -2412,6 +2420,7 @@ afr_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,      int ret = -1;      int op_errno = ENOMEM; +    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);      transaction_frame = copy_frame(frame);      if (!transaction_frame)          goto out; @@ -2507,6 +2516,7 @@ afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,      int ret = -1;      int32_t op_errno = ENOMEM; +    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);      transaction_frame = copy_frame(frame);      if (!transaction_frame)          goto out; diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index b0fb00641a0..816065fb57a 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -31,6 +31,8 @@ enum gf_afr_mem_types_ {      gf_afr_mt_empty_brick_t,      gf_afr_mt_child_latency_t,      gf_afr_mt_atomic_t, +    gf_afr_mt_lk_heal_info_t, +    gf_afr_mt_gf_lock,      gf_afr_mt_end  };  #endif diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h index c9c99270e98..8e59c51b993 100644 --- a/xlators/cluster/afr/src/afr-messages.h +++ b/xlators/cluster/afr/src/afr-messages.h @@ -42,6 +42,6 @@ GLFS_MSGID(AFR, AFR_MSG_QUORUM_FAIL, AFR_MSG_QUORUM_MET,             AFR_MSG_SPLIT_BRAIN_STATUS, AFR_MSG_ADD_BRICK_STATUS,             AFR_MSG_NO_CHANGELOG, AFR_MSG_TIMER_CREATE_FAIL,             AFR_MSG_SBRAIN_FAV_CHILD_POLICY, AFR_MSG_INODE_CTX_GET_FAILED, -           AFR_MSG_THIN_ARB); +           AFR_MSG_THIN_ARB, AFR_MSG_LK_HEAL_DOM);  #endif /* !_AFR_MESSAGES_H_ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index f8db3c5653f..13b5ca2fce9 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -417,6 +417,8 @@ init(xlator_t *this)          goto out;      priv = this->private; +    INIT_LIST_HEAD(&priv->saved_locks); +    INIT_LIST_HEAD(&priv->lk_healq);      LOCK_INIT(&priv->lock);      child_count = xlator_subvolume_count(this); @@ -684,6 +686,7 @@ fini(xlator_t *this)      priv = this->private;      afr_selfheal_daemon_fini(this); +    GF_ASSERT(list_empty(&priv->saved_locks));      LOCK(&priv->lock);      if (priv->timer != NULL) { diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index f86f019e637..28be839ad68 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -39,6 +39,8 @@  #define AFR_TA_DOM_NOTIFY "afr.ta.dom-notify"  #define AFR_TA_DOM_MODIFY "afr.ta.dom-modify" +#define AFR_LK_HEAL_DOM "afr.lock-heal.domain" +  #define AFR_HALO_MAX_LATENCY 99999  #define PFLAG_PENDING (1 << 0) @@ -95,6 +97,16 @@ typedef int (*afr_changelog_resume_t)(call_frame_t *frame, xlator_t *this);                     gf_fop_list[local->op], uuid_utoa(local->inode->gfid));     \      } while (0) +#define AFR_ERROR_OUT_IF_FDCTX_INVALID(__fd, __this, __error, __label)         \ +    do {                                                                       \ +        afr_fd_ctx_t *__fd_ctx = NULL;                                         \ +        __fd_ctx = afr_fd_ctx_get(__fd, __this);                               \ +        if (__fd_ctx && __fd_ctx->is_fd_bad) {                                 \ +            __error = EBADF;                                                   \ +            goto __label;                                                      \ +        }                                                                      \ +    } while (0) +  typedef enum {      AFR_READ_POLICY_FIRST_UP,      AFR_READ_POLICY_GFID_HASH, @@ -143,6 +155,19 @@ struct afr_nfsd {      gf_boolean_t iamnfsd;  }; +typedef struct _afr_lk_heal_info { +    fd_t *fd; +    int32_t cmd; +    struct gf_flock flock; +    dict_t *xdata_req; +    unsigned char *locked_nodes; +    struct list_head pos; +    gf_lkowner_t lk_owner; +    pid_t pid; +    int32_t *child_up_event_gen; +    int32_t *child_down_event_gen; +} afr_lk_heal_info_t; +  typedef struct _afr_private {      gf_lock_t lock;             /* to guard access to child_count, etc */      unsigned int child_count;   /* total number of children   */ @@ -249,6 +274,10 @@ typedef struct _afr_private {      gf_boolean_t esh_granular;      gf_boolean_t consistent_io;      gf_boolean_t data_self_heal; /* on/off */ + +    /*For lock healing.*/ +    struct list_head saved_locks; +    struct list_head lk_healq;  } afr_private_t;  typedef enum { @@ -371,6 +400,10 @@ typedef struct {         arrives, we continue to read off this subvol.      */      int readdir_subvol; +    /* lock-healing related members. */ +    gf_boolean_t is_fd_bad; +    afr_lk_heal_info_t *lk_heal_info; +  } afr_fd_ctx_t;  typedef enum { @@ -572,6 +605,11 @@ typedef struct _afr_local {              struct gf_flock ret_flock;              unsigned char *locked_nodes;              int32_t cmd; +            /*For lock healing only.*/ +            unsigned char *dom_locked_nodes; +            int32_t *dom_lock_op_ret; +            int32_t *dom_lock_op_errno; +            struct gf_flock *getlk_rsp;          } lk;          /* inode read */ @@ -1074,6 +1112,8 @@ afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd);              if (__local && __local->is_read_txn)                               \                  afr_pending_read_decrement(__this->private,                    \                                             __local->read_subvol);              \ +            if (__local && afr_is_lock_mode_mandatory(__local->xdata_req))     \ +                afr_dom_lock_release(frame);                                   \              frame->local = NULL;                                               \          }                                                                      \                                                                                 \ @@ -1354,4 +1394,10 @@ afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv,  void  afr_selfheal_childup(xlator_t *this, afr_private_t *priv); + +gf_boolean_t +afr_is_lock_mode_mandatory(dict_t *xdata); + +void +afr_dom_lock_release(call_frame_t *frame);  #endif /* __AFR_H__ */  | 
