diff options
-rw-r--r-- | tests/basic/ec/ec-fix-openfd.t | 109 | ||||
-rwxr-xr-x | tests/bugs/core/bug-908146.t | 12 | ||||
-rw-r--r-- | tests/volume.rc | 12 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-common.c | 113 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-common.h | 3 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-dir-read.c | 8 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-dir-write.c | 1 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-helpers.c | 29 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-inode-read.c | 3 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-types.h | 177 |
10 files changed, 365 insertions, 102 deletions
diff --git a/tests/basic/ec/ec-fix-openfd.t b/tests/basic/ec/ec-fix-openfd.t new file mode 100644 index 00000000000..b62fbf429c8 --- /dev/null +++ b/tests/basic/ec/ec-fix-openfd.t @@ -0,0 +1,109 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../fileio.rc + +# This test checks for open fd heal on EC + +#Create Volume +cleanup +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/${V0}{0..2} +TEST $CLI volume set $V0 performance.read-after-open yes +TEST $CLI volume set $V0 performance.lazy-open no +TEST $CLI volume set $V0 performance.open-behind off +TEST $CLI volume set $V0 disperse.background-heals 0 +TEST $CLI volume heal $V0 disable +TEST $CLI volume start $V0 + +#Mount the volume +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0; +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 + +#Touch a file +TEST touch "$M0/test_file" + +#Kill a brick +TEST kill_brick $V0 $H0 $B0/${V0}0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0 + +#Open the file in write mode +TEST fd=`fd_available` +TEST fd_open $fd 'rw' "$M0/test_file" + +#Bring up the killed brick +TEST $CLI volume start $V0 force +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 + +#Test the fd count +EXPECT "0" get_fd_count $V0 $H0 $B0/${V0}0 test_file +EXPECT "1" get_fd_count $V0 $H0 $B0/${V0}1 test_file +EXPECT "1" get_fd_count $V0 $H0 $B0/${V0}2 test_file + +#Write to file +dd iflag=fullblock if=/dev/random bs=1024 count=2 >&$fd 2>/dev/null + +#Test the fd count +EXPECT "1" get_fd_count $V0 $H0 $B0/${V0}0 test_file + +#Close fd +TEST fd_close $fd + +#Stop the volume +TEST $CLI volume stop $V0 + +#Start the volume +TEST $CLI volume start $V0 + +#Kill brick1 +TEST kill_brick $V0 $H0 $B0/${V0}0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0 + +#Unmount and mount +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0; +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0; +EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0 + +#Calculate md5 sum +md5sum0=`get_md5_sum "$M0/test_file"` + +#Bring up the brick +TEST $CLI volume start $V0 force +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 + +#Kill brick2 +TEST kill_brick $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0 + +#Unmount and mount +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0; +EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0 + +#Calculate md5 sum +md5sum1=`get_md5_sum "$M0/test_file"` + +#Bring up the brick +TEST $CLI volume start $V0 force +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 + +#Kill brick3 +TEST kill_brick $V0 $H0 $B0/${V0}2 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0 + +#Unmount and mount +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0; +EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0 + +#Calculate md5 sum +md5sum2=`get_md5_sum "$M0/test_file"` + +#compare the md5sum +EXPECT "$md5sum0" echo $md5sum1 +EXPECT "$md5sum0" echo $md5sum2 +EXPECT "$md5sum1" echo $md5sum2 + +cleanup diff --git a/tests/bugs/core/bug-908146.t b/tests/bugs/core/bug-908146.t index bf34992fee5..327be6e54bc 100755 --- a/tests/bugs/core/bug-908146.t +++ b/tests/bugs/core/bug-908146.t @@ -2,18 +2,8 @@ . $(dirname $0)/../../include.rc . $(dirname $0)/../../volume.rc +. $(dirname $0)/../../fileio.rc -function get_fd_count { - local vol=$1 - local host=$2 - local brick=$3 - local fname=$4 - local gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $brick/$fname)) - local statedump=$(generate_brick_statedump $vol $host $brick) - local count=$(grep "gfid=$gfid_str" $statedump -A2 | grep fd-count | cut -f2 -d'=' | tail -1) - rm -f $statedump - echo $count -} cleanup; TEST glusterd diff --git a/tests/volume.rc b/tests/volume.rc index 1cee648993b..1ca17ab3456 100644 --- a/tests/volume.rc +++ b/tests/volume.rc @@ -796,3 +796,15 @@ function count_sh_entries() { ls $1/.glusterfs/indices/xattrop | grep -v "xattrop-" | wc -l } + +function get_fd_count { + local vol=$1 + local host=$2 + local brick=$3 + local fname=$4 + local gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $brick/$fname)) + local statedump=$(generate_brick_statedump $vol $host $brick) + local count=$(grep "gfid=$gfid_str" $statedump -A2 | grep fd-count | cut -f2 -d'=' | tail -1) + rm -f $statedump + echo $count +} diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index b7088e54724..cb627a92c9c 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -26,6 +26,114 @@ EC_FLAG_WAITING_DATA_DIRTY |\ EC_FLAG_WAITING_METADATA_DIRTY) +void +ec_update_fd_status (fd_t *fd, xlator_t *xl, int idx, + int32_t ret_status) +{ + ec_fd_t *fd_ctx; + + if (fd == NULL) + return; + + LOCK (&fd->lock); + { + fd_ctx = __ec_fd_get(fd, xl); + if (fd_ctx) { + if (ret_status >= 0) + fd_ctx->fd_status[idx] = EC_FD_OPENED; + else + fd_ctx->fd_status[idx] = EC_FD_NOT_OPENED; + } + } + UNLOCK (&fd->lock); +} + +static int +ec_fd_ctx_need_open (fd_t *fd, xlator_t *this, uintptr_t *need_open) +{ + int i = 0; + int count = 0; + ec_t *ec = NULL; + ec_fd_t *fd_ctx = NULL; + + ec = this->private; + *need_open = 0; + + fd_ctx = ec_fd_get (fd, this); + if (!fd_ctx) + return count; + + LOCK (&fd->lock); + { + for (i = 0; i < ec->nodes; i++) { + if ((fd_ctx->fd_status[i] == EC_FD_NOT_OPENED) && + (ec->xl_up & (1<<i))) { + fd_ctx->fd_status[i] = EC_FD_OPENING; + *need_open |= (1<<i); + count++; + } + } + } + UNLOCK (&fd->lock); + + /* If fd needs to open on minimum number of nodes + * then ignore fixing the fd as it has been + * requested from heal operation. + */ + if (count >= ec->fragments) + count = 0; + + return count; +} + +static gf_boolean_t +ec_is_fd_fixable (fd_t *fd) +{ + if (!fd || !fd->inode) + return _gf_false; + else if (fd_is_anonymous (fd)) + return _gf_false; + else if (gf_uuid_is_null (fd->inode->gfid)) + return _gf_false; + + return _gf_true; +} + +static void +ec_fix_open (ec_fop_data_t *fop) +{ + int call_count = 0; + uintptr_t need_open = 0; + int ret = 0; + loc_t loc = {0, }; + + if (!ec_is_fd_fixable (fop->fd)) + goto out; + + /* Evaluate how many remote fd's to be opened */ + call_count = ec_fd_ctx_need_open (fop->fd, fop->xl, &need_open); + if (!call_count) + goto out; + + loc.inode = inode_ref (fop->fd->inode); + gf_uuid_copy (loc.gfid, fop->fd->inode->gfid); + ret = loc_path (&loc, NULL); + if (ret < 0) { + goto out; + } + + if (IA_IFDIR == fop->fd->inode->ia_type) { + ec_opendir(fop->frame, fop->xl, need_open, EC_MINIMUM_ONE, + NULL, NULL, &fop->loc[0], fop->fd, NULL); + } else{ + ec_open(fop->frame, fop->xl, need_open, EC_MINIMUM_ONE, + NULL, NULL, &loc, fop->fd->flags, fop->fd, NULL); + } + +out: + loc_wipe (&loc); +} + off_t ec_range_end_get (off_t fl_start, size_t fl_size) { @@ -1677,6 +1785,11 @@ void ec_lock_acquired(ec_lock_link_t *link) ec_lock_apply(link); + if (fop->use_fd && + (link->update[EC_DATA_TXN] || link->update[EC_METADATA_TXN])) { + ec_fix_open(fop); + } + ec_lock_resume_shared(&list); } diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h index 35c6a8107c2..c3e291585ef 100644 --- a/xlators/cluster/ec/src/ec-common.h +++ b/xlators/cluster/ec/src/ec-common.h @@ -140,4 +140,7 @@ int32_t ec_lock_unlocked(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata); +void +ec_update_fd_status (fd_t *fd, xlator_t *xl, + int child_index, int32_t ret_status); #endif /* __EC_COMMON_H__ */ diff --git a/xlators/cluster/ec/src/ec-dir-read.c b/xlators/cluster/ec/src/ec-dir-read.c index 48afe54460f..b44bb4239b1 100644 --- a/xlators/cluster/ec/src/ec-dir-read.c +++ b/xlators/cluster/ec/src/ec-dir-read.c @@ -19,7 +19,11 @@ #include "ec-method.h" #include "ec-fops.h" -/* FOP: opendir */ +/**************************************************************** + * + * File Operation: opendir + * + ***************************************************************/ int32_t ec_combine_opendir(ec_fop_data_t * fop, ec_cbk_data_t * dst, ec_cbk_data_t * src) @@ -88,6 +92,8 @@ int32_t ec_opendir_cbk(call_frame_t * frame, void * cookie, xlator_t * this, } ec_combine(cbk, ec_combine_opendir); + + ec_update_fd_status (fd, this, idx, op_ret); } out: diff --git a/xlators/cluster/ec/src/ec-dir-write.c b/xlators/cluster/ec/src/ec-dir-write.c index 150dc66f21b..7779d4849f3 100644 --- a/xlators/cluster/ec/src/ec-dir-write.c +++ b/xlators/cluster/ec/src/ec-dir-write.c @@ -71,6 +71,7 @@ ec_dir_write_cbk (call_frame_t *frame, xlator_t *this, out: if (cbk) ec_combine (cbk, ec_combine_write); + if (fop) ec_complete (fop); return 0; diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c index fe610748f0f..83f96ba6cb2 100644 --- a/xlators/cluster/ec/src/ec-helpers.c +++ b/xlators/cluster/ec/src/ec-helpers.c @@ -764,27 +764,32 @@ ec_inode_t * ec_inode_get(inode_t * inode, xlator_t * xl) ec_fd_t * __ec_fd_get(fd_t * fd, xlator_t * xl) { + int i = 0; ec_fd_t * ctx = NULL; uint64_t value = 0; + ec_t *ec = xl->private; - if ((__fd_ctx_get(fd, xl, &value) != 0) || (value == 0)) - { - ctx = GF_MALLOC(sizeof(*ctx), ec_mt_ec_fd_t); - if (ctx != NULL) - { + if ((__fd_ctx_get(fd, xl, &value) != 0) || (value == 0)) { + ctx = GF_MALLOC(sizeof(*ctx) + (sizeof (ec_fd_status_t) * ec->nodes), + ec_mt_ec_fd_t); + if (ctx != NULL) { memset(ctx, 0, sizeof(*ctx)); - value = (uint64_t)(uintptr_t)ctx; - if (__fd_ctx_set(fd, xl, value) != 0) - { - GF_FREE(ctx); + for (i = 0; i < ec->nodes; i++) { + if (fd_is_anonymous (fd)) { + ctx->fd_status[i] = EC_FD_OPENED; + } else { + ctx->fd_status[i] = EC_FD_NOT_OPENED; + } + } + value = (uint64_t)(uintptr_t)ctx; + if (__fd_ctx_set(fd, xl, value) != 0) { + GF_FREE (ctx); return NULL; } } - } - else - { + } else { ctx = (ec_fd_t *)(uintptr_t)value; } diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c index 03690ab8e96..d58ed9e5795 100644 --- a/xlators/cluster/ec/src/ec-inode-read.c +++ b/xlators/cluster/ec/src/ec-inode-read.c @@ -749,6 +749,9 @@ int32_t ec_open_cbk(call_frame_t * frame, void * cookie, xlator_t * this, } ec_combine(cbk, ec_combine_open); + + ec_update_fd_status (fd, this, idx, op_ret); + } out: diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h index 7b2b7b8247d..23dc434bc42 100644 --- a/xlators/cluster/ec/src/ec-types.h +++ b/xlators/cluster/ec/src/ec-types.h @@ -145,6 +145,13 @@ enum _ec_stripe_part { EC_STRIPE_TAIL }; +/* Enumartions to indicate FD status. */ +typedef enum { + EC_FD_NOT_OPENED, + EC_FD_OPENED, + EC_FD_OPENING +} ec_fd_status_t; + struct _ec_config { uint32_t version; uint8_t algorithm; @@ -158,6 +165,7 @@ struct _ec_fd { loc_t loc; uintptr_t open; int32_t flags; + ec_fd_status_t fd_status[0]; }; struct _ec_stripe { @@ -309,75 +317,82 @@ struct _ec_fragment_range { the bricks (offset on brick) */ }; +/* EC xlator data structure to collect all the data required to perform + * the file operation.*/ struct _ec_fop_data { - int32_t id; - int32_t refs; - int32_t state; - int32_t minimum; - int32_t expected; - int32_t winds; - int32_t jobs; - int32_t error; - ec_fop_data_t *parent; - xlator_t *xl; - call_frame_t *req_frame; /* frame of the calling xlator */ - call_frame_t *frame; /* frame used by this fop */ - struct list_head cbk_list; /* sorted list of groups of answers */ - struct list_head answer_list; /* list of answers */ - struct list_head pending_list; /* member of ec_t.pending_fops */ - ec_cbk_data_t *answer; /* accepted answer */ - int32_t lock_count; - int32_t locked; - ec_lock_link_t locks[2]; - int32_t first_lock; - gf_lock_t lock; - - uint32_t flags; - uint32_t first; - uintptr_t mask; - uintptr_t healing; /*Dispatch is done but call is successful - only if fop->minimum number of subvolumes - succeed which are not healing*/ - uintptr_t remaining; - uintptr_t received; /* Mask of responses */ - uintptr_t good; - - uid_t uid; - gid_t gid; - - ec_wind_f wind; - ec_handler_f handler; - ec_resume_f resume; - ec_cbk_t cbks; - void *data; - ec_heal_t *heal; - struct list_head healer; - - uint64_t user_size; - uint32_t head; - - int32_t use_fd; - - dict_t *xdata; - dict_t *dict; - int32_t int32; - uint32_t uint32; - uint64_t size; - off_t offset; - mode_t mode[2]; - entrylk_cmd entrylk_cmd; - entrylk_type entrylk_type; - gf_xattrop_flags_t xattrop_flags; - dev_t dev; - inode_t *inode; - fd_t *fd; - struct iatt iatt; - char *str[2]; - loc_t loc[2]; - struct gf_flock flock; - struct iovec *vector; - struct iobref *buffers; - gf_seek_what_t seek; + int32_t id; /* ID of the file operation */ + int32_t refs; + int32_t state; + int32_t minimum; /* Mininum number of successful + operation required to conclude a + fop as successful */ + int32_t expected; + int32_t winds; + int32_t jobs; + int32_t error; + ec_fop_data_t *parent; + xlator_t *xl; /* points to EC xlator */ + call_frame_t *req_frame; /* frame of the calling xlator */ + call_frame_t *frame; /* frame used by this fop */ + struct list_head cbk_list; /* sorted list of groups of answers */ + struct list_head answer_list; /* list of answers */ + struct list_head pending_list; /* member of ec_t.pending_fops */ + ec_cbk_data_t *answer; /* accepted answer */ + int32_t lock_count; + int32_t locked; + ec_lock_link_t locks[2]; + int32_t first_lock; + gf_lock_t lock; + + uint32_t flags; + uint32_t first; + uintptr_t mask; + uintptr_t healing; /*Dispatch is done but call is successful only + if fop->minimum number of subvolumes succeed + which are not healing*/ + uintptr_t remaining; + uintptr_t received; /* Mask of responses */ + uintptr_t good; + + uid_t uid; + gid_t gid; + + ec_wind_f wind; /* Function to wind to */ + ec_handler_f handler; /* FOP manager function */ + ec_resume_f resume; + ec_cbk_t cbks; /* Callback function for this FOP */ + void *data; + ec_heal_t *heal; + struct list_head healer; + + uint64_t user_size; + uint32_t head; + + int32_t use_fd; /* Indicates whether this FOP uses FD or + not */ + + dict_t *xdata; + dict_t *dict; + int32_t int32; + uint32_t uint32; + uint64_t size; + off_t offset; + mode_t mode[2]; + entrylk_cmd entrylk_cmd; + entrylk_type entrylk_type; + gf_xattrop_flags_t xattrop_flags; + dev_t dev; + inode_t *inode; + fd_t *fd; /* FD of the file on which FOP is + being carried upon */ + struct iatt iatt; + char *str[2]; + loc_t loc[2]; /* Holds the location details for + the file */ + struct gf_flock flock; + struct iovec *vector; + struct iobref *buffers; + gf_seek_what_t seek; ec_fragment_range_t frag_range; /* This will hold the range of stripes affected by the fop. */ }; @@ -623,18 +638,24 @@ struct _ec { xlator_t *xl; int32_t healers; int32_t heal_waiters; - int32_t nodes; + int32_t nodes; /* Total number of bricks(n) */ int32_t bits_for_nodes; - int32_t fragments; - int32_t redundancy; - uint32_t fragment_size; - uint32_t stripe_size; - int32_t up; + int32_t fragments; /* Data bricks(k) */ + int32_t redundancy; /* Redundant bricks(m) */ + uint32_t fragment_size; /* Size of fragment/chunk on a + brick. */ + uint32_t stripe_size; /* (fragment_size * fragments) + maximum size of user data + stored in one stripe. */ + int32_t up; /* Represents whether EC volume is + up or not. */ uint32_t idx; - uint32_t xl_up_count; - uintptr_t xl_up; - uint32_t xl_notify_count; - uintptr_t xl_notify; + uint32_t xl_up_count; /* Number of UP bricks. */ + uintptr_t xl_up; /* Bit flag representing UP + bricks */ + uint32_t xl_notify_count; /* Number of notifications. */ + uintptr_t xl_notify; /* Bit flag representing + notification for bricks. */ uintptr_t node_mask; xlator_t **xl_list; gf_lock_t lock; |