From d2b7e65ec0c35c4ee16f28f449553d90fb88fa6a Mon Sep 17 00:00:00 2001 From: Vikas Gorur Date: Thu, 17 Sep 2009 05:56:31 +0000 Subject: cluster/afr: Add the "diff" self-heal algorithm. The "diff" self-heal algorithm works as follows: For each block: Compute MD5 checksum on source and all sinks If checksum on a sink differs from source: Read block from source and write to sinks Signed-off-by: Anand V. Avati --- xlators/cluster/afr/src/afr-self-heal-algorithm.c | 352 ++++++++++++++++++++++ xlators/cluster/afr/src/afr-self-heal-algorithm.h | 10 +- xlators/cluster/afr/src/afr-self-heal-data.c | 2 +- 3 files changed, 362 insertions(+), 2 deletions(-) diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c index bc3917cac..f2d80c3e9 100644 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c +++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.c @@ -228,6 +228,358 @@ afr_sh_algo_full (call_frame_t *frame, xlator_t *this) } +/* + * The "diff" algorithm. Copies only those blocks whose checksums + * don't match with those of source. + */ + + +static int +sh_diff_number_of_writes_needed (unsigned char *write_needed, int child_count) +{ + int writes = 0; + int i; + + for (i = 0; i < child_count; i++) { + if (write_needed[i]) + writes++; + } + + return writes; +} + + +static int +sh_diff_iter (call_frame_t *frame, xlator_t *this); + + +static int +sh_diff_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t * sh = NULL; + + int child_index = (long) cookie; + int call_count = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + gf_log (this->name, GF_LOG_TRACE, + "wrote %d bytes of data from %s to child %d, offset %"PRId64"", + op_ret, local->loc.path, child_index, sh->offset - op_ret); + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "write to %s failed on subvolume %s (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + + sh->op_failed = 1; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + sh_diff_iter (frame, this); + } + + return 0; +} + + +static int +sh_diff_read_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct iovec *vector, int32_t count, struct stat *buf, + struct iobref *iobref) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t * sh = NULL; + + afr_sh_algo_diff_private_t *sh_priv = NULL; + + int child_index = (long) cookie; + + int i = 0; + int call_count = 0; + + off_t offset; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + sh_priv = sh->private; + + call_count = sh_diff_number_of_writes_needed (sh_priv->write_needed, + priv->child_count); + + local->call_count = call_count; + + gf_log (this->name, GF_LOG_TRACE, + "read %d bytes of data from %s on child %d, offset %"PRId64"", + op_ret, local->loc.path, child_index, sh->offset); + + if (op_ret <= 0) { + local->self_heal.algo_completion_cbk (frame, this); + return 0; + } + + /* what if we read less than block size? */ + offset = sh->offset; + sh->offset += sh_priv->block_size; + + if (sh->file_has_holes) { + if (iov_0filled (vector, count) == 0) { + /* + the iter function depends on the + sh->offset already being updated + above + */ + + sh_diff_iter (frame, this); + goto out; + } + } + + for (i = 0; i < priv->child_count; i++) { + if (sh_priv->write_needed[i]) { + STACK_WIND_COOKIE (frame, sh_diff_write_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->writev, + sh->healing_fd, vector, count, offset, + iobref); + + sh_priv->write_needed[i] = 0; + + if (!--call_count) + break; + } + } + +out: + return 0; +} + + +static int +sh_diff_read (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t * sh = NULL; + + afr_sh_algo_diff_private_t * sh_priv = NULL; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + sh_priv = sh->private; + + STACK_WIND_COOKIE (frame, sh_diff_read_cbk, + (void *) (long) sh->source, + priv->children[sh->source], + priv->children[sh->source]->fops->readv, + sh->healing_fd, sh_priv->block_size, + sh->offset); + + return 0; +} + + + +static int +sh_diff_checksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + uint32_t weak_checksum, uint8_t *strong_checksum) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t * sh = NULL; + afr_sh_algo_diff_private_t * sh_priv = NULL; + + int child_index = (long) cookie; + int call_count = 0; + int i = 0; + int write_needed = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + sh_priv = sh->private; + + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "checksum on %s failed on subvolume %s (%s)", + local->loc.path, priv->children[child_index]->name, + strerror (op_errno)); + + sh->op_failed = 1; + + local->self_heal.algo_abort_cbk (frame, this); + return 0; + } + + memcpy ((void *) sh_priv->checksum + (child_index * MD5_DIGEST_LEN), + strong_checksum, + MD5_DIGEST_LEN); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] || !local->child_up[i]) + continue; + + if (memcmp ((const void *) sh_priv->checksum + (i * MD5_DIGEST_LEN), + (const void *) sh_priv->checksum + (sh->source * MD5_DIGEST_LEN), + MD5_DIGEST_LEN)) { + /* + Checksums differ, so this block + must be written to this sink + */ + + gf_log (this->name, GF_LOG_TRACE, + "checksum on subvolume %s at offset %" + PRId64" differs from that on source", + priv->children[i]->name, sh->offset); + + write_needed = sh_priv->write_needed[i] = 1; + } + } + + if (write_needed) { + sh_diff_read (frame, this); + } else { + sh->offset += sh_priv->block_size; + + sh_diff_iter (frame, this); + } + } + + return 0; +} + + +static int +sh_diff_checksum (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t * sh = NULL; + + afr_sh_algo_diff_private_t * sh_priv = NULL; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + sh_priv = sh->private; + + int call_count = 0; + int i = 0; + + call_count = sh->active_sinks + 1; /* sinks and source */ + + local->call_count = call_count; + + STACK_WIND_COOKIE (frame, sh_diff_checksum_cbk, + (void *) (long) i, + priv->children[sh->source], + priv->children[sh->source]->fops->rchecksum, + sh->healing_fd, + sh->offset, sh_priv->block_size); + + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] || !local->child_up[i]) + continue; + + STACK_WIND_COOKIE (frame, sh_diff_checksum_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->rchecksum, + sh->healing_fd, + sh->offset, sh_priv->block_size); + if (!--call_count) + break; + } + + return 0; +} + + +static int +sh_diff_iter (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t * sh = NULL; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + if (sh->op_failed) { + local->self_heal.algo_abort_cbk (frame, this); + goto out; + } + + if (sh->offset >= sh->file_size) { + gf_log (this->name, GF_LOG_TRACE, + "closing fd's of %s", + local->loc.path); + + local->self_heal.algo_completion_cbk (frame, this); + + goto out; + } + + sh_diff_checksum (frame, this); +out: + return 0; +} + + +int +afr_sh_algo_diff (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t * sh = NULL; + afr_sh_algo_diff_private_t *sh_priv = NULL; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + sh_priv = CALLOC (1, sizeof (*sh_priv)); + + sh_priv->write_needed = CALLOC (priv->child_count, + sizeof (unsigned char)); + + sh_priv->checksum = CALLOC (priv->child_count, MD5_DIGEST_LEN); + + sh_priv->block_size = this->ctx->page_size; + + sh->private = sh_priv; + + sh_diff_checksum (frame, this); + + return 0; +} + + struct afr_sh_algorithm afr_self_heal_algorithms[] = { {.name = "full", .fn = afr_sh_algo_full}, + {.name = "diff", .fn = afr_sh_algo_diff}, }; diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.h b/xlators/cluster/afr/src/afr-self-heal-algorithm.h index 646fd2ee7..8998ce888 100644 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.h +++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.h @@ -29,6 +29,14 @@ struct afr_sh_algorithm { afr_sh_algo_fn fn; }; -struct afr_sh_algorithm afr_self_heal_algorithms[1]; +struct afr_sh_algorithm afr_self_heal_algorithms[2]; + +typedef struct { + uint8_t *checksum; /* array of MD5 checksums for each child + Each checksum is MD5_DIGEST_LEN bytes long */ + + unsigned char *write_needed; + size_t block_size; +} afr_sh_algo_diff_private_t; #endif /* __AFR_SELF_HEAL_ALGORITHM_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 5f08a005c..4ed6071ab 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -476,7 +476,7 @@ afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this) struct afr_sh_algorithm * afr_sh_data_pick_algo (call_frame_t *frame, xlator_t *this) { - return &afr_self_heal_algorithms[0]; /* full */ + return &afr_self_heal_algorithms[1]; /* full */ } -- cgit