summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVikas Gorur <vikas@gluster.com>2009-09-17 05:56:31 +0000
committerAnand V. Avati <avati@dev.gluster.com>2009-09-22 06:13:34 -0700
commitd2b7e65ec0c35c4ee16f28f449553d90fb88fa6a (patch)
treed215e4680f9872a9da25d145699f32616076f911
parent8c224de82b9b3e75f2dd9c264d5d3726dd1ef379 (diff)
cluster/afr: Add the "diff" self-heal algorithm.
The "diff" self-heal algorithm works as follows: For each block: Compute MD5 checksum on source and all sinks If checksum on a sink differs from source: Read block from source and write to sinks Signed-off-by: Anand V. Avati <avati@dev.gluster.com>
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-algorithm.c352
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-algorithm.h10
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c2
3 files changed, 362 insertions, 2 deletions
diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c
index bc3917cac..f2d80c3e9 100644
--- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c
+++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.c
@@ -228,6 +228,358 @@ afr_sh_algo_full (call_frame_t *frame, xlator_t *this)
}
+/*
+ * The "diff" algorithm. Copies only those blocks whose checksums
+ * don't match with those of source.
+ */
+
+
+static int
+sh_diff_number_of_writes_needed (unsigned char *write_needed, int child_count)
+{
+ int writes = 0;
+ int i;
+
+ for (i = 0; i < child_count; i++) {
+ if (write_needed[i])
+ writes++;
+ }
+
+ return writes;
+}
+
+
+static int
+sh_diff_iter (call_frame_t *frame, xlator_t *this);
+
+
+static int
+sh_diff_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t * sh = NULL;
+
+ int child_index = (long) cookie;
+ int call_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "wrote %d bytes of data from %s to child %d, offset %"PRId64"",
+ op_ret, local->loc.path, child_index, sh->offset - op_ret);
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "write to %s failed on subvolume %s (%s)",
+ local->loc.path,
+ priv->children[child_index]->name,
+ strerror (op_errno));
+
+ sh->op_failed = 1;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ sh_diff_iter (frame, this);
+ }
+
+ return 0;
+}
+
+
+static int
+sh_diff_read_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ struct iovec *vector, int32_t count, struct stat *buf,
+ struct iobref *iobref)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t * sh = NULL;
+
+ afr_sh_algo_diff_private_t *sh_priv = NULL;
+
+ int child_index = (long) cookie;
+
+ int i = 0;
+ int call_count = 0;
+
+ off_t offset;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+ sh_priv = sh->private;
+
+ call_count = sh_diff_number_of_writes_needed (sh_priv->write_needed,
+ priv->child_count);
+
+ local->call_count = call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "read %d bytes of data from %s on child %d, offset %"PRId64"",
+ op_ret, local->loc.path, child_index, sh->offset);
+
+ if (op_ret <= 0) {
+ local->self_heal.algo_completion_cbk (frame, this);
+ return 0;
+ }
+
+ /* what if we read less than block size? */
+ offset = sh->offset;
+ sh->offset += sh_priv->block_size;
+
+ if (sh->file_has_holes) {
+ if (iov_0filled (vector, count) == 0) {
+ /*
+ the iter function depends on the
+ sh->offset already being updated
+ above
+ */
+
+ sh_diff_iter (frame, this);
+ goto out;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh_priv->write_needed[i]) {
+ STACK_WIND_COOKIE (frame, sh_diff_write_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->writev,
+ sh->healing_fd, vector, count, offset,
+ iobref);
+
+ sh_priv->write_needed[i] = 0;
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+out:
+ return 0;
+}
+
+
+static int
+sh_diff_read (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t * sh = NULL;
+
+ afr_sh_algo_diff_private_t * sh_priv = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+ sh_priv = sh->private;
+
+ STACK_WIND_COOKIE (frame, sh_diff_read_cbk,
+ (void *) (long) sh->source,
+ priv->children[sh->source],
+ priv->children[sh->source]->fops->readv,
+ sh->healing_fd, sh_priv->block_size,
+ sh->offset);
+
+ return 0;
+}
+
+
+
+static int
+sh_diff_checksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ uint32_t weak_checksum, uint8_t *strong_checksum)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t * sh = NULL;
+ afr_sh_algo_diff_private_t * sh_priv = NULL;
+
+ int child_index = (long) cookie;
+ int call_count = 0;
+ int i = 0;
+ int write_needed = 0;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ sh_priv = sh->private;
+
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "checksum on %s failed on subvolume %s (%s)",
+ local->loc.path, priv->children[child_index]->name,
+ strerror (op_errno));
+
+ sh->op_failed = 1;
+
+ local->self_heal.algo_abort_cbk (frame, this);
+ return 0;
+ }
+
+ memcpy ((void *) sh_priv->checksum + (child_index * MD5_DIGEST_LEN),
+ strong_checksum,
+ MD5_DIGEST_LEN);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->sources[i] || !local->child_up[i])
+ continue;
+
+ if (memcmp ((const void *) sh_priv->checksum + (i * MD5_DIGEST_LEN),
+ (const void *) sh_priv->checksum + (sh->source * MD5_DIGEST_LEN),
+ MD5_DIGEST_LEN)) {
+ /*
+ Checksums differ, so this block
+ must be written to this sink
+ */
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "checksum on subvolume %s at offset %"
+ PRId64" differs from that on source",
+ priv->children[i]->name, sh->offset);
+
+ write_needed = sh_priv->write_needed[i] = 1;
+ }
+ }
+
+ if (write_needed) {
+ sh_diff_read (frame, this);
+ } else {
+ sh->offset += sh_priv->block_size;
+
+ sh_diff_iter (frame, this);
+ }
+ }
+
+ return 0;
+}
+
+
+static int
+sh_diff_checksum (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t * sh = NULL;
+
+ afr_sh_algo_diff_private_t * sh_priv = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+ sh_priv = sh->private;
+
+ int call_count = 0;
+ int i = 0;
+
+ call_count = sh->active_sinks + 1; /* sinks and source */
+
+ local->call_count = call_count;
+
+ STACK_WIND_COOKIE (frame, sh_diff_checksum_cbk,
+ (void *) (long) i,
+ priv->children[sh->source],
+ priv->children[sh->source]->fops->rchecksum,
+ sh->healing_fd,
+ sh->offset, sh_priv->block_size);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sh->sources[i] || !local->child_up[i])
+ continue;
+
+ STACK_WIND_COOKIE (frame, sh_diff_checksum_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->rchecksum,
+ sh->healing_fd,
+ sh->offset, sh_priv->block_size);
+ if (!--call_count)
+ break;
+ }
+
+ return 0;
+}
+
+
+static int
+sh_diff_iter (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t * sh = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ if (sh->op_failed) {
+ local->self_heal.algo_abort_cbk (frame, this);
+ goto out;
+ }
+
+ if (sh->offset >= sh->file_size) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "closing fd's of %s",
+ local->loc.path);
+
+ local->self_heal.algo_completion_cbk (frame, this);
+
+ goto out;
+ }
+
+ sh_diff_checksum (frame, this);
+out:
+ return 0;
+}
+
+
+int
+afr_sh_algo_diff (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_self_heal_t * sh = NULL;
+ afr_sh_algo_diff_private_t *sh_priv = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ sh = &local->self_heal;
+
+ sh_priv = CALLOC (1, sizeof (*sh_priv));
+
+ sh_priv->write_needed = CALLOC (priv->child_count,
+ sizeof (unsigned char));
+
+ sh_priv->checksum = CALLOC (priv->child_count, MD5_DIGEST_LEN);
+
+ sh_priv->block_size = this->ctx->page_size;
+
+ sh->private = sh_priv;
+
+ sh_diff_checksum (frame, this);
+
+ return 0;
+}
+
+
struct afr_sh_algorithm afr_self_heal_algorithms[] = {
{.name = "full", .fn = afr_sh_algo_full},
+ {.name = "diff", .fn = afr_sh_algo_diff},
};
diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.h b/xlators/cluster/afr/src/afr-self-heal-algorithm.h
index 646fd2ee7..8998ce888 100644
--- a/xlators/cluster/afr/src/afr-self-heal-algorithm.h
+++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.h
@@ -29,6 +29,14 @@ struct afr_sh_algorithm {
afr_sh_algo_fn fn;
};
-struct afr_sh_algorithm afr_self_heal_algorithms[1];
+struct afr_sh_algorithm afr_self_heal_algorithms[2];
+
+typedef struct {
+ uint8_t *checksum; /* array of MD5 checksums for each child
+ Each checksum is MD5_DIGEST_LEN bytes long */
+
+ unsigned char *write_needed;
+ size_t block_size;
+} afr_sh_algo_diff_private_t;
#endif /* __AFR_SELF_HEAL_ALGORITHM_H__ */
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index 5f08a005c..4ed6071ab 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -476,7 +476,7 @@ afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this)
struct afr_sh_algorithm *
afr_sh_data_pick_algo (call_frame_t *frame, xlator_t *this)
{
- return &afr_self_heal_algorithms[0]; /* full */
+ return &afr_self_heal_algorithms[1]; /* full */
}