diff options
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-metadata.c | 116 |
1 files changed, 115 insertions, 1 deletions
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index 68dc568bc65..87600df3bad 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -99,6 +99,96 @@ out: return ret; } +static inline uint64_t +mtime_ns(struct iatt *ia) +{ + uint64_t ret; + + ret = (((uint64_t)(ia->ia_mtime)) * 1000000000) + + (uint64_t)(ia->ia_mtime_nsec); + + return ret; +} + +/* + * When directory content is modified, [mc]time is updated. On + * Linux, the filesystem does it, while at least on NetBSD, the + * kernel file-system independant code does it. This means that + * when entries are added while bricks are down, the kernel sends + * a SETATTR [mc]time which will cause metadata split brain for + * the directory. In this case, clear the split brain by finding + * the source with the most recent modification date. + */ +static int +afr_dirtime_splitbrain_source (call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, + unsigned char *locked_on) +{ + afr_private_t *priv = NULL; + int source = -1; + struct iatt source_ia; + struct iatt child_ia; + uint64_t mtime = 0; + int i; + int ret = -1; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!locked_on[i]) + continue; + + if (!replies[i].valid) + continue; + + if (replies[i].op_ret != 0) + continue; + + if (mtime_ns(&replies[i].poststat) <= mtime) + continue; + + mtime = mtime_ns(&replies[i].poststat); + source = i; + } + + if (source == -1) + goto out; + + source_ia = replies[source].poststat; + if (source_ia.ia_type != IA_IFDIR) + goto out; + + for (i = 0; i < priv->child_count; i++) { + if (i == source) + continue; + + if (!replies[i].valid) + continue; + + if (replies[i].op_ret != 0) + continue; + + child_ia = replies[i].poststat; + + if (!IA_EQUAL(source_ia, child_ia, gfid) || + !IA_EQUAL(source_ia, child_ia, type) || + !IA_EQUAL(source_ia, child_ia, prot) || + !IA_EQUAL(source_ia, child_ia, uid) || + !IA_EQUAL(source_ia, child_ia, gid) || + !afr_xattrs_are_equal (replies[source].xdata, + replies[i].xdata)) + goto out; + } + + /* + * Metadata split brain is just about [amc]time + * We return our source. + */ + ret = source; +out: + return ret; +} + /* * Look for mismatching uid/gid or mode or user xattrs even if @@ -122,10 +212,34 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this, sources_count = AFR_COUNT (sources, priv->child_count); if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0) - || !sources_count) { + || !sources_count) { + /* If this is a directory mtime/ctime only split brain + use the most recent */ + source = afr_dirtime_splitbrain_source (frame, this, + replies, locked_on); + if (source != -1) { + gf_log (this->name, GF_LOG_NOTICE, "clear time " + "split brain on %s", + uuid_utoa (replies[source].poststat.ia_gfid)); + sources[source] = 1; + + for (i = 0; i < priv->child_count; i++) { + if (i == source) + continue; + + if (!locked_on[i]) + continue; + + healed_sinks[i] = 1; + } + + return source; + } + if (!priv->metadata_splitbrain_forced_heal) { return -EIO; } + /* Metadata split brain, select one subvol arbitrarily */ for (i = 0; i < priv->child_count; i++) { |