summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKotresh HR <khiremat@redhat.com>2017-11-03 09:57:08 -0400
committerAmar Tumballi <amarts@redhat.com>2018-05-06 01:32:30 +0000
commit3e24848f1e568ed3307683a9786f33d1ee15209b (patch)
tree44ea76321366fba4f2d2c702e07cf676140770c8
parent80262e2984b44609e9f572cf11dbc24fb6aea4cc (diff)
posix: APIs in posix to get and set time attributes
This is part of the effort to provide consistent time across distribute and replica set for time attributes (ctime, atime, mtime) of the object. This patch contains the APIs to set and get the attributes from on disk and in inode context. Credits: Rafi KC <rkavunga@redhat.com> Updates: #208 Change-Id: I5d3cba53eef90ac252cb8299c0da42ebab3bde9f Signed-off-by: Kotresh HR <khiremat@redhat.com>
-rw-r--r--doc/features/ctime.md68
-rw-r--r--libglusterfs/src/glusterfs.h1
-rw-r--r--xlators/storage/posix/src/Makefile.am5
-rw-r--r--xlators/storage/posix/src/posix-entry-ops.c18
-rw-r--r--xlators/storage/posix/src/posix-mem-types.h1
-rw-r--r--xlators/storage/posix/src/posix-messages.h4
-rw-r--r--xlators/storage/posix/src/posix-metadata-disk.h31
-rw-r--r--xlators/storage/posix/src/posix-metadata.c510
-rw-r--r--xlators/storage/posix/src/posix-metadata.h49
-rw-r--r--xlators/storage/posix/src/posix.h17
10 files changed, 683 insertions, 21 deletions
diff --git a/doc/features/ctime.md b/doc/features/ctime.md
new file mode 100644
index 00000000000..6c6cad90055
--- /dev/null
+++ b/doc/features/ctime.md
@@ -0,0 +1,68 @@
+#Consistent time attributes in gluster across replica/distribute
+
+
+####Problem:
+Traditionally gluster has been using time attributes (ctime, atime, mtime) of files/dirs from bricks. The problem with this approach is that, it is not consisteant across replica and distribute bricks. And applications which depend on it breaks as replica might not always return time attributes from same brick.
+
+Tar especially gives "file changed as we read it" whenever it detects ctime differences when stat is served from different bricks. The way we have been trying to solve it is to serve the stat structures from same brick in afr, max-time in dht. But it doesn't avoid the problem completely. Because there is no way to change ctime at the moment(lutimes() only allows mtime, atime), there is little we can do to make sure ctimes match after self-heals/xattr updates/rebalance.
+
+####Solution Proposed:
+Store time attribues (ctime, mtime, atime) as an xattr of the file. The xattr is updated based
+on the fop. If a filesystem fop changes only mtime and ctime, update only those in xattr for
+that file.
+
+####Design Overview:
+1) As part of each fop, top layer will generate a time stamp and pass it to the down along
+ with other information
+> - This will bring a dependency for NTP synced clients along with servers
+> - There can be a diff in time if the fop stuck in the xlator for various reason,
+for ex: because of locks.
+
+ 2) On the server, posix layer stores the value in the memory (inode ctx) and will sync the data periodically to the disk as an extended attr
+> - Of course sync call also will force it. And fop comes for an inode which is not linked, we do the sync immediately.
+
+ 3) Each time when inodes are created or initialized it read the data from disk and store in inode ctx.
+
+ 4) Before setting to inode_ctx we compare the timestamp stored and the timestamp received, and only store if the stored value is lesser than the current value.
+
+ 5) So in best case data will be stored and retrieved from the memory. We replace the values in iatt with the values in inode_ctx.
+
+ 6) File ops that changes the parent directory attr time need to be consistent across all the distributed directories across the subvolumes. (for eg: a create call will change ctime and mtime of parent dir)
+
+> - This has to handle separately because we only send the fop to the hashed subvolume.
+> - We can asynchronously send the timeupdate setattr fop to the other subvoumes and change the values for parent directory if the file fops is successful on hashed subvolume.
+> - This will have a window where the times are inconsistent across dht subvolume (Please provide your suggestions)
+
+7) Currently we have couple of mount options for time attributes like noatime, relatime , nodiratime etc. But we are not explicitly handled those options even if it is given as mount option when gluster mount.
+
+
+####Implementation Overview:
+This features involves changes in following xlators.
+> - utime xlator
+> - posix xlator
+
+#####utime xlator:
+This is a new client side xlator which does following tasks.
+
+1. It will generate a time stamp and passes it down in frame->root->ctime and over the network.
+2. Based on fop, it also decides the time attributes to be updated and this passed using "frame->root->flags"
+
+ Patches:
+ 1. https://review.gluster.org/#/c/19857/
+
+#####posix xlator:
+Following tasks are done in posix xlator:
+
+1. Provides APIs to set and get the xattr from backend. It also caches the xattr in inode context. During get, it updates time attributes stored in xattr into iatt structure.
+2. Based on the flags from utime xlator, relevant fops update the time attributes in the xattr.
+
+ Patches:
+ 1. https://review.gluster.org/#/c/19267/
+ 2. https://review.gluster.org/#/c/19795/
+ 3. https://review.gluster.org/#/c/19796/
+
+####Pending Work:
+1. Handling of time related mount options (noatime, realatime,etc)
+2. flag based create (depending on flags in open, create behaviour might change)
+3. Changes in dht for direcotory sync acrosss multiple subvolumes
+4. readdirp stat need to be worked on.
diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h
index 90f2762b990..24f08a05f40 100644
--- a/libglusterfs/src/glusterfs.h
+++ b/libglusterfs/src/glusterfs.h
@@ -143,6 +143,7 @@
#define GFID2PATH_XATTR_KEY_PREFIX_LENGTH 18
#define VIRTUAL_GFID_XATTR_KEY_STR "glusterfs.gfid.string"
#define VIRTUAL_GFID_XATTR_KEY "glusterfs.gfid"
+#define GF_XATTR_MDATA_KEY "trusted.glusterfs.mdata"
#define UUID_CANONICAL_FORM_LEN 36
#define GET_ANCESTRY_PATH_KEY "glusterfs.ancestry.path"
diff --git a/xlators/storage/posix/src/Makefile.am b/xlators/storage/posix/src/Makefile.am
index 59d462336d1..d8af6221e4e 100644
--- a/xlators/storage/posix/src/Makefile.am
+++ b/xlators/storage/posix/src/Makefile.am
@@ -7,12 +7,13 @@ posix_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
posix_la_SOURCES = posix.c posix-helpers.c posix-handle.c posix-aio.c \
posix-gfid-path.c posix-entry-ops.c posix-inode-fd-ops.c \
- posix-common.c
+ posix-common.c posix-metadata.c
posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBAIO) \
$(ACL_LIBS)
noinst_HEADERS = posix.h posix-mem-types.h posix-handle.h posix-aio.h \
- posix-messages.h posix-gfid-path.h posix-inode-handle.h
+ posix-messages.h posix-gfid-path.h posix-inode-handle.h \
+ posix-metadata.h posix-metadata-disk.h
AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
diff --git a/xlators/storage/posix/src/posix-entry-ops.c b/xlators/storage/posix/src/posix-entry-ops.c
index 0abe380ee43..438fbe509e4 100644
--- a/xlators/storage/posix/src/posix-entry-ops.c
+++ b/xlators/storage/posix/src/posix-entry-ops.c
@@ -88,24 +88,6 @@ extern char *marker_xattrs[];
#endif
-/* Setting microseconds or nanoseconds depending on what's supported:
- The passed in `tv` can be
- struct timespec
- if supported (better, because it supports nanosecond resolution) or
- struct timeval
- otherwise. */
-#if HAVE_UTIMENSAT
-#define SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv, nanosecs) \
- tv.tv_nsec = nanosecs
-#define PATH_SET_TIMESPEC_OR_TIMEVAL(path, tv) \
- (sys_utimensat (AT_FDCWD, path, tv, AT_SYMLINK_NOFOLLOW))
-#else
-#define SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv, nanosecs) \
- tv.tv_usec = nanosecs / 1000
-#define PATH_SET_TIMESPEC_OR_TIMEVAL(path, tv) \
- (lutimes (path, tv))
-#endif
-
gf_boolean_t
posix_symlinks_match (xlator_t *this, loc_t *loc, uuid_t gfid)
{
diff --git a/xlators/storage/posix/src/posix-mem-types.h b/xlators/storage/posix/src/posix-mem-types.h
index b463c086be5..0180900ee8e 100644
--- a/xlators/storage/posix/src/posix-mem-types.h
+++ b/xlators/storage/posix/src/posix-mem-types.h
@@ -22,6 +22,7 @@ enum gf_posix_mem_types_ {
gf_posix_mt_trash_path,
gf_posix_mt_paiocb,
gf_posix_mt_inode_ctx_t,
+ gf_posix_mt_mdata_attr,
gf_posix_mt_end
};
#endif
diff --git a/xlators/storage/posix/src/posix-messages.h b/xlators/storage/posix/src/posix-messages.h
index a05f6739958..6b5332b6d09 100644
--- a/xlators/storage/posix/src/posix-messages.h
+++ b/xlators/storage/posix/src/posix-messages.h
@@ -136,7 +136,9 @@ GLFS_MSGID(POSIX,
P_MSG_LEASE_DISABLED,
P_MSG_ANCESTORY_FAILED,
P_MSG_DISK_SPACE_CHECK_FAILED,
- P_MSG_FALLOCATE_FAILED
+ P_MSG_FALLOCATE_FAILED,
+ P_MSG_STOREMDATA_FAILED,
+ P_MSG_FETCHMDATA_FAILED
);
#endif /* !_GLUSTERD_MESSAGES_H_ */
diff --git a/xlators/storage/posix/src/posix-metadata-disk.h b/xlators/storage/posix/src/posix-metadata-disk.h
new file mode 100644
index 00000000000..b25ad04a633
--- /dev/null
+++ b/xlators/storage/posix/src/posix-metadata-disk.h
@@ -0,0 +1,31 @@
+/*
+ Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _POSIX_METADATA_DISK_H
+#define _POSIX_METADATA_DISK_H
+
+typedef struct gf_timespec_disk {
+ uint64_t tv_sec;
+ uint64_t tv_nsec;
+} gf_timespec_disk_t;
+
+/* posix_mdata_t on disk structure */
+
+typedef struct __attribute__ ((__packed__)) posix_mdata_disk {
+ /* version of structure, bumped up if any new member is added */
+ uint8_t version;
+ /* flags indicates valid fields in the structure */
+ uint64_t flags;
+ gf_timespec_disk_t ctime;
+ gf_timespec_disk_t mtime;
+ gf_timespec_disk_t atime;
+} posix_mdata_disk_t;
+
+#endif /* _POSIX_METADATA_DISK_H */
diff --git a/xlators/storage/posix/src/posix-metadata.c b/xlators/storage/posix/src/posix-metadata.c
new file mode 100644
index 00000000000..4e75a4f1411
--- /dev/null
+++ b/xlators/storage/posix/src/posix-metadata.c
@@ -0,0 +1,510 @@
+/*
+ Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "xlator.h"
+#include "posix-metadata.h"
+#include "posix-metadata-disk.h"
+#include "posix-handle.h"
+#include "posix-messages.h"
+#include "syscall.h"
+#include "compat-errno.h"
+#include "compat.h"
+
+static int gf_posix_xattr_enotsup_log;
+
+/* posix_mdata_to_disk converts posix_mdata_t into network byte order to
+ * save it on disk in machine independant format
+ */
+static inline void
+posix_mdata_to_disk (posix_mdata_disk_t *out, posix_mdata_t *in)
+{
+ out->version = in->version;
+ out->flags = htobe64(in->flags);
+
+ out->ctime.tv_sec = htobe64(in->ctime.tv_sec);
+ out->ctime.tv_nsec = htobe64(in->ctime.tv_nsec);
+
+ out->mtime.tv_sec = htobe64(in->mtime.tv_sec);
+ out->mtime.tv_nsec = htobe64(in->mtime.tv_nsec);
+
+ out->atime.tv_sec = htobe64(in->atime.tv_sec);
+ out->atime.tv_nsec = htobe64(in->atime.tv_nsec);
+}
+
+/* posix_mdata_from_disk converts posix_mdata_disk_t into host byte order
+ */
+static inline void
+posix_mdata_from_disk (posix_mdata_t *out, posix_mdata_disk_t *in)
+{
+ out->version = in->version;
+ out->flags = be64toh(in->flags);
+
+ out->ctime.tv_sec = be64toh(in->ctime.tv_sec);
+ out->ctime.tv_nsec = be64toh(in->ctime.tv_nsec);
+
+ out->mtime.tv_sec = be64toh(in->mtime.tv_sec);
+ out->mtime.tv_nsec = be64toh(in->mtime.tv_nsec);
+
+ out->atime.tv_sec = be64toh(in->atime.tv_sec);
+ out->atime.tv_nsec = be64toh(in->atime.tv_nsec);
+}
+
+/* posix_fetch_mdata_xattr fetches the posix_mdata_t from disk */
+static int
+posix_fetch_mdata_xattr (xlator_t *this, const char *real_path_arg, int _fd,
+ inode_t *inode, posix_mdata_t *metadata)
+{
+ size_t size = -1;
+ int op_errno = 0;
+ int op_ret = -1;
+ char *value = NULL;
+ gf_boolean_t fd_based_fop = _gf_false;
+ char gfid_str[64] = {0};
+ char *real_path = NULL;
+
+ char *key = GF_XATTR_MDATA_KEY;
+
+ if (!metadata) {
+ op_ret = -1;
+ goto out;
+ }
+
+ if (_fd != -1) {
+ fd_based_fop = _gf_true;
+ }
+ if (!(fd_based_fop || real_path_arg)) {
+ MAKE_HANDLE_PATH (real_path, this, inode->gfid, NULL);
+ if (!real_path) {
+ uuid_utoa_r (inode->gfid, gfid_str);
+ gf_msg (this->name, GF_LOG_WARNING, op_errno,
+ P_MSG_LSTAT_FAILED, "lstat on gfid %s failed",
+ gfid_str);
+ op_ret = -1;
+ goto out;
+ }
+ }
+
+ if (fd_based_fop) {
+ size = sys_fgetxattr (_fd, key, NULL, 0);
+ } else if (real_path_arg) {
+ size = sys_lgetxattr (real_path_arg, key, NULL, 0);
+ } else if (real_path) {
+ size = sys_lgetxattr (real_path, key, NULL, 0);
+ }
+
+ if (size == -1) {
+ op_errno = errno;
+ if ((op_errno == ENOTSUP) || (op_errno == ENOSYS)) {
+ GF_LOG_OCCASIONALLY (gf_posix_xattr_enotsup_log,
+ this->name, GF_LOG_WARNING,
+ "Extended attributes not "
+ "supported (try remounting"
+ " brick with 'user_xattr' "
+ "flag)");
+ } else if (op_errno == ENOATTR ||
+ op_errno == ENODATA) {
+ gf_msg_debug (this->name, 0,
+ "No such attribute:%s for file %s "
+ "gfid: %s",
+ key, real_path ? real_path : (real_path_arg ? real_path_arg : "null"),
+ uuid_utoa(inode->gfid));
+ } else {
+ gf_msg (this->name, GF_LOG_DEBUG, op_errno,
+ P_MSG_XATTR_FAILED, "getxattr failed"
+ " on %s gfid: %s key: %s ",
+ real_path ? real_path : (real_path_arg ? real_path_arg : "null"),
+ uuid_utoa(inode->gfid), key);
+ }
+ op_ret = -1;
+ goto out;
+ }
+
+ value = GF_CALLOC (size + 1, sizeof(char), gf_posix_mt_char);
+ if (!value) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ if (fd_based_fop) {
+ size = sys_fgetxattr (_fd, key, value, size);
+ } else if (real_path_arg) {
+ size = sys_lgetxattr (real_path_arg, key, value, size);
+ } else if (real_path) {
+ size = sys_lgetxattr (real_path, key, value, size);
+ }
+ if (size == -1) {
+ op_ret = -1;
+ op_errno = errno;
+ gf_msg (this->name, GF_LOG_ERROR, errno,
+ P_MSG_XATTR_FAILED, "getxattr failed on "
+ " on %s gfid: %s key: %s ",
+ real_path ? real_path : (real_path_arg ? real_path_arg : "null"),
+ uuid_utoa(inode->gfid), key);
+ goto out;
+ }
+
+ posix_mdata_from_disk (metadata, (posix_mdata_disk_t*)value);
+
+ op_ret = 0;
+out:
+ GF_FREE (value);
+ return op_ret;
+}
+
+/* posix_store_mdata_xattr stores the posix_mdata_t on disk */
+static int
+posix_store_mdata_xattr (xlator_t *this, const char *real_path_arg, int fd,
+ inode_t *inode, posix_mdata_t *metadata)
+{
+ char *real_path = NULL;
+ int op_ret = 0;
+ gf_boolean_t fd_based_fop = _gf_false;
+ char *key = GF_XATTR_MDATA_KEY;
+ char gfid_str[64] = {0};
+ posix_mdata_disk_t disk_metadata;
+
+ if (!metadata) {
+ op_ret = -1;
+ goto out;
+ }
+
+ if (fd != -1) {
+ fd_based_fop = _gf_true;
+ }
+ if (!(fd_based_fop || real_path_arg)) {
+ MAKE_HANDLE_PATH (real_path, this, inode->gfid, NULL);
+ if (!real_path) {
+ uuid_utoa_r (inode->gfid, gfid_str);
+ gf_msg (this->name, GF_LOG_DEBUG, errno,
+ P_MSG_LSTAT_FAILED, "lstat on gfid %s failed",
+ gfid_str);
+ op_ret = -1;
+ goto out;
+ }
+ }
+
+ /* Set default version as 1 */
+ posix_mdata_to_disk (&disk_metadata, metadata);
+
+ if (fd_based_fop) {
+ op_ret = sys_fsetxattr (fd, key,
+ (void *) &disk_metadata,
+ sizeof (posix_mdata_disk_t), 0);
+ } else if (real_path_arg) {
+ op_ret = sys_lsetxattr (real_path_arg, key,
+ (void *) &disk_metadata,
+ sizeof (posix_mdata_disk_t), 0);
+ } else if (real_path) {
+ op_ret = sys_lsetxattr (real_path, key,
+ (void *) &disk_metadata,
+ sizeof (posix_mdata_disk_t), 0);
+ }
+
+#ifdef GF_DARWIN_HOST_OS
+ if (real_path_arg) {
+ posix_dump_buffer(this, real_path_arg, key, value, 0);
+ } else if (real_path) {
+ posix_dump_buffer(this, real_path, key, value, 0);
+ }
+#endif
+out:
+ if (op_ret < 0) {
+ gf_msg (this->name, GF_LOG_ERROR, errno, P_MSG_XATTR_FAILED,
+ "file: %s: gfid: %s key:%s ",
+ real_path ? real_path : (real_path_arg ? real_path_arg : "null"),
+ uuid_utoa(inode->gfid), key);
+ }
+ return op_ret;
+}
+
+/* _posix_get_mdata_xattr gets posix_mdata_t from inode context. If it fails
+ * to get it from inode context, gets it from disk. This is with out inode lock.
+ */
+int
+__posix_get_mdata_xattr (xlator_t *this, const char *real_path, int _fd,
+ inode_t *inode, struct iatt *stbuf)
+{
+ posix_mdata_t *mdata = NULL;
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO (this->name, inode, out);
+
+ ret = __inode_ctx_get1 (inode, this,
+ (uint64_t *)&mdata);
+ if (ret == -1 || !mdata) {
+ mdata = GF_CALLOC (1, sizeof (posix_mdata_t),
+ gf_posix_mt_mdata_attr);
+ if (!mdata) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = posix_fetch_mdata_xattr (this, real_path, _fd, inode,
+ mdata);
+
+ if (ret == 0) {
+ /* Got mdata from disk, set it in inode ctx. This case
+ * is hit when in-memory status is lost due to brick
+ * down scenario
+ */
+ __inode_ctx_set1 (inode, this, (uint64_t *)&mdata);
+ } else {
+ /* Failed to get mdata from disk, xattr missing
+ * Even new file creation hits here first as posix_pstat
+ * is generally done before posix_set_ctime
+ */
+ if (stbuf) {
+ mdata->version = 1;
+ mdata->flags = 0;
+ mdata->ctime.tv_sec = stbuf->ia_ctime;
+ mdata->ctime.tv_nsec = stbuf->ia_ctime_nsec;
+ mdata->mtime.tv_sec = stbuf->ia_mtime;
+ mdata->mtime.tv_nsec = stbuf->ia_mtime_nsec;
+ mdata->atime.tv_sec = stbuf->ia_atime;
+ mdata->atime.tv_nsec = stbuf->ia_atime_nsec;
+ ret = posix_store_mdata_xattr (this, real_path,
+ _fd, inode,
+ mdata);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, errno,
+ P_MSG_STOREMDATA_FAILED,
+ "file: %s: gfid: %s key:%s ",
+ real_path ? real_path : "null",
+ uuid_utoa(inode->gfid),
+ GF_XATTR_MDATA_KEY);
+ goto out;
+ }
+ __inode_ctx_set1 (inode, this, (uint64_t *)&mdata);
+ } else {
+ /* This case should not be hit. If it hits, don't
+ * fail, log warning, free mdata and move on
+ */
+ gf_msg (this->name, GF_LOG_WARNING, errno,
+ P_MSG_FETCHMDATA_FAILED,
+ "file: %s: gfid: %s key:%s ",
+ real_path ? real_path : "null",
+ uuid_utoa(inode->gfid),
+ GF_XATTR_MDATA_KEY);
+ GF_FREE (mdata);
+ ret = 0;
+ goto out;
+ }
+ }
+ }
+
+ ret = 0;
+
+ if (ret == 0 && stbuf) {
+ stbuf->ia_ctime = mdata->ctime.tv_sec;
+ stbuf->ia_ctime_nsec = mdata->ctime.tv_nsec;
+ stbuf->ia_mtime = mdata->mtime.tv_sec;
+ stbuf->ia_mtime_nsec = mdata->mtime.tv_nsec;
+ stbuf->ia_atime = mdata->atime.tv_sec;
+ stbuf->ia_atime_nsec = mdata->atime.tv_nsec;
+ }
+
+out:
+ return ret;
+}
+
+/* posix_get_mdata_xattr gets posix_mdata_t from inode context. If it fails
+ * to get it from inode context, gets it from disk. This is with inode lock.
+ */
+int
+posix_get_mdata_xattr (xlator_t *this, const char *real_path, int _fd,
+ inode_t *inode, struct iatt *stbuf)
+{
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO (this->name, inode, out);
+
+ LOCK (&inode->lock);
+ {
+ ret = __posix_get_mdata_xattr (this, real_path, _fd, inode, stbuf);
+ }
+ UNLOCK (&inode->lock);
+
+out:
+ return ret;
+}
+
+static int
+posix_compare_timespec (struct timespec *first, struct timespec *second)
+{
+ if (first->tv_sec == second->tv_sec)
+ return first->tv_nsec - second->tv_nsec;
+ else
+ return first->tv_sec - second->tv_sec;
+}
+
+/* posix_update_mdata_xattr updates the posix_mdata_t based on the flag
+ * in inode context and stores it on disk
+ */
+int
+posix_set_mdata_xattr (xlator_t *this, const char *real_path, int fd,
+ inode_t *inode, struct timespec *time,
+ struct iatt *stbuf, posix_mdata_flag_t *flag)
+{
+ posix_mdata_t *mdata = NULL;
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("posix", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, inode, out);
+ GF_VALIDATE_OR_GOTO (this->name, inode->gfid, out);
+
+ LOCK (&inode->lock);
+ {
+ ret = __inode_ctx_get1 (inode, this,
+ (uint64_t *)&mdata);
+ if (ret == -1 || !mdata) {
+ /*
+ * Do we need to fetch the data from xattr
+ * If we does we can compare the value and store
+ * the largest data in inode ctx.
+ */
+ mdata = GF_CALLOC (1, sizeof (posix_mdata_t),
+ gf_posix_mt_mdata_attr);
+ if (!mdata) {
+ ret = -1;
+ goto unlock;
+ }
+
+ ret = posix_fetch_mdata_xattr (this, real_path, fd,
+ inode,
+ (void *)mdata);
+ if (ret == 0) {
+ /* Got mdata from disk, set it in inode ctx. This case
+ * is hit when in-memory status is lost due to brick
+ * down scenario
+ */
+ __inode_ctx_set1 (inode, this,
+ (uint64_t *)&mdata);
+ } else if (ret && stbuf) {
+ /*
+ * This is the first time creating the time
+ * attr. This happens when you activate this
+ * feature, and the legacy file will not have
+ * any xattr set.
+ *
+ * New files will create extended attributes.
+ */
+
+ /*
+ * TODO: This is wrong approach, because before
+ * creating fresh xattr, we should consult
+ * to all replica and/or distribution set.
+ *
+ * We should contact the time management
+ * xlators, and ask them to create an xattr.
+ */
+ mdata->version = 1;
+ mdata->flags = 0;
+ mdata->ctime.tv_sec = stbuf->ia_ctime;
+ mdata->ctime.tv_nsec = stbuf->ia_ctime_nsec;
+ mdata->atime.tv_sec = stbuf->ia_atime;
+ mdata->atime.tv_nsec = stbuf->ia_atime_nsec;
+ mdata->mtime.tv_sec = stbuf->ia_mtime;
+ mdata->mtime.tv_nsec = stbuf->ia_mtime_nsec;
+
+ __inode_ctx_set1 (inode, this,
+ (uint64_t *)&mdata);
+ }
+ }
+ if (flag->ctime &&
+ posix_compare_timespec (time, &mdata->ctime) > 0) {
+ mdata->ctime = *time;
+ }
+ if (flag->mtime &&
+ posix_compare_timespec (time, &mdata->mtime) > 0) {
+ mdata->mtime = *time;
+ }
+ if (flag->atime &&
+ posix_compare_timespec (time, &mdata->atime) > 0) {
+ mdata->atime = *time;
+ }
+
+ if (inode->ia_type == IA_INVAL) {
+ /*
+ * TODO: This is non-linked inode. So we have to sync the
+ * data into backend. Because inode_link may return
+ * a different inode.
+ */
+ /* ret = posix_store_mdata_xattr (this, loc, fd,
+ * mdata); */
+ }
+ /*
+ * With this patch set, we are setting the xattr for each update
+ * We should evaluate the performance, and based on that we can
+ * decide on asynchronous updation.
+ */
+ ret = posix_store_mdata_xattr (this, real_path, fd, inode,
+ mdata);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, errno,
+ P_MSG_STOREMDATA_FAILED,
+ "file: %s: gfid: %s key:%s ",
+ real_path ? real_path : "null",
+ uuid_utoa(inode->gfid), GF_XATTR_MDATA_KEY);
+ goto out;
+ }
+ }
+unlock:
+ UNLOCK (&inode->lock);
+out:
+ if (ret == 0 && stbuf) {
+ stbuf->ia_ctime = mdata->ctime.tv_sec;
+ stbuf->ia_ctime_nsec = mdata->ctime.tv_nsec;
+ stbuf->ia_mtime = mdata->mtime.tv_sec;
+ stbuf->ia_mtime_nsec = mdata->mtime.tv_nsec;
+ stbuf->ia_atime = mdata->atime.tv_sec;
+ stbuf->ia_atime_nsec = mdata->atime.tv_nsec;
+ }
+
+ return ret;
+}
+
+/* posix_update_utime_in_mdata updates the posix_mdata_t when mtime/atime
+ * is modified using syscall
+ */
+int
+posix_update_utime_in_mdata (xlator_t *this, const char *real_path, int fd,
+ inode_t *inode,
+ struct iatt *stbuf, int valid)
+{
+ int32_t ret = -1;
+#if defined(HAVE_UTIMENSAT)
+ struct timespec tv = {0, };
+#else
+ struct timeval tv = {0, };
+#endif
+ posix_mdata_flag_t flag = {0, };
+
+ if ((valid & GF_SET_ATTR_ATIME) == GF_SET_ATTR_ATIME) {
+ tv.tv_sec = stbuf->ia_atime;
+ SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv, stbuf->ia_atime_nsec);
+
+ flag.ctime = 0;
+ flag.mtime = 0;
+ flag.atime = 1;
+ }
+
+ if ((valid & GF_SET_ATTR_MTIME) == GF_SET_ATTR_MTIME) {
+ tv.tv_sec = stbuf->ia_mtime;
+ SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv, stbuf->ia_mtime_nsec);
+ flag.ctime = 1;
+ flag.mtime = 1;
+ flag.atime = 0;
+ }
+
+ ret = posix_set_mdata_xattr (this, real_path, -1, inode, &tv, NULL,
+ &flag);
+ return ret;
+}
diff --git a/xlators/storage/posix/src/posix-metadata.h b/xlators/storage/posix/src/posix-metadata.h
new file mode 100644
index 00000000000..b654c83230c
--- /dev/null
+++ b/xlators/storage/posix/src/posix-metadata.h
@@ -0,0 +1,49 @@
+/*
+ Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _POSIX_METADATA_H
+#define _POSIX_METADATA_H
+
+#include "posix-metadata-disk.h"
+
+/* In memory representation posix metadata xattr */
+typedef struct {
+ /* version of structure, bumped up if any new member is added */
+ uint8_t version;
+ /* flags indicates valid fields in the structure */
+ uint64_t flags;
+ struct timespec ctime;
+ struct timespec mtime;
+ struct timespec atime;
+} posix_mdata_t;
+
+typedef struct {
+ unsigned short ctime : 1;
+ unsigned short mtime : 1;
+ unsigned short atime : 1;
+} posix_mdata_flag_t;
+
+/* With inode lock*/
+int
+posix_get_mdata_xattr (xlator_t *this, const char *real_path, int _fd,
+ inode_t *inode, struct iatt *stbuf);
+/* With out inode lock*/
+int
+__posix_get_mdata_xattr (xlator_t *this, const char *real_path, int _fd,
+ inode_t *inode, struct iatt *stbuf);
+int
+posix_set_mdata_xattr (xlator_t *this, const char *real_path, int fd,
+ inode_t *inode, struct timespec *time,
+ struct iatt *stbuf, posix_mdata_flag_t *flag);
+int
+posix_update_utime_in_mdata (xlator_t *this, const char *real_path, int fd,
+ inode_t *inode, struct iatt *stbuf, int valid);
+
+#endif /* _POSIX_METADATA_H */
diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h
index 5f28be9e414..60a7132ddcb 100644
--- a/xlators/storage/posix/src/posix.h
+++ b/xlators/storage/posix/src/posix.h
@@ -77,6 +77,23 @@
} \
} while (0)
+/* Setting microseconds or nanoseconds depending on what's supported:
+ The passed in `tv` can be
+ struct timespec
+ if supported (better, because it supports nanosecond resolution) or
+ struct timeval
+ otherwise. */
+#if HAVE_UTIMENSAT
+#define SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv, nanosecs) \
+ tv.tv_nsec = nanosecs
+#define PATH_SET_TIMESPEC_OR_TIMEVAL(path, tv) \
+ (sys_utimensat (AT_FDCWD, path, tv, AT_SYMLINK_NOFOLLOW))
+#else
+#define SET_TIMESPEC_NSEC_OR_TIMEVAL_USEC(tv, nanosecs) \
+ tv.tv_usec = nanosecs / 1000
+#define PATH_SET_TIMESPEC_OR_TIMEVAL(path, tv) \
+ (lutimes (path, tv))
+#endif
#define GFID_NULL_CHECK_AND_GOTO(frame, this, loc, xattr_req, op_ret, \
op_errno, out) \