diff options
24 files changed, 1667 insertions, 205 deletions
diff --git a/api/src/glfs-fops.c b/api/src/glfs-fops.c index f0c769def29..94b31ef076a 100644 --- a/api/src/glfs-fops.c +++ b/api/src/glfs-fops.c @@ -2839,27 +2839,6 @@ out: GFAPI_SYMVER_PUBLIC_DEFAULT(glfs_flistxattr, 3.4.0); - -dict_t * -dict_for_key_value (const char *name, const char *value, size_t size) -{ - dict_t *xattr = NULL; - int ret = 0; - - xattr = dict_new (); - if (!xattr) - return NULL; - - ret = dict_set_static_bin (xattr, (char *)name, (void *)value, size); - if (ret) { - dict_destroy (xattr); - xattr = NULL; - } - - return xattr; -} - - int glfs_setxattr_common (struct glfs *fs, const char *path, const char *name, const void *value, size_t size, int flags, int follow) diff --git a/configure.ac b/configure.ac index ee89ce99167..89ea35ce6f1 100644 --- a/configure.ac +++ b/configure.ac @@ -165,6 +165,7 @@ AC_CONFIG_FILES([Makefile xlators/features/bit-rot/Makefile xlators/features/bit-rot/src/Makefile xlators/features/bit-rot/src/stub/Makefile + xlators/features/bit-rot/src/bitd/Makefile xlators/playground/Makefile xlators/playground/template/Makefile xlators/playground/template/src/Makefile diff --git a/libglusterfs/src/Makefile.am b/libglusterfs/src/Makefile.am index 818de91cf36..33de0a287c7 100644 --- a/libglusterfs/src/Makefile.am +++ b/libglusterfs/src/Makefile.am @@ -11,6 +11,7 @@ libglusterfs_la_LIBADD = @LEXLIB@ $(ZLIB_LIBS) $(MATH_LIB) libglusterfs_la_LDFLAGS = -version-info $(LIBGLUSTERFS_LT_VERSION) lib_LTLIBRARIES = libglusterfs.la +libgfchangelogdir = $(includedir)/glusterfs/gfchangelog CONTRIB_BUILDDIR = $(top_builddir)/contrib @@ -53,6 +54,8 @@ noinst_HEADERS = common-utils.h defaults.h dict.h glusterfs.h hashfn.h timespec. unittest/unittest.h quota-common-utils.h rot-buffs.h \ $(CONTRIBDIR)/timer-wheel/timer-wheel.h +libgfchangelog_HEADERS = changelog.h + EXTRA_DIST = graph.l graph.y graph.lex.c: graph.l y.tab.h diff --git a/xlators/features/changelog/lib/src/changelog.h b/libglusterfs/src/changelog.h index 08307810704..08307810704 100644 --- a/xlators/features/changelog/lib/src/changelog.h +++ b/libglusterfs/src/changelog.h diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c index 751dc8a2e50..1adfdaa1673 100644 --- a/libglusterfs/src/common-utils.c +++ b/libglusterfs/src/common-utils.c @@ -3245,6 +3245,29 @@ gf_set_log_ident (cmd_args_t *cmd_args) } int +gf_thread_cleanup_xint (pthread_t thread) +{ + int ret = 0; + void *res = NULL; + + ret = pthread_cancel (thread); + if (ret != 0) + goto error_return; + + ret = pthread_join (thread, &res); + if (ret != 0) + goto error_return; + + if (res != PTHREAD_CANCELED) + goto error_return; + + ret = 0; + + error_return: + return ret; +} + +int gf_thread_create (pthread_t *thread, const pthread_attr_t *attr, void *(*start_routine)(void *), void *arg) { diff --git a/libglusterfs/src/common-utils.h b/libglusterfs/src/common-utils.h index c1deeef3c9d..6ac1442b0bf 100644 --- a/libglusterfs/src/common-utils.h +++ b/libglusterfs/src/common-utils.h @@ -707,4 +707,8 @@ gf_get_index_by_elem (char **array, char *elem); int glusterfs_is_local_pathinfo (char *pathinfo, gf_boolean_t *local); + +int +gf_thread_cleanup_xint (pthread_t thread); + #endif /* _COMMON_UTILS_H */ diff --git a/libglusterfs/src/dict.c b/libglusterfs/src/dict.c index 81db64dfd40..b8b6aeab248 100644 --- a/libglusterfs/src/dict.c +++ b/libglusterfs/src/dict.c @@ -2926,3 +2926,22 @@ dict_dump_to_statedump (dict_t *dict, char *dict_name, char *domain) return; } + +dict_t * +dict_for_key_value (const char *name, const char *value, size_t size) +{ + dict_t *xattr = NULL; + int ret = 0; + + xattr = dict_new (); + if (!xattr) + return NULL; + + ret = dict_set_static_bin (xattr, (char *)name, (void *)value, size); + if (ret) { + dict_destroy (xattr); + xattr = NULL; + } + + return xattr; +} diff --git a/libglusterfs/src/dict.h b/libglusterfs/src/dict.h index a1a4c85f711..3708eede06d 100644 --- a/libglusterfs/src/dict.h +++ b/libglusterfs/src/dict.h @@ -260,4 +260,8 @@ int dict_dump_to_str (dict_t *dict, char *dump, int dumpsize, char *format); gf_boolean_t dict_match_everything (dict_t *d, char *k, data_t *v, void *data); + +dict_t * +dict_for_key_value (const char *name, const char *value, size_t size); + #endif diff --git a/libglusterfs/src/mem-types.h b/libglusterfs/src/mem-types.h index a24e5731114..fc06d52239b 100644 --- a/libglusterfs/src/mem-types.h +++ b/libglusterfs/src/mem-types.h @@ -148,6 +148,8 @@ enum gf_common_mem_types_ { /* glusterd can load the nfs-xlator dynamically and needs these two */ gf_common_mt_nfs_netgroups = 130, gf_common_mt_nfs_exports = 131, + gf_common_mt_gf_brick_spec_t = 132, + gf_common_mt_gf_timer_entry_t = 133, gf_common_mt_end }; #endif diff --git a/libglusterfs/src/syncop-utils.c b/libglusterfs/src/syncop-utils.c index 53768acd0ac..2fc95fa3e70 100644 --- a/libglusterfs/src/syncop-utils.c +++ b/libglusterfs/src/syncop-utils.c @@ -133,6 +133,92 @@ out: return ret; } +/** + * Syncop_ftw_throttle can be used in a configurable way to control + * the speed at which crawling is done. It takes 2 more arguments + * compared to syncop_ftw. + * After @count entries are finished in a directory (to be + * precise, @count files) sleep for @sleep_time seconds. + * If either @count or @sleep_time is <=0, then it behaves similar to + * syncop_ftw. + */ +int +syncop_ftw_throttle (xlator_t *subvol, loc_t *loc, int pid, void *data, + int (*fn) (xlator_t *subvol, gf_dirent_t *entry, + loc_t *parent, void *data), + int count, int sleep_time) +{ + loc_t child_loc = {0, }; + fd_t *fd = NULL; + uint64_t offset = 0; + gf_dirent_t *entry = NULL; + int ret = 0; + gf_dirent_t entries; + int tmp = 0; + + if (sleep_time <= 0) { + ret = syncop_ftw (subvol, loc, pid, data, fn); + goto out; + } + + ret = syncop_dirfd (subvol, loc, &fd, pid); + if (ret) + goto out; + + INIT_LIST_HEAD (&entries.list); + + while ((ret = syncop_readdirp (subvol, fd, 131072, offset, 0, + &entries))) { + if (ret < 0) + break; + + if (ret > 0) { + /* If the entries are only '.', and '..' then ret + * value will be non-zero. so set it to zero here. */ + ret = 0; + } + + tmp = 0; + + list_for_each_entry (entry, &entries.list, list) { + offset = entry->d_off; + + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + + if (++tmp >= count) + sleep (sleep_time); + + gf_link_inode_from_dirent (NULL, fd->inode, entry); + + ret = fn (subvol, entry, loc, data); + if (ret) + continue; + + if (entry->d_stat.ia_type == IA_IFDIR) { + child_loc.inode = inode_ref (entry->inode); + uuid_copy (child_loc.gfid, entry->inode->gfid); + ret = syncop_ftw_throttle (subvol, &child_loc, + pid, data, fn, count, + sleep_time); + loc_wipe (&child_loc); + if (ret) + continue; + } + } + + gf_dirent_free (&entries); + if (ret) + break; + } + +out: + if (fd) + fd_unref (fd); + return ret; +} + int syncop_dir_scan (xlator_t *subvol, loc_t *loc, int pid, void *data, int (*fn) (xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, diff --git a/libglusterfs/src/syncop-utils.h b/libglusterfs/src/syncop-utils.h index 918b3b7c666..7a9ccacb285 100644 --- a/libglusterfs/src/syncop-utils.h +++ b/libglusterfs/src/syncop-utils.h @@ -30,4 +30,10 @@ syncop_is_subvol_local (xlator_t *this, loc_t *loc, gf_boolean_t *is_local); int syncop_gfid_to_path (inode_table_t *itable, xlator_t *subvol, uuid_t gfid, char **path_p); + +int +syncop_ftw_throttle (xlator_t *subvol, loc_t *loc, int pid, void *data, + int (*fn) (xlator_t *subvol, gf_dirent_t *entry, + loc_t *parent, void *data), + int count, int sleep_time); #endif /* _SYNCOP_H */ diff --git a/libglusterfs/src/xlator.c b/libglusterfs/src/xlator.c index cc4726e0ea5..00f411e275b 100644 --- a/libglusterfs/src/xlator.c +++ b/libglusterfs/src/xlator.c @@ -1024,3 +1024,13 @@ glusterd_check_log_level (const char *value) return log_level; } +int +xlator_subvolume_count (xlator_t *this) +{ + int i = 0; + xlator_list_t *list = NULL; + + for (list = this->children; list; list = list->next) + i++; + return i; +} diff --git a/libglusterfs/src/xlator.h b/libglusterfs/src/xlator.h index 5a0b114d6a8..9bea950d720 100644 --- a/libglusterfs/src/xlator.h +++ b/libglusterfs/src/xlator.h @@ -989,4 +989,7 @@ glusterfs_leaf_position(xlator_t *tgt); int glusterfs_reachable_leaves(xlator_t *base, dict_t *leaves); +int +xlator_subvolume_count (xlator_t *this); + #endif /* _XLATOR_H */ diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 0af46993a34..6c06fd9b7b5 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -4164,18 +4164,6 @@ out: return; } -int -xlator_subvolume_count (xlator_t *this) -{ - int i = 0; - xlator_list_t *list = NULL; - - for (list = this->children; list; list = list->next) - i++; - return i; -} - - void afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this) { diff --git a/xlators/features/bit-rot/src/Makefile.am b/xlators/features/bit-rot/src/Makefile.am index 1f59a71ebea..b5e4a7d62a0 100644 --- a/xlators/features/bit-rot/src/Makefile.am +++ b/xlators/features/bit-rot/src/Makefile.am @@ -1,18 +1 @@ - -SUBDIRS = stub - -xlator_LTLIBRARIES = bit-rot.la -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features - -bit_rot_la_LDFLAGS = -module -avoid-version - -bit_rot_la_SOURCES = bit-rot.c -bit_rot_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la - -noinst_HEADERS = bit-rot.h bit-rot-mem-types.h - -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src - -AM_CFLAGS = -Wall $(GF_CFLAGS) - -CLEANFILES = +SUBDIRS = stub bitd diff --git a/xlators/features/bit-rot/src/bit-rot-mem-types.h b/xlators/features/bit-rot/src/bit-rot-mem-types.h deleted file mode 100644 index 19c2aca0f8a..00000000000 --- a/xlators/features/bit-rot/src/bit-rot-mem-types.h +++ /dev/null @@ -1,24 +0,0 @@ -/* - Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef _BR_MEM_TYPES_H -#define _BR_MEM_TYPES_H - -#include "mem-types.h" - -enum br_mem_types { - gf_br_mt_br_private_t = gf_common_mt_end + 1, - gf_br_mt_br_local_t, - gf_br_mt_br_inode_t, - gf_br_mt_br_fd_t, - gf_br_mt_end -}; - -#endif diff --git a/xlators/features/bit-rot/src/bit-rot.c b/xlators/features/bit-rot/src/bit-rot.c deleted file mode 100644 index 0ba8b80825b..00000000000 --- a/xlators/features/bit-rot/src/bit-rot.c +++ /dev/null @@ -1,89 +0,0 @@ -/* - Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#include <ctype.h> -#include <sys/uio.h> - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "xlator.h" -#include "logging.h" - -#include "bit-rot.h" -#include "bit-rot-mem-types.h" - -int32_t -mem_acct_init (xlator_t *this) -{ - int32_t ret = -1; - - if (!this) - return ret; - - ret = xlator_mem_acct_init (this, gf_br_mt_end + 1); - - if (ret != 0) { - gf_log (this->name, GF_LOG_WARNING, "Memory accounting" - " init failed"); - return ret; - } - - return ret; -} - -int32_t -init (xlator_t *this) -{ - br_private_t *priv = NULL; - int32_t ret = -1; - - if (!this->children) { - gf_log (this->name, GF_LOG_ERROR, - "FATAL: no children"); - goto out; - } - - priv = GF_CALLOC (1, sizeof (*priv), gf_br_mt_br_private_t); - if (!priv) - goto out; - - this->private = priv; - - ret = 0; - -out: - gf_log (this->name, GF_LOG_DEBUG, "bit-rot xlator loaded"); - return ret; -} - -void -fini (xlator_t *this) -{ - br_private_t *priv = this->private; - - if (!priv) - return; - this->private = NULL; - GF_FREE (priv); - - return; -} - -struct xlator_fops fops; - -struct xlator_cbks cbks; - -struct volume_options options[] = { - { .key = {NULL} }, -}; diff --git a/xlators/features/bit-rot/src/bit-rot.h b/xlators/features/bit-rot/src/bit-rot.h deleted file mode 100644 index b275c0e9535..00000000000 --- a/xlators/features/bit-rot/src/bit-rot.h +++ /dev/null @@ -1,33 +0,0 @@ - /* - Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ -#ifndef __BIT_ROT_H__ -#define __BIT_ROT_H__ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" -#include "defaults.h" -#include "bit-rot-mem-types.h" -#include "syncop.h" - -struct br_private { - xlator_t *xl; - gf_lock_t lock; -}; - -typedef struct br_private br_private_t; - -#endif /* __BIR_ROT_H__ */ diff --git a/xlators/features/bit-rot/src/bitd/Makefile.am b/xlators/features/bit-rot/src/bitd/Makefile.am new file mode 100644 index 00000000000..d94a70dc97f --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/Makefile.am @@ -0,0 +1,20 @@ +xlator_LTLIBRARIES = bit-rot.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +bit_rot_la_LDFLAGS = -module -avoid-version + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src/ \ + -I$(top_srcdir)/rpc/rpc-lib/src \ + -I$(CONTRIBDIR)/timer-wheel \ + -I$(top_srcdir)/xlators/features/bit-rot/src/stub + +bit_rot_la_SOURCES = bit-rot.c +bit_rot_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ + $(top_builddir)/xlators/features/changelog/lib/src/libgfchangelog.la + +noinst_HEADERS = bit-rot.h + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c new file mode 100644 index 00000000000..6234dd83864 --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/bit-rot.c @@ -0,0 +1,1351 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <ctype.h> +#include <sys/uio.h> + +#include "glusterfs.h" +#include "xlator.h" +#include "logging.h" +#include "compat-errno.h" + +#include "bit-rot.h" +#include <pthread.h> + +static int +br_find_child_index (xlator_t *this, xlator_t *child) +{ + br_private_t *priv = NULL; + int i = -1; + int index = -1; + + GF_VALIDATE_OR_GOTO ("bit-rot", this, out); + GF_VALIDATE_OR_GOTO (this->name, this->private, out); + GF_VALIDATE_OR_GOTO (this->name, child, out); + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (child == priv->children[i].xl) { + index = i; + break; + } + } + +out: + return index; +} + +static void +br_free_children (xlator_t *this) +{ + br_private_t *priv = NULL; + int32_t i = 0; + br_child_t *child = NULL; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + child = &priv->children[i]; + mem_pool_destroy (child->timer_pool); + list_del_init (&priv->children[i].list); + } + + GF_FREE (priv->children); + + priv->children = NULL; +} + +br_child_t * +br_get_child_from_brick_path (xlator_t *this, char *brick_path) +{ + br_private_t *priv = NULL; + br_child_t *child = NULL; + br_child_t *tmp = NULL; + int i = 0; + + GF_VALIDATE_OR_GOTO ("bit-rot", this, out); + GF_VALIDATE_OR_GOTO (this->name, this->private, out); + GF_VALIDATE_OR_GOTO (this->name, brick_path, out); + + priv = this->private; + + pthread_mutex_lock (&priv->lock); + { + for (i = 0; i < priv->child_count; i++) { + tmp = &priv->children[i]; + if (!strcmp (tmp->brick_path, brick_path)) { + child = tmp; + break; + } + } + } + pthread_mutex_unlock (&priv->lock); + +out: + return child; +} + +/** + * probably we'll encapsulate brick inside our own structure when + * needed -- later. + */ +void * +br_brick_init (void *xl, struct gf_brick_spec *brick) +{ + return brick; +} + +/** + * and cleanup things here when allocated br_brick_init(). + */ +void +br_brick_fini (void *xl, char *brick, void *data) +{ + return; +} + +/** + * TODO: Signature can contain null terminators which causes bitrot + * stub to store truncated hash as it depends on string length of + * the hash. + * + * FIX: Send the string length as part of the signature struct and + * change stub to handle this change. + */ +static inline br_isignature_t * +br_prepare_signature (const unsigned char *sign, + unsigned long hashlen, + int8_t hashtype, br_object_t *object) +{ + br_isignature_t *signature = NULL; + + /* TODO: use mem-pool */ + signature = GF_CALLOC (1, signature_size (hashlen + 1), + gf_br_stub_mt_signature_t); + if (!signature) + return NULL; + + signature->signedversion = object->signedversion; + signature->signaturetype = hashtype; + memcpy (signature->signature, (char *)sign, hashlen); + signature->signature[hashlen+1] = '\0'; + + return signature; +} + +/** + * Do a lookup on the gfid present within the object. + */ +static inline int32_t +br_object_lookup (xlator_t *this, br_object_t *object, + struct iatt *iatt, inode_t **linked_inode) +{ + int ret = -EINVAL; + loc_t loc = {0, }; + inode_t *inode = NULL; + + GF_VALIDATE_OR_GOTO ("bit-rot", this, out); + GF_VALIDATE_OR_GOTO (this->name, object, out); + + inode = inode_find (object->child->table, object->gfid); + + if (inode) + loc.inode = inode; + else + loc.inode = inode_new (object->child->table); + + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + + uuid_copy (loc.gfid, object->gfid); + + ret = syncop_lookup (object->child->xl, &loc, NULL, iatt, NULL, NULL); + if (ret < 0) + goto out; + + /* + * The file might have been deleted by the application + * after getting the event, but before doing a lookup. + * So use linked_inode after inode_link is done. + */ + *linked_inode = inode_link (loc.inode, NULL, NULL, iatt); + if (*linked_inode) + inode_lookup (*linked_inode); + +out: + loc_wipe (&loc); + return ret; +} + +/** + * open the object with O_RDONLY flags and return the fd. How to let brick + * know that open is being done by bitd because syncop framework does not allow + * passing xdata -- may be use frame->root->pid itself. + */ +static inline int32_t +br_object_open (xlator_t *this, + br_object_t *object, inode_t *inode, fd_t **openfd) +{ + int32_t ret = -1; + fd_t *fd = NULL; + loc_t loc = {0, }; + + GF_VALIDATE_OR_GOTO ("bit-rot", this, out); + GF_VALIDATE_OR_GOTO (this->name, object, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); + + ret = -EINVAL; + fd = fd_create (inode, 0); + if (!fd) { + gf_log (this->name, GF_LOG_ERROR, "failed to create fd for the " + "inode %s", uuid_utoa (inode->gfid)); + goto out; + } + + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); + + ret = syncop_open (object->child->xl, &loc, O_RDONLY, fd); + if (ret) { + fd_unref (fd); + fd = NULL; + } else { + fd_bind (fd); + *openfd = fd; + } + + loc_wipe (&loc); + +out: + return ret; +} + +/** + * read 128k block from the object @object from the offset @offset + * and return the buffer. + */ +static int32_t +br_object_read_block_and_sign (xlator_t *this, fd_t *fd, br_child_t *child, + off_t offset, size_t size, SHA256_CTX *sha256) +{ + int32_t ret = -1; + struct iovec *iovec = NULL; + struct iobref *iobref = NULL; + int count = 0; + int i = 0; + + GF_VALIDATE_OR_GOTO ("bit-rot", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + GF_VALIDATE_OR_GOTO (this->name, fd->inode, out); + GF_VALIDATE_OR_GOTO (this->name, child, out); + + ret = syncop_readv (child->xl, fd, + size, offset, 0, &iovec, &count, &iobref); + + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "readv on %s failed (%s)", + uuid_utoa (fd->inode->gfid), strerror (errno)); + ret = -1; + goto out; + } + + if (ret == 0) + goto out; + + for (i = 0; i < count; i++) { + SHA256_Update (sha256, + (const unsigned char *) (iovec[i].iov_base), + iovec[i].iov_len); + } + + out: + if (iovec) + GF_FREE (iovec); + + if (iobref) + iobref_unref (iobref); + + return ret; +} + +int32_t +br_object_checksum (unsigned char *md, + br_object_t *object, fd_t *fd, struct iatt *iatt) +{ + int32_t ret = -1; + off_t offset = 0; + size_t block = 128 * 1024; /* 128K block size */ + xlator_t *this = NULL; + + SHA256_CTX sha256; + + GF_VALIDATE_OR_GOTO ("bit-rot", object, out); + GF_VALIDATE_OR_GOTO ("bit-rot", iatt, out); + GF_VALIDATE_OR_GOTO ("bit-rot", fd, out); + + this = object->this; + + SHA256_Init (&sha256); + + while (1) { + ret = br_object_read_block_and_sign (this, fd, object->child, + offset, block, &sha256); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "reading block with " + "offset %lu of object %s failed", offset, + uuid_utoa (fd->inode->gfid)); + break; + } + + if (ret == 0) + break; + + offset += ret; + } + + if (ret == 0) + SHA256_Final (md, &sha256); + + out: + return ret; +} + +static inline int32_t +br_object_read_sign (inode_t *linked_inode, fd_t *fd, br_object_t *object, + struct iatt *iatt) +{ + int32_t ret = -1; + xlator_t *this = NULL; + dict_t *xattr = NULL; + unsigned char *md = NULL; + br_isignature_t *sign = NULL; + + GF_VALIDATE_OR_GOTO ("bit-rot", object, out); + GF_VALIDATE_OR_GOTO ("bit-rot", linked_inode, out); + GF_VALIDATE_OR_GOTO ("bit-rot", fd, out); + + this = object->this; + + md = GF_CALLOC (SHA256_DIGEST_LENGTH, sizeof (*md), gf_common_mt_char); + if (!md) { + gf_log (this->name, GF_LOG_ERROR, "failed to allocate memory " + "for saving hash of the object %s", + uuid_utoa (fd->inode->gfid)); + goto out; + } + + ret = br_object_checksum (md, object, fd, iatt); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "calculating checksum for " + "the object %s failed", uuid_utoa (linked_inode->gfid)); + goto free_signature; + } + + sign = br_prepare_signature (md, SHA256_DIGEST_LENGTH, + BR_SIGNATURE_TYPE_SHA256, object); + if (!sign) { + gf_log (this->name, GF_LOG_ERROR, "failed to get the signature " + "for the object %s", uuid_utoa (fd->inode->gfid)); + goto free_signature; + } + + xattr = dict_for_key_value + (GLUSTERFS_SET_OBJECT_SIGNATURE, + (void *)sign, signature_size (SHA256_DIGEST_LENGTH)); + + if (!xattr) { + gf_log (this->name, GF_LOG_ERROR, "dict allocation for signing" + " failed for the object %s", + uuid_utoa (fd->inode->gfid)); + goto free_isign; + } + + ret = syncop_fsetxattr (object->child->xl, fd, xattr, 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "fsetxattr of signature to " + "the object %s failed", uuid_utoa (fd->inode->gfid)); + goto unref_dict; + } + + ret = 0; + + unref_dict: + dict_unref (xattr); + free_isign: + GF_FREE (sign); + free_signature: + GF_FREE (md); + out: + return ret; +} + +static inline int br_object_sign_softerror (int32_t op_errno) +{ + return ((op_errno == ENOENT) || (op_errno = ESTALE)); +} + +void +br_log_object (xlator_t *this, char *op, uuid_t gfid, int32_t op_errno) +{ + int softerror = br_object_sign_softerror (op_errno); + gf_log (this->name, (softerror) ? GF_LOG_DEBUG : GF_LOG_ERROR, + "%s() failed on object %s [reason: %s]", + op, uuid_utoa (gfid), strerror (op_errno)); +} + +void +br_log_object_path (xlator_t *this, char *op, + const char *path, int32_t op_errno) +{ + int softerror = br_object_sign_softerror (op_errno); + gf_log (this->name, (softerror) ? GF_LOG_DEBUG : GF_LOG_ERROR, + "%s() failed on object %s [reason: %s]", + op, path, strerror (op_errno)); +} + +/** + * Sign a given object. This routine runs full throttle. There needs to be + * some form of priority scheduling and/or read burstness to avoid starving + * (or kicking) client I/O's. + */ +static inline int32_t br_sign_object (br_object_t *object) +{ + int32_t ret = -1; + inode_t *linked_inode = NULL; + xlator_t *this = NULL; + fd_t *fd = NULL; + struct iatt iatt = {0, }; + pid_t pid = GF_CLIENT_PID_BITD; + + GF_VALIDATE_OR_GOTO ("bit-rot", object, out); + + this = object->this; + + /** + * FIXME: This is required as signing an object is restricted to + * clients with special frame->root->pid. Change the way client + * pid is set. + */ + syncopctx_setfspid (&pid); + + ret = br_object_lookup (this, object, &iatt, &linked_inode); + if (ret) { + br_log_object (this, "lookup", object->gfid, -ret); + goto out; + } + + ret = br_object_open (this, object, linked_inode, &fd); + if (!fd) { + br_log_object (this, "open", object->gfid, -ret); + goto unref_inode; + } + + /** + * we have an open file descriptor on the object. from here on, + * do not be generous to file operation errors. + */ + + /* change this to DEBUG log level later */ + gf_log (this->name, GF_LOG_DEBUG, + "Signing object [%s]", uuid_utoa (linked_inode->gfid)); + + ret = br_object_read_sign (linked_inode, fd, object, &iatt); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "reading and signing of the " + "object %s failed", uuid_utoa (linked_inode->gfid)); + goto unref_fd; + } + + ret = 0; + + unref_fd: + fd_unref (fd); + unref_inode: + inode_unref (linked_inode); + out: + return ret; +} + +static inline br_object_t *__br_pick_object (br_private_t *priv) +{ + br_object_t *object = NULL; + + while (list_empty (&priv->obj_queue->objects)) { + pthread_cond_wait (&priv->object_cond, &priv->lock); + } + + object = list_first_entry + (&priv->obj_queue->objects, br_object_t, list); + list_del_init (&object->list); + + return object; +} + +/** + * This is the place where the signing of the objects is triggered. + */ +void * +br_process_object (void *arg) +{ + xlator_t *this = NULL; + br_object_t *object = NULL; + br_private_t *priv = NULL; + int32_t ret = -1; + + this = arg; + priv = this->private; + + THIS = this; + + for (;;) { + pthread_mutex_lock (&priv->lock); + { + object = __br_pick_object (priv); + } + pthread_mutex_unlock (&priv->lock); + + ret = br_sign_object (object); + if (ret && !br_object_sign_softerror (-ret)) + gf_log (this->name, GF_LOG_ERROR, + "SIGNING FAILURE [%s]", + uuid_utoa (object->gfid)); + GF_FREE (object); + } + + return NULL; +} + +/** + * This function gets kicked in once the object is expired from the + * timer wheel. This actually adds the object received via notification + * from the changelog to the queue from where the objects gets picked + * up for signing. + * + * This routine can be made lightweight by introducing an alternate + * timer-wheel API that dispatches _all_ expired objects in one-shot + * rather than an object at-a-time. This routine can then just simply + * be a call to list_splice_tail(). + * + * NOTE: use call_time to instrument signing time in br_sign_object(). + */ +void +br_add_object_to_queue (struct gf_tw_timer_list *timer, + void *data, unsigned long call_time) +{ + br_object_t *object = NULL; + xlator_t *this = NULL; + br_private_t *priv = NULL; + + object = data; + this = object->this; + priv = this->private; + + pthread_mutex_lock (&priv->lock); + { + list_add_tail (&object->list, &priv->obj_queue->objects); + pthread_cond_broadcast (&priv->object_cond); + } + pthread_mutex_unlock (&priv->lock); + + mem_put (timer); + return; +} + +static inline br_object_t * +br_initialize_object (xlator_t *this, br_child_t *child, changelog_event_t *ev) +{ + br_object_t *object = NULL; + + object = GF_CALLOC (1, sizeof (*object), gf_br_mt_br_object_t); + if (!object) + goto out; + INIT_LIST_HEAD (&object->list); + + object->this = this; + object->child = child; + uuid_copy (object->gfid, ev->u.releasebr.gfid); + + /* NOTE: it's BE, but no worry */ + object->signedversion = ev->u.releasebr.version; + +out: + return object; +} + +static inline struct gf_tw_timer_list * +br_initialize_timer (xlator_t *this, br_object_t *object, br_child_t *child, + changelog_event_t *ev) +{ + br_private_t *priv = NULL; + struct gf_tw_timer_list *timer = NULL; + + priv = this->private; + + timer = mem_get0 (child->timer_pool); + if (!timer) + goto out; + INIT_LIST_HEAD (&timer->entry); + + timer->data = object; + timer->expires = priv->expiry_time; + timer->function = br_add_object_to_queue; + gf_tw_add_timer (priv->timer_wheel, timer); + +out: + return timer; +} + +/** + * This callback function registered with the changelog is executed + * whenever a notification from the changelog is received. This should + * add the object (or the gfid) on which the notification has come to + * the timer-wheel with some expiry time. + * + * TODO: use mem-pool for allocations and maybe allocate timer and + * object as a single alloc and bifurcate their respective pointers. + */ +void +br_brick_callback (void *xl, char *brick, + void *data, changelog_event_t *ev) +{ + uuid_t gfid = {0,}; + xlator_t *this = NULL; + br_object_t *object = NULL; + br_child_t *child = NULL; + int32_t flags = 0; + struct gf_tw_timer_list *timer = NULL; + + this = xl; + + GF_VALIDATE_OR_GOTO (this->name, ev, out); + GF_VALIDATE_OR_GOTO ("bit-rot", this, out); + GF_VALIDATE_OR_GOTO (this->name, this->private, out); + + GF_ASSERT (ev->ev_type == CHANGELOG_OP_TYPE_BR_RELEASE); + GF_ASSERT (!uuid_is_null (ev->u.releasebr.gfid)); + + uuid_copy (gfid, ev->u.releasebr.gfid); + + gf_log (this->name, GF_LOG_DEBUG, + "RELEASE EVENT [GFID %s]", uuid_utoa (gfid)); + + flags = (int32_t)ntohl (ev->u.releasebr.flags); + if (flags == O_RDONLY) { + gf_log (this->name, GF_LOG_DEBUG, + "Read only fd [GFID: %s], ignoring signing..", + uuid_utoa (gfid)); + goto out; + } + + child = br_get_child_from_brick_path (this, brick); + if (!child) { + gf_log (this->name, GF_LOG_ERROR, "failed to get the subvolume " + "for the brick %s", brick); + goto out; + } + + object = br_initialize_object (this, child, ev); + if (!object) { + gf_log (this->name, GF_LOG_ERROR, "failed to allocate " + "object memory [GFID: %s]", uuid_utoa (gfid)); + goto out; + } + + timer = br_initialize_timer (this, object, child, ev); + if (!timer) { + gf_log (this->name, GF_LOG_ERROR, "failed to allocate " + "object expiry timer [GFID: %s]", uuid_utoa (gfid)); + goto free_object; + } + + gf_log (this->name, GF_LOG_DEBUG, "->callback: brick [%s], type [%d]\n", + brick, ev->ev_type); + + return; + + free_object: + GF_FREE (object); +out: + return; +} + +void +br_fill_brick_spec (struct gf_brick_spec *brick, char *path) +{ + brick->brick_path = gf_strdup (path); + brick->filter = CHANGELOG_OP_TYPE_BR_RELEASE; + + brick->init = br_brick_init; + brick->fini = br_brick_fini; + brick->callback = br_brick_callback; + brick->connected = NULL; + brick->disconnected = NULL; +} + +static inline gf_boolean_t +br_time_equal (br_child_t *child, struct timeval *tv) +{ + if ((child->tv.tv_sec == tv->tv_sec) && + (child->tv.tv_usec == tv->tv_usec)) + return _gf_true; + + return _gf_false; +} + +static inline gf_boolean_t +br_check_object_need_sign (xlator_t *this, dict_t *xattr, br_child_t *child) +{ + int32_t ret = -1; + gf_boolean_t need_sign = _gf_false; + struct timeval tv = {0,}; + br_isignature_out_t *sign = NULL; + + GF_VALIDATE_OR_GOTO ("bit-rot", this, out); + GF_VALIDATE_OR_GOTO (this->name, xattr, out); + GF_VALIDATE_OR_GOTO (this->name, child, out); + + ret = dict_get_ptr (xattr, GLUSTERFS_GET_OBJECT_SIGNATURE, + (void **)&sign); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to get object signature info"); + goto out; + } + + tv.tv_sec = ntohl (sign->time[0]); + tv.tv_usec = ntohl (sign->time[1]); + + /* Object has been opened and hence dirty. Do not sign it */ + if (sign->stale && !br_time_equal (child, &tv)) + need_sign = _gf_true; + +out: + return need_sign; +} + +static inline void +br_trigger_sign (xlator_t *this, br_child_t *child, inode_t *linked_inode, + loc_t *loc) +{ + fd_t *fd = NULL; + int32_t ret = -1; + + fd = fd_create (linked_inode, 0); + if (!fd) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to create fd [GFID %s]", + uuid_utoa (linked_inode->gfid)); + goto out; + } + + ret = syncop_open (child->xl, loc, O_RDWR, fd); + if (ret) { + br_log_object (this, "open", linked_inode->gfid, -ret); + fd_unref (fd); + fd = NULL; + } else { + fd_bind (fd); + } + + if (fd) + syncop_close (fd); + +out: + return; +} + +int32_t +br_prepare_loc (xlator_t *this, br_child_t *child, loc_t *parent, + gf_dirent_t *entry, loc_t *loc) +{ + int32_t ret = -1; + inode_t *inode = NULL; + + inode = inode_grep (child->table, parent->inode, entry->d_name); + if (!inode) + loc->inode = inode_new (child->table); + else { + loc->inode = inode; + if (loc->inode->ia_type != IA_IFREG) { + gf_log (this->name, GF_LOG_DEBUG, "%s is not a regular " + "file", entry->d_name); + ret = 0; + goto out; + } + } + + loc->parent = inode_ref (parent->inode); + uuid_copy (loc->pargfid, parent->inode->gfid); + + ret = inode_path (parent->inode, entry->d_name, (char **)&loc->path); + if (ret < 0 || !loc->path) { + gf_log (this->name, GF_LOG_ERROR, "inode_path on %s " + "(parent: %s) failed", entry->d_name, + uuid_utoa (parent->inode->gfid)); + goto out; + } + + loc->name = strrchr (loc->path, '/'); + if (loc->name) + loc->name++; + + ret = 1; + +out: + return ret; +} + +/** + * Oneshot crawler + * --------------- + * This is a catchup mechanism. Objects that remained unsigned from the + * last run for whatever reason (node crashes, reboots, etc..) become + * candidates for signing. This allows the signature to "catch up" with + * the current state of the object. Triggering signing is easy: perform + * an open() followed by a close() therby resulting in call boomerang. + * (though not back to itself :)) + */ +int +bitd_oneshot_crawl (xlator_t *subvol, + gf_dirent_t *entry, loc_t *parent, void *data) +{ + int op_errno = 0; + br_child_t *child = NULL; + xlator_t *this = NULL; + loc_t loc = {0, }; + struct iatt iatt = {0, }; + struct iatt parent_buf = {0, }; + dict_t *xattr = NULL; + int32_t ret = -1; + inode_t *linked_inode = NULL; + gf_boolean_t need_signing = _gf_false; + + GF_VALIDATE_OR_GOTO ("bit-rot", subvol, out); + GF_VALIDATE_OR_GOTO ("bit-rot", data, out); + + child = data; + this = child->this; + + ret = br_prepare_loc (this, child, parent, entry, &loc); + if (!ret) + goto out; + + ret = syncop_lookup (child->xl, &loc, NULL, &iatt, NULL, &parent_buf); + if (ret) { + br_log_object_path (this, "lookup", loc.path, -ret); + goto out; + } + + linked_inode = inode_link (loc.inode, parent->inode, loc.name, &iatt); + if (linked_inode) + inode_lookup (linked_inode); + + if (iatt.ia_type != IA_IFREG) { + gf_log (this->name, GF_LOG_DEBUG, + "%s is not a regular file, skipping..", entry->d_name); + ret = 0; + goto unref_inode; + } + + /** + * As of now, 2 cases are possible and handled. + * 1) GlusterFS is upgraded from a previous version which does not + * have any idea about bit-rot and have data in the filesystem. + * In this case syncop_getxattr fails with ENODATA and the object + * is signed. (In real, when crawler sends lookup, bit-rot-stub + * creates the xattrs before returning lookup reply) + * 2) Bit-rot was not enabled or BitD was dows for some reasons, during + * which some files were created, but since BitD was down, were not + * signed. + * If the file was just created and was being written some data when + * the down BitD came up, then bit-rot stub should be intelligent to + * identify this case (by comparing the ongoing version or by checking + * if there are any fds present for that inode) and handle properly. + */ + + ret = syncop_getxattr (child->xl, &loc, &xattr, + GLUSTERFS_GET_OBJECT_SIGNATURE, NULL); + if (ret < 0) { + op_errno = -ret; + br_log_object (this, "getxattr", linked_inode->gfid, op_errno); + + if (op_errno == ENODATA) + need_signing = _gf_true; + if (op_errno == EINVAL) + gf_log (this->name, GF_LOG_WARNING, "Partial version " + "xattr presence detected, ignoring [GFID: %s]", + uuid_utoa (linked_inode->gfid)); + } else { + need_signing = br_check_object_need_sign (this, xattr, child); + } + + if (!need_signing) + goto unref_dict; + + gf_log (this->name, GF_LOG_INFO, + "Triggering signing for %s [GFID: %s | Brick: %s]", + loc.path, uuid_utoa (linked_inode->gfid), child->brick_path); + br_trigger_sign (this, child, linked_inode, &loc); + + ret = 0; + + unref_dict: + if (xattr) + dict_unref (xattr); + unref_inode: + inode_unref (linked_inode); + out: + loc_wipe (&loc); + + return ret; +} + +#define BR_CRAWL_THROTTLE_COUNT 50 +#define BR_CRAWL_THROTTLE_ZZZ 5 + +void * +br_oneshot_signer (void *arg) +{ + loc_t loc = {0,}; + xlator_t *this = NULL; + br_child_t *child = NULL; + + child = arg; + this = child->this; + + THIS = this; + + gf_log (this->name, GF_LOG_INFO, "Crawling brick [%s], scanning " + "for unsigned objects", child->brick_path); + + loc.inode = child->table->root; + (void) syncop_ftw_throttle + (child->xl, &loc, + GF_CLIENT_PID_BITD, child, bitd_oneshot_crawl, + BR_CRAWL_THROTTLE_COUNT, BR_CRAWL_THROTTLE_ZZZ); + + gf_log (this->name, GF_LOG_INFO, + "Completed crawling brick [%s]", child->brick_path); + + return NULL; +} + +/** + * At this point a thread is spawned to crawl the filesystem (in + * tortoise pace) to sign objects that were not signed in previous run(s). + * Such objects are identified by examining it's dirtyness and timestamp. + * + * pick object: + * signature_is_stale() && (object_timestamp() <= stub_init_time()) + * + * Also, we register to the changelog library to subscribe for event + * notifications. + */ +static inline int32_t +br_enact_signer (xlator_t *this, br_child_t *child, br_stub_init_t *stub) +{ + int32_t ret = 0; + struct gf_brick_spec *brick = NULL; + + brick = GF_CALLOC (1, sizeof (struct gf_brick_spec), + gf_common_mt_gf_brick_spec_t); + if (!brick) + goto error_return; + + br_fill_brick_spec (brick, stub->export); + ret = gf_changelog_register_generic + (brick, 1, 1, this->ctx->cmd_args.log_file, -1, this); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Register to changelog failed" + " [Reason: %s]", strerror (errno)); + goto dealloc; + } + + child->threadrunning = 0; + ret = gf_thread_create (&child->thread, NULL, br_oneshot_signer, child); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "failed to spawn FS crawler thread"); + else + child->threadrunning = 1; + + /* it's OK to continue, "old" objects would be signed when modified */ + return 0; + + dealloc: + GF_FREE (brick); + error_return: + return -1; +} + +/** + * This routine fetches various attributes associated with a child which + * is basically a subvolume. Attributes include brick path and the stub + * birth time. This is done by performing a lookup on the root followed + * by getxattr() on a virtual key. + */ +static inline int32_t +br_brick_connect (xlator_t *this, br_child_t *child) +{ + int32_t ret = -1; + loc_t loc = {0, }; + struct iatt buf = {0, }; + struct iatt parent = {0, }; + br_stub_init_t *stub = NULL; + dict_t *xattr = NULL; + int op_errno = 0; + + GF_VALIDATE_OR_GOTO ("bit-rot", this, out); + GF_VALIDATE_OR_GOTO (this->name, child, out); + GF_VALIDATE_OR_GOTO (this->name, this->private, out); + + loc.inode = inode_ref (child->table->root); + uuid_copy (loc.gfid, loc.inode->gfid); + loc.path = gf_strdup ("/"); + + ret = syncop_lookup (child->xl, &loc, NULL, &buf, NULL, &parent); + if (ret) { + op_errno = -ret; + ret = -1; + gf_log (this->name, GF_LOG_ERROR, "lookup on root failed " + "[Reason: %s]", strerror (op_errno)); + goto wipeloc; + } + + ret = syncop_getxattr (child->xl, &loc, &xattr, + GLUSTERFS_GET_BR_STUB_INIT_TIME, NULL); + if (ret) { + op_errno = -ret; + ret = -1; + gf_log (this->name, GF_LOG_ERROR, "failed to get stub info " + "[Reason: %s]", strerror (op_errno)); + goto wipeloc; + } + + ret = dict_get_ptr (xattr, GLUSTERFS_GET_BR_STUB_INIT_TIME, + (void **)&stub); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to extract stub information"); + goto free_dict; + } + + memcpy (child->brick_path, stub->export, strlen (stub->export) + 1); + child->tv.tv_sec = ntohl (stub->timebuf[0]); + child->tv.tv_usec = ntohl (stub->timebuf[0]); + + ret = br_enact_signer (this, child, stub); + + free_dict: + dict_unref (xattr); + wipeloc: + loc_wipe (&loc); + out: + return ret; +} + +/** + * This function is executed in a separate thread. The thread gets the + * brick from where CHILD_UP has received from the queue and gets the + * information regarding that brick (such as brick path). + */ +void * +br_handle_events (void *arg) +{ + xlator_t *this = NULL; + br_private_t *priv = NULL; + br_child_t *child = NULL; + int32_t ret = -1; + + this = arg; + priv = this->private; + + /* + * Since, this is the topmost xlator, THIS has to be set by bit-rot + * xlator itself (STACK_WIND wont help in this case). Also it has + * to be done for each thread that gets spawned. Otherwise, a new + * thread will get global_xlator's pointer when it does "THIS". + */ + THIS = this; + + while (1) { + pthread_mutex_lock (&priv->lock); + { + while (list_empty (&priv->bricks)) { + pthread_cond_wait (&priv->cond, + &priv->lock); + } + + child = list_entry (priv->bricks.next, br_child_t, + list); + if (child && child->child_up) { + ret = br_brick_connect (this, child); + if (ret == -1) + gf_log (this->name, GF_LOG_ERROR, + "failed to connect to the " + "child (subvolume: %s)", + child->xl->name); + else + list_del_init (&child->list); + } + + } + pthread_mutex_unlock (&priv->lock); + } + + return NULL; +} + +int32_t +mem_acct_init (xlator_t *this) +{ + int32_t ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init (this, gf_br_stub_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_WARNING, "Memory accounting" + " init failed"); + return ret; + } + + return ret; +} + +int +notify (xlator_t *this, int32_t event, void *data, ...) +{ + xlator_t *subvol = NULL; + br_private_t *priv = NULL; + int idx = -1; + br_child_t *child = NULL; + + subvol = (xlator_t *)data; + priv = this->private; + + gf_log (this->name, GF_LOG_TRACE, "Notification received: %d", + event); + + switch (event) { + case GF_EVENT_CHILD_UP: + /* should this be done under lock? or is it ok to do it + without lock? */ + idx = br_find_child_index (this, subvol); + + pthread_mutex_lock (&priv->lock); + { + if (idx < 0) { + gf_log (this->name, GF_LOG_ERROR, "got child " + "up from invalid subvolume"); + } else { + child = &priv->children[idx]; + if (child->child_up != 1) + child->child_up = 1; + if (!child->xl) + child->xl = subvol; + if (!child->table) + child->table = inode_table_new (4096, + subvol); + priv->up_children++; + list_add_tail (&child->list, &priv->bricks); + pthread_cond_signal (&priv->cond); + } + } + pthread_mutex_unlock (&priv->lock); + break; + + case GF_EVENT_CHILD_MODIFIED: + idx = br_find_child_index (this, subvol); + if (idx < 0) { + gf_log (this->name, GF_LOG_ERROR, "received child up " + "from invalid subvolume"); + goto out; + } + priv = this->private; + /* ++(priv->generation); */ + break; + case GF_EVENT_CHILD_DOWN: + idx = br_find_child_index (this, subvol); + if (idx < 0) { + gf_log (this->name, GF_LOG_ERROR, "received child down " + "from invalid subvolume"); + goto out; + } + + pthread_mutex_lock (&priv->lock); + { + if (priv->children[idx].child_up == 1) { + priv->children[idx].child_up = 0; + priv->up_children--; + } + } + pthread_mutex_unlock (&priv->lock); + break; + case GF_EVENT_PARENT_UP: + default_notify (this, GF_EVENT_PARENT_UP, data); + break; + } + +out: + return 0; +} + +int32_t +init (xlator_t *this) +{ + int i = 0; + int32_t ret = -1; + br_private_t *priv = NULL; + xlator_list_t *trav = NULL; + + if (!this->children) { + gf_log (this->name, GF_LOG_ERROR, "FATAL: no children"); + goto out; + } + + priv = GF_CALLOC (1, sizeof (*priv), gf_br_mt_br_private_t); + if (!priv) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate memory (->priv)"); + goto out; + } + + /* initialize gfchangelog xlator context */ + ret = gf_changelog_init (this); + if (ret) + goto out; + + GF_OPTION_INIT ("expiry-time", priv->expiry_time, int32, out); + + priv->child_count = xlator_subvolume_count (this); + priv->children = GF_CALLOC (priv->child_count, sizeof (*priv->children), + gf_br_mt_br_child_t); + if (!priv->children) + goto out; + + trav = this->children; + while (trav) { + priv->children[i].this = this; + priv->children[i].xl = trav->xlator; + + priv->children[i].timer_pool = + mem_pool_new (struct gf_tw_timer_list, 4096); + if (!priv->children[i].timer_pool) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate mem-pool for timer"); + errno = ENOMEM; + goto out; + } + + i++; + trav = trav->next; + } + + pthread_mutex_init (&priv->lock, NULL); + pthread_cond_init (&priv->cond, NULL); + + for (i = 0; i < priv->child_count; i++) + INIT_LIST_HEAD (&priv->children[i].list); + INIT_LIST_HEAD (&priv->bricks); + + this->private = priv; + + ret = gf_thread_create (&priv->thread, NULL, br_handle_events, + this); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "thread creation failed (%s)", strerror (errno)); + goto out; + } + + priv->timer_wheel = gf_tw_init_timers (); + if (!priv->timer_wheel) { + gf_log (this->name, GF_LOG_ERROR, "failed to initialize the " + "timer wheel"); + goto out; + } + + pthread_cond_init (&priv->object_cond, NULL); + priv->obj_queue = GF_CALLOC (1, sizeof (*priv->obj_queue), + gf_br_mt_br_ob_n_wk_t); + if (!priv->obj_queue) { + gf_log (this->name, GF_LOG_ERROR, "memory allocation failed"); + goto out; + } + + INIT_LIST_HEAD (&priv->obj_queue->objects); + + for (i = 0; i < BR_WORKERS; i++) { + gf_thread_create (&priv->obj_queue->workers[i], NULL, + br_process_object, this); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "thread creation failed (%s)", + strerror (errno)); + goto out; + } + } + + ret = 0; + +out: + if (ret) { + if (priv->children) + GF_FREE (priv->children); + if (priv->timer_wheel) + gf_tw_cleanup_timers (priv->timer_wheel); + GF_FREE (priv); + } + + gf_log (this->name, GF_LOG_DEBUG, "bit-rot xlator loaded"); + return ret; +} + +void +fini (xlator_t *this) +{ + br_private_t *priv = this->private; + + if (!priv) + return; + + br_free_children (this); + if (priv->timer_wheel) + gf_tw_cleanup_timers (priv->timer_wheel); + this->private = NULL; + GF_FREE (priv); + + return; +} + +struct xlator_fops fops; + +struct xlator_cbks cbks; + +struct volume_options options[] = { + { .key = {"expiry-time"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "120", + .description = "default time duration for which an object waits " + "before it is signed", + }, + { .key = {NULL} }, +}; diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.h b/xlators/features/bit-rot/src/bitd/bit-rot.h new file mode 100644 index 00000000000..ab9fd806232 --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/bit-rot.h @@ -0,0 +1,126 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef __BIT_ROT_H__ +#define __BIT_ROT_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "defaults.h" +#include "syncop.h" +#include "syncop-utils.h" +#include "changelog.h" +#include "timer-wheel.h" + +#include "bit-rot-common.h" +#include "bit-rot-stub-mem-types.h" + +#include <openssl/sha.h> + +/* TODO: make this configurable */ +#define BR_WORKERS 8 + +#define signature_size(hl) (sizeof (br_isignature_t) + hl + 1) + +struct br_child { + char child_up; /* Indicates whether this child is + up or not */ + xlator_t *xl; /* client xlator corresponding to + this child */ + inode_table_t *table; /* inode table for this child */ + char brick_path[PATH_MAX]; /* brick export directory of this + child */ + struct list_head list; /* hook to attach to the list of + UP children */ + xlator_t *this; /* Bit rot xlator */ + + pthread_t thread; /* initial crawler for unsigned + object(s) */ + int threadrunning; /* active thread */ + + struct mem_pool *timer_pool; /* timer-wheel's timer mem-pool */ + + struct timeval tv; +}; + +typedef struct br_child br_child_t; + +struct br_obj_n_workers { + struct list_head objects; /* queue of objects expired from the + timer wheel and ready to be picked + up for signing */ + pthread_t workers[BR_WORKERS]; /* Threads which pick up the objects + from the above queue and start + signing each object */ +}; + +typedef struct br_obj_n_workers br_obj_n_workers_t; + +struct br_private { + pthread_mutex_t lock; + + struct list_head bricks; /* list of bricks from which CHILD_UP + has been received */ + + pthread_cond_t cond; /* handling CHILD_UP notifications */ + pthread_cond_t object_cond; /* handling signing of objects */ + int child_count; + br_child_t *children; /* list of subvolumes */ + int up_children; + pthread_t thread; /* thread for connecting each UP + child with changelog */ + struct tvec_base *timer_wheel; /* timer wheel where the objects which + changelog has sent sits and waits + for expiry */ + br_obj_n_workers_t *obj_queue; /* place holder for all the objects + that are expired from timer wheel + and ready to be picked up for + signing and the workers which sign + the objects */ + int32_t expiry_time; /* objects "wait" time */ +}; + +typedef struct br_private br_private_t; + +struct br_object { + xlator_t *this; + + uuid_t gfid; + + unsigned long signedversion; /* version aginst which this object will + be signed */ + br_child_t *child; /* object's subvolume */ + + struct list_head list; /* hook to add to the queue once the + object is expired from timer wheel */ + void *data; +}; + +typedef struct br_object br_object_t; + +void +br_log_object (xlator_t *, char *, uuid_t, int32_t); + +void +br_log_object_path (xlator_t *, char *, const char *, int32_t); + +int32_t +br_object_checksum (unsigned char *, br_object_t *, fd_t *, struct iatt *); + +int32_t +br_prepare_loc (xlator_t *, br_child_t *, loc_t *, gf_dirent_t *, loc_t *); + +#endif /* __BIT_ROT_H__ */ diff --git a/xlators/features/bit-rot/src/stub/Makefile.am b/xlators/features/bit-rot/src/stub/Makefile.am index 9abcbb76db2..ec6b1ef4506 100644 --- a/xlators/features/bit-rot/src/stub/Makefile.am +++ b/xlators/features/bit-rot/src/stub/Makefile.am @@ -8,9 +8,7 @@ bitrot_stub_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la noinst_HEADERS = bit-rot-stub.h bit-rot-common.h bit-rot-stub-mem-types.h -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ - -I$(top_srcdir)/xlators/features/changelog/lib/src - +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src AM_CFLAGS = -Wall $(GF_CFLAGS) diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h index 64779923fd6..492278639b4 100644 --- a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h @@ -15,9 +15,13 @@ enum br_mem_types { gf_br_stub_mt_private_t = gf_common_mt_end + 1, - gf_br_stub_mt_version_t = gf_common_mt_end + 2, - gf_br_stub_mt_inode_ctx_t = gf_common_mt_end + 3, - gf_br_stub_mt_signature_t = gf_common_mt_end + 4, + gf_br_stub_mt_version_t, + gf_br_stub_mt_inode_ctx_t, + gf_br_stub_mt_signature_t, + gf_br_mt_br_private_t, + gf_br_mt_br_child_t, + gf_br_mt_br_object_t, + gf_br_mt_br_ob_n_wk_t, gf_br_stub_mt_end }; diff --git a/xlators/features/changelog/lib/src/Makefile.am b/xlators/features/changelog/lib/src/Makefile.am index 306306bd585..456e211b89d 100644 --- a/xlators/features/changelog/lib/src/Makefile.am +++ b/xlators/features/changelog/lib/src/Makefile.am @@ -32,8 +32,6 @@ noinst_HEADERS = gf-changelog-helpers.h gf-changelog-rpc.h gf-changelog-journal. $(CONTRIBDIR)/uuid/uuidd.h $(CONTRIBDIR)/uuid/uuid.h \ $(CONTRIBDIR)/uuid/uuidP.h $(CONTRIB_BUILDDIR)/uuid/uuid_types.h -libgfchangelog_HEADERS = changelog.h - CLEANFILES = CONFIG_CLEAN_FILES = $(CONTRIB_BUILDDIR)/uuid/uuid_types.h |