diff options
Diffstat (limited to 'xlators/features/bit-rot/src')
19 files changed, 10347 insertions, 0 deletions
diff --git a/xlators/features/bit-rot/src/Makefile.am b/xlators/features/bit-rot/src/Makefile.am new file mode 100644 index 00000000000..b5e4a7d62a0 --- /dev/null +++ b/xlators/features/bit-rot/src/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = stub bitd diff --git a/xlators/features/bit-rot/src/bitd/Makefile.am b/xlators/features/bit-rot/src/bitd/Makefile.am new file mode 100644 index 00000000000..6db800e6565 --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/Makefile.am @@ -0,0 +1,23 @@ +if WITH_SERVER +xlator_LTLIBRARIES = bit-rot.la +endif +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +bit_rot_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src/ -I$(top_builddir)/rpc/xdr/src/ \ + -I$(top_srcdir)/rpc/rpc-lib/src -I$(CONTRIBDIR)/timer-wheel \ + -I$(top_srcdir)/xlators/features/bit-rot/src/stub + +bit_rot_la_SOURCES = bit-rot.c bit-rot-scrub.c bit-rot-ssm.c \ + bit-rot-scrub-status.c +bit_rot_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ + $(top_builddir)/xlators/features/changelog/lib/src/libgfchangelog.la + +noinst_HEADERS = bit-rot.h bit-rot-scrub.h bit-rot-bitd-messages.h bit-rot-ssm.h \ + bit-rot-scrub-status.h + +AM_CFLAGS = -Wall -DBR_RATE_LIMIT_SIGNER $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-bitd-messages.h b/xlators/features/bit-rot/src/bitd/bit-rot-bitd-messages.h new file mode 100644 index 00000000000..5bc5103a27c --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/bit-rot-bitd-messages.h @@ -0,0 +1,101 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. + */ + +#ifndef _BITROT_BITD_MESSAGES_H_ +#define _BITROT_BITD_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(BITROT_BITD, BRB_MSG_FD_CREATE_FAILED, BRB_MSG_READV_FAILED, + BRB_MSG_BLOCK_READ_FAILED, BRB_MSG_CALC_CHECKSUM_FAILED, + BRB_MSG_NO_MEMORY, BRB_MSG_GET_SIGN_FAILED, BRB_MSG_SET_SIGN_FAILED, + BRB_MSG_OP_FAILED, BRB_MSG_READ_AND_SIGN_FAILED, BRB_MSG_SIGN_FAILED, + BRB_MSG_GET_SUBVOL_FAILED, BRB_MSG_SET_TIMER_FAILED, + BRB_MSG_GET_INFO_FAILED, BRB_MSG_PATH_FAILED, BRB_MSG_MARK_BAD_FILE, + BRB_MSG_TRIGGER_SIGN, BRB_MSG_REGISTER_FAILED, + BRB_MSG_CRAWLING_START, BRB_MSG_SPAWN_FAILED, + BRB_MSG_INVALID_SUBVOL_CHILD, BRB_MSG_SKIP_OBJECT, BRB_MSG_NO_CHILD, + BRB_MSG_CHECKSUM_MISMATCH, BRB_MSG_MARK_CORRUPTED, + BRB_MSG_CRAWLING_FINISH, BRB_MSG_CALC_ERROR, BRB_MSG_LOOKUP_FAILED, + BRB_MSG_PARTIAL_VERSION_PRESENCE, BRB_MSG_MEM_ACNT_FAILED, + BRB_MSG_TIMER_WHEEL_UNAVAILABLE, BRB_MSG_BITROT_LOADED, + BRB_MSG_SCALE_DOWN_FAILED, BRB_MSG_SCALE_UP_FAILED, + BRB_MSG_SCALE_DOWN_SCRUBBER, BRB_MSG_SCALING_UP_SCRUBBER, + BRB_MSG_UNKNOWN_THROTTLE, BRB_MSG_RATE_LIMIT_INFO, + BRB_MSG_SCRUB_INFO, BRB_MSG_CONNECTED_TO_BRICK, BRB_MSG_BRICK_INFO, + BRB_MSG_SUBVOL_CONNECT_FAILED, BRB_MSG_INVALID_SUBVOL, + BRB_MSG_RESCHEDULE_SCRUBBER_FAILED, BRB_MSG_SCRUB_START, + BRB_MSG_SCRUB_FINISH, BRB_MSG_SCRUB_RUNNING, + BRB_MSG_SCRUB_RESCHEDULED, BRB_MSG_SCRUB_TUNABLE, + BRB_MSG_SCRUB_THREAD_CLEANUP, BRB_MSG_SCRUBBER_CLEANED, + BRB_MSG_GENERIC_SSM_INFO, BRB_MSG_ZERO_TIMEOUT_BUG, + BRB_MSG_BAD_OBJ_READDIR_FAIL, BRB_MSG_SSM_FAILED, + BRB_MSG_SCRUB_WAIT_FAILED, BRB_MSG_TRIGGER_SIGN_FAILED, + BRB_MSG_EVENT_UNHANDLED, BRB_MSG_COULD_NOT_SCHEDULE_SCRUB, + BRB_MSG_THREAD_CREATION_FAILED, BRB_MSG_MEM_POOL_ALLOC, + BRB_MSG_SAVING_HASH_FAILED); + +#define BRB_MSG_FD_CREATE_FAILED_STR "failed to create fd for the inode" +#define BRB_MSG_READV_FAILED_STR "readv failed" +#define BRB_MSG_BLOCK_READ_FAILED_STR "reading block failed" +#define BRB_MSG_NO_MEMORY_STR "failed to allocate memory" +#define BRB_MSG_CALC_CHECKSUM_FAILED_STR "calculating checksum failed" +#define BRB_MSG_GET_SIGN_FAILED_STR "failed to get the signature" +#define BRB_MSG_SET_SIGN_FAILED_STR "signing failed" +#define BRB_MSG_OP_FAILED_STR "failed on object" +#define BRB_MSG_TRIGGER_SIGN_FAILED_STR "Could not trigger signing" +#define BRB_MSG_READ_AND_SIGN_FAILED_STR "reading and signing of object failed" +#define BRB_MSG_SET_TIMER_FAILED_STR "Failed to allocate object expiry timer" +#define BRB_MSG_GET_SUBVOL_FAILED_STR \ + "failed to get the subvolume for the brick" +#define BRB_MSG_PATH_FAILED_STR "path failed" +#define BRB_MSG_SKIP_OBJECT_STR "Entry is marked corrupted. skipping" +#define BRB_MSG_PARTIAL_VERSION_PRESENCE_STR \ + "PArtial version xattr presence detected, ignoring" +#define BRB_MSG_TRIGGER_SIGN_STR "Triggering signing" +#define BRB_MSG_CRAWLING_START_STR \ + "Crawling brick, scanning for unsigned objects" +#define BRB_MSG_CRAWLING_FINISH_STR "Completed crawling brick" +#define BRB_MSG_REGISTER_FAILED_STR "Register to changelog failed" +#define BRB_MSG_SPAWN_FAILED_STR "failed to spawn" +#define BRB_MSG_CONNECTED_TO_BRICK_STR "Connected to brick" +#define BRB_MSG_LOOKUP_FAILED_STR "lookup on root failed" +#define BRB_MSG_GET_INFO_FAILED_STR "failed to get stub info" +#define BRB_MSG_SCRUB_THREAD_CLEANUP_STR "Error cleaning up scanner thread" +#define BRB_MSG_SCRUBBER_CLEANED_STR "clened up scrubber for brick" +#define BRB_MSG_SUBVOL_CONNECT_FAILED_STR \ + "callback handler for subvolume failed" +#define BRB_MSG_MEM_ACNT_FAILED_STR "Memory accounting init failed" +#define BRB_MSG_EVENT_UNHANDLED_STR "Event unhandled for child" +#define BRB_MSG_INVALID_SUBVOL_STR "Got event from invalid subvolume" +#define BRB_MSG_RESCHEDULE_SCRUBBER_FAILED_STR \ + "on demand scrub schedule failed. Scrubber is not in pending state." +#define BRB_MSG_COULD_NOT_SCHEDULE_SCRUB_STR \ + "Could not schedule ondemand scrubbing. Scrubbing will continue " \ + "according to old frequency." +#define BRB_MSG_THREAD_CREATION_FAILED_STR "thread creation failed" +#define BRB_MSG_RATE_LIMIT_INFO_STR "Rate Limit Info" +#define BRB_MSG_MEM_POOL_ALLOC_STR "failed to allocate mem-pool for timer" +#define BRB_MSG_NO_CHILD_STR "FATAL: no children" +#define BRB_MSG_TIMER_WHEEL_UNAVAILABLE_STR "global timer wheel unavailable" +#define BRB_MSG_BITROT_LOADED_STR "bit-rot xlator loaded" +#define BRB_MSG_SAVING_HASH_FAILED_STR \ + "failed to allocate memory for saving hash of the object" +#endif /* !_BITROT_BITD_MESSAGES_H_ */ diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c new file mode 100644 index 00000000000..5cef2ffa5e5 --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c @@ -0,0 +1,78 @@ +/* + Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <string.h> +#include <stdio.h> + +#include "bit-rot-scrub-status.h" + +void +br_inc_unsigned_file_count(br_scrub_stats_t *scrub_stat) +{ + if (!scrub_stat) + return; + + pthread_mutex_lock(&scrub_stat->lock); + { + scrub_stat->unsigned_files++; + } + pthread_mutex_unlock(&scrub_stat->lock); +} + +void +br_inc_scrubbed_file(br_scrub_stats_t *scrub_stat) +{ + if (!scrub_stat) + return; + + pthread_mutex_lock(&scrub_stat->lock); + { + scrub_stat->scrubbed_files++; + } + pthread_mutex_unlock(&scrub_stat->lock); +} + +void +br_update_scrub_start_time(br_scrub_stats_t *scrub_stat, time_t time) +{ + if (!scrub_stat) + return; + + pthread_mutex_lock(&scrub_stat->lock); + { + scrub_stat->scrub_start_time = time; + } + pthread_mutex_unlock(&scrub_stat->lock); +} + +void +br_update_scrub_finish_time(br_scrub_stats_t *scrub_stat, char *timestr, + time_t time) +{ + int lst_size = 0; + + if (!scrub_stat) + return; + + lst_size = sizeof(scrub_stat->last_scrub_time); + if (strlen(timestr) >= lst_size) + return; + + pthread_mutex_lock(&scrub_stat->lock); + { + scrub_stat->scrub_end_time = time; + + scrub_stat->scrub_duration = scrub_stat->scrub_end_time - + scrub_stat->scrub_start_time; + + snprintf(scrub_stat->last_scrub_time, lst_size, "%s", timestr); + } + pthread_mutex_unlock(&scrub_stat->lock); +} diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h new file mode 100644 index 00000000000..f022aa831eb --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h @@ -0,0 +1,50 @@ +/* + Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __BIT_ROT_SCRUB_STATUS_H__ +#define __BIT_ROT_SCRUB_STATUS_H__ + +#include <stdint.h> +#include <sys/time.h> +#include <pthread.h> + +#include <glusterfs/common-utils.h> + +struct br_scrub_stats { + uint64_t scrubbed_files; /* Total number of scrubbed files. */ + + uint64_t unsigned_files; /* Total number of unsigned files. */ + + uint64_t scrub_duration; /* Duration of last scrub. */ + + char last_scrub_time[GF_TIMESTR_SIZE]; /* Last scrub completion time. */ + + time_t scrub_start_time; /* Scrubbing starting time. */ + + time_t scrub_end_time; /* Scrubbing finishing time. */ + + int8_t scrub_running; /* Whether scrub running or not. */ + + pthread_mutex_t lock; +}; + +typedef struct br_scrub_stats br_scrub_stats_t; + +void +br_inc_unsigned_file_count(br_scrub_stats_t *scrub_stat); +void +br_inc_scrubbed_file(br_scrub_stats_t *scrub_stat); +void +br_update_scrub_start_time(br_scrub_stats_t *scrub_stat, time_t time); +void +br_update_scrub_finish_time(br_scrub_stats_t *scrub_stat, char *timestr, + time_t time); + +#endif /* __BIT_ROT_SCRUB_STATUS_H__ */ diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c new file mode 100644 index 00000000000..289dd53f610 --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c @@ -0,0 +1,2070 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <math.h> +#include <ctype.h> +#include <sys/uio.h> + +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/common-utils.h> + +#include "bit-rot-scrub.h" +#include <pthread.h> +#include "bit-rot-bitd-messages.h" +#include "bit-rot-scrub-status.h" +#include <glusterfs/events.h> + +struct br_scrubbers { + pthread_t scrubthread; + + struct list_head list; +}; + +struct br_fsscan_entry { + void *data; + + loc_t parent; + + gf_dirent_t *entry; + + struct br_scanfs *fsscan; /* backpointer to subvolume scanner */ + + struct list_head list; +}; + +/** + * fetch signature extended attribute from an object's fd. + * NOTE: On success @xattr is not unref'd as @sign points + * to the dictionary value. + */ +static int32_t +bitd_fetch_signature(xlator_t *this, br_child_t *child, fd_t *fd, + dict_t **xattr, br_isignature_out_t **sign) +{ + int32_t ret = -1; + + ret = syncop_fgetxattr(child->xl, fd, xattr, GLUSTERFS_GET_OBJECT_SIGNATURE, + NULL, NULL); + if (ret < 0) { + br_log_object(this, "fgetxattr", fd->inode->gfid, -ret); + goto out; + } + + ret = dict_get_ptr(*xattr, GLUSTERFS_GET_OBJECT_SIGNATURE, (void **)sign); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_SIGN_FAILED, + "failed to extract signature info [GFID: %s]", + uuid_utoa(fd->inode->gfid)); + goto unref_dict; + } + + return 0; + +unref_dict: + dict_unref(*xattr); +out: + return -1; +} + +/** + * POST COMPUTE CHECK + * + * Checks to be performed before verifying calculated signature + * Object is skipped if: + * - has stale signature + * - mismatches versions caches in pre-compute check + */ + +int32_t +bitd_scrub_post_compute_check(xlator_t *this, br_child_t *child, fd_t *fd, + unsigned long version, + br_isignature_out_t **signature, + br_scrub_stats_t *scrub_stat, + gf_boolean_t skip_stat) +{ + int32_t ret = 0; + size_t signlen = 0; + dict_t *xattr = NULL; + br_isignature_out_t *signptr = NULL; + + ret = bitd_fetch_signature(this, child, fd, &xattr, &signptr); + if (ret < 0) { + if (!skip_stat) + br_inc_unsigned_file_count(scrub_stat); + goto out; + } + + /** + * Either the object got dirtied during the time the signature was + * calculated OR the version we saved during pre-compute check does + * not match now, implying that the object got dirtied and signed in + * between scrubs pre & post compute checks (checksum window). + * + * The log entry looks pretty ugly, but helps in debugging.. + */ + if (signptr->stale || (signptr->version != version)) { + if (!skip_stat) + br_inc_unsigned_file_count(scrub_stat); + gf_msg_debug(this->name, 0, + "<STAGE: POST> Object [GFID: %s] " + "either has a stale signature OR underwent " + "signing during checksumming {Stale: %d | " + "Version: %lu,%lu}", + uuid_utoa(fd->inode->gfid), (signptr->stale) ? 1 : 0, + version, signptr->version); + ret = -1; + goto unref_dict; + } + + signlen = signptr->signaturelen; + *signature = GF_MALLOC(sizeof(br_isignature_out_t) + signlen, + gf_common_mt_char); + + (void)memcpy(*signature, signptr, sizeof(br_isignature_out_t) + signlen); + + (*signature)->signaturelen = signlen; + +unref_dict: + dict_unref(xattr); +out: + return ret; +} + +static int32_t +bitd_signature_staleness(xlator_t *this, br_child_t *child, fd_t *fd, + int *stale, unsigned long *version, + br_scrub_stats_t *scrub_stat, gf_boolean_t skip_stat) +{ + int32_t ret = -1; + dict_t *xattr = NULL; + br_isignature_out_t *signptr = NULL; + + ret = bitd_fetch_signature(this, child, fd, &xattr, &signptr); + if (ret < 0) { + if (!skip_stat) + br_inc_unsigned_file_count(scrub_stat); + goto out; + } + + /** + * save version for validation in post compute stage + * c.f. bitd_scrub_post_compute_check() + */ + *stale = signptr->stale ? 1 : 0; + *version = signptr->version; + + dict_unref(xattr); + +out: + return ret; +} + +/** + * PRE COMPUTE CHECK + * + * Checks to be performed before initiating object signature calculation. + * An object is skipped if: + * - it's already marked corrupted + * - has stale signature + */ +int32_t +bitd_scrub_pre_compute_check(xlator_t *this, br_child_t *child, fd_t *fd, + unsigned long *version, + br_scrub_stats_t *scrub_stat, + gf_boolean_t skip_stat) +{ + int stale = 0; + int32_t ret = -1; + + if (bitd_is_bad_file(this, child, NULL, fd)) { + gf_msg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SKIP_OBJECT, + "Object [GFID: %s] is marked corrupted, skipping..", + uuid_utoa(fd->inode->gfid)); + goto out; + } + + ret = bitd_signature_staleness(this, child, fd, &stale, version, scrub_stat, + skip_stat); + if (!ret && stale) { + if (!skip_stat) + br_inc_unsigned_file_count(scrub_stat); + gf_msg_debug(this->name, 0, + "<STAGE: PRE> Object [GFID: %s] " + "has stale signature", + uuid_utoa(fd->inode->gfid)); + ret = -1; + } + +out: + return ret; +} + +/* static int */ +int +bitd_compare_ckum(xlator_t *this, br_isignature_out_t *sign, unsigned char *md, + inode_t *linked_inode, gf_dirent_t *entry, fd_t *fd, + br_child_t *child, loc_t *loc) +{ + int ret = -1; + dict_t *xattr = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot", this, out); + GF_VALIDATE_OR_GOTO(this->name, sign, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, child, out); + GF_VALIDATE_OR_GOTO(this->name, linked_inode, out); + GF_VALIDATE_OR_GOTO(this->name, md, out); + GF_VALIDATE_OR_GOTO(this->name, entry, out); + + if (strncmp(sign->signature, (char *)md, sign->signaturelen) == 0) { + gf_msg_debug(this->name, 0, + "%s [GFID: %s | Brick: %s] " + "matches calculated checksum", + loc->path, uuid_utoa(linked_inode->gfid), + child->brick_path); + return 0; + } + + gf_msg(this->name, GF_LOG_DEBUG, 0, BRB_MSG_CHECKSUM_MISMATCH, + "Object checksum mismatch: %s [GFID: %s | Brick: %s]", loc->path, + uuid_utoa(linked_inode->gfid), child->brick_path); + gf_msg(this->name, GF_LOG_ALERT, 0, BRB_MSG_CHECKSUM_MISMATCH, + "CORRUPTION DETECTED: Object %s {Brick: %s | GFID: %s}", loc->path, + child->brick_path, uuid_utoa(linked_inode->gfid)); + + /* Perform bad-file marking */ + xattr = dict_new(); + if (!xattr) { + ret = -1; + goto out; + } + + ret = dict_set_int32(xattr, BITROT_OBJECT_BAD_KEY, _gf_true); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_MARK_BAD_FILE, + "Error setting bad-file marker for %s [GFID: %s | " + "Brick: %s]", + loc->path, uuid_utoa(linked_inode->gfid), child->brick_path); + goto dictfree; + } + + gf_msg(this->name, GF_LOG_ALERT, 0, BRB_MSG_MARK_CORRUPTED, + "Marking" + " %s [GFID: %s | Brick: %s] as corrupted..", + loc->path, uuid_utoa(linked_inode->gfid), child->brick_path); + gf_event(EVENT_BITROT_BAD_FILE, "gfid=%s;path=%s;brick=%s", + uuid_utoa(linked_inode->gfid), loc->path, child->brick_path); + ret = syncop_fsetxattr(child->xl, fd, xattr, 0, NULL, NULL); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_MARK_BAD_FILE, + "Error marking object %s [GFID: %s] as corrupted", loc->path, + uuid_utoa(linked_inode->gfid)); + +dictfree: + dict_unref(xattr); +out: + return ret; +} + +/** + * "The Scrubber" + * + * Perform signature validation for a given object with the assumption + * that the signature is SHA256 (because signer as of now _always_ + * signs with SHA256). + */ +int +br_scrubber_scrub_begin(xlator_t *this, struct br_fsscan_entry *fsentry) +{ + int32_t ret = -1; + fd_t *fd = NULL; + loc_t loc = { + 0, + }; + struct iatt iatt = { + 0, + }; + struct iatt parent_buf = { + 0, + }; + pid_t pid = 0; + br_child_t *child = NULL; + unsigned char *md = NULL; + inode_t *linked_inode = NULL; + br_isignature_out_t *sign = NULL; + unsigned long signedversion = 0; + gf_dirent_t *entry = NULL; + br_private_t *priv = NULL; + loc_t *parent = NULL; + gf_boolean_t skip_stat = _gf_false; + uuid_t shard_root_gfid = { + 0, + }; + + GF_VALIDATE_OR_GOTO("bit-rot", fsentry, out); + + entry = fsentry->entry; + parent = &fsentry->parent; + child = fsentry->data; + + priv = this->private; + + GF_VALIDATE_OR_GOTO("bit-rot", entry, out); + GF_VALIDATE_OR_GOTO("bit-rot", parent, out); + GF_VALIDATE_OR_GOTO("bit-rot", child, out); + GF_VALIDATE_OR_GOTO("bit-rot", priv, out); + + pid = GF_CLIENT_PID_SCRUB; + + ret = br_prepare_loc(this, child, parent, entry, &loc); + if (!ret) + goto out; + + syncopctx_setfspid(&pid); + + ret = syncop_lookup(child->xl, &loc, &iatt, &parent_buf, NULL, NULL); + if (ret) { + br_log_object_path(this, "lookup", loc.path, -ret); + goto out; + } + + linked_inode = inode_link(loc.inode, parent->inode, loc.name, &iatt); + if (linked_inode) + inode_lookup(linked_inode); + + gf_msg_debug(this->name, 0, "Scrubbing object %s [GFID: %s]", entry->d_name, + uuid_utoa(linked_inode->gfid)); + + if (iatt.ia_type != IA_IFREG) { + gf_msg_debug(this->name, 0, "%s is not a regular file", entry->d_name); + ret = 0; + goto unref_inode; + } + + if (IS_DHT_LINKFILE_MODE((&iatt))) { + gf_msg_debug(this->name, 0, "%s is a dht sticky bit file", + entry->d_name); + ret = 0; + goto unref_inode; + } + + /* skip updating scrub statistics for shard entries */ + gf_uuid_parse(SHARD_ROOT_GFID, shard_root_gfid); + if (gf_uuid_compare(loc.pargfid, shard_root_gfid) == 0) + skip_stat = _gf_true; + + /** + * open() an fd for subsequent operations + */ + fd = fd_create(linked_inode, 0); + if (!fd) { + gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_FD_CREATE_FAILED, + "failed to create fd for inode %s", + uuid_utoa(linked_inode->gfid)); + goto unref_inode; + } + + ret = syncop_open(child->xl, &loc, O_RDWR, fd, NULL, NULL); + if (ret) { + br_log_object(this, "open", linked_inode->gfid, -ret); + ret = -1; + goto unrefd; + } + + fd_bind(fd); + + /** + * perform pre compute checks before initiating checksum + * computation + * - presence of bad object + * - signature staleness + */ + ret = bitd_scrub_pre_compute_check(this, child, fd, &signedversion, + &priv->scrub_stat, skip_stat); + if (ret) + goto unrefd; /* skip this object */ + + /* if all's good, proceed to calculate the hash */ + md = GF_MALLOC(SHA256_DIGEST_LENGTH, gf_common_mt_char); + if (!md) + goto unrefd; + + ret = br_calculate_obj_checksum(md, child, fd, &iatt); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_CALC_ERROR, + "error calculating hash for object [GFID: %s]", + uuid_utoa(fd->inode->gfid)); + ret = -1; + goto free_md; + } + + /** + * perform post compute checks as an object's signature may have + * become stale while scrubber calculated checksum. + */ + ret = bitd_scrub_post_compute_check(this, child, fd, signedversion, &sign, + &priv->scrub_stat, skip_stat); + if (ret) + goto free_md; + + ret = bitd_compare_ckum(this, sign, md, linked_inode, entry, fd, child, + &loc); + + if (!skip_stat) + br_inc_scrubbed_file(&priv->scrub_stat); + + GF_FREE(sign); /* allocated on post-compute */ + + /** fd_unref() takes care of closing fd.. like syncop_close() */ + +free_md: + GF_FREE(md); +unrefd: + fd_unref(fd); +unref_inode: + inode_unref(linked_inode); +out: + loc_wipe(&loc); + return ret; +} + +static void +_br_lock_cleaner(void *arg) +{ + pthread_mutex_t *mutex = arg; + + pthread_mutex_unlock(mutex); +} + +static void +wait_for_scrubbing(xlator_t *this, struct br_scanfs *fsscan) +{ + br_private_t *priv = NULL; + struct br_scrubber *fsscrub = NULL; + + priv = this->private; + fsscrub = &priv->fsscrub; + + pthread_cleanup_push(_br_lock_cleaner, &fsscan->waitlock); + pthread_mutex_lock(&fsscan->waitlock); + { + pthread_cleanup_push(_br_lock_cleaner, &fsscrub->mutex); + pthread_mutex_lock(&fsscrub->mutex); + { + list_replace_init(&fsscan->queued, &fsscan->ready); + + /* wake up scrubbers */ + pthread_cond_broadcast(&fsscrub->cond); + } + pthread_mutex_unlock(&fsscrub->mutex); + pthread_cleanup_pop(0); + + while (fsscan->entries != 0) + pthread_cond_wait(&fsscan->waitcond, &fsscan->waitlock); + } + pthread_mutex_unlock(&fsscan->waitlock); + pthread_cleanup_pop(0); +} + +static void +_br_fsscan_inc_entry_count(struct br_scanfs *fsscan) +{ + fsscan->entries++; +} + +static void +_br_fsscan_dec_entry_count(struct br_scanfs *fsscan) +{ + if (--fsscan->entries == 0) { + pthread_mutex_lock(&fsscan->waitlock); + { + pthread_cond_signal(&fsscan->waitcond); + } + pthread_mutex_unlock(&fsscan->waitlock); + } +} + +static void +_br_fsscan_collect_entry(struct br_scanfs *fsscan, + struct br_fsscan_entry *fsentry) +{ + list_add_tail(&fsentry->list, &fsscan->queued); + _br_fsscan_inc_entry_count(fsscan); +} + +#define NR_ENTRIES (1 << 7) /* ..bulk scrubbing */ + +int +br_fsscanner_handle_entry(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, + void *data) +{ + int32_t ret = -1; + int scrub = 0; + br_child_t *child = NULL; + xlator_t *this = NULL; + struct br_scanfs *fsscan = NULL; + struct br_fsscan_entry *fsentry = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot", subvol, error_return); + GF_VALIDATE_OR_GOTO("bit-rot", data, error_return); + + child = data; + this = child->this; + fsscan = &child->fsscan; + + _mask_cancellation(); + + fsentry = GF_CALLOC(1, sizeof(*fsentry), gf_br_mt_br_fsscan_entry_t); + if (!fsentry) + goto error_return; + + { + fsentry->data = data; + fsentry->fsscan = &child->fsscan; + + /* copy parent loc */ + ret = loc_copy(&fsentry->parent, parent); + if (ret) + goto dealloc; + + /* copy child entry */ + fsentry->entry = entry_copy(entry); + if (!fsentry->entry) + goto locwipe; + + INIT_LIST_HEAD(&fsentry->list); + } + + LOCK(&fsscan->entrylock); + { + _br_fsscan_collect_entry(fsscan, fsentry); + + /** + * need not be a equality check as entries may be pushed + * back onto the scanned queue when thread(s) are cleaned. + */ + if (fsscan->entries >= NR_ENTRIES) + scrub = 1; + } + UNLOCK(&fsscan->entrylock); + + _unmask_cancellation(); + + if (scrub) + wait_for_scrubbing(this, fsscan); + + return 0; + +locwipe: + loc_wipe(&fsentry->parent); +dealloc: + GF_FREE(fsentry); +error_return: + return -1; +} + +int32_t +br_fsscan_deactivate(xlator_t *this) +{ + int ret = 0; + br_private_t *priv = NULL; + br_scrub_state_t nstate = 0; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + ret = gf_tw_del_timer(priv->timer_wheel, scrub_monitor->timer); + if (ret == 0) { + nstate = BR_SCRUB_STATE_STALLED; + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO, + "Volume is under active scrubbing. Pausing scrub.."); + } else { + nstate = BR_SCRUB_STATE_PAUSED; + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO, + "Scrubber paused"); + } + + _br_monitor_set_scrub_state(scrub_monitor, nstate); + + return 0; +} + +static void +br_scrubber_log_time(xlator_t *this, const char *sfx) +{ + char timestr[GF_TIMESTR_SIZE] = { + 0, + }; + br_private_t *priv = NULL; + time_t now = 0; + + now = gf_time(); + priv = this->private; + + gf_time_fmt(timestr, sizeof(timestr), now, gf_timefmt_FT); + + if (strcasecmp(sfx, "started") == 0) { + br_update_scrub_start_time(&priv->scrub_stat, now); + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_START, + "Scrubbing %s at %s", sfx, timestr); + } else { + br_update_scrub_finish_time(&priv->scrub_stat, timestr, now); + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_FINISH, + "Scrubbing %s at %s", sfx, timestr); + } +} + +static void +br_fsscanner_log_time(xlator_t *this, br_child_t *child, const char *sfx) +{ + char timestr[GF_TIMESTR_SIZE] = { + 0, + }; + time_t now = 0; + + now = gf_time(); + gf_time_fmt(timestr, sizeof(timestr), now, gf_timefmt_FT); + + if (strcasecmp(sfx, "started") == 0) { + gf_msg_debug(this->name, 0, "Scrubbing \"%s\" %s at %s", + child->brick_path, sfx, timestr); + } else { + gf_msg_debug(this->name, 0, "Scrubbing \"%s\" %s at %s", + child->brick_path, sfx, timestr); + } +} + +void +br_child_set_scrub_state(br_child_t *child, gf_boolean_t state) +{ + child->active_scrubbing = state; +} + +static void +br_fsscanner_wait_until_kicked(xlator_t *this, br_child_t *child) +{ + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + pthread_cleanup_push(_br_lock_cleaner, &scrub_monitor->wakelock); + pthread_mutex_lock(&scrub_monitor->wakelock); + { + while (!scrub_monitor->kick) + pthread_cond_wait(&scrub_monitor->wakecond, + &scrub_monitor->wakelock); + + /* Child lock is to synchronize with disconnect events */ + pthread_cleanup_push(_br_lock_cleaner, &child->lock); + pthread_mutex_lock(&child->lock); + { + scrub_monitor->active_child_count++; + br_child_set_scrub_state(child, _gf_true); + } + pthread_mutex_unlock(&child->lock); + pthread_cleanup_pop(0); + } + pthread_mutex_unlock(&scrub_monitor->wakelock); + pthread_cleanup_pop(0); +} + +static void +br_scrubber_entry_control(xlator_t *this) +{ + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + LOCK(&scrub_monitor->lock); + { + /* Move the state to BR_SCRUB_STATE_ACTIVE */ + if (scrub_monitor->state == BR_SCRUB_STATE_PENDING) + scrub_monitor->state = BR_SCRUB_STATE_ACTIVE; + br_scrubber_log_time(this, "started"); + priv->scrub_stat.scrub_running = 1; + } + UNLOCK(&scrub_monitor->lock); +} + +static void +br_scrubber_exit_control(xlator_t *this) +{ + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + LOCK(&scrub_monitor->lock); + { + br_scrubber_log_time(this, "finished"); + priv->scrub_stat.scrub_running = 0; + + if (scrub_monitor->state == BR_SCRUB_STATE_ACTIVE) { + (void)br_fsscan_activate(this); + } else { + UNLOCK(&scrub_monitor->lock); + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO, + "Volume waiting to get rescheduled.."); + return; + } + } + UNLOCK(&scrub_monitor->lock); +} + +static void +br_fsscanner_entry_control(xlator_t *this, br_child_t *child) +{ + br_fsscanner_log_time(this, child, "started"); +} + +static void +br_fsscanner_exit_control(xlator_t *this, br_child_t *child) +{ + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + if (!_br_is_child_connected(child)) { + gf_msg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SCRUB_INFO, + "Brick [%s] disconnected while scrubbing. Scrubbing " + "might be incomplete", + child->brick_path); + } + + br_fsscanner_log_time(this, child, "finished"); + + pthread_cleanup_push(_br_lock_cleaner, &scrub_monitor->wakelock); + pthread_mutex_lock(&scrub_monitor->wakelock); + { + scrub_monitor->active_child_count--; + pthread_cleanup_push(_br_lock_cleaner, &child->lock); + pthread_mutex_lock(&child->lock); + { + br_child_set_scrub_state(child, _gf_false); + } + pthread_mutex_unlock(&child->lock); + pthread_cleanup_pop(0); + + if (scrub_monitor->active_child_count == 0) { + /* The last child has finished scrubbing. + * Set the kick to false and wake up other + * children who are waiting for the last + * child to complete scrubbing. + */ + scrub_monitor->kick = _gf_false; + pthread_cond_broadcast(&scrub_monitor->wakecond); + + /* Signal monitor thread waiting for the all + * the children to finish scrubbing. + */ + pthread_cleanup_push(_br_lock_cleaner, &scrub_monitor->donelock); + pthread_mutex_lock(&scrub_monitor->donelock); + { + scrub_monitor->done = _gf_true; + pthread_cond_signal(&scrub_monitor->donecond); + } + pthread_mutex_unlock(&scrub_monitor->donelock); + pthread_cleanup_pop(0); + } else { + while (scrub_monitor->active_child_count) + pthread_cond_wait(&scrub_monitor->wakecond, + &scrub_monitor->wakelock); + } + } + pthread_mutex_unlock(&scrub_monitor->wakelock); + pthread_cleanup_pop(0); +} + +void * +br_fsscanner(void *arg) +{ + loc_t loc = { + 0, + }; + br_child_t *child = NULL; + xlator_t *this = NULL; + struct br_scanfs *fsscan = NULL; + + child = arg; + this = child->this; + fsscan = &child->fsscan; + + THIS = this; + loc.inode = child->table->root; + + while (1) { + br_fsscanner_wait_until_kicked(this, child); + { + /* precursor for scrub */ + br_fsscanner_entry_control(this, child); + + /* scrub */ + (void)syncop_ftw(child->xl, &loc, GF_CLIENT_PID_SCRUB, child, + br_fsscanner_handle_entry); + if (!list_empty(&fsscan->queued)) + wait_for_scrubbing(this, fsscan); + + /* scrub exit criteria */ + br_fsscanner_exit_control(this, child); + } + } + + return NULL; +} + +/** + * Keep this routine extremely simple and do not ever try to acquire + * child->lock here: it may lead to deadlock. Scrubber state is + * modified in br_fsscanner(). An intermediate state change to pause + * changes the scrub state to the _correct_ state by identifying a + * non-pending timer. + */ +void +br_kickstart_scanner(struct gf_tw_timer_list *timer, void *data, + unsigned long calltime) +{ + xlator_t *this = NULL; + struct br_monitor *scrub_monitor = data; + br_private_t *priv = NULL; + + THIS = this = scrub_monitor->this; + priv = this->private; + + /* Reset scrub statistics */ + priv->scrub_stat.scrubbed_files = 0; + priv->scrub_stat.unsigned_files = 0; + + /* Moves state from PENDING to ACTIVE */ + (void)br_scrubber_entry_control(this); + + /* kickstart scanning.. */ + pthread_mutex_lock(&scrub_monitor->wakelock); + { + scrub_monitor->kick = _gf_true; + GF_ASSERT(scrub_monitor->active_child_count == 0); + pthread_cond_broadcast(&scrub_monitor->wakecond); + } + pthread_mutex_unlock(&scrub_monitor->wakelock); + + return; +} + +static uint32_t +br_fsscan_calculate_delta(uint32_t times) +{ + return times; +} + +#define BR_SCRUB_ONDEMAND (1) +#define BR_SCRUB_MINUTE (60) +#define BR_SCRUB_HOURLY (60 * 60) +#define BR_SCRUB_DAILY (1 * 24 * 60 * 60) +#define BR_SCRUB_WEEKLY (7 * 24 * 60 * 60) +#define BR_SCRUB_BIWEEKLY (14 * 24 * 60 * 60) +#define BR_SCRUB_MONTHLY (30 * 24 * 60 * 60) + +static unsigned int +br_fsscan_calculate_timeout(scrub_freq_t freq) +{ + uint32_t timo = 0; + + switch (freq) { + case BR_FSSCRUB_FREQ_MINUTE: + timo = br_fsscan_calculate_delta(BR_SCRUB_MINUTE); + break; + case BR_FSSCRUB_FREQ_HOURLY: + timo = br_fsscan_calculate_delta(BR_SCRUB_HOURLY); + break; + case BR_FSSCRUB_FREQ_DAILY: + timo = br_fsscan_calculate_delta(BR_SCRUB_DAILY); + break; + case BR_FSSCRUB_FREQ_WEEKLY: + timo = br_fsscan_calculate_delta(BR_SCRUB_WEEKLY); + break; + case BR_FSSCRUB_FREQ_BIWEEKLY: + timo = br_fsscan_calculate_delta(BR_SCRUB_BIWEEKLY); + break; + case BR_FSSCRUB_FREQ_MONTHLY: + timo = br_fsscan_calculate_delta(BR_SCRUB_MONTHLY); + break; + default: + timo = 0; + } + + return timo; +} + +int32_t +br_fsscan_schedule(xlator_t *this) +{ + uint32_t timo = 0; + br_private_t *priv = NULL; + char timestr[GF_TIMESTR_SIZE] = { + 0, + }; + struct br_scrubber *fsscrub = NULL; + struct gf_tw_timer_list *timer = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + fsscrub = &priv->fsscrub; + scrub_monitor = &priv->scrub_monitor; + + scrub_monitor->boot = gf_time(); + + timo = br_fsscan_calculate_timeout(fsscrub->frequency); + if (timo == 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_ZERO_TIMEOUT_BUG, + "BUG: Zero schedule timeout"); + goto error_return; + } + + scrub_monitor->timer = GF_CALLOC(1, sizeof(*scrub_monitor->timer), + gf_br_stub_mt_br_scanner_freq_t); + if (!scrub_monitor->timer) + goto error_return; + + timer = scrub_monitor->timer; + INIT_LIST_HEAD(&timer->entry); + + timer->data = scrub_monitor; + timer->expires = timo; + timer->function = br_kickstart_scanner; + + gf_tw_add_timer(priv->timer_wheel, timer); + _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_PENDING); + + gf_time_fmt(timestr, sizeof(timestr), (scrub_monitor->boot + timo), + gf_timefmt_FT); + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO, + "Scrubbing is " + "scheduled to run at %s", + timestr); + + return 0; + +error_return: + return -1; +} + +int32_t +br_fsscan_activate(xlator_t *this) +{ + uint32_t timo = 0; + char timestr[GF_TIMESTR_SIZE] = { + 0, + }; + time_t now = 0; + br_private_t *priv = NULL; + struct br_scrubber *fsscrub = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + fsscrub = &priv->fsscrub; + scrub_monitor = &priv->scrub_monitor; + + now = gf_time(); + timo = br_fsscan_calculate_timeout(fsscrub->frequency); + if (timo == 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_ZERO_TIMEOUT_BUG, + "BUG: Zero schedule timeout"); + return -1; + } + + pthread_mutex_lock(&scrub_monitor->donelock); + { + scrub_monitor->done = _gf_false; + } + pthread_mutex_unlock(&scrub_monitor->donelock); + + gf_time_fmt(timestr, sizeof(timestr), now + timo, gf_timefmt_FT); + (void)gf_tw_mod_timer(priv->timer_wheel, scrub_monitor->timer, timo); + + _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_PENDING); + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO, + "Scrubbing is " + "rescheduled to run at %s", + timestr); + + return 0; +} + +int32_t +br_fsscan_reschedule(xlator_t *this) +{ + int32_t ret = 0; + uint32_t timo = 0; + char timestr[GF_TIMESTR_SIZE] = { + 0, + }; + time_t now = 0; + br_private_t *priv = NULL; + struct br_scrubber *fsscrub = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + fsscrub = &priv->fsscrub; + scrub_monitor = &priv->scrub_monitor; + + if (!fsscrub->frequency_reconf) + return 0; + + now = gf_time(); + timo = br_fsscan_calculate_timeout(fsscrub->frequency); + if (timo == 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_ZERO_TIMEOUT_BUG, + "BUG: Zero schedule timeout"); + return -1; + } + + gf_time_fmt(timestr, sizeof(timestr), now + timo, gf_timefmt_FT); + + pthread_mutex_lock(&scrub_monitor->donelock); + { + scrub_monitor->done = _gf_false; + } + pthread_mutex_unlock(&scrub_monitor->donelock); + + ret = gf_tw_mod_timer_pending(priv->timer_wheel, scrub_monitor->timer, + timo); + if (ret == 0) + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO, + "Scrubber is currently running and would be " + "rescheduled after completion"); + else { + _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_PENDING); + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO, + "Scrubbing rescheduled to run at %s", timestr); + } + + return 0; +} + +int32_t +br_fsscan_ondemand(xlator_t *this) +{ + int32_t ret = 0; + uint32_t timo = 0; + char timestr[GF_TIMESTR_SIZE] = { + 0, + }; + time_t now = 0; + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + now = gf_time(); + timo = BR_SCRUB_ONDEMAND; + gf_time_fmt(timestr, sizeof(timestr), now + timo, gf_timefmt_FT); + + pthread_mutex_lock(&scrub_monitor->donelock); + { + scrub_monitor->done = _gf_false; + } + pthread_mutex_unlock(&scrub_monitor->donelock); + + ret = gf_tw_mod_timer_pending(priv->timer_wheel, scrub_monitor->timer, + timo); + if (ret == 0) + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO, + "Scrubber is currently running and would be " + "rescheduled after completion"); + else { + _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_PENDING); + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO, + "Ondemand Scrubbing scheduled to run at %s", timestr); + } + + return 0; +} + +#define BR_SCRUB_THREAD_SCALE_LAZY 0 +#define BR_SCRUB_THREAD_SCALE_NORMAL 0.4 +#define BR_SCRUB_THREAD_SCALE_AGGRESSIVE 1.0 + +#ifndef M_E +#define M_E 2.718 +#endif + +/** + * This is just a simple exponential scale to a fixed value selected + * per throttle config. We probably need to be more smart and select + * the scale based on the number of processor cores too. + */ +static unsigned int +br_scrubber_calc_scale(xlator_t *this, br_private_t *priv, + scrub_throttle_t throttle) +{ + unsigned int scale = 0; + + switch (throttle) { + case BR_SCRUB_THROTTLE_VOID: + case BR_SCRUB_THROTTLE_STALLED: + scale = 0; + break; + case BR_SCRUB_THROTTLE_LAZY: + scale = priv->child_count * pow(M_E, BR_SCRUB_THREAD_SCALE_LAZY); + break; + case BR_SCRUB_THROTTLE_NORMAL: + scale = priv->child_count * pow(M_E, BR_SCRUB_THREAD_SCALE_NORMAL); + break; + case BR_SCRUB_THROTTLE_AGGRESSIVE: + scale = priv->child_count * + pow(M_E, BR_SCRUB_THREAD_SCALE_AGGRESSIVE); + break; + default: + gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_UNKNOWN_THROTTLE, + "Unknown throttle %d", throttle); + } + + return scale; +} + +static br_child_t * +_br_scrubber_get_next_child(struct br_scrubber *fsscrub) +{ + br_child_t *child = NULL; + + child = list_first_entry(&fsscrub->scrublist, br_child_t, list); + list_rotate_left(&fsscrub->scrublist); + + return child; +} + +static void +_br_scrubber_get_entry(br_child_t *child, struct br_fsscan_entry **fsentry) +{ + struct br_scanfs *fsscan = &child->fsscan; + + if (list_empty(&fsscan->ready)) + return; + *fsentry = list_first_entry(&fsscan->ready, struct br_fsscan_entry, list); + list_del_init(&(*fsentry)->list); +} + +static void +_br_scrubber_find_scrubbable_entry(struct br_scrubber *fsscrub, + struct br_fsscan_entry **fsentry) +{ + br_child_t *child = NULL; + br_child_t *firstchild = NULL; + + while (1) { + while (list_empty(&fsscrub->scrublist)) + pthread_cond_wait(&fsscrub->cond, &fsscrub->mutex); + + firstchild = NULL; + for (child = _br_scrubber_get_next_child(fsscrub); child != firstchild; + child = _br_scrubber_get_next_child(fsscrub)) { + if (!firstchild) + firstchild = child; + + _br_scrubber_get_entry(child, fsentry); + if (*fsentry) + break; + } + + if (*fsentry) + break; + + /* nothing to work on.. wait till available */ + pthread_cond_wait(&fsscrub->cond, &fsscrub->mutex); + } +} + +static void +br_scrubber_pick_entry(struct br_scrubber *fsscrub, + struct br_fsscan_entry **fsentry) +{ + pthread_cleanup_push(_br_lock_cleaner, &fsscrub->mutex); + + pthread_mutex_lock(&fsscrub->mutex); + { + *fsentry = NULL; + _br_scrubber_find_scrubbable_entry(fsscrub, fsentry); + } + pthread_mutex_unlock(&fsscrub->mutex); + + pthread_cleanup_pop(0); +} + +struct br_scrub_entry { + gf_boolean_t scrubbed; + struct br_fsscan_entry *fsentry; +}; + +/** + * We need to be a bit careful here. These thread(s) are prone to cancellations + * when threads are scaled down (depending on the thottling value configured) + * and pausing scrub. A thread can get cancelled while it's waiting for entries + * in the ->pending queue or when an object is undergoing scrubbing. + */ +static void +br_scrubber_entry_handle(void *arg) +{ + struct br_scanfs *fsscan = NULL; + struct br_scrub_entry *sentry = NULL; + struct br_fsscan_entry *fsentry = NULL; + + sentry = arg; + + fsentry = sentry->fsentry; + fsscan = fsentry->fsscan; + + LOCK(&fsscan->entrylock); + { + if (sentry->scrubbed) { + _br_fsscan_dec_entry_count(fsscan); + + /* cleanup ->entry */ + fsentry->data = NULL; + fsentry->fsscan = NULL; + loc_wipe(&fsentry->parent); + gf_dirent_entry_free(fsentry->entry); + + GF_FREE(sentry->fsentry); + } else { + /* (re)queue the entry again for scrub */ + _br_fsscan_collect_entry(fsscan, sentry->fsentry); + } + } + UNLOCK(&fsscan->entrylock); +} + +static void +br_scrubber_scrub_entry(xlator_t *this, struct br_fsscan_entry *fsentry) +{ + struct br_scrub_entry sentry = { + 0, + }; + + sentry.scrubbed = 0; + sentry.fsentry = fsentry; + + pthread_cleanup_push(br_scrubber_entry_handle, &sentry); + { + (void)br_scrubber_scrub_begin(this, fsentry); + sentry.scrubbed = 1; + } + pthread_cleanup_pop(1); +} + +void * +br_scrubber_proc(void *arg) +{ + xlator_t *this = NULL; + struct br_scrubber *fsscrub = NULL; + struct br_fsscan_entry *fsentry = NULL; + + fsscrub = arg; + THIS = this = fsscrub->this; + + while (1) { + br_scrubber_pick_entry(fsscrub, &fsentry); + br_scrubber_scrub_entry(this, fsentry); + sleep(1); + } + + return NULL; +} + +static int32_t +br_scrubber_scale_up(xlator_t *this, struct br_scrubber *fsscrub, + unsigned int v1, unsigned int v2) +{ + int i = 0; + int32_t ret = -1; + int diff = 0; + struct br_scrubbers *scrub = NULL; + + diff = (int)(v2 - v1); + + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCALING_UP_SCRUBBER, + "Scaling up scrubbers [%d => %d]", v1, v2); + + for (i = 0; i < diff; i++) { + scrub = GF_CALLOC(diff, sizeof(*scrub), gf_br_mt_br_scrubber_t); + if (!scrub) + break; + + INIT_LIST_HEAD(&scrub->list); + ret = gf_thread_create(&scrub->scrubthread, NULL, br_scrubber_proc, + fsscrub, "brsproc"); + if (ret) + break; + + fsscrub->nr_scrubbers++; + list_add_tail(&scrub->list, &fsscrub->scrubbers); + } + + if ((i != diff) && !scrub) + goto error_return; + + if (i != diff) /* degraded scaling.. */ + gf_msg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SCALE_UP_FAILED, + "Could not fully scale up to %d scrubber(s). Spawned " + "%d/%d [total scrubber(s): %d]", + v2, i, diff, (v1 + i)); + + return 0; + +error_return: + return -1; +} + +static int32_t +br_scrubber_scale_down(xlator_t *this, struct br_scrubber *fsscrub, + unsigned int v1, unsigned int v2) +{ + int i = 0; + int diff = 0; + int32_t ret = -1; + struct br_scrubbers *scrub = NULL; + + diff = (int)(v1 - v2); + + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCALE_DOWN_SCRUBBER, + "Scaling down scrubbers [%d => %d]", v1, v2); + + for (i = 0; i < diff; i++) { + scrub = list_first_entry(&fsscrub->scrubbers, struct br_scrubbers, + list); + + list_del_init(&scrub->list); + ret = gf_thread_cleanup_xint(scrub->scrubthread); + if (ret) + break; + GF_FREE(scrub); + + fsscrub->nr_scrubbers--; + } + + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SCALE_DOWN_FAILED, + "Could not fully scale down " + "to %d scrubber(s). Terminated %d/%d [total " + "scrubber(s): %d]", + v1, i, diff, (v2 - i)); + ret = 0; + } + + return ret; +} + +static int32_t +br_scrubber_configure(xlator_t *this, br_private_t *priv, + struct br_scrubber *fsscrub, scrub_throttle_t nthrottle) +{ + int32_t ret = 0; + unsigned int v1 = 0; + unsigned int v2 = 0; + + v1 = fsscrub->nr_scrubbers; + v2 = br_scrubber_calc_scale(this, priv, nthrottle); + + if (v1 == v2) + return 0; + + if (v1 > v2) + ret = br_scrubber_scale_down(this, fsscrub, v1, v2); + else + ret = br_scrubber_scale_up(this, fsscrub, v1, v2); + + return ret; +} + +static int32_t +br_scrubber_fetch_option(xlator_t *this, char *opt, dict_t *options, + char **value) +{ + if (options) + GF_OPTION_RECONF(opt, *value, options, str, error_return); + else + GF_OPTION_INIT(opt, *value, str, error_return); + + return 0; + +error_return: + return -1; +} + +/* internal "throttle" override */ +#define BR_SCRUB_STALLED "STALLED" + +/* TODO: token buket spec */ +static int32_t +br_scrubber_handle_throttle(xlator_t *this, br_private_t *priv, dict_t *options, + gf_boolean_t scrubstall) +{ + int32_t ret = 0; + char *tmp = NULL; + struct br_scrubber *fsscrub = NULL; + scrub_throttle_t nthrottle = BR_SCRUB_THROTTLE_VOID; + + fsscrub = &priv->fsscrub; + fsscrub->throttle_reconf = _gf_false; + + ret = br_scrubber_fetch_option(this, "scrub-throttle", options, &tmp); + if (ret) + goto error_return; + + if (scrubstall) + tmp = BR_SCRUB_STALLED; + + if (strcasecmp(tmp, "lazy") == 0) + nthrottle = BR_SCRUB_THROTTLE_LAZY; + else if (strcasecmp(tmp, "normal") == 0) + nthrottle = BR_SCRUB_THROTTLE_NORMAL; + else if (strcasecmp(tmp, "aggressive") == 0) + nthrottle = BR_SCRUB_THROTTLE_AGGRESSIVE; + else if (strcasecmp(tmp, BR_SCRUB_STALLED) == 0) + nthrottle = BR_SCRUB_THROTTLE_STALLED; + else + goto error_return; + + /* on failure old throttling value is preserved */ + ret = br_scrubber_configure(this, priv, fsscrub, nthrottle); + if (ret) + goto error_return; + + if (fsscrub->throttle != nthrottle) + fsscrub->throttle_reconf = _gf_true; + + fsscrub->throttle = nthrottle; + return 0; + +error_return: + return -1; +} + +static int32_t +br_scrubber_handle_stall(xlator_t *this, br_private_t *priv, dict_t *options, + gf_boolean_t *scrubstall) +{ + int32_t ret = 0; + char *tmp = NULL; + + ret = br_scrubber_fetch_option(this, "scrub-state", options, &tmp); + if (ret) + goto error_return; + + if (strcasecmp(tmp, "pause") == 0) /* anything else is active */ + *scrubstall = _gf_true; + + return 0; + +error_return: + return -1; +} + +static int32_t +br_scrubber_handle_freq(xlator_t *this, br_private_t *priv, dict_t *options, + gf_boolean_t scrubstall) +{ + int32_t ret = -1; + char *tmp = NULL; + scrub_freq_t frequency = BR_FSSCRUB_FREQ_HOURLY; + struct br_scrubber *fsscrub = NULL; + + fsscrub = &priv->fsscrub; + fsscrub->frequency_reconf = _gf_true; + + ret = br_scrubber_fetch_option(this, "scrub-freq", options, &tmp); + if (ret) + goto error_return; + + if (scrubstall) + tmp = BR_SCRUB_STALLED; + + if (strcasecmp(tmp, "hourly") == 0) { + frequency = BR_FSSCRUB_FREQ_HOURLY; + } else if (strcasecmp(tmp, "daily") == 0) { + frequency = BR_FSSCRUB_FREQ_DAILY; + } else if (strcasecmp(tmp, "weekly") == 0) { + frequency = BR_FSSCRUB_FREQ_WEEKLY; + } else if (strcasecmp(tmp, "biweekly") == 0) { + frequency = BR_FSSCRUB_FREQ_BIWEEKLY; + } else if (strcasecmp(tmp, "monthly") == 0) { + frequency = BR_FSSCRUB_FREQ_MONTHLY; + } else if (strcasecmp(tmp, "minute") == 0) { + frequency = BR_FSSCRUB_FREQ_MINUTE; + } else if (strcasecmp(tmp, BR_SCRUB_STALLED) == 0) { + frequency = BR_FSSCRUB_FREQ_STALLED; + } else + goto error_return; + + if (fsscrub->frequency == frequency) + fsscrub->frequency_reconf = _gf_false; + else + fsscrub->frequency = frequency; + + return 0; + +error_return: + return -1; +} + +static void +br_scrubber_log_option(xlator_t *this, br_private_t *priv, + gf_boolean_t scrubstall) +{ + struct br_scrubber *fsscrub = &priv->fsscrub; + char *scrub_throttle_str[] = { + [BR_SCRUB_THROTTLE_LAZY] = "lazy", + [BR_SCRUB_THROTTLE_NORMAL] = "normal", + [BR_SCRUB_THROTTLE_AGGRESSIVE] = "aggressive", + [BR_SCRUB_THROTTLE_STALLED] = "stalled", + }; + + char *scrub_freq_str[] = { + [0] = "", + [BR_FSSCRUB_FREQ_HOURLY] = "hourly", + [BR_FSSCRUB_FREQ_DAILY] = "daily", + [BR_FSSCRUB_FREQ_WEEKLY] = "weekly", + [BR_FSSCRUB_FREQ_BIWEEKLY] = "biweekly", + [BR_FSSCRUB_FREQ_MONTHLY] = "monthly (30 days)", + [BR_FSSCRUB_FREQ_MINUTE] = "every minute", + }; + + if (scrubstall) + return; /* logged as pause */ + + if (fsscrub->frequency_reconf || fsscrub->throttle_reconf) { + if (fsscrub->throttle == BR_SCRUB_THROTTLE_VOID) + return; + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_TUNABLE, + "SCRUB TUNABLES:: [Frequency: %s, Throttle: %s]", + scrub_freq_str[fsscrub->frequency], + scrub_throttle_str[fsscrub->throttle]); + } +} + +int32_t +br_scrubber_handle_options(xlator_t *this, br_private_t *priv, dict_t *options) +{ + int32_t ret = 0; + gf_boolean_t scrubstall = _gf_false; /* not as dangerous as it sounds */ + + ret = br_scrubber_handle_stall(this, priv, options, &scrubstall); + if (ret) + goto error_return; + + ret = br_scrubber_handle_throttle(this, priv, options, scrubstall); + if (ret) + goto error_return; + + ret = br_scrubber_handle_freq(this, priv, options, scrubstall); + if (ret) + goto error_return; + + br_scrubber_log_option(this, priv, scrubstall); + + return 0; + +error_return: + return -1; +} + +inode_t * +br_lookup_bad_obj_dir(xlator_t *this, br_child_t *child, uuid_t gfid) +{ + struct iatt statbuf = { + 0, + }; + inode_table_t *table = NULL; + int32_t ret = -1; + loc_t loc = { + 0, + }; + inode_t *linked_inode = NULL; + int32_t op_errno = 0; + + GF_VALIDATE_OR_GOTO("bit-rot-scrubber", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO(this->name, child, out); + + table = child->table; + + loc.inode = inode_new(table); + if (!loc.inode) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY, + "failed to allocate a new inode for" + "bad object directory"); + goto out; + } + + gf_uuid_copy(loc.gfid, gfid); + + ret = syncop_lookup(child->xl, &loc, &statbuf, NULL, NULL, NULL); + if (ret < 0) { + op_errno = -ret; + gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_LOOKUP_FAILED, + "failed to lookup the bad " + "objects directory (gfid: %s (%s))", + uuid_utoa(gfid), strerror(op_errno)); + goto out; + } + + linked_inode = inode_link(loc.inode, NULL, NULL, &statbuf); + if (linked_inode) + inode_lookup(linked_inode); + +out: + loc_wipe(&loc); + return linked_inode; +} + +int32_t +br_read_bad_object_dir(xlator_t *this, br_child_t *child, fd_t *fd, + dict_t *dict) +{ + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + int32_t ret = -1; + off_t offset = 0; + int32_t count = 0; + char key[32] = { + 0, + }; + dict_t *out_dict = NULL; + + INIT_LIST_HEAD(&entries.list); + + while ((ret = syncop_readdir(child->xl, fd, 131072, offset, &entries, NULL, + &out_dict))) { + if (ret < 0) + goto out; + + list_for_each_entry(entry, &entries.list, list) + { + offset = entry->d_off; + + snprintf(key, sizeof(key), "quarantine-%d", count); + + /* + * ignore the dict_set errors for now. The intention is + * to get as many bad objects as possible instead of + * erroring out at the first failure. + */ + ret = dict_set_dynstr_with_alloc(dict, key, entry->d_name); + if (!ret) + count++; + + if (out_dict) { + dict_copy(out_dict, dict); + dict_unref(out_dict); + out_dict = NULL; + } + } + + gf_dirent_free(&entries); + } + + ret = count; + ret = dict_set_int32_sizen(dict, "count", count); + +out: + return ret; +} + +int32_t +br_get_bad_objects_from_child(xlator_t *this, dict_t *dict, br_child_t *child) +{ + inode_t *inode = NULL; + inode_table_t *table = NULL; + fd_t *fd = NULL; + int32_t ret = -1; + loc_t loc = { + 0, + }; + int32_t op_errno = 0; + + GF_VALIDATE_OR_GOTO("bit-rot-scrubber", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO(this->name, child, out); + GF_VALIDATE_OR_GOTO(this->name, dict, out); + + table = child->table; + + inode = inode_find(table, BR_BAD_OBJ_CONTAINER); + if (!inode) { + inode = br_lookup_bad_obj_dir(this, child, BR_BAD_OBJ_CONTAINER); + if (!inode) + goto out; + } + + fd = fd_create(inode, 0); + if (!fd) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_FD_CREATE_FAILED, + "fd creation for the bad " + "objects directory failed (gfid: %s)", + uuid_utoa(BR_BAD_OBJ_CONTAINER)); + goto out; + } + + loc.inode = inode; + gf_uuid_copy(loc.gfid, inode->gfid); + + ret = syncop_opendir(child->xl, &loc, fd, NULL, NULL); + if (ret < 0) { + op_errno = -ret; + fd_unref(fd); + fd = NULL; + gf_msg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_FD_CREATE_FAILED, + "failed to open the bad " + "objects directory %s", + uuid_utoa(BR_BAD_OBJ_CONTAINER)); + goto out; + } + + fd_bind(fd); + + ret = br_read_bad_object_dir(this, child, fd, dict); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_BAD_OBJ_READDIR_FAIL, + "readdir of the bad " + "objects directory (%s) failed ", + uuid_utoa(BR_BAD_OBJ_CONTAINER)); + goto out; + } + + ret = 0; + +out: + loc_wipe(&loc); + if (fd) + fd_unref(fd); + return ret; +} + +int32_t +br_collect_bad_objects_of_child(xlator_t *this, br_child_t *child, dict_t *dict, + dict_t *child_dict, int32_t total_count) +{ + int32_t ret = -1; + int32_t count = 0; + char key[32] = { + 0, + }; + char main_key[32] = { + 0, + }; + int32_t j = 0; + int32_t tmp_count = 0; + char *entry = NULL; + char tmp[PATH_MAX] = { + 0, + }; + char *path = NULL; + int32_t len = 0; + + ret = dict_get_int32_sizen(child_dict, "count", &count); + if (ret) + goto out; + + tmp_count = total_count; + + for (j = 0; j < count; j++) { + len = snprintf(key, sizeof(key), "quarantine-%d", j); + ret = dict_get_strn(child_dict, key, len, &entry); + if (ret) + continue; + + ret = dict_get_str(child_dict, entry, &path); + len = snprintf(tmp, PATH_MAX, "%s ==> BRICK: %s\n path: %s", entry, + child->brick_path, path); + if ((len < 0) || (len >= PATH_MAX)) { + continue; + } + snprintf(main_key, sizeof(main_key), "quarantine-%d", tmp_count); + + ret = dict_set_dynstr_with_alloc(dict, main_key, tmp); + if (!ret) + tmp_count++; + path = NULL; + } + + ret = tmp_count; + +out: + return ret; +} + +int32_t +br_collect_bad_objects_from_children(xlator_t *this, dict_t *dict) +{ + int32_t ret = -1; + dict_t *child_dict = NULL; + int32_t i = 0; + int32_t total_count = 0; + br_child_t *child = NULL; + br_private_t *priv = NULL; + dict_t *tmp_dict = NULL; + + priv = this->private; + tmp_dict = dict; + + for (i = 0; i < priv->child_count; i++) { + child = &priv->children[i]; + GF_ASSERT(child); + if (!_br_is_child_connected(child)) + continue; + + child_dict = dict_new(); + if (!child_dict) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY, + "failed to allocate dict"); + continue; + } + ret = br_get_bad_objects_from_child(this, child_dict, child); + /* + * Continue asking the remaining children for the list of + * bad objects even though getting the list from one of them + * fails. + */ + if (ret) { + dict_unref(child_dict); + continue; + } + + ret = br_collect_bad_objects_of_child(this, child, tmp_dict, child_dict, + total_count); + if (ret < 0) { + dict_unref(child_dict); + continue; + } + + total_count = ret; + dict_unref(child_dict); + child_dict = NULL; + } + + ret = dict_set_int32(tmp_dict, "total-count", total_count); + + return ret; +} + +int32_t +br_get_bad_objects_list(xlator_t *this, dict_t **dict) +{ + int32_t ret = -1; + dict_t *tmp_dict = NULL; + + GF_VALIDATE_OR_GOTO("bir-rot-scrubber", this, out); + GF_VALIDATE_OR_GOTO(this->name, dict, out); + + tmp_dict = *dict; + if (!tmp_dict) { + tmp_dict = dict_new(); + if (!tmp_dict) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY, + "failed to allocate dict"); + goto out; + } + *dict = tmp_dict; + } + + ret = br_collect_bad_objects_from_children(this, tmp_dict); + +out: + return ret; +} + +static int +wait_for_scrub_to_finish(xlator_t *this) +{ + int ret = -1; + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + GF_VALIDATE_OR_GOTO("bit-rot", scrub_monitor, out); + GF_VALIDATE_OR_GOTO("bit-rot", this, out); + + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO, + "Waiting for all children to start and finish scrub"); + + pthread_mutex_lock(&scrub_monitor->donelock); + { + while (!scrub_monitor->done) + pthread_cond_wait(&scrub_monitor->donecond, + &scrub_monitor->donelock); + } + pthread_mutex_unlock(&scrub_monitor->donelock); + ret = 0; +out: + return ret; +} + +/** + * This function is executed in a separate thread. This is scrubber monitor + * thread that takes care of state machine. + */ +void * +br_monitor_thread(void *arg) +{ + int32_t ret = 0; + xlator_t *this = NULL; + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + this = arg; + priv = this->private; + + /* + * Since, this is the topmost xlator, THIS has to be set by bit-rot + * xlator itself (STACK_WIND won't help in this case). Also it has + * to be done for each thread that gets spawned. Otherwise, a new + * thread will get global_xlator's pointer when it does "THIS". + */ + THIS = this; + + scrub_monitor = &priv->scrub_monitor; + + pthread_mutex_lock(&scrub_monitor->mutex); + { + while (!scrub_monitor->inited) + pthread_cond_wait(&scrub_monitor->cond, &scrub_monitor->mutex); + } + pthread_mutex_unlock(&scrub_monitor->mutex); + + /* this needs to be serialized with reconfigure() */ + pthread_mutex_lock(&priv->lock); + { + ret = br_scrub_state_machine(this, _gf_false); + } + pthread_mutex_unlock(&priv->lock); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, BRB_MSG_SSM_FAILED, + "Scrub state machine failed"); + goto out; + } + + while (1) { + /* Wait for all children to finish scrubbing */ + ret = wait_for_scrub_to_finish(this); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, BRB_MSG_SCRUB_WAIT_FAILED, + "Scrub wait failed"); + goto out; + } + + /* scrub exit criteria: Move the state to PENDING */ + br_scrubber_exit_control(this); + } + +out: + return NULL; +} + +static void +br_set_scrub_state(struct br_monitor *scrub_monitor, br_scrub_state_t state) +{ + LOCK(&scrub_monitor->lock); + { + _br_monitor_set_scrub_state(scrub_monitor, state); + } + UNLOCK(&scrub_monitor->lock); +} + +int32_t +br_scrubber_monitor_init(xlator_t *this, br_private_t *priv) +{ + struct br_monitor *scrub_monitor = NULL; + int ret = 0; + + scrub_monitor = &priv->scrub_monitor; + + LOCK_INIT(&scrub_monitor->lock); + scrub_monitor->this = this; + + scrub_monitor->inited = _gf_false; + pthread_mutex_init(&scrub_monitor->mutex, NULL); + pthread_cond_init(&scrub_monitor->cond, NULL); + + scrub_monitor->kick = _gf_false; + scrub_monitor->active_child_count = 0; + pthread_mutex_init(&scrub_monitor->wakelock, NULL); + pthread_cond_init(&scrub_monitor->wakecond, NULL); + + scrub_monitor->done = _gf_false; + pthread_mutex_init(&scrub_monitor->donelock, NULL); + pthread_cond_init(&scrub_monitor->donecond, NULL); + + /* Set the state to INACTIVE */ + br_set_scrub_state(&priv->scrub_monitor, BR_SCRUB_STATE_INACTIVE); + + /* Start the monitor thread */ + ret = gf_thread_create(&scrub_monitor->thread, NULL, br_monitor_thread, + this, "brmon"); + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, -ret, BRB_MSG_SPAWN_FAILED, + "monitor thread creation failed"); + ret = -1; + goto err; + } + + return 0; +err: + pthread_mutex_destroy(&scrub_monitor->mutex); + pthread_cond_destroy(&scrub_monitor->cond); + + pthread_mutex_destroy(&scrub_monitor->wakelock); + pthread_cond_destroy(&scrub_monitor->wakecond); + + pthread_mutex_destroy(&scrub_monitor->donelock); + pthread_cond_destroy(&scrub_monitor->donecond); + + LOCK_DESTROY(&scrub_monitor->lock); + + return ret; +} + +int32_t +br_scrubber_init(xlator_t *this, br_private_t *priv) +{ + struct br_scrubber *fsscrub = NULL; + int ret = 0; + + priv->tbf = tbf_init(NULL, 0); + if (!priv->tbf) + return -1; + + ret = br_scrubber_monitor_init(this, priv); + if (ret) + return -1; + + fsscrub = &priv->fsscrub; + + fsscrub->this = this; + fsscrub->throttle = BR_SCRUB_THROTTLE_VOID; + + pthread_mutex_init(&fsscrub->mutex, NULL); + pthread_cond_init(&fsscrub->cond, NULL); + + fsscrub->nr_scrubbers = 0; + INIT_LIST_HEAD(&fsscrub->scrubbers); + INIT_LIST_HEAD(&fsscrub->scrublist); + + return 0; +} diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub.h b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.h new file mode 100644 index 00000000000..4e5f67bc021 --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.h @@ -0,0 +1,46 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __BIT_ROT_SCRUB_H__ +#define __BIT_ROT_SCRUB_H__ + +#include <glusterfs/xlator.h> +#include "bit-rot.h" + +void * +br_fsscanner(void *); + +int32_t +br_fsscan_schedule(xlator_t *); +int32_t +br_fsscan_reschedule(xlator_t *); +int32_t +br_fsscan_activate(xlator_t *); +int32_t +br_fsscan_deactivate(xlator_t *); +int32_t +br_fsscan_ondemand(xlator_t *); + +int32_t +br_scrubber_handle_options(xlator_t *, br_private_t *, dict_t *); + +int32_t +br_scrubber_monitor_init(xlator_t *, br_private_t *); + +int32_t +br_scrubber_init(xlator_t *, br_private_t *); + +int32_t +br_collect_bad_objects_from_children(xlator_t *this, dict_t *dict); + +void +br_child_set_scrub_state(br_child_t *, gf_boolean_t); + +#endif /* __BIT_ROT_SCRUB_H__ */ diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-ssm.c b/xlators/features/bit-rot/src/bitd/bit-rot-ssm.c new file mode 100644 index 00000000000..753e31a3b23 --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/bit-rot-ssm.c @@ -0,0 +1,124 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "bit-rot-ssm.h" +#include "bit-rot-scrub.h" +#include "bit-rot-bitd-messages.h" + +int +br_scrub_ssm_noop(xlator_t *this) +{ + return 0; +} + +int +br_scrub_ssm_state_pause(xlator_t *this) +{ + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_GENERIC_SSM_INFO, + "Scrubber paused"); + _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_PAUSED); + return 0; +} + +int +br_scrub_ssm_state_ipause(xlator_t *this) +{ + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_GENERIC_SSM_INFO, + "Scrubber paused"); + _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_IPAUSED); + return 0; +} + +int +br_scrub_ssm_state_active(xlator_t *this) +{ + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + if (scrub_monitor->done) { + (void)br_fsscan_activate(this); + } else { + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_GENERIC_SSM_INFO, + "Scrubbing resumed"); + _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_ACTIVE); + } + + return 0; +} + +int +br_scrub_ssm_state_stall(xlator_t *this) +{ + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_GENERIC_SSM_INFO, + "Volume is under active scrubbing. Pausing scrub.."); + _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_STALLED); + return 0; +} + +static br_scrub_ssm_call *br_scrub_ssm[BR_SCRUB_MAXSTATES][BR_SCRUB_MAXEVENTS] = + { + /* INACTIVE */ + {br_fsscan_schedule, br_scrub_ssm_state_ipause, br_scrub_ssm_noop}, + /* PENDING */ + {br_fsscan_reschedule, br_fsscan_deactivate, br_fsscan_ondemand}, + /* ACTIVE */ + {br_scrub_ssm_noop, br_scrub_ssm_state_stall, br_scrub_ssm_noop}, + /* PAUSED */ + {br_fsscan_activate, br_scrub_ssm_noop, br_scrub_ssm_noop}, + /* IPAUSED */ + {br_fsscan_schedule, br_scrub_ssm_noop, br_scrub_ssm_noop}, + /* STALLED */ + {br_scrub_ssm_state_active, br_scrub_ssm_noop, br_scrub_ssm_noop}, +}; + +int32_t +br_scrub_state_machine(xlator_t *this, gf_boolean_t scrub_ondemand) +{ + br_private_t *priv = NULL; + br_scrub_ssm_call *call = NULL; + struct br_scrubber *fsscrub = NULL; + br_scrub_state_t currstate = 0; + br_scrub_event_t event = 0; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + fsscrub = &priv->fsscrub; + scrub_monitor = &priv->scrub_monitor; + + currstate = scrub_monitor->state; + if (scrub_ondemand) + event = BR_SCRUB_EVENT_ONDEMAND; + else + event = _br_child_get_scrub_event(fsscrub); + + call = br_scrub_ssm[currstate][event]; + return call(this); +} diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-ssm.h b/xlators/features/bit-rot/src/bitd/bit-rot-ssm.h new file mode 100644 index 00000000000..37b45a42eac --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/bit-rot-ssm.h @@ -0,0 +1,38 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __BIT_ROT_SSM_H__ +#define __BIT_ROT_SSM_H__ + +#include <glusterfs/xlator.h> + +typedef enum br_scrub_state { + BR_SCRUB_STATE_INACTIVE = 0, + BR_SCRUB_STATE_PENDING, + BR_SCRUB_STATE_ACTIVE, + BR_SCRUB_STATE_PAUSED, + BR_SCRUB_STATE_IPAUSED, + BR_SCRUB_STATE_STALLED, + BR_SCRUB_MAXSTATES, +} br_scrub_state_t; + +typedef enum br_scrub_event { + BR_SCRUB_EVENT_SCHEDULE = 0, + BR_SCRUB_EVENT_PAUSE, + BR_SCRUB_EVENT_ONDEMAND, + BR_SCRUB_MAXEVENTS, +} br_scrub_event_t; + +struct br_monitor; + +int32_t +br_scrub_state_machine(xlator_t *, gf_boolean_t); + +#endif /* __BIT_ROT_SSM_H__ */ diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c new file mode 100644 index 00000000000..a2f1c343a1d --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/bit-rot.c @@ -0,0 +1,2232 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <ctype.h> + +#include <glusterfs/logging.h> +#include <glusterfs/compat-errno.h> + +#include "bit-rot.h" +#include "bit-rot-scrub.h" +#include <pthread.h> +#include "bit-rot-bitd-messages.h" + +#define BR_HASH_CALC_READ_SIZE (128 * 1024) + +typedef int32_t(br_child_handler)(xlator_t *, br_child_t *); + +struct br_child_event { + xlator_t *this; + + br_child_t *child; + + br_child_handler *call; + + struct list_head list; +}; + +static int +br_find_child_index(xlator_t *this, xlator_t *child) +{ + br_private_t *priv = NULL; + int i = -1; + int index = -1; + + GF_VALIDATE_OR_GOTO("bit-rot", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO(this->name, child, out); + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (child == priv->children[i].xl) { + index = i; + break; + } + } + +out: + return index; +} + +br_child_t * +br_get_child_from_brick_path(xlator_t *this, char *brick_path) +{ + br_private_t *priv = NULL; + br_child_t *child = NULL; + br_child_t *tmp = NULL; + int i = 0; + + GF_VALIDATE_OR_GOTO("bit-rot", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO(this->name, brick_path, out); + + priv = this->private; + + pthread_mutex_lock(&priv->lock); + { + for (i = 0; i < priv->child_count; i++) { + tmp = &priv->children[i]; + if (!strcmp(tmp->brick_path, brick_path)) { + child = tmp; + break; + } + } + } + pthread_mutex_unlock(&priv->lock); + +out: + return child; +} + +/** + * probably we'll encapsulate brick inside our own structure when + * needed -- later. + */ +void * +br_brick_init(void *xl, struct gf_brick_spec *brick) +{ + return brick; +} + +/** + * and cleanup things here when allocated br_brick_init(). + */ +void +br_brick_fini(void *xl, char *brick, void *data) +{ + return; +} + +/** + * TODO: Signature can contain null terminators which causes bitrot + * stub to store truncated hash as it depends on string length of + * the hash. + * + * FIX: Send the string length as part of the signature struct and + * change stub to handle this change. + */ +static br_isignature_t * +br_prepare_signature(const unsigned char *sign, unsigned long hashlen, + int8_t hashtype, br_object_t *object) +{ + br_isignature_t *signature = NULL; + + /* TODO: use mem-pool */ + signature = GF_CALLOC(1, signature_size(hashlen + 1), + gf_br_stub_mt_signature_t); + if (!signature) + return NULL; + + /* object version */ + signature->signedversion = object->signedversion; + + /* signature length & type */ + signature->signaturelen = hashlen; + signature->signaturetype = hashtype; + + /* signature itself */ + memcpy(signature->signature, (char *)sign, hashlen); + signature->signature[hashlen + 1] = '\0'; + + return signature; +} + +gf_boolean_t +bitd_is_bad_file(xlator_t *this, br_child_t *child, loc_t *loc, fd_t *fd) +{ + int32_t ret = -1; + dict_t *xattr = NULL; + inode_t *inode = NULL; + gf_boolean_t bad_file = _gf_false; + + GF_VALIDATE_OR_GOTO("bit-rot", this, out); + + inode = (loc) ? loc->inode : fd->inode; + + if (fd) + ret = syncop_fgetxattr(child->xl, fd, &xattr, BITROT_OBJECT_BAD_KEY, + NULL, NULL); + else if (loc) + ret = syncop_getxattr(child->xl, loc, &xattr, BITROT_OBJECT_BAD_KEY, + NULL, NULL); + + if (!ret) { + gf_msg_debug(this->name, 0, "[GFID: %s] is marked corrupted", + uuid_utoa(inode->gfid)); + bad_file = _gf_true; + } + + if (xattr) + dict_unref(xattr); + +out: + return bad_file; +} + +/** + * Do a lookup on the gfid present within the object. + */ +static int32_t +br_object_lookup(xlator_t *this, br_object_t *object, struct iatt *iatt, + inode_t **linked_inode) +{ + int ret = -EINVAL; + loc_t loc = { + 0, + }; + inode_t *inode = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot", this, out); + GF_VALIDATE_OR_GOTO(this->name, object, out); + + inode = inode_find(object->child->table, object->gfid); + + if (inode) + loc.inode = inode; + else + loc.inode = inode_new(object->child->table); + + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + + gf_uuid_copy(loc.gfid, object->gfid); + + ret = syncop_lookup(object->child->xl, &loc, iatt, NULL, NULL, NULL); + if (ret < 0) + goto out; + + /* + * The file might have been deleted by the application + * after getting the event, but before doing a lookup. + * So use linked_inode after inode_link is done. + */ + *linked_inode = inode_link(loc.inode, NULL, NULL, iatt); + if (*linked_inode) + inode_lookup(*linked_inode); + +out: + loc_wipe(&loc); + return ret; +} + +/** + * open the object with O_RDONLY flags and return the fd. How to let brick + * know that open is being done by bitd because syncop framework does not allow + * passing xdata -- may be use frame->root->pid itself. + */ +static int32_t +br_object_open(xlator_t *this, br_object_t *object, inode_t *inode, + fd_t **openfd) +{ + int32_t ret = -1; + fd_t *fd = NULL; + loc_t loc = { + 0, + }; + + GF_VALIDATE_OR_GOTO("bit-rot", this, out); + GF_VALIDATE_OR_GOTO(this->name, object, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + ret = -EINVAL; + fd = fd_create(inode, 0); + if (!fd) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_FD_CREATE_FAILED, + "gfid=%s", uuid_utoa(inode->gfid), NULL); + goto out; + } + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + + ret = syncop_open(object->child->xl, &loc, O_RDONLY, fd, NULL, NULL); + if (ret) { + br_log_object(this, "open", inode->gfid, -ret); + fd_unref(fd); + fd = NULL; + } else { + fd_bind(fd); + *openfd = fd; + } + + loc_wipe(&loc); + +out: + return ret; +} + +/** + * read 128k block from the object @object from the offset @offset + * and return the buffer. + */ +static int32_t +br_object_read_block_and_sign(xlator_t *this, fd_t *fd, br_child_t *child, + off_t offset, size_t size, SHA256_CTX *sha256) +{ + int32_t ret = -1; + tbf_t *tbf = NULL; + struct iovec *iovec = NULL; + struct iobref *iobref = NULL; + br_private_t *priv = NULL; + int count = 0; + int i = 0; + + GF_VALIDATE_OR_GOTO("bit-rot", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, fd->inode, out); + GF_VALIDATE_OR_GOTO(this->name, child, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + priv = this->private; + + GF_VALIDATE_OR_GOTO(this->name, priv->tbf, out); + tbf = priv->tbf; + + ret = syncop_readv(child->xl, fd, size, offset, 0, &iovec, &count, &iobref, + NULL, NULL, NULL); + + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, errno, BRB_MSG_READV_FAILED, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + ret = -1; + goto out; + } + + if (ret == 0) + goto out; + + for (i = 0; i < count; i++) { + TBF_THROTTLE_BEGIN(tbf, TBF_OP_HASH, iovec[i].iov_len); + { + SHA256_Update(sha256, (const unsigned char *)(iovec[i].iov_base), + iovec[i].iov_len); + } + TBF_THROTTLE_BEGIN(tbf, TBF_OP_HASH, iovec[i].iov_len); + } + +out: + if (iovec) + GF_FREE(iovec); + + if (iobref) + iobref_unref(iobref); + + return ret; +} + +int32_t +br_calculate_obj_checksum(unsigned char *md, br_child_t *child, fd_t *fd, + struct iatt *iatt) +{ + int32_t ret = -1; + off_t offset = 0; + size_t block = BR_HASH_CALC_READ_SIZE; + xlator_t *this = NULL; + + SHA256_CTX sha256; + + GF_VALIDATE_OR_GOTO("bit-rot", child, out); + GF_VALIDATE_OR_GOTO("bit-rot", iatt, out); + GF_VALIDATE_OR_GOTO("bit-rot", fd, out); + + this = child->this; + + SHA256_Init(&sha256); + + while (1) { + ret = br_object_read_block_and_sign(this, fd, child, offset, block, + &sha256); + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_BLOCK_READ_FAILED, + "offset=%" PRIu64, offset, "object-gfid=%s", + uuid_utoa(fd->inode->gfid), NULL); + break; + } + + if (ret == 0) + break; + + offset += ret; + } + + if (ret == 0) + SHA256_Final(md, &sha256); + +out: + return ret; +} + +static int32_t +br_object_checksum(unsigned char *md, br_object_t *object, fd_t *fd, + struct iatt *iatt) +{ + return br_calculate_obj_checksum(md, object->child, fd, iatt); +} + +static int32_t +br_object_read_sign(inode_t *linked_inode, fd_t *fd, br_object_t *object, + struct iatt *iatt) +{ + int32_t ret = -1; + xlator_t *this = NULL; + dict_t *xattr = NULL; + unsigned char *md = NULL; + br_isignature_t *sign = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot", object, out); + GF_VALIDATE_OR_GOTO("bit-rot", linked_inode, out); + GF_VALIDATE_OR_GOTO("bit-rot", fd, out); + + this = object->this; + + md = GF_MALLOC(SHA256_DIGEST_LENGTH, gf_common_mt_char); + if (!md) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_SAVING_HASH_FAILED, + "object-gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto out; + } + + ret = br_object_checksum(md, object, fd, iatt); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_CALC_CHECKSUM_FAILED, + "object-gfid=%s", uuid_utoa(linked_inode->gfid), NULL); + goto free_signature; + } + + sign = br_prepare_signature(md, SHA256_DIGEST_LENGTH, + BR_SIGNATURE_TYPE_SHA256, object); + if (!sign) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_SIGN_FAILED, + "object-gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto free_signature; + } + + xattr = dict_for_key_value(GLUSTERFS_SET_OBJECT_SIGNATURE, (void *)sign, + signature_size(SHA256_DIGEST_LENGTH), _gf_true); + + if (!xattr) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SET_SIGN_FAILED, + "dict-allocation object-gfid=%s", uuid_utoa(fd->inode->gfid), + NULL); + goto free_isign; + } + + ret = syncop_fsetxattr(object->child->xl, fd, xattr, 0, NULL, NULL); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SET_SIGN_FAILED, + "fsetxattr object-gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto unref_dict; + } + + ret = 0; + +unref_dict: + dict_unref(xattr); +free_isign: + GF_FREE(sign); +free_signature: + GF_FREE(md); +out: + return ret; +} + +static int +br_object_sign_softerror(int32_t op_errno) +{ + return ((op_errno == ENOENT) || (op_errno == ESTALE) || + (op_errno == ENODATA)); +} + +void +br_log_object(xlator_t *this, char *op, uuid_t gfid, int32_t op_errno) +{ + int softerror = br_object_sign_softerror(op_errno); + if (softerror) { + gf_msg_debug(this->name, 0, + "%s() failed on object %s " + "[reason: %s]", + op, uuid_utoa(gfid), strerror(op_errno)); + } else { + gf_smsg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_OP_FAILED, "op=%s", + op, "gfid=%s", uuid_utoa(gfid), NULL); + } +} + +void +br_log_object_path(xlator_t *this, char *op, const char *path, int32_t op_errno) +{ + int softerror = br_object_sign_softerror(op_errno); + if (softerror) { + gf_msg_debug(this->name, 0, + "%s() failed on object %s " + "[reason: %s]", + op, path, strerror(op_errno)); + } else { + gf_smsg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_OP_FAILED, "op=%s", + op, "path=%s", path, NULL); + } +} + +static void +br_trigger_sign(xlator_t *this, br_child_t *child, inode_t *linked_inode, + loc_t *loc, gf_boolean_t need_reopen) +{ + fd_t *fd = NULL; + int32_t ret = -1; + uint32_t val = 0; + dict_t *dict = NULL; + pid_t pid = GF_CLIENT_PID_BITD; + + syncopctx_setfspid(&pid); + + val = (need_reopen == _gf_true) ? BR_OBJECT_REOPEN : BR_OBJECT_RESIGN; + + dict = dict_new(); + if (!dict) + goto out; + + ret = dict_set_uint32(dict, BR_REOPEN_SIGN_HINT_KEY, val); + if (ret) + goto cleanup_dict; + + ret = -1; + fd = fd_create(linked_inode, 0); + if (!fd) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_FD_CREATE_FAILED, + "gfid=%s", uuid_utoa(linked_inode->gfid), NULL); + goto cleanup_dict; + } + + ret = syncop_open(child->xl, loc, O_RDWR, fd, NULL, NULL); + if (ret) { + br_log_object(this, "open", linked_inode->gfid, -ret); + goto unref_fd; + } + + fd_bind(fd); + + ret = syncop_fsetxattr(child->xl, fd, dict, 0, NULL, NULL); + if (ret) + br_log_object(this, "fsetxattr", linked_inode->gfid, -ret); + + /* passthough: fd_unref() */ + +unref_fd: + fd_unref(fd); +cleanup_dict: + dict_unref(dict); +out: + if (ret) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRB_MSG_TRIGGER_SIGN_FAILED, + "gfid=%s", uuid_utoa(linked_inode->gfid), "reopen-hint-val=%d", + val, NULL); + } +} + +static void +br_object_resign(xlator_t *this, br_object_t *object, inode_t *linked_inode) +{ + loc_t loc = { + 0, + }; + + loc.inode = inode_ref(linked_inode); + gf_uuid_copy(loc.gfid, linked_inode->gfid); + + br_trigger_sign(this, object->child, linked_inode, &loc, _gf_false); + + loc_wipe(&loc); +} + +/** + * Sign a given object. This routine runs full throttle. There needs to be + * some form of priority scheduling and/or read burstness to avoid starving + * (or kicking) client I/O's. + */ +static int32_t +br_sign_object(br_object_t *object) +{ + int32_t ret = -1; + inode_t *linked_inode = NULL; + xlator_t *this = NULL; + fd_t *fd = NULL; + struct iatt iatt = { + 0, + }; + pid_t pid = GF_CLIENT_PID_BITD; + br_sign_state_t sign_info = BR_SIGN_NORMAL; + + GF_VALIDATE_OR_GOTO("bit-rot", object, out); + + this = object->this; + + /** + * FIXME: This is required as signing an object is restricted to + * clients with special frame->root->pid. Change the way client + * pid is set. + */ + syncopctx_setfspid(&pid); + + ret = br_object_lookup(this, object, &iatt, &linked_inode); + if (ret) { + br_log_object(this, "lookup", object->gfid, -ret); + goto out; + } + + /** + * For fd's that have notified for reopening, we send an explicit + * open() followed by a dummy write() call. This triggers the + * actual signing of the object. + */ + sign_info = ntohl(object->sign_info); + if (sign_info == BR_SIGN_REOPEN_WAIT) { + br_object_resign(this, object, linked_inode); + goto unref_inode; + } + + ret = br_object_open(this, object, linked_inode, &fd); + if (!fd) { + br_log_object(this, "open", object->gfid, -ret); + goto unref_inode; + } + + /** + * we have an open file descriptor on the object. from here on, + * do not be generous to file operation errors. + */ + gf_msg_debug(this->name, 0, "Signing object [%s]", + uuid_utoa(linked_inode->gfid)); + + ret = br_object_read_sign(linked_inode, fd, object, &iatt); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_READ_AND_SIGN_FAILED, + "gfid=%s", uuid_utoa(linked_inode->gfid), NULL); + goto unref_fd; + } + + ret = 0; + +unref_fd: + fd_unref(fd); +unref_inode: + inode_unref(linked_inode); +out: + return ret; +} + +static br_object_t * +__br_pick_object(br_private_t *priv) +{ + br_object_t *object = NULL; + + while (list_empty(&priv->obj_queue->objects)) { + pthread_cond_wait(&priv->object_cond, &priv->lock); + } + + object = list_first_entry(&priv->obj_queue->objects, br_object_t, list); + list_del_init(&object->list); + + return object; +} + +/** + * This is the place where the signing of the objects is triggered. + */ +void * +br_process_object(void *arg) +{ + xlator_t *this = NULL; + br_object_t *object = NULL; + br_private_t *priv = NULL; + int32_t ret = -1; + + this = arg; + priv = this->private; + + THIS = this; + + for (;;) { + pthread_mutex_lock(&priv->lock); + { + object = __br_pick_object(priv); + } + pthread_mutex_unlock(&priv->lock); + + ret = br_sign_object(object); + if (ret && !br_object_sign_softerror(-ret)) + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SET_SIGN_FAILED, + "gfid=%s", uuid_utoa(object->gfid), NULL); + GF_FREE(object); + } + + return NULL; +} + +/** + * This function gets kicked in once the object is expired from the + * timer wheel. This actually adds the object received via notification + * from the changelog to the queue from where the objects gets picked + * up for signing. + * + * This routine can be made lightweight by introducing an alternate + * timer-wheel API that dispatches _all_ expired objects in one-shot + * rather than an object at-a-time. This routine can then just simply + * be a call to list_splice_tail(). + * + * NOTE: use call_time to instrument signing time in br_sign_object(). + */ +void +br_add_object_to_queue(struct gf_tw_timer_list *timer, void *data, + unsigned long call_time) +{ + br_object_t *object = NULL; + xlator_t *this = NULL; + br_private_t *priv = NULL; + + object = data; + this = object->this; + priv = this->private; + + THIS = this; + + pthread_mutex_lock(&priv->lock); + { + list_add_tail(&object->list, &priv->obj_queue->objects); + pthread_cond_broadcast(&priv->object_cond); + } + pthread_mutex_unlock(&priv->lock); + + if (timer) + mem_put(timer); + return; +} + +static br_object_t * +br_initialize_object(xlator_t *this, br_child_t *child, changelog_event_t *ev) +{ + br_object_t *object = NULL; + + object = GF_CALLOC(1, sizeof(*object), gf_br_mt_br_object_t); + if (!object) + goto out; + INIT_LIST_HEAD(&object->list); + + object->this = this; + object->child = child; + gf_uuid_copy(object->gfid, ev->u.releasebr.gfid); + + /* NOTE: it's BE, but no worry */ + object->signedversion = ev->u.releasebr.version; + object->sign_info = ev->u.releasebr.sign_info; + +out: + return object; +} + +static struct gf_tw_timer_list * +br_initialize_timer(xlator_t *this, br_object_t *object, br_child_t *child, + changelog_event_t *ev) +{ + br_private_t *priv = NULL; + struct gf_tw_timer_list *timer = NULL; + + priv = this->private; + + timer = mem_get0(child->timer_pool); + if (!timer) + goto out; + INIT_LIST_HEAD(&timer->entry); + + timer->expires = priv->expiry_time; + if (!timer->expires) + timer->expires = 1; + + timer->data = object; + timer->function = br_add_object_to_queue; + gf_tw_add_timer(priv->timer_wheel, timer); + +out: + return timer; +} + +static int32_t +br_schedule_object_reopen(xlator_t *this, br_object_t *object, + br_child_t *child, changelog_event_t *ev) +{ + struct gf_tw_timer_list *timer = NULL; + + timer = br_initialize_timer(this, object, child, ev); + if (!timer) + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SET_TIMER_FAILED, + "gfid=%s", uuid_utoa(object->gfid), NULL); + return timer ? 0 : -1; +} + +static int32_t +br_object_quicksign(xlator_t *this, br_object_t *object) +{ + br_add_object_to_queue(NULL, object, 0ULL); + return 0; +} + +/** + * This callback function registered with the changelog is executed + * whenever a notification from the changelog is received. This should + * add the object (or the gfid) on which the notification has come to + * the timer-wheel with some expiry time. + * + * TODO: use mem-pool for allocations and maybe allocate timer and + * object as a single alloc and bifurcate their respective pointers. + */ +void +br_brick_callback(void *xl, char *brick, void *data, changelog_event_t *ev) +{ + int32_t ret = 0; + uuid_t gfid = { + 0, + }; + xlator_t *this = NULL; + br_object_t *object = NULL; + br_child_t *child = NULL; + br_sign_state_t sign_info = BR_SIGN_INVALID; + + this = xl; + + GF_VALIDATE_OR_GOTO(this->name, ev, out); + GF_VALIDATE_OR_GOTO("bit-rot", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + GF_ASSERT(ev->ev_type == CHANGELOG_OP_TYPE_BR_RELEASE); + GF_ASSERT(!gf_uuid_is_null(ev->u.releasebr.gfid)); + + gf_uuid_copy(gfid, ev->u.releasebr.gfid); + + gf_msg_debug(this->name, 0, "RELEASE EVENT [GFID %s]", uuid_utoa(gfid)); + + child = br_get_child_from_brick_path(this, brick); + if (!child) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_SUBVOL_FAILED, + "brick=%s", brick, NULL); + goto out; + } + + object = br_initialize_object(this, child, ev); + if (!object) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY, + "object-gfid=%s", uuid_utoa(gfid), NULL); + goto out; + } + + /* sanity check */ + sign_info = ntohl(object->sign_info); + GF_ASSERT(sign_info != BR_SIGN_NORMAL); + + if (sign_info == BR_SIGN_REOPEN_WAIT) + ret = br_schedule_object_reopen(this, object, child, ev); + else + ret = br_object_quicksign(this, object); + + if (ret) + goto free_object; + + gf_msg_debug(this->name, 0, "->callback: brick [%s], type [%d]\n", brick, + ev->ev_type); + return; + +free_object: + GF_FREE(object); +out: + return; +} + +void +br_fill_brick_spec(struct gf_brick_spec *brick, char *path) +{ + brick->brick_path = gf_strdup(path); + brick->filter = CHANGELOG_OP_TYPE_BR_RELEASE; + + brick->init = br_brick_init; + brick->fini = br_brick_fini; + brick->callback = br_brick_callback; + brick->connected = NULL; + brick->disconnected = NULL; +} + +static gf_boolean_t +br_check_object_need_sign(xlator_t *this, dict_t *xattr, br_child_t *child) +{ + int32_t ret = -1; + gf_boolean_t need_sign = _gf_false; + br_isignature_out_t *sign = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot", this, out); + GF_VALIDATE_OR_GOTO(this->name, xattr, out); + GF_VALIDATE_OR_GOTO(this->name, child, out); + + ret = dict_get_ptr(xattr, GLUSTERFS_GET_OBJECT_SIGNATURE, (void **)&sign); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_SIGN_FAILED, + "object-info", NULL); + goto out; + } + + /* Object has been opened and hence dirty. Do not sign it */ + if (sign->stale) + need_sign = _gf_true; + +out: + return need_sign; +} + +int32_t +br_prepare_loc(xlator_t *this, br_child_t *child, loc_t *parent, + gf_dirent_t *entry, loc_t *loc) +{ + int32_t ret = -1; + inode_t *inode = NULL; + + inode = inode_grep(child->table, parent->inode, entry->d_name); + if (!inode) + loc->inode = inode_new(child->table); + else { + loc->inode = inode; + if (loc->inode->ia_type != IA_IFREG) { + gf_msg_debug(this->name, 0, + "%s is not a regular " + "file", + entry->d_name); + ret = 0; + goto out; + } + } + + loc->parent = inode_ref(parent->inode); + gf_uuid_copy(loc->pargfid, parent->inode->gfid); + + ret = inode_path(parent->inode, entry->d_name, (char **)&loc->path); + if (ret < 0 || !loc->path) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_PATH_FAILED, + "inode_path=%s", entry->d_name, "parent-gfid=%s", + uuid_utoa(parent->inode->gfid), NULL); + goto out; + } + + loc->name = strrchr(loc->path, '/'); + if (loc->name) + loc->name++; + + ret = 1; + +out: + return ret; +} + +/** + * Oneshot crawler + * --------------- + * This is a catchup mechanism. Objects that remained unsigned from the + * last run for whatever reason (node crashes, reboots, etc..) become + * candidates for signing. This allows the signature to "catch up" with + * the current state of the object. Triggering signing is easy: perform + * an open() followed by a close() thereby resulting in call boomerang. + * (though not back to itself :)) + */ +int +bitd_oneshot_crawl(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, + void *data) +{ + int op_errno = 0; + br_child_t *child = NULL; + xlator_t *this = NULL; + loc_t loc = { + 0, + }; + struct iatt iatt = { + 0, + }; + struct iatt parent_buf = { + 0, + }; + dict_t *xattr = NULL; + int32_t ret = -1; + inode_t *linked_inode = NULL; + gf_boolean_t need_signing = _gf_false; + gf_boolean_t need_reopen = _gf_true; + + GF_VALIDATE_OR_GOTO("bit-rot", subvol, out); + GF_VALIDATE_OR_GOTO("bit-rot", data, out); + + child = data; + this = child->this; + + ret = br_prepare_loc(this, child, parent, entry, &loc); + if (!ret) + goto out; + + ret = syncop_lookup(child->xl, &loc, &iatt, &parent_buf, NULL, NULL); + if (ret) { + br_log_object_path(this, "lookup", loc.path, -ret); + goto out; + } + + linked_inode = inode_link(loc.inode, parent->inode, loc.name, &iatt); + if (linked_inode) + inode_lookup(linked_inode); + + if (iatt.ia_type != IA_IFREG) { + gf_msg_debug(this->name, 0, + "%s is not a regular file, " + "skipping..", + entry->d_name); + ret = 0; + goto unref_inode; + } + + /** + * As of now, 2 cases are possible and handled. + * 1) GlusterFS is upgraded from a previous version which does not + * have any idea about bit-rot and have data in the filesystem. + * In this case syncop_getxattr fails with ENODATA and the object + * is signed. (In real, when crawler sends lookup, bit-rot-stub + * creates the xattrs before returning lookup reply) + * 2) Bit-rot was not enabled or BitD was does for some reasons, during + * which some files were created, but since BitD was down, were not + * signed. + * If the file was just created and was being written some data when + * the down BitD came up, then bit-rot stub should be intelligent to + * identify this case (by comparing the ongoing version or by checking + * if there are any fds present for that inode) and handle properly. + */ + + if (bitd_is_bad_file(this, child, &loc, NULL)) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SKIP_OBJECT, "path=%s", + loc.path, NULL); + goto unref_inode; + } + + ret = syncop_getxattr(child->xl, &loc, &xattr, + GLUSTERFS_GET_OBJECT_SIGNATURE, NULL, NULL); + if (ret < 0) { + op_errno = -ret; + br_log_object(this, "getxattr", linked_inode->gfid, op_errno); + + /** + * No need to sign the zero byte objects as the signing + * happens upon first modification of the object. + */ + if (op_errno == ENODATA && (iatt.ia_size != 0)) + need_signing = _gf_true; + if (op_errno == EINVAL) + gf_smsg(this->name, GF_LOG_WARNING, 0, + BRB_MSG_PARTIAL_VERSION_PRESENCE, "gfid=%s", + uuid_utoa(linked_inode->gfid), NULL); + } else { + need_signing = br_check_object_need_sign(this, xattr, child); + + /* + * If we are here means, bitrot daemon has started. Is it just + * a simple restart of the daemon or is it started because the + * feature is enabled is something hard to determine. Hence, + * if need_signing is false (because bit-rot version and signature + * are present), then still go ahead and sign it. + */ + if (!need_signing) { + need_signing = _gf_true; + need_reopen = _gf_true; + } + } + + if (!need_signing) + goto unref_dict; + + gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_TRIGGER_SIGN, "path=%s", + loc.path, "gfid=%s", uuid_utoa(linked_inode->gfid), "Brick-path=%s", + child->brick_path, NULL); + br_trigger_sign(this, child, linked_inode, &loc, need_reopen); + + ret = 0; + +unref_dict: + if (xattr) + dict_unref(xattr); +unref_inode: + inode_unref(linked_inode); +out: + loc_wipe(&loc); + + return ret; +} + +#define BR_CRAWL_THROTTLE_COUNT 50 +#define BR_CRAWL_THROTTLE_ZZZ 5 + +void * +br_oneshot_signer(void *arg) +{ + loc_t loc = { + 0, + }; + xlator_t *this = NULL; + br_child_t *child = NULL; + + child = arg; + this = child->this; + + THIS = this; + + gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_CRAWLING_START, "brick-path=%s", + child->brick_path, NULL); + + loc.inode = child->table->root; + (void)syncop_ftw_throttle(child->xl, &loc, GF_CLIENT_PID_BITD, child, + bitd_oneshot_crawl, BR_CRAWL_THROTTLE_COUNT, + BR_CRAWL_THROTTLE_ZZZ); + + gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_CRAWLING_FINISH, + "brick-path=%s", child->brick_path, NULL); + + return NULL; +} + +static void +br_set_child_state(br_child_t *child, br_child_state_t state) +{ + pthread_mutex_lock(&child->lock); + { + _br_set_child_state(child, state); + } + pthread_mutex_unlock(&child->lock); +} + +/** + * At this point a thread is spawned to crawl the filesystem (in + * tortoise pace) to sign objects that were not signed in previous run(s). + * Such objects are identified by examining it's dirtyness and timestamp. + * + * pick object: + * signature_is_stale() && (object_timestamp() <= stub_init_time()) + * + * Also, we register to the changelog library to subscribe for event + * notifications. + */ +static int32_t +br_enact_signer(xlator_t *this, br_child_t *child, br_stub_init_t *stub) +{ + int32_t ret = 0; + br_private_t *priv = NULL; + struct gf_brick_spec *brick = NULL; + + priv = this->private; + + brick = GF_CALLOC(1, sizeof(struct gf_brick_spec), + gf_common_mt_gf_brick_spec_t); + if (!brick) + goto error_return; + + br_fill_brick_spec(brick, stub->export); + ret = gf_changelog_register_generic(brick, 1, 1, + this->ctx->cmd_args.log_file, -1, this); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, errno, BRB_MSG_REGISTER_FAILED, NULL); + goto dealloc; + } + + child->threadrunning = 0; + ret = gf_thread_create(&child->thread, NULL, br_oneshot_signer, child, + "brosign"); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SPAWN_FAILED, + "FS-crawler-thread", NULL); + else + child->threadrunning = 1; + + /* it's OK to continue, "old" objects would be signed when modified */ + list_add_tail(&child->list, &priv->signing); + return 0; + +dealloc: + GF_FREE(brick); +error_return: + return -1; +} + +static int32_t +br_launch_scrubber(xlator_t *this, br_child_t *child, struct br_scanfs *fsscan, + struct br_scrubber *fsscrub) +{ + int32_t ret = -1; + br_private_t *priv = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + + scrub_monitor = &priv->scrub_monitor; + ret = gf_thread_create(&child->thread, NULL, br_fsscanner, child, + "brfsscan"); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_ALERT, 0, BRB_MSG_SPAWN_FAILED, + "bitrot-scrubber-daemon Brick-path=%s", child->brick_path, + NULL); + goto error_return; + } + + /* Signal monitor to kick off state machine*/ + pthread_mutex_lock(&scrub_monitor->mutex); + { + if (!scrub_monitor->inited) + pthread_cond_signal(&scrub_monitor->cond); + scrub_monitor->inited = _gf_true; + } + pthread_mutex_unlock(&scrub_monitor->mutex); + + /** + * Everything has been setup.. add this subvolume to scrubbers + * list. + */ + pthread_mutex_lock(&fsscrub->mutex); + { + list_add_tail(&child->list, &fsscrub->scrublist); + pthread_cond_broadcast(&fsscrub->cond); + } + pthread_mutex_unlock(&fsscrub->mutex); + + return 0; + +error_return: + return -1; +} + +static int32_t +br_enact_scrubber(xlator_t *this, br_child_t *child) +{ + int32_t ret = 0; + br_private_t *priv = NULL; + struct br_scanfs *fsscan = NULL; + struct br_scrubber *fsscrub = NULL; + + priv = this->private; + + fsscan = &child->fsscan; + fsscrub = &priv->fsscrub; + + /** + * if this child already witnesses a successful connection earlier + * there's no need to initialize mutexes, condvars, etc.. + */ + if (_br_child_witnessed_connection(child)) + return br_launch_scrubber(this, child, fsscan, fsscrub); + + LOCK_INIT(&fsscan->entrylock); + pthread_mutex_init(&fsscan->waitlock, NULL); + pthread_cond_init(&fsscan->waitcond, NULL); + + fsscan->entries = 0; + INIT_LIST_HEAD(&fsscan->queued); + INIT_LIST_HEAD(&fsscan->ready); + + ret = br_launch_scrubber(this, child, fsscan, fsscrub); + if (ret) + goto error_return; + + return 0; + +error_return: + LOCK_DESTROY(&fsscan->entrylock); + pthread_mutex_destroy(&fsscan->waitlock); + pthread_cond_destroy(&fsscan->waitcond); + + return -1; +} + +static int32_t +br_child_enaction(xlator_t *this, br_child_t *child, br_stub_init_t *stub) +{ + int32_t ret = -1; + br_private_t *priv = this->private; + + pthread_mutex_lock(&child->lock); + { + if (priv->iamscrubber) + ret = br_enact_scrubber(this, child); + else + ret = br_enact_signer(this, child, stub); + + if (!ret) { + child->witnessed = 1; + _br_set_child_state(child, BR_CHILD_STATE_CONNECTED); + gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_CONNECTED_TO_BRICK, + "brick-path=%s", child->brick_path, NULL); + } + } + pthread_mutex_unlock(&child->lock); + + return ret; +} + +/** + * This routine fetches various attributes associated with a child which + * is basically a subvolume. Attributes include brick path and the stub + * birth time. This is done by performing a lookup on the root followed + * by getxattr() on a virtual key. Depending on the configuration, the + * process either acts as a signer or a scrubber. + */ +int32_t +br_brick_connect(xlator_t *this, br_child_t *child) +{ + int32_t ret = -1; + loc_t loc = { + 0, + }; + struct iatt buf = { + 0, + }; + struct iatt parent = { + 0, + }; + br_stub_init_t *stub = NULL; + dict_t *xattr = NULL; + int op_errno = 0; + + GF_VALIDATE_OR_GOTO("bit-rot", this, out); + GF_VALIDATE_OR_GOTO(this->name, child, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + br_child_set_scrub_state(child, _gf_false); + br_set_child_state(child, BR_CHILD_STATE_INITIALIZING); + + loc.inode = inode_ref(child->table->root); + gf_uuid_copy(loc.gfid, loc.inode->gfid); + loc.path = gf_strdup("/"); + + ret = syncop_lookup(child->xl, &loc, &buf, &parent, NULL, NULL); + if (ret) { + op_errno = -ret; + ret = -1; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_LOOKUP_FAILED, + NULL); + goto wipeloc; + } + + ret = syncop_getxattr(child->xl, &loc, &xattr, + GLUSTERFS_GET_BR_STUB_INIT_TIME, NULL, NULL); + if (ret) { + op_errno = -ret; + ret = -1; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_GET_INFO_FAILED, + NULL); + goto wipeloc; + } + + ret = dict_get_ptr(xattr, GLUSTERFS_GET_BR_STUB_INIT_TIME, (void **)&stub); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_INFO_FAILED, NULL); + goto free_dict; + } + + memcpy(child->brick_path, stub->export, strlen(stub->export) + 1); + child->tv.tv_sec = ntohl(stub->timebuf[0]); + child->tv.tv_usec = ntohl(stub->timebuf[1]); + + ret = br_child_enaction(this, child, stub); + +free_dict: + dict_unref(xattr); +wipeloc: + loc_wipe(&loc); +out: + if (ret) + br_set_child_state(child, BR_CHILD_STATE_CONNFAILED); + return ret; +} + +/* TODO: cleanup signer */ +static int32_t +br_cleanup_signer(xlator_t *this, br_child_t *child) +{ + return 0; +} + +static int32_t +br_cleanup_scrubber(xlator_t *this, br_child_t *child) +{ + int32_t ret = 0; + br_private_t *priv = NULL; + struct br_scrubber *fsscrub = NULL; + struct br_monitor *scrub_monitor = NULL; + + priv = this->private; + fsscrub = &priv->fsscrub; + scrub_monitor = &priv->scrub_monitor; + + if (_br_is_child_scrub_active(child)) { + scrub_monitor->active_child_count--; + br_child_set_scrub_state(child, _gf_false); + } + + /** + * 0x0: child (brick) goes out of rotation + * + * This is fully safe w.r.t. entries for this child being actively + * scrubbed. Each of the scrubber thread(s) would finish scrubbing + * the entry (probably failing due to disconnection) and either + * putting the entry back into the queue or continuing further. + * Either way, pending entries for this child's queue need not be + * drained; entries just sit there in the queued/ready list to be + * consumed later upon re-connection. + */ + pthread_mutex_lock(&fsscrub->mutex); + { + list_del_init(&child->list); + } + pthread_mutex_unlock(&fsscrub->mutex); + + /** + * 0x1: cleanup scanner thread + * + * The pending timer needs to be removed _after_ cleaning up the + * filesystem scanner (scheduling the next scrub time is not a + * cancellation point). + */ + ret = gf_thread_cleanup_xint(child->thread); + if (ret) + gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_THREAD_CLEANUP, NULL); + + gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUBBER_CLEANED, + "brick-path=%s", child->brick_path, NULL); + + return 0; +} + +/** + * OK.. this child has made it's mind to go down the drain. So, + * let's clean up what it touched. (NOTE: there's no need to clean + * the inode table, it's just reused taking care of stale inodes) + */ +int32_t +br_brick_disconnect(xlator_t *this, br_child_t *child) +{ + int32_t ret = 0; + struct br_monitor *scrub_monitor = NULL; + br_private_t *priv = this->private; + + scrub_monitor = &priv->scrub_monitor; + + /* Lock order should be wakelock and then child lock to + * dead locks. + */ + pthread_mutex_lock(&scrub_monitor->wakelock); + { + pthread_mutex_lock(&child->lock); + { + if (!_br_is_child_connected(child)) + goto unblock; + + /* child is on death row.. */ + _br_set_child_state(child, BR_CHILD_STATE_DISCONNECTED); + + if (priv->iamscrubber) + ret = br_cleanup_scrubber(this, child); + else + ret = br_cleanup_signer(this, child); + } + unblock: + pthread_mutex_unlock(&child->lock); + } + pthread_mutex_unlock(&scrub_monitor->wakelock); + + return ret; +} + +/** + * This function is executed in a separate thread. The thread gets the + * brick from where CHILD_UP has received from the queue and gets the + * information regarding that brick (such as brick path). + */ +void * +br_handle_events(void *arg) +{ + int32_t ret = 0; + xlator_t *this = NULL; + br_private_t *priv = NULL; + br_child_t *child = NULL; + struct br_child_event *childev = NULL; + + this = arg; + priv = this->private; + + /* + * Since, this is the topmost xlator, THIS has to be set by bit-rot + * xlator itself (STACK_WIND won't help in this case). Also it has + * to be done for each thread that gets spawned. Otherwise, a new + * thread will get global_xlator's pointer when it does "THIS". + */ + THIS = this; + + while (1) { + pthread_mutex_lock(&priv->lock); + { + while (list_empty(&priv->bricks)) + pthread_cond_wait(&priv->cond, &priv->lock); + + childev = list_first_entry(&priv->bricks, struct br_child_event, + list); + list_del_init(&childev->list); + } + pthread_mutex_unlock(&priv->lock); + + child = childev->child; + ret = childev->call(this, child); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SUBVOL_CONNECT_FAILED, + "name=%s", child->xl->name, NULL); + GF_FREE(childev); + } + + return NULL; +} + +int32_t +mem_acct_init(xlator_t *this) +{ + int32_t ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init(this, gf_br_stub_mt_end + 1); + + if (ret != 0) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRB_MSG_MEM_ACNT_FAILED, NULL); + return ret; + } + + return ret; +} + +static void +_br_qchild_event(xlator_t *this, br_child_t *child, br_child_handler *call) +{ + br_private_t *priv = NULL; + struct br_child_event *childev = NULL; + + priv = this->private; + + childev = GF_CALLOC(1, sizeof(*childev), gf_br_mt_br_child_event_t); + if (!childev) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_EVENT_UNHANDLED, + "Brick-name=%s", child->xl->name, NULL); + return; + } + + INIT_LIST_HEAD(&childev->list); + childev->this = this; + childev->child = child; + childev->call = call; + + list_add_tail(&childev->list, &priv->bricks); +} + +int +br_scrubber_status_get(xlator_t *this, dict_t **dict) +{ + int ret = -1; + br_private_t *priv = NULL; + struct br_scrub_stats *scrub_stats = NULL; + + priv = this->private; + + GF_VALIDATE_OR_GOTO("bit-rot", priv, out); + + scrub_stats = &priv->scrub_stat; + + ret = br_get_bad_objects_list(this, dict); + if (ret) { + gf_msg_debug(this->name, 0, + "Failed to collect corrupt " + "files"); + } + + ret = dict_set_int8(*dict, "scrub-running", scrub_stats->scrub_running); + if (ret) { + gf_msg_debug(this->name, 0, + "Failed setting scrub_running " + "entry to the dictionary"); + } + + ret = dict_set_uint64(*dict, "scrubbed-files", scrub_stats->scrubbed_files); + if (ret) { + gf_msg_debug(this->name, 0, + "Failed to setting scrubbed file " + "entry to the dictionary"); + } + + ret = dict_set_uint64(*dict, "unsigned-files", scrub_stats->unsigned_files); + if (ret) { + gf_msg_debug(this->name, 0, + "Failed to set unsigned file count" + " entry to the dictionary"); + } + + ret = dict_set_uint64(*dict, "scrub-duration", scrub_stats->scrub_duration); + if (ret) { + gf_msg_debug(this->name, 0, + "Failed to set scrub duration" + " entry to the dictionary"); + } + + ret = dict_set_dynstr_with_alloc(*dict, "last-scrub-time", + scrub_stats->last_scrub_time); + if (ret) { + gf_msg_debug(this->name, 0, + "Failed to set " + "last scrub time value"); + } + +out: + return ret; +} + +int +notify(xlator_t *this, int32_t event, void *data, ...) +{ + int idx = -1; + int ret = -1; + xlator_t *subvol = NULL; + br_child_t *child = NULL; + br_private_t *priv = NULL; + dict_t *output = NULL; + va_list ap; + struct br_monitor *scrub_monitor = NULL; + + subvol = (xlator_t *)data; + priv = this->private; + scrub_monitor = &priv->scrub_monitor; + + gf_msg_trace(this->name, 0, "Notification received: %d", event); + + idx = br_find_child_index(this, subvol); + + switch (event) { + case GF_EVENT_CHILD_UP: + if (idx < 0) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_INVALID_SUBVOL, + "event=%d", event, NULL); + goto out; + } + + pthread_mutex_lock(&priv->lock); + { + child = &priv->children[idx]; + if (child->child_up == 1) + goto unblock_0; + priv->up_children++; + + child->child_up = 1; + child->xl = subvol; + if (!child->table) + child->table = inode_table_new(4096, subvol); + + _br_qchild_event(this, child, br_brick_connect); + pthread_cond_signal(&priv->cond); + } + unblock_0: + pthread_mutex_unlock(&priv->lock); + + if (priv->up_children == priv->child_count) + default_notify(this, event, data); + break; + + case GF_EVENT_CHILD_DOWN: + if (idx < 0) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_INVALID_SUBVOL, + "event=%d", event, NULL); + goto out; + } + + pthread_mutex_lock(&priv->lock); + { + child = &priv->children[idx]; + if (child->child_up == 0) + goto unblock_1; + + child->child_up = 0; + priv->up_children--; + + _br_qchild_event(this, child, br_brick_disconnect); + pthread_cond_signal(&priv->cond); + } + unblock_1: + pthread_mutex_unlock(&priv->lock); + + if (priv->up_children == 0) + default_notify(this, event, data); + break; + + case GF_EVENT_SCRUB_STATUS: + gf_msg_debug(this->name, GF_LOG_INFO, + "BitRot scrub status " + "called"); + va_start(ap, data); + output = va_arg(ap, dict_t *); + va_end(ap); + + ret = br_scrubber_status_get(this, &output); + gf_msg_debug(this->name, 0, "returning %d", ret); + break; + + case GF_EVENT_SCRUB_ONDEMAND: + gf_log(this->name, GF_LOG_INFO, + "BitRot scrub ondemand " + "called"); + + if (scrub_monitor->state != BR_SCRUB_STATE_PENDING) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + BRB_MSG_RESCHEDULE_SCRUBBER_FAILED, "Current-state=%d", + scrub_monitor->state, NULL); + return -2; + } + + /* Needs synchronization with reconfigure thread */ + pthread_mutex_lock(&priv->lock); + { + ret = br_scrub_state_machine(this, _gf_true); + } + pthread_mutex_unlock(&priv->lock); + + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + BRB_MSG_COULD_NOT_SCHEDULE_SCRUB, NULL); + } + gf_msg_debug(this->name, 0, "returning %d", ret); + break; + default: + default_notify(this, event, data); + } + +out: + return 0; +} + +static void +br_fini_signer(xlator_t *this, br_private_t *priv) +{ + int i = 0; + + if (priv == NULL) + return; + + for (; i < priv->signer_th_count; i++) { + (void)gf_thread_cleanup_xint(priv->obj_queue->workers[i]); + } + GF_FREE(priv->obj_queue->workers); + + pthread_cond_destroy(&priv->object_cond); +} + +/** + * Initialize signer specific structures, spawn worker threads. + */ + +static int32_t +br_init_signer(xlator_t *this, br_private_t *priv) +{ + int i = 0; + int32_t ret = -1; + + /* initialize gfchangelog xlator context */ + ret = gf_changelog_init(this); + if (ret) + goto out; + + pthread_cond_init(&priv->object_cond, NULL); + + priv->obj_queue = GF_CALLOC(1, sizeof(*priv->obj_queue), + gf_br_mt_br_ob_n_wk_t); + if (!priv->obj_queue) + goto cleanup_cond; + INIT_LIST_HEAD(&priv->obj_queue->objects); + + priv->obj_queue->workers = GF_CALLOC( + priv->signer_th_count, sizeof(pthread_t), gf_br_mt_br_worker_t); + if (!priv->obj_queue->workers) + goto cleanup_obj_queue; + + for (i = 0; i < priv->signer_th_count; i++) { + ret = gf_thread_create(&priv->obj_queue->workers[i], NULL, + br_process_object, this, "brpobj"); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + BRB_MSG_THREAD_CREATION_FAILED, NULL); + ret = -1; + goto cleanup_threads; + } + } + + return 0; + +cleanup_threads: + for (i--; i >= 0; i--) { + (void)gf_thread_cleanup_xint(priv->obj_queue->workers[i]); + } + GF_FREE(priv->obj_queue->workers); + +cleanup_obj_queue: + GF_FREE(priv->obj_queue); + +cleanup_cond: + /* that's explicit */ + pthread_cond_destroy(&priv->object_cond); +out: + return -1; +} + +/** + * For signer, only rate limit CPU usage (during hash calculation) when + * compiled with -DBR_RATE_LIMIT_SIGNER cflags, else let it run full + * throttle. + */ +static int32_t +br_rate_limit_signer(xlator_t *this, int child_count, int numbricks) +{ + br_private_t *priv = NULL; + tbf_opspec_t spec = { + 0, + }; + + priv = this->private; + + spec.op = TBF_OP_HASH; + spec.rate = 0; + spec.maxlimit = 0; + + /** + * OK. Most implementations of TBF I've come across generate tokens + * every second (UML, etc..) and some chose sub-second granularity + * (blk-iothrottle cgroups). TBF algorithm itself does not enforce + * any logic for choosing generation interval and it seems pretty + * logical as one could jack up token count per interval w.r.t. + * generation rate. + * + * Value used here is chosen based on a series of test(s) performed + * to balance object signing time and not maxing out on all available + * CPU cores. It's obvious to have seconds granularity and jack up + * token count per interval, thereby achieving close to similar + * results. Let's stick to this as it seems to be working fine for + * the set of ops that are throttled. + **/ + spec.token_gen_interval = 600000; /* In usec */ + +#ifdef BR_RATE_LIMIT_SIGNER + + double contribution = 0; + contribution = ((double)1 - ((double)child_count / (double)numbricks)); + if (contribution == 0) + contribution = 1; + spec.rate = BR_HASH_CALC_READ_SIZE * contribution; + spec.maxlimit = priv->signer_th_count * BR_HASH_CALC_READ_SIZE; + +#endif + + if (!spec.rate) + gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_RATE_LIMIT_INFO, + "FULL THROTTLE", NULL); + else + gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_RATE_LIMIT_INFO, + "tokens/sec-rate=%lu", spec.rate, "maxlimit=%lu", spec.maxlimit, + NULL); + + priv->tbf = tbf_init(&spec, 1); + return priv->tbf ? 0 : -1; +} + +static int32_t +br_signer_handle_options(xlator_t *this, br_private_t *priv, dict_t *options) +{ + if (options) { + GF_OPTION_RECONF("expiry-time", priv->expiry_time, options, uint32, + error_return); + GF_OPTION_RECONF("signer-threads", priv->signer_th_count, options, + uint32, error_return); + } else { + GF_OPTION_INIT("expiry-time", priv->expiry_time, uint32, error_return); + GF_OPTION_INIT("signer-threads", priv->signer_th_count, uint32, + error_return); + } + + return 0; + +error_return: + return -1; +} + +static int32_t +br_signer_init(xlator_t *this, br_private_t *priv) +{ + int32_t ret = 0; + int numbricks = 0; + + GF_OPTION_INIT("expiry-time", priv->expiry_time, uint32, error_return); + GF_OPTION_INIT("brick-count", numbricks, int32, error_return); + GF_OPTION_INIT("signer-threads", priv->signer_th_count, uint32, + error_return); + + ret = br_rate_limit_signer(this, priv->child_count, numbricks); + if (ret) + goto error_return; + + ret = br_init_signer(this, priv); + if (ret) + goto cleanup_tbf; + + return 0; + +cleanup_tbf: + /* cleanup TBF */ +error_return: + return -1; +} + +static void +br_free_scrubber_monitor(xlator_t *this, br_private_t *priv) +{ + struct br_monitor *scrub_monitor = &priv->scrub_monitor; + + if (scrub_monitor->timer) { + (void)gf_tw_del_timer(priv->timer_wheel, scrub_monitor->timer); + + GF_FREE(scrub_monitor->timer); + scrub_monitor->timer = NULL; + } + + (void)gf_thread_cleanup_xint(scrub_monitor->thread); + + /* Clean up cond and mutex variables */ + pthread_mutex_destroy(&scrub_monitor->mutex); + pthread_cond_destroy(&scrub_monitor->cond); + + pthread_mutex_destroy(&scrub_monitor->wakelock); + pthread_cond_destroy(&scrub_monitor->wakecond); + + pthread_mutex_destroy(&scrub_monitor->donelock); + pthread_cond_destroy(&scrub_monitor->donecond); + + LOCK_DESTROY(&scrub_monitor->lock); +} + +static void +br_free_children(xlator_t *this, br_private_t *priv, int count) +{ + br_child_t *child = NULL; + + for (--count; count >= 0; count--) { + child = &priv->children[count]; + mem_pool_destroy(child->timer_pool); + pthread_mutex_destroy(&child->lock); + } + + GF_FREE(priv->children); + priv->children = NULL; +} + +static int +br_init_children(xlator_t *this, br_private_t *priv) +{ + int i = 0; + br_child_t *child = NULL; + xlator_list_t *trav = NULL; + + priv->child_count = xlator_subvolume_count(this); + priv->children = GF_CALLOC(priv->child_count, sizeof(*priv->children), + gf_br_mt_br_child_t); + if (!priv->children) + goto err; + + trav = this->children; + while (trav) { + child = &priv->children[i]; + + pthread_mutex_init(&child->lock, NULL); + child->witnessed = 0; + + br_set_child_state(child, BR_CHILD_STATE_DISCONNECTED); + + child->this = this; + child->xl = trav->xlator; + + child->timer_pool = mem_pool_new(struct gf_tw_timer_list, 4096); + if (!child->timer_pool) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_MEM_POOL_ALLOC, + NULL); + errno = ENOMEM; + goto freechild; + } + + INIT_LIST_HEAD(&child->list); + + i++; + trav = trav->next; + } + + return 0; + +freechild: + br_free_children(this, priv, i); +err: + return -1; +} + +int32_t +init(xlator_t *this) +{ + int32_t ret = -1; + br_private_t *priv = NULL; + + if (!this->children) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_NO_CHILD, NULL); + goto out; + } + + priv = GF_CALLOC(1, sizeof(*priv), gf_br_mt_br_private_t); + if (!priv) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY, NULL); + goto out; + } + + GF_OPTION_INIT("scrubber", priv->iamscrubber, bool, free_priv); + + ret = br_init_children(this, priv); + if (ret) + goto free_priv; + + pthread_mutex_init(&priv->lock, NULL); + pthread_cond_init(&priv->cond, NULL); + + INIT_LIST_HEAD(&priv->bricks); + INIT_LIST_HEAD(&priv->signing); + + priv->timer_wheel = glusterfs_ctx_tw_get(this->ctx); + if (!priv->timer_wheel) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_TIMER_WHEEL_UNAVAILABLE, + NULL); + goto cleanup; + } + + this->private = priv; + + if (!priv->iamscrubber) { + ret = br_signer_init(this, priv); + if (!ret) + ret = br_signer_handle_options(this, priv, NULL); + } else { + ret = br_scrubber_init(this, priv); + if (!ret) + ret = br_scrubber_handle_options(this, priv, NULL); + } + + if (ret) + goto cleanup; + + ret = gf_thread_create(&priv->thread, NULL, br_handle_events, this, + "brhevent"); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, BRB_MSG_THREAD_CREATION_FAILED, + NULL); + ret = -1; + } + + if (!ret) { + gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_BITROT_LOADED, "mode=%s", + (priv->iamscrubber) ? "SCRUBBER" : "SIGNER", NULL); + return 0; + } + +cleanup: + (void)pthread_cond_destroy(&priv->cond); + (void)pthread_mutex_destroy(&priv->lock); + + br_free_children(this, priv, priv->child_count); + +free_priv: + GF_FREE(priv); +out: + this->private = NULL; + return -1; +} + +void +fini(xlator_t *this) +{ + br_private_t *priv = this->private; + + if (!priv) + return; + + if (!priv->iamscrubber) + br_fini_signer(this, priv); + else + (void)br_free_scrubber_monitor(this, priv); + + br_free_children(this, priv, priv->child_count); + + this->private = NULL; + GF_FREE(priv); + + glusterfs_ctx_tw_put(this->ctx); + + return; +} + +static void +br_reconfigure_monitor(xlator_t *this) +{ + int32_t ret = 0; + + ret = br_scrub_state_machine(this, _gf_false); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_COULD_NOT_SCHEDULE_SCRUB, + NULL); + } +} + +static int +br_reconfigure_scrubber(xlator_t *this, dict_t *options) +{ + int32_t ret = -1; + br_private_t *priv = NULL; + + priv = this->private; + + pthread_mutex_lock(&priv->lock); + { + ret = br_scrubber_handle_options(this, priv, options); + } + pthread_mutex_unlock(&priv->lock); + + if (ret) + goto err; + + /* change state for all _up_ subvolume(s) */ + pthread_mutex_lock(&priv->lock); + { + br_reconfigure_monitor(this); + } + pthread_mutex_unlock(&priv->lock); + +err: + return ret; +} + +static int +br_reconfigure_signer(xlator_t *this, dict_t *options) +{ + br_private_t *priv = this->private; + + return br_signer_handle_options(this, priv, options); +} + +int +reconfigure(xlator_t *this, dict_t *options) +{ + int ret = 0; + br_private_t *priv = NULL; + + priv = this->private; + + if (priv->iamscrubber) + ret = br_reconfigure_scrubber(this, options); + else + ret = br_reconfigure_signer(this, options); + + return ret; +} + +struct xlator_fops fops; + +struct xlator_cbks cbks; + +struct volume_options options[] = { + { + .key = {"expiry-time"}, + .type = GF_OPTION_TYPE_INT, + .default_value = SIGNING_TIMEOUT, + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_SETTABLE, + .description = "Waiting time for an object on which it waits " + "before it is signed", + }, + { + .key = {"brick-count"}, + .type = GF_OPTION_TYPE_STR, + .description = "Total number of bricks for the current node for " + "all volumes in the trusted storage pool.", + }, + { + .key = {"scrubber", "scrub"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_FORCE, + .description = "option to run as a scrubber", + }, + { + .key = {"scrub-throttle"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "lazy", + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_SETTABLE, + .description = "Scrub-throttle value is a measure of how fast " + "or slow the scrubber scrubs the filesystem for " + "volume <VOLNAME>", + }, + { + .key = {"scrub-freq"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "biweekly", + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_SETTABLE, + .description = "Scrub frequency for volume <VOLNAME>", + }, + { + .key = {"scrub-state"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "active", + .op_version = {GD_OP_VERSION_4_0_0}, + .flags = OPT_FLAG_SETTABLE, + .description = "Pause/Resume scrub. Upon resume, scrubber " + "continues from where it left off.", + }, + { + .key = {"signer-threads"}, + .type = GF_OPTION_TYPE_INT, + .default_value = BR_WORKERS, + .op_version = {GD_OP_VERSION_8_0}, + .flags = OPT_FLAG_SETTABLE, + .description = "Number of signing process threads. As a best " + "practice, set this to the number of processor cores", + }, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .notify = notify, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "bit-rot", + .category = GF_MAINTAINED, +}; diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.h b/xlators/features/bit-rot/src/bitd/bit-rot.h new file mode 100644 index 00000000000..8ac7dcdac3d --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/bit-rot.h @@ -0,0 +1,302 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __BIT_ROT_H__ +#define __BIT_ROT_H__ + +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> +#include <glusterfs/syncop.h> +#include <glusterfs/syncop-utils.h> +#include "changelog.h" +#include "timer-wheel.h" + +#include <glusterfs/throttle-tbf.h> +#include "bit-rot-ssm.h" + +#include "bit-rot-common.h" +#include "bit-rot-stub-mem-types.h" +#include "bit-rot-scrub-status.h" + +#include <openssl/sha.h> + +typedef enum scrub_throttle { + BR_SCRUB_THROTTLE_VOID = -1, + BR_SCRUB_THROTTLE_LAZY = 0, + BR_SCRUB_THROTTLE_NORMAL = 1, + BR_SCRUB_THROTTLE_AGGRESSIVE = 2, + BR_SCRUB_THROTTLE_STALLED = 3, +} scrub_throttle_t; + +typedef enum scrub_freq { + BR_FSSCRUB_FREQ_HOURLY = 1, + BR_FSSCRUB_FREQ_DAILY, + BR_FSSCRUB_FREQ_WEEKLY, + BR_FSSCRUB_FREQ_BIWEEKLY, + BR_FSSCRUB_FREQ_MONTHLY, + BR_FSSCRUB_FREQ_MINUTE, + BR_FSSCRUB_FREQ_STALLED, +} scrub_freq_t; + +#define signature_size(hl) (sizeof(br_isignature_t) + hl + 1) + +struct br_scanfs { + gf_lock_t entrylock; + + pthread_mutex_t waitlock; + pthread_cond_t waitcond; + + unsigned int entries; + struct list_head queued; + struct list_head ready; +}; + +/* just need three states to track child status */ +typedef enum br_child_state { + BR_CHILD_STATE_CONNECTED = 1, + BR_CHILD_STATE_INITIALIZING, + BR_CHILD_STATE_CONNFAILED, + BR_CHILD_STATE_DISCONNECTED, +} br_child_state_t; + +struct br_child { + pthread_mutex_t lock; /* protects child state */ + char witnessed; /* witnessed at least one successful + connection */ + br_child_state_t c_state; /* current state of this child */ + + char child_up; /* Indicates whether this child is + up or not */ + xlator_t *xl; /* client xlator corresponding to + this child */ + inode_table_t *table; /* inode table for this child */ + char brick_path[PATH_MAX]; /* brick export directory of this + child */ + struct list_head list; /* hook to attach to the list of + UP children */ + xlator_t *this; /* Bit rot xlator */ + + pthread_t thread; /* initial crawler for unsigned + object(s) or scrub crawler */ + int threadrunning; /* active thread */ + + struct mem_pool *timer_pool; /* timer-wheel's timer mem-pool */ + + struct timeval tv; + + struct br_scanfs fsscan; /* per subvolume FS scanner */ + + gf_boolean_t active_scrubbing; /* Actively scrubbing or not */ +}; + +typedef struct br_child br_child_t; + +struct br_obj_n_workers { + struct list_head objects; /* queue of objects expired from the + timer wheel and ready to be picked + up for signing */ + pthread_t *workers; /* Threads which pick up the objects + from the above queue and start + signing each object */ +}; + +struct br_scrubber { + xlator_t *this; + + scrub_throttle_t throttle; + + /** + * frequency of scanning for this subvolume. this should + * normally be per-child, but since all children follow the + * same frequency for a volume, this option ends up here + * instead of br_child_t. + */ + scrub_freq_t frequency; + + gf_boolean_t frequency_reconf; + gf_boolean_t throttle_reconf; + + pthread_mutex_t mutex; + pthread_cond_t cond; + + unsigned int nr_scrubbers; + struct list_head scrubbers; + + /** + * list of "rotatable" subvolume(s) undergoing scrubbing + */ + struct list_head scrublist; +}; + +struct br_monitor { + gf_lock_t lock; + pthread_t thread; /* Monitor thread */ + + gf_boolean_t inited; + pthread_mutex_t mutex; + pthread_cond_t cond; /* Thread starts and will be waiting on cond. + First child which is up wakes this up */ + + xlator_t *this; + /* scheduler */ + uint32_t boot; + + int32_t active_child_count; /* Number of children currently scrubbing */ + gf_boolean_t kick; /* This variable tracks the scrubber is + * kicked or not. Both 'kick' and + * 'active_child_count' uses the same pair + * of mutex-cond variable, i.e, wakelock and + * wakecond. */ + + pthread_mutex_t wakelock; + pthread_cond_t wakecond; + + gf_boolean_t done; + pthread_mutex_t donelock; + pthread_cond_t donecond; + + struct gf_tw_timer_list *timer; + br_scrub_state_t state; /* current scrub state */ +}; + +typedef struct br_obj_n_workers br_obj_n_workers_t; + +typedef struct br_private br_private_t; + +typedef void (*br_scrubbed_file_update)(br_private_t *priv); + +struct br_private { + pthread_mutex_t lock; + + struct list_head bricks; /* list of bricks from which enents + have been received */ + + struct list_head signing; + + pthread_cond_t object_cond; /* handling signing of objects */ + int child_count; + br_child_t *children; /* list of subvolumes */ + int up_children; + + pthread_cond_t cond; /* handling CHILD_UP notifications */ + pthread_t thread; /* thread for connecting each UP + child with changelog */ + + struct tvec_base *timer_wheel; /* timer wheel where the objects which + changelog has sent sits and waits + for expiry */ + br_obj_n_workers_t *obj_queue; /* place holder for all the objects + that are expired from timer wheel + and ready to be picked up for + signing and the workers which sign + the objects */ + + uint32_t expiry_time; /* objects "wait" time */ + + uint32_t signer_th_count; /* Number of signing process threads */ + + tbf_t *tbf; /* token bucket filter */ + + gf_boolean_t iamscrubber; /* function as a fs scrubber */ + + struct br_scrub_stats scrub_stat; /* statistics of scrub*/ + + struct br_scrubber fsscrub; /* scrubbers for this subvolume */ + + struct br_monitor scrub_monitor; /* scrubber monitor */ +}; + +struct br_object { + xlator_t *this; + + uuid_t gfid; + + unsigned long signedversion; /* version against which this object will + be signed */ + br_child_t *child; /* object's subvolume */ + + int sign_info; + + struct list_head list; /* hook to add to the queue once the + object is expired from timer wheel */ + void *data; +}; + +typedef struct br_object br_object_t; +typedef int32_t(br_scrub_ssm_call)(xlator_t *); + +void +br_log_object(xlator_t *, char *, uuid_t, int32_t); + +void +br_log_object_path(xlator_t *, char *, const char *, int32_t); + +int32_t +br_calculate_obj_checksum(unsigned char *, br_child_t *, fd_t *, struct iatt *); + +int32_t +br_prepare_loc(xlator_t *, br_child_t *, loc_t *, gf_dirent_t *, loc_t *); + +gf_boolean_t +bitd_is_bad_file(xlator_t *, br_child_t *, loc_t *, fd_t *); + +static inline void +_br_set_child_state(br_child_t *child, br_child_state_t state) +{ + child->c_state = state; +} + +static inline int +_br_is_child_connected(br_child_t *child) +{ + return (child->c_state == BR_CHILD_STATE_CONNECTED); +} + +static inline int +_br_is_child_scrub_active(br_child_t *child) +{ + return child->active_scrubbing; +} + +static inline int +_br_child_failed_conn(br_child_t *child) +{ + return (child->c_state == BR_CHILD_STATE_CONNFAILED); +} + +static inline int +_br_child_witnessed_connection(br_child_t *child) +{ + return (child->witnessed == 1); +} + +/* scrub state */ +static inline void +_br_monitor_set_scrub_state(struct br_monitor *scrub_monitor, + br_scrub_state_t state) +{ + scrub_monitor->state = state; +} + +static inline br_scrub_event_t +_br_child_get_scrub_event(struct br_scrubber *fsscrub) +{ + return (fsscrub->frequency == BR_FSSCRUB_FREQ_STALLED) + ? BR_SCRUB_EVENT_PAUSE + : BR_SCRUB_EVENT_SCHEDULE; +} + +int32_t +br_get_bad_objects_list(xlator_t *this, dict_t **dict); + +#endif /* __BIT_ROT_H__ */ diff --git a/xlators/features/bit-rot/src/stub/Makefile.am b/xlators/features/bit-rot/src/stub/Makefile.am new file mode 100644 index 00000000000..f13de7145fc --- /dev/null +++ b/xlators/features/bit-rot/src/stub/Makefile.am @@ -0,0 +1,20 @@ +if WITH_SERVER +xlator_LTLIBRARIES = bitrot-stub.la +endif +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +bitrot_stub_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +bitrot_stub_la_SOURCES = bit-rot-stub-helpers.c bit-rot-stub.c +bitrot_stub_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = bit-rot-stub.h bit-rot-common.h bit-rot-stub-mem-types.h \ + bit-rot-object-version.h bit-rot-stub-messages.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ + -I$(top_srcdir)/rpc/rpc-lib/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/bit-rot/src/stub/bit-rot-common.h b/xlators/features/bit-rot/src/stub/bit-rot-common.h new file mode 100644 index 00000000000..20561aa7764 --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-common.h @@ -0,0 +1,178 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __BIT_ROT_COMMON_H__ +#define __BIT_ROT_COMMON_H__ + +#include <glusterfs/glusterfs.h> +#include "bit-rot-object-version.h" + +#define BR_VXATTR_VERSION (1 << 0) +#define BR_VXATTR_SIGNATURE (1 << 1) + +#define BR_VXATTR_SIGN_MISSING (BR_VXATTR_SIGNATURE) +#define BR_VXATTR_ALL_MISSING (BR_VXATTR_VERSION | BR_VXATTR_SIGNATURE) + +#define BR_BAD_OBJ_CONTAINER \ + (uuid_t) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 } + +typedef enum br_vxattr_state { + BR_VXATTR_STATUS_FULL = 0, + BR_VXATTR_STATUS_MISSING = 1, + BR_VXATTR_STATUS_UNSIGNED = 2, + BR_VXATTR_STATUS_INVALID = 3, +} br_vxattr_status_t; + +typedef enum br_sign_state { + BR_SIGN_INVALID = -1, + BR_SIGN_NORMAL = 0, + BR_SIGN_REOPEN_WAIT = 1, + BR_SIGN_QUICK = 2, +} br_sign_state_t; + +static inline br_vxattr_status_t +br_version_xattr_state(dict_t *xattr, br_version_t **obuf, + br_signature_t **sbuf, gf_boolean_t *objbad) +{ + int32_t ret = 0; + int32_t vxattr = 0; + br_vxattr_status_t status; + void *data = NULL; + + /** + * The key being present in the dict indicates the xattr was set on + * disk. The presence of xattr itself as of now is suffecient to say + * the the object is bad. + */ + *objbad = _gf_false; + ret = dict_get_bin(xattr, BITROT_OBJECT_BAD_KEY, (void **)&data); + if (!ret) + *objbad = _gf_true; + + ret = dict_get_bin(xattr, BITROT_CURRENT_VERSION_KEY, (void **)obuf); + if (ret) + vxattr |= BR_VXATTR_VERSION; + + ret = dict_get_bin(xattr, BITROT_SIGNING_VERSION_KEY, (void **)sbuf); + if (ret) + vxattr |= BR_VXATTR_SIGNATURE; + + switch (vxattr) { + case 0: + status = BR_VXATTR_STATUS_FULL; + break; + case BR_VXATTR_SIGN_MISSING: + status = BR_VXATTR_STATUS_UNSIGNED; + break; + case BR_VXATTR_ALL_MISSING: + status = BR_VXATTR_STATUS_MISSING; + break; + default: + status = BR_VXATTR_STATUS_INVALID; + } + + return status; +} + +/** + * in-memory representation of signature used by signer for object + * signing. + */ +typedef struct br_isignature_in { + int8_t signaturetype; /* signature type */ + + unsigned long signedversion; /* version against which the + object was signed */ + + size_t signaturelen; /* signature length */ + char signature[0]; /* object signature */ +} br_isignature_t; + +/** + * in-memory representation of signature used by scrubber for object + * verification. + */ +typedef struct br_isignature_out { + char stale; /* stale signature? */ + + unsigned long version; /* current signed version */ + + uint32_t time[2]; /* time when the object + got dirtied */ + + int8_t signaturetype; /* hash type */ + size_t signaturelen; /* signature length */ + char signature[0]; /* signature (hash) */ +} br_isignature_out_t; + +typedef struct br_stub_init { + uint32_t timebuf[2]; + char export[PATH_MAX]; +} br_stub_init_t; + +typedef enum { + BR_SIGNATURE_TYPE_VOID = -1, /* object is not signed */ + BR_SIGNATURE_TYPE_ZERO = 0, /* min boundary */ + BR_SIGNATURE_TYPE_SHA256 = 1, /* signed with SHA256 */ + BR_SIGNATURE_TYPE_MAX = 2, /* max boundary */ +} br_signature_type; + +/* BitRot stub start time (virtual xattr) */ +#define GLUSTERFS_GET_BR_STUB_INIT_TIME "trusted.glusterfs.bit-rot.stub-init" + +/* signing/reopen hint */ +#define BR_OBJECT_RESIGN 0 +#define BR_OBJECT_REOPEN 1 +#define BR_REOPEN_SIGN_HINT_KEY "trusted.glusterfs.bit-rot.reopen-hint" + +static inline int +br_is_signature_type_valid(int8_t signaturetype) +{ + return ((signaturetype > BR_SIGNATURE_TYPE_ZERO) && + (signaturetype < BR_SIGNATURE_TYPE_MAX)); +} + +static inline void +br_set_default_ongoingversion(br_version_t *buf, uint32_t *tv) +{ + buf->ongoingversion = BITROT_DEFAULT_CURRENT_VERSION; + buf->timebuf[0] = tv[0]; + buf->timebuf[1] = tv[1]; +} + +static inline void +br_set_default_signature(br_signature_t *buf, size_t *size) +{ + buf->signaturetype = (int8_t)BR_SIGNATURE_TYPE_VOID; + buf->signedversion = BITROT_DEFAULT_SIGNING_VERSION; + + *size = sizeof(br_signature_t); /* no signature */ +} + +static inline void +br_set_ongoingversion(br_version_t *buf, unsigned long version, uint32_t *tv) +{ + buf->ongoingversion = version; + buf->timebuf[0] = tv[0]; + buf->timebuf[1] = tv[1]; +} + +static inline void +br_set_signature(br_signature_t *buf, br_isignature_t *sign, + size_t signaturelen, size_t *size) +{ + buf->signaturetype = sign->signaturetype; + buf->signedversion = ntohl(sign->signedversion); + + memcpy(buf->signature, sign->signature, signaturelen); + *size = sizeof(br_signature_t) + signaturelen; +} + +#endif /* __BIT_ROT_COMMON_H__ */ diff --git a/xlators/features/bit-rot/src/stub/bit-rot-object-version.h b/xlators/features/bit-rot/src/stub/bit-rot-object-version.h new file mode 100644 index 00000000000..7ae6a5200df --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-object-version.h @@ -0,0 +1,30 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __BIT_ROT_OBJECT_VERSION_H +#define __BIT_ROT_OBJECT_VERSION_H + +/** + * on-disk formats for ongoing version and object signature. + */ +typedef struct br_version { + unsigned long ongoingversion; + uint32_t timebuf[2]; +} br_version_t; + +typedef struct __attribute__((__packed__)) br_signature { + int8_t signaturetype; + + unsigned long signedversion; + + char signature[0]; +} br_signature_t; + +#endif diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-helpers.c b/xlators/features/bit-rot/src/stub/bit-rot-stub-helpers.c new file mode 100644 index 00000000000..8ac13a09941 --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-helpers.c @@ -0,0 +1,796 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "bit-rot-stub.h" + +br_stub_fd_t * +br_stub_fd_new(void) +{ + br_stub_fd_t *br_stub_fd = NULL; + + br_stub_fd = GF_CALLOC(1, sizeof(*br_stub_fd), gf_br_stub_mt_br_stub_fd_t); + + return br_stub_fd; +} + +int +__br_stub_fd_ctx_set(xlator_t *this, fd_t *fd, br_stub_fd_t *br_stub_fd) +{ + uint64_t value = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, br_stub_fd, out); + + value = (uint64_t)(long)br_stub_fd; + + ret = __fd_ctx_set(fd, this, value); + +out: + return ret; +} + +br_stub_fd_t * +__br_stub_fd_ctx_get(xlator_t *this, fd_t *fd) +{ + br_stub_fd_t *br_stub_fd = NULL; + uint64_t value = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + ret = __fd_ctx_get(fd, this, &value); + if (ret) + return NULL; + + br_stub_fd = (br_stub_fd_t *)((long)value); + +out: + return br_stub_fd; +} + +br_stub_fd_t * +br_stub_fd_ctx_get(xlator_t *this, fd_t *fd) +{ + br_stub_fd_t *br_stub_fd = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + LOCK(&fd->lock); + { + br_stub_fd = __br_stub_fd_ctx_get(this, fd); + } + UNLOCK(&fd->lock); + +out: + return br_stub_fd; +} + +int32_t +br_stub_fd_ctx_set(xlator_t *this, fd_t *fd, br_stub_fd_t *br_stub_fd) +{ + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + GF_VALIDATE_OR_GOTO(this->name, br_stub_fd, out); + + LOCK(&fd->lock); + { + ret = __br_stub_fd_ctx_set(this, fd, br_stub_fd); + } + UNLOCK(&fd->lock); + +out: + return ret; +} + +/** + * Adds an entry to the bad objects directory. + * @gfid: gfid of the bad object being added to the bad objects directory + */ +int +br_stub_add(xlator_t *this, uuid_t gfid) +{ + char gfid_path[BR_PATH_MAX_PLUS] = {0}; + char bad_gfid_path[BR_PATH_MAX_PLUS] = {0}; + int ret = 0; + br_stub_private_t *priv = NULL; + struct stat st = {0}; + + priv = this->private; + GF_ASSERT_AND_GOTO_WITH_ERROR(this->name, !gf_uuid_is_null(gfid), out, + errno, EINVAL); + + snprintf(gfid_path, sizeof(gfid_path), "%s/%s", priv->stub_basepath, + uuid_utoa(gfid)); + + ret = sys_stat(gfid_path, &st); + if (!ret) + goto out; + snprintf(bad_gfid_path, sizeof(bad_gfid_path), "%s/stub-%s", + priv->stub_basepath, uuid_utoa(priv->bad_object_dir_gfid)); + + ret = sys_link(bad_gfid_path, gfid_path); + if (ret) { + if ((errno != ENOENT) && (errno != EMLINK) && (errno != EEXIST)) + goto out; + + /* + * Continue with success. At least we'll have half of the + * functionality, in the sense, object is marked bad and + * would be inaccessible. It's only scrub status that would + * show up less number of objects. That's fine as we'll have + * the log files that will have the missing information. + */ + gf_smsg(this->name, GF_LOG_WARNING, errno, BRS_MSG_LINK_FAIL, "gfid=%s", + uuid_utoa(gfid), NULL); + } + + return 0; +out: + return -1; +} + +int +br_stub_del(xlator_t *this, uuid_t gfid) +{ + int32_t op_errno __attribute__((unused)) = 0; + br_stub_private_t *priv = NULL; + int ret = 0; + char gfid_path[BR_PATH_MAX_PLUS] = {0}; + + priv = this->private; + GF_ASSERT_AND_GOTO_WITH_ERROR(this->name, !gf_uuid_is_null(gfid), out, + op_errno, EINVAL); + snprintf(gfid_path, sizeof(gfid_path), "%s/%s", priv->stub_basepath, + uuid_utoa(gfid)); + ret = sys_unlink(gfid_path); + if (ret && (errno != ENOENT)) { + gf_smsg(this->name, GF_LOG_ERROR, errno, BRS_MSG_BAD_OBJ_UNLINK_FAIL, + "path=%s", gfid_path, NULL); + ret = -errno; + goto out; + } + + ret = 0; + +out: + return ret; +} + +static int +br_stub_check_stub_directory(xlator_t *this, char *fullpath) +{ + int ret = 0; + struct stat st = { + 0, + }; + char oldpath[BR_PATH_MAX_PLUS] = {0}; + br_stub_private_t *priv = NULL; + + priv = this->private; + + snprintf(oldpath, sizeof(oldpath), "%s/%s", priv->export, + OLD_BR_STUB_QUARANTINE_DIR); + + ret = sys_stat(fullpath, &st); + if (!ret && !S_ISDIR(st.st_mode)) + goto error_return; + if (ret) { + if (errno != ENOENT) + goto error_return; + ret = sys_stat(oldpath, &st); + if (ret) + ret = mkdir_p(fullpath, 0600, _gf_true); + else + ret = sys_rename(oldpath, fullpath); + } + + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, errno, BRS_MSG_BAD_OBJECT_DIR_FAIL, + "create-path=%s", fullpath, NULL); + return ret; + +error_return: + gf_smsg(this->name, GF_LOG_ERROR, errno, BRS_MSG_BAD_OBJECT_DIR_FAIL, + "verify-path=%s", fullpath, NULL); + return -1; +} + +/** + * Function to create the container for the bad objects within the bad objects + * directory. + */ +static int +br_stub_check_stub_file(xlator_t *this, char *path) +{ + int ret = 0; + int fd = -1; + struct stat st = { + 0, + }; + + ret = sys_stat(path, &st); + if (!ret && !S_ISREG(st.st_mode)) + goto error_return; + if (ret) { + if (errno != ENOENT) + goto error_return; + fd = sys_creat(path, 0); + if (fd < 0) + gf_smsg(this->name, GF_LOG_ERROR, errno, + BRS_MSG_BAD_OBJECT_DIR_FAIL, "create-path=%s", path, NULL); + } + + if (fd >= 0) { + sys_close(fd); + ret = 0; + } + + return ret; + +error_return: + gf_smsg(this->name, GF_LOG_ERROR, errno, BRS_MSG_BAD_OBJECT_DIR_FAIL, + "verify-path=%s", path, NULL); + return -1; +} + +int +br_stub_dir_create(xlator_t *this, br_stub_private_t *priv) +{ + int ret = -1; + char fullpath[BR_PATH_MAX_PLUS] = { + 0, + }; + char stub_gfid_path[BR_PATH_MAX_PLUS] = { + 0, + }; + + gf_uuid_copy(priv->bad_object_dir_gfid, BR_BAD_OBJ_CONTAINER); + + if (snprintf(fullpath, sizeof(fullpath), "%s", priv->stub_basepath) >= + sizeof(fullpath)) + goto out; + + if (snprintf(stub_gfid_path, sizeof(stub_gfid_path), "%s/stub-%s", + priv->stub_basepath, uuid_utoa(priv->bad_object_dir_gfid)) >= + sizeof(stub_gfid_path)) + goto out; + + ret = br_stub_check_stub_directory(this, fullpath); + if (ret) + goto out; + ret = br_stub_check_stub_file(this, stub_gfid_path); + if (ret) + goto out; + + return 0; + +out: + return -1; +} + +call_stub_t * +__br_stub_dequeue(struct list_head *callstubs) +{ + call_stub_t *stub = NULL; + + if (!list_empty(callstubs)) { + stub = list_entry(callstubs->next, call_stub_t, list); + list_del_init(&stub->list); + } + + return stub; +} + +void +__br_stub_enqueue(struct list_head *callstubs, call_stub_t *stub) +{ + list_add_tail(&stub->list, callstubs); +} + +void +br_stub_worker_enqueue(xlator_t *this, call_stub_t *stub) +{ + br_stub_private_t *priv = NULL; + + priv = this->private; + pthread_mutex_lock(&priv->container.bad_lock); + { + __br_stub_enqueue(&priv->container.bad_queue, stub); + pthread_cond_signal(&priv->container.bad_cond); + } + pthread_mutex_unlock(&priv->container.bad_lock); +} + +void * +br_stub_worker(void *data) +{ + br_stub_private_t *priv = NULL; + xlator_t *this = NULL; + call_stub_t *stub = NULL; + + THIS = data; + this = data; + priv = this->private; + + for (;;) { + pthread_mutex_lock(&priv->container.bad_lock); + { + while (list_empty(&priv->container.bad_queue)) { + (void)pthread_cond_wait(&priv->container.bad_cond, + &priv->container.bad_lock); + } + + stub = __br_stub_dequeue(&priv->container.bad_queue); + } + pthread_mutex_unlock(&priv->container.bad_lock); + + if (stub) /* guard against spurious wakeups */ + call_resume(stub); + } + + return NULL; +} + +int32_t +br_stub_lookup_wrapper(call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xattr_req) +{ + br_stub_private_t *priv = NULL; + struct stat lstatbuf = {0}; + int ret = 0; + int32_t op_errno = EINVAL; + int32_t op_ret = -1; + struct iatt stbuf = { + 0, + }; + struct iatt postparent = { + 0, + }; + dict_t *xattr = NULL; + gf_boolean_t ver_enabled = _gf_false; + + BR_STUB_VER_ENABLED_IN_CALLPATH(frame, ver_enabled); + priv = this->private; + BR_STUB_VER_COND_GOTO(priv, (!ver_enabled), done); + + VALIDATE_OR_GOTO(loc, done); + if (gf_uuid_compare(loc->gfid, priv->bad_object_dir_gfid)) + goto done; + + ret = sys_lstat(priv->stub_basepath, &lstatbuf); + if (ret) { + gf_msg_debug(this->name, errno, + "Stat failed on stub bad " + "object dir"); + op_errno = errno; + goto done; + } else if (!S_ISDIR(lstatbuf.st_mode)) { + gf_msg_debug(this->name, errno, + "bad object container is not " + "a directory"); + op_errno = ENOTDIR; + goto done; + } + + iatt_from_stat(&stbuf, &lstatbuf); + gf_uuid_copy(stbuf.ia_gfid, priv->bad_object_dir_gfid); + + op_ret = op_errno = 0; + xattr = dict_new(); + if (!xattr) { + op_ret = -1; + op_errno = ENOMEM; + } + +done: + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, loc->inode, &stbuf, + xattr, &postparent); + if (xattr) + dict_unref(xattr); + return 0; +} + +static int +is_bad_gfid_file_current(char *filename, uuid_t gfid) +{ + char current_stub_gfid[GF_UUID_BUF_SIZE + 16] = { + 0, + }; + + snprintf(current_stub_gfid, sizeof current_stub_gfid, "stub-%s", + uuid_utoa(gfid)); + return (!strcmp(filename, current_stub_gfid)); +} + +static void +check_delete_stale_bad_file(xlator_t *this, char *filename) +{ + int ret = 0; + struct stat st = {0}; + char filepath[BR_PATH_MAX_PLUS] = {0}; + br_stub_private_t *priv = NULL; + + priv = this->private; + + if (is_bad_gfid_file_current(filename, priv->bad_object_dir_gfid)) + return; + + snprintf(filepath, sizeof(filepath), "%s/%s", priv->stub_basepath, + filename); + + ret = sys_stat(filepath, &st); + if (!ret && st.st_nlink == 1) + sys_unlink(filepath); +} + +static int +br_stub_fill_readdir(fd_t *fd, br_stub_fd_t *fctx, DIR *dir, off_t off, + size_t size, gf_dirent_t *entries) +{ + off_t in_case = -1; + off_t last_off = 0; + size_t filled = 0; + int count = 0; + int32_t this_size = -1; + gf_dirent_t *this_entry = NULL; + xlator_t *this = NULL; + struct dirent *entry = NULL; + struct dirent scratch[2] = { + { + 0, + }, + }; + + this = THIS; + if (!off) { + rewinddir(dir); + } else { + seekdir(dir, off); +#ifndef GF_LINUX_HOST_OS + if ((u_long)telldir(dir) != off && off != fctx->bad_object.dir_eof) { + gf_smsg(THIS->name, GF_LOG_ERROR, 0, + BRS_MSG_BAD_OBJECT_DIR_SEEK_FAIL, "off=(0x%llx)", off, + "dir=%p", dir, NULL); + errno = EINVAL; + count = -1; + goto out; + } +#endif /* GF_LINUX_HOST_OS */ + } + + while (filled <= size) { + in_case = (u_long)telldir(dir); + + if (in_case == -1) { + gf_smsg(THIS->name, GF_LOG_ERROR, 0, + BRS_MSG_BAD_OBJECT_DIR_TELL_FAIL, "dir=%p", dir, "err=%s", + strerror(errno), NULL); + goto out; + } + + errno = 0; + entry = sys_readdir(dir, scratch); + if (!entry || errno != 0) { + if (errno == EBADF) { + gf_smsg(THIS->name, GF_LOG_WARNING, 0, + BRS_MSG_BAD_OBJECT_DIR_READ_FAIL, "dir=%p", dir, + "err=%s", strerror(errno), NULL); + goto out; + } + break; + } + + if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) + continue; + + if (!strncmp(entry->d_name, "stub-", strlen("stub-"))) { + check_delete_stale_bad_file(this, entry->d_name); + continue; + } + + this_size = max(sizeof(gf_dirent_t), sizeof(gfs3_dirplist)) + + strlen(entry->d_name) + 1; + + if (this_size + filled > size) { + seekdir(dir, in_case); +#ifndef GF_LINUX_HOST_OS + if ((u_long)telldir(dir) != in_case && + in_case != fctx->bad_object.dir_eof) { + gf_smsg(THIS->name, GF_LOG_ERROR, 0, + BRS_MSG_BAD_OBJECT_DIR_SEEK_FAIL, "in_case=(0x%llx)", + in_case, "dir=%p", dir, NULL); + errno = EINVAL; + count = -1; + goto out; + } +#endif /* GF_LINUX_HOST_OS */ + break; + } + + this_entry = gf_dirent_for_name(entry->d_name); + + if (!this_entry) { + gf_smsg(THIS->name, GF_LOG_ERROR, 0, + BRS_MSG_CREATE_GF_DIRENT_FAILED, "entry-name=%s", + entry->d_name, "err=%s", strerror(errno), NULL); + goto out; + } + /* + * we store the offset of next entry here, which is + * probably not intended, but code using syncop_readdir() + * (glfs-heal.c, afr-self-heald.c, pump.c) rely on it + * for directory read resumption. + */ + last_off = (u_long)telldir(dir); + this_entry->d_off = last_off; + this_entry->d_ino = entry->d_ino; + + list_add_tail(&this_entry->list, &entries->list); + + filled += this_size; + count++; + } + + if ((!sys_readdir(dir, scratch) && (errno == 0))) { + /* Indicate EOF */ + errno = ENOENT; + /* Remember EOF offset for later detection */ + fctx->bad_object.dir_eof = last_off; + } +out: + return count; +} + +int32_t +br_stub_readdir_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t off, dict_t *xdata) +{ + br_stub_fd_t *fctx = NULL; + DIR *dir = NULL; + int ret = -1; + int32_t op_ret = -1; + int32_t op_errno = 0; + int count = 0; + gf_dirent_t entries; + gf_boolean_t xdata_unref = _gf_false; + dict_t *dict = NULL; + + INIT_LIST_HEAD(&entries.list); + + fctx = br_stub_fd_ctx_get(this, fd); + if (!fctx) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_GET_FD_CONTEXT_FAILED, + "fd=%p", fd, NULL); + op_errno = -ret; + goto done; + } + + dir = fctx->bad_object.dir; + + if (!dir) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_BAD_HANDLE_DIR_NULL, + "fd=%p", fd, NULL); + op_errno = EINVAL; + goto done; + } + + count = br_stub_fill_readdir(fd, fctx, dir, off, size, &entries); + + /* pick ENOENT to indicate EOF */ + op_errno = errno; + op_ret = count; + + dict = xdata; + (void)br_stub_bad_objects_path(this, fd, &entries, &dict); + if (!xdata && dict) { + xdata = dict; + xdata_unref = _gf_true; + } + +done: + STACK_UNWIND_STRICT(readdir, frame, op_ret, op_errno, &entries, xdata); + gf_dirent_free(&entries); + if (xdata_unref) + dict_unref(xdata); + return 0; +} + +/** + * This function is called to mainly obtain the paths of the corrupt + * objects (files as of now). Currently scrub status prints only the + * gfid of the corrupted files. Reason is, bitrot-stub maintains the + * list of the corrupted objects as entries inside the quarantine + * directory (<brick export>/.glusterfs/quarantine) + * + * And the name of each entry in the qurantine directory is the gfid + * of the corrupted object. So scrub status will just show that info. + * But it helps the users a lot if the actual path to the object is + * also reported. Hence the below function to get that information. + * The function allocates a new dict to be returned (if it does not + * get one from the caller of readdir i.e. scrubber as of now), and + * stores the paths of each corrupted gfid there. The gfid is used as + * the key and path is used as the value. + * + * NOTE: The path will be there in following situations + * 1) gfid2path option has been enabled (posix xlator option) + * and the corrupted file contains the path as an extended + * attribute. + * 2) If the gfid2path option is not enabled, OR if the xattr + * is absent, then the inode table should have it. + * The path will be there if a name based lookup has happened + * on the file which has been corrupted. With lookup a inode and + * dentry would be created in the inode table. And the path is + * constructed using the in memory inode and dentry. If a lookup + * has not happened OR the inode corresponding to the corrupted + * file does not exist in the inode table (because it got purged + * as lru limit of the inodes exceeded) OR a nameless lookup had + * happened to populate the inode in the inode table, then the + * path will not be printed in scrub and only the gfid will be there. + **/ +int +br_stub_bad_objects_path(xlator_t *this, fd_t *fd, gf_dirent_t *entries, + dict_t **dict) +{ + gf_dirent_t *entry = NULL; + inode_t *inode = NULL; + char *hpath = NULL; + uuid_t gfid = {0}; + int ret = -1; + dict_t *tmp_dict = NULL; + char str_gfid[64] = {0}; + + if (list_empty(&entries->list)) + return 0; + + tmp_dict = *dict; + + if (!tmp_dict) { + tmp_dict = dict_new(); + /* + * If the allocation of dict fails then no need treat it + * it as a error. This path (or function) is executed when + * "gluster volume bitrot <volume name> scrub status" is + * executed, to get the list of the corrupted objects. + * And the motive of this function is to get the paths of + * the corrupted objects. If the dict allocation fails, then + * the scrub status will only show the gfids of those corrupted + * objects (which is the behavior as of the time of this patch + * being worked upon). So just return and only the gfids will + * be shown. + */ + if (!tmp_dict) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ALLOC_FAILED, NULL); + goto out; + } + } + + list_for_each_entry(entry, &entries->list, list) + { + gf_uuid_clear(gfid); + gf_uuid_parse(entry->d_name, gfid); + + inode = inode_find(fd->inode->table, gfid); + + /* No need to check the return value here. + * Because @hpath is examined. + */ + (void)br_stub_get_path_of_gfid(this, fd->inode, inode, gfid, &hpath); + + if (hpath) { + gf_msg_debug(this->name, 0, + "path of the corrupted " + "object (gfid: %s) is %s", + uuid_utoa(gfid), hpath); + br_stub_entry_xattr_fill(this, hpath, entry, tmp_dict); + } else + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_PATH_GET_FAILED, + "gfid=%s", uuid_utoa_r(gfid, str_gfid), NULL); + + inode = NULL; + hpath = NULL; + } + + ret = 0; + *dict = tmp_dict; + +out: + return ret; +} + +int +br_stub_get_path_of_gfid(xlator_t *this, inode_t *parent, inode_t *inode, + uuid_t gfid, char **path) +{ + int32_t ret = -1; + char gfid_str[64] = {0}; + + GF_VALIDATE_OR_GOTO("bitrot-stub", this, out); + GF_VALIDATE_OR_GOTO(this->name, parent, out); + GF_VALIDATE_OR_GOTO(this->name, path, out); + + /* Above, No need to validate the @inode for hard resolution. Because + * inode can be NULL and if it is NULL, then syncop_gfid_to_path_hard + * will allocate a new inode and proceed. So no need to bother about + * @inode. Because we need it only to send a syncop_getxattr call + * from inside syncop_gfid_to_path_hard. And getxattr fetches the + * path from the backend. + */ + + ret = syncop_gfid_to_path_hard(parent->table, FIRST_CHILD(this), gfid, + inode, path, _gf_true); + if (ret < 0) + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_PATH_GET_FAILED, + "gfid=%s", uuid_utoa_r(gfid, gfid_str), NULL); + + /* + * Try with soft resolution of path if hard resolve fails. Because + * checking the xattr on disk to get the path of a inode (or gfid) + * is dependent on whether that option is enabled in the posix + * xlator or not. If it is not enabled, then hard resolution by + * checking the on disk xattr fails. + * + * Thus in such situations fall back to the soft resolution which + * mainly depends on the inode_path() function. And for using + * inode_path, @inode has to be linked i.e. a successful lookup should + * have happened on the gfid (or the path) to link the inode to the + * inode table. And if @inode is NULL, means, the inode has not been + * found in the inode table and better not to do inode_path() on the + * inode which has not been linked. + */ + if (ret < 0 && inode) { + ret = syncop_gfid_to_path_hard(parent->table, FIRST_CHILD(this), gfid, + inode, path, _gf_false); + if (ret < 0) + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_PATH_GET_FAILED, + "from-memory gfid=%s", uuid_utoa_r(gfid, gfid_str), NULL); + } + +out: + return ret; +} + +/** + * NOTE: If the file has multiple hardlinks (in gluster volume + * namespace), the path would be one of the hardlinks. Its up to + * the user to find the remaining hardlinks (using find -samefile) + * and remove them. + **/ +void +br_stub_entry_xattr_fill(xlator_t *this, char *hpath, gf_dirent_t *entry, + dict_t *dict) +{ + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, out); + GF_VALIDATE_OR_GOTO(this->name, hpath, out); + + /* + * Use the entry->d_name (which is nothing but the gfid of the + * corrupted object) as the key. And the value will be the actual + * path of that object (or file). + * + * ALso ignore the dict_set errors. scrubber will get the gfid of + * the corrupted object for sure. So, for now lets just log the + * dict_set_dynstr failure and move on. + */ + + ret = dict_set_dynstr(dict, entry->d_name, hpath); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_DICT_SET_FAILED, + "path=%s", hpath, "object-name=%s", entry->d_name, NULL); +out: + return; +} diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h new file mode 100644 index 00000000000..9d93caf069f --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h @@ -0,0 +1,36 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _BR_MEM_TYPES_H +#define _BR_MEM_TYPES_H + +#include <glusterfs/mem-types.h> + +enum br_mem_types { + gf_br_stub_mt_private_t = gf_common_mt_end + 1, + gf_br_stub_mt_version_t, + gf_br_stub_mt_inode_ctx_t, + gf_br_stub_mt_signature_t, + gf_br_mt_br_private_t, + gf_br_mt_br_child_t, + gf_br_mt_br_object_t, + gf_br_mt_br_ob_n_wk_t, + gf_br_mt_br_scrubber_t, + gf_br_mt_br_fsscan_entry_t, + gf_br_stub_mt_br_stub_fd_t, + gf_br_stub_mt_br_scanner_freq_t, + gf_br_stub_mt_sigstub_t, + gf_br_mt_br_child_event_t, + gf_br_stub_mt_misc, + gf_br_mt_br_worker_t, + gf_br_stub_mt_end, +}; + +#endif diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h new file mode 100644 index 00000000000..6c15a166f18 --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h @@ -0,0 +1,117 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. + */ + +#ifndef _BITROT_STUB_MESSAGES_H_ +#define _BITROT_STUB_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(BITROT_STUB, BRS_MSG_NO_MEMORY, BRS_MSG_SET_EVENT_FAILED, + BRS_MSG_MEM_ACNT_FAILED, BRS_MSG_CREATE_FRAME_FAILED, + BRS_MSG_SET_CONTEXT_FAILED, BRS_MSG_CHANGE_VERSION_FAILED, + BRS_MSG_ADD_FD_TO_LIST_FAILED, BRS_MSG_SET_FD_CONTEXT_FAILED, + BRS_MSG_CREATE_ANONYMOUS_FD_FAILED, BRS_MSG_NO_CHILD, + BRS_MSG_STUB_ALLOC_FAILED, BRS_MSG_GET_INODE_CONTEXT_FAILED, + BRS_MSG_CANCEL_SIGN_THREAD_FAILED, BRS_MSG_ADD_FD_TO_INODE, + BRS_MSG_SIGN_VERSION_ERROR, BRS_MSG_BAD_OBJ_MARK_FAIL, + BRS_MSG_NON_SCRUB_BAD_OBJ_MARK, BRS_MSG_REMOVE_INTERNAL_XATTR, + BRS_MSG_SET_INTERNAL_XATTR, BRS_MSG_BAD_OBJECT_ACCESS, + BRS_MSG_BAD_CONTAINER_FAIL, BRS_MSG_BAD_OBJECT_DIR_FAIL, + BRS_MSG_BAD_OBJECT_DIR_SEEK_FAIL, BRS_MSG_BAD_OBJECT_DIR_TELL_FAIL, + BRS_MSG_BAD_OBJECT_DIR_READ_FAIL, BRS_MSG_GET_FD_CONTEXT_FAILED, + BRS_MSG_BAD_HANDLE_DIR_NULL, BRS_MSG_BAD_OBJ_THREAD_FAIL, + BRS_MSG_BAD_OBJ_DIR_CLOSE_FAIL, BRS_MSG_LINK_FAIL, + BRS_MSG_BAD_OBJ_UNLINK_FAIL, BRS_MSG_DICT_SET_FAILED, + BRS_MSG_PATH_GET_FAILED, BRS_MSG_NULL_LOCAL, + BRS_MSG_SPAWN_SIGN_THRD_FAILED, BRS_MSG_KILL_SIGN_THREAD, + BRS_MSG_NON_BITD_PID, BRS_MSG_SIGN_PREPARE_FAIL, + BRS_MSG_USING_DEFAULT_THREAD_SIZE, BRS_MSG_ALLOC_MEM_FAILED, + BRS_MSG_DICT_ALLOC_FAILED, BRS_MSG_CREATE_GF_DIRENT_FAILED, + BRS_MSG_ALLOC_FAILED, BRS_MSG_PATH_XATTR_GET_FAILED, + BRS_MSG_VERSION_PREPARE_FAIL); + +#define BRS_MSG_MEM_ACNT_FAILED_STR "Memory accounting init failed" +#define BRS_MSG_BAD_OBJ_THREAD_FAIL_STR "pthread_init failed" +#define BRS_MSG_USING_DEFAULT_THREAD_SIZE_STR "Using default thread stack size" +#define BRS_MSG_NO_CHILD_STR "FATAL: no children" +#define BRS_MSG_SPAWN_SIGN_THRD_FAILED_STR \ + "failed to create the new thread for signer" +#define BRS_MSG_BAD_CONTAINER_FAIL_STR \ + "failed to launch the thread for storing bad gfids" +#define BRS_MSG_CANCEL_SIGN_THREAD_FAILED_STR \ + "Could not cancel sign serializer thread" +#define BRS_MSG_KILL_SIGN_THREAD_STR "killed the signer thread" +#define BRS_MSG_GET_INODE_CONTEXT_FAILED_STR \ + "failed to init the inode context for the inode" +#define BRS_MSG_ADD_FD_TO_INODE_STR "failed to add fd to the inode" +#define BRS_MSG_NO_MEMORY_STR "local allocation failed" +#define BRS_MSG_BAD_OBJECT_ACCESS_STR "bad object accessed. Returning" +#define BRS_MSG_SIGN_VERSION_ERROR_STR "Signing version exceeds current version" +#define BRS_MSG_NON_BITD_PID_STR \ + "PID from where signature request came, does not belong to bit-rot " \ + "daemon. Unwinding the fop" +#define BRS_MSG_SIGN_PREPARE_FAIL_STR \ + "failed to prepare the signature. Unwinding the fop" +#define BRS_MSG_VERSION_PREPARE_FAIL_STR \ + "failed to prepare the version. Unwinding the fop" +#define BRS_MSG_STUB_ALLOC_FAILED_STR "failed to allocate stub fop, Unwinding" +#define BRS_MSG_BAD_OBJ_MARK_FAIL_STR "failed to mark object as bad" +#define BRS_MSG_NON_SCRUB_BAD_OBJ_MARK_STR \ + "bad object marking is not from the scrubber" +#define BRS_MSG_ALLOC_MEM_FAILED_STR "failed to allocate memory" +#define BRS_MSG_SET_INTERNAL_XATTR_STR "called on the internal xattr" +#define BRS_MSG_REMOVE_INTERNAL_XATTR_STR "removexattr called on internal xattr" +#define BRS_MSG_CREATE_ANONYMOUS_FD_FAILED_STR \ + "failed to create anonymous fd for the inode" +#define BRS_MSG_ADD_FD_TO_LIST_FAILED_STR "failed add fd to the list" +#define BRS_MSG_SET_FD_CONTEXT_FAILED_STR \ + "failed to set the fd context for the file" +#define BRS_MSG_NULL_LOCAL_STR "local is NULL" +#define BRS_MSG_DICT_ALLOC_FAILED_STR \ + "dict allocation failed: cannot send IPC FOP to changelog" +#define BRS_MSG_SET_EVENT_FAILED_STR "cannot set release event in dict" +#define BRS_MSG_CREATE_FRAME_FAILED_STR "create_frame() failure" +#define BRS_MSG_BAD_OBJ_DIR_CLOSE_FAIL_STR "closedir error" +#define BRS_MSG_LINK_FAIL_STR "failed to record gfid" +#define BRS_MSG_BAD_OBJ_UNLINK_FAIL_STR \ + "failed to delete bad object link from quaratine directory" +#define BRS_MSG_BAD_OBJECT_DIR_FAIL_STR "failed stub directory" +#define BRS_MSG_BAD_OBJECT_DIR_SEEK_FAIL_STR \ + "seekdir failed. Invalid argument (offset reused from another DIR * " \ + "structure)" +#define BRS_MSG_BAD_OBJECT_DIR_TELL_FAIL_STR "telldir failed on dir" +#define BRS_MSG_BAD_OBJECT_DIR_READ_FAIL_STR "readdir failed on dir" +#define BRS_MSG_CREATE_GF_DIRENT_FAILED_STR "could not create gf_dirent" +#define BRS_MSG_GET_FD_CONTEXT_FAILED_STR "pfd is NULL" +#define BRS_MSG_BAD_HANDLE_DIR_NULL_STR "dir if NULL" +#define BRS_MSG_ALLOC_FAILED_STR \ + "failed to allocate new dict for saving the paths of the corrupted " \ + "objects. Scrub status will only display the gfid" +#define BRS_MSG_PATH_GET_FAILED_STR "failed to get the path" +#define BRS_MSG_PATH_XATTR_GET_FAILED_STR \ + "failed to get the path xattr from disk for the gfid. Trying to get path " \ + "from the memory" +#define BRS_MSG_DICT_SET_FAILED_STR \ + "failed to set the actual path as the value in the dict for the " \ + "corrupted object" +#define BRS_MSG_SET_CONTEXT_FAILED_STR \ + "could not set fd context for release callback" +#define BRS_MSG_CHANGE_VERSION_FAILED_STR "change version failed" +#endif /* !_BITROT_STUB_MESSAGES_H_ */ diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.c b/xlators/features/bit-rot/src/stub/bit-rot-stub.c new file mode 100644 index 00000000000..447dd47ff41 --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.c @@ -0,0 +1,3590 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <ctype.h> +#include <sys/uio.h> +#include <signal.h> + +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include "changelog.h" +#include <glusterfs/compat-errno.h> +#include <glusterfs/call-stub.h> + +#include "bit-rot-stub.h" +#include "bit-rot-stub-mem-types.h" +#include "bit-rot-stub-messages.h" +#include "bit-rot-common.h" + +#define BR_STUB_REQUEST_COOKIE 0x1 + +void +br_stub_lock_cleaner(void *arg) +{ + pthread_mutex_t *clean_mutex = arg; + + pthread_mutex_unlock(clean_mutex); + return; +} + +void * +br_stub_signth(void *); + +struct br_stub_signentry { + unsigned long v; + + call_stub_t *stub; + + struct list_head list; +}; + +int32_t +mem_acct_init(xlator_t *this) +{ + int32_t ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init(this, gf_br_stub_mt_end + 1); + + if (ret != 0) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_MEM_ACNT_FAILED, NULL); + return ret; + } + + return ret; +} + +int +br_stub_bad_object_container_init(xlator_t *this, br_stub_private_t *priv) +{ + pthread_attr_t w_attr; + int ret = -1; + + ret = pthread_cond_init(&priv->container.bad_cond, NULL); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_THREAD_FAIL, + "cond_init ret=%d", ret, NULL); + goto out; + } + + ret = pthread_mutex_init(&priv->container.bad_lock, NULL); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_THREAD_FAIL, + "mutex_init ret=%d", ret, NULL); + goto cleanup_cond; + } + + ret = pthread_attr_init(&w_attr); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_THREAD_FAIL, + "attr_init ret=%d", ret, NULL); + goto cleanup_lock; + } + + ret = pthread_attr_setstacksize(&w_attr, BAD_OBJECT_THREAD_STACK_SIZE); + if (ret == EINVAL) { + gf_smsg(this->name, GF_LOG_WARNING, 0, + BRS_MSG_USING_DEFAULT_THREAD_SIZE, NULL); + } + + INIT_LIST_HEAD(&priv->container.bad_queue); + ret = br_stub_dir_create(this, priv); + if (ret < 0) + goto cleanup_lock; + + ret = gf_thread_create(&priv->container.thread, &w_attr, br_stub_worker, + this, "brswrker"); + if (ret) + goto cleanup_attr; + + return 0; + +cleanup_attr: + pthread_attr_destroy(&w_attr); +cleanup_lock: + pthread_mutex_destroy(&priv->container.bad_lock); +cleanup_cond: + pthread_cond_destroy(&priv->container.bad_cond); +out: + return -1; +} + +int32_t +init(xlator_t *this) +{ + int ret = 0; + char *tmp = NULL; + struct timeval tv = { + 0, + }; + br_stub_private_t *priv = NULL; + + if (!this->children) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_NO_CHILD, NULL); + goto error_return; + } + + priv = GF_CALLOC(1, sizeof(*priv), gf_br_stub_mt_private_t); + if (!priv) + goto error_return; + + priv->local_pool = mem_pool_new(br_stub_local_t, 512); + if (!priv->local_pool) + goto free_priv; + + GF_OPTION_INIT("bitrot", priv->do_versioning, bool, free_mempool); + + GF_OPTION_INIT("export", tmp, str, free_mempool); + + if (snprintf(priv->export, PATH_MAX, "%s", tmp) >= PATH_MAX) + goto free_mempool; + + if (snprintf(priv->stub_basepath, sizeof(priv->stub_basepath), "%s/%s", + priv->export, + BR_STUB_QUARANTINE_DIR) >= sizeof(priv->stub_basepath)) + goto free_mempool; + + (void)gettimeofday(&tv, NULL); + + /* boot time is in network endian format */ + priv->boot[0] = htonl(tv.tv_sec); + priv->boot[1] = htonl(tv.tv_usec); + + pthread_mutex_init(&priv->lock, NULL); + pthread_cond_init(&priv->cond, NULL); + INIT_LIST_HEAD(&priv->squeue); + + /* Thread creations need 'this' to be passed so that THIS can be + * assigned inside the thread. So setting this->private here. + */ + this->private = priv; + if (!priv->do_versioning) + return 0; + + ret = gf_thread_create(&priv->signth, NULL, br_stub_signth, this, + "brssign"); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SPAWN_SIGN_THRD_FAILED, + NULL); + goto cleanup_lock; + } + + ret = br_stub_bad_object_container_init(this, priv); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_CONTAINER_FAIL, NULL); + goto cleanup_lock; + } + + gf_msg_debug(this->name, 0, "bit-rot stub loaded"); + + return 0; + +cleanup_lock: + pthread_cond_destroy(&priv->cond); + pthread_mutex_destroy(&priv->lock); +free_mempool: + mem_pool_destroy(priv->local_pool); + priv->local_pool = NULL; +free_priv: + GF_FREE(priv); + this->private = NULL; +error_return: + return -1; +} + +/* TODO: + * As of now enabling bitrot option does 2 things. + * 1) Start the Bitrot Daemon which signs the objects (currently files only) + * upon getting notified by the stub. + * 2) Enable versioning of the objects. Object versions (again files only) are + * incremented upon modification. + * So object versioning is tied to bitrot daemon's signing. In future, object + * versioning might be necessary for other things as well apart from bit-rot + * detection (well that's the objective of bringing in object-versioning :)). + * In that case, better to make versioning a new option and letting it to be + * enabled despite bit-rot detection is not needed. + * Ex: ICAP. + */ +int32_t +reconfigure(xlator_t *this, dict_t *options) +{ + int32_t ret = -1; + br_stub_private_t *priv = NULL; + + priv = this->private; + + GF_OPTION_RECONF("bitrot", priv->do_versioning, options, bool, err); + if (priv->do_versioning && !priv->signth) { + ret = gf_thread_create(&priv->signth, NULL, br_stub_signth, this, + "brssign"); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_WARNING, 0, + BRS_MSG_SPAWN_SIGN_THRD_FAILED, NULL); + goto err; + } + + ret = br_stub_bad_object_container_init(this, priv); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_CONTAINER_FAIL, + NULL); + goto err; + } + } else { + if (priv->signth) { + if (gf_thread_cleanup_xint(priv->signth)) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + BRS_MSG_CANCEL_SIGN_THREAD_FAILED, NULL); + } else { + gf_smsg(this->name, GF_LOG_INFO, 0, BRS_MSG_KILL_SIGN_THREAD, + NULL); + priv->signth = 0; + } + } + + if (priv->container.thread) { + if (gf_thread_cleanup_xint(priv->container.thread)) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + BRS_MSG_CANCEL_SIGN_THREAD_FAILED, NULL); + } + priv->container.thread = 0; + } + } + + ret = 0; + return ret; +err: + if (priv->signth) { + if (gf_thread_cleanup_xint(priv->signth)) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + BRS_MSG_CANCEL_SIGN_THREAD_FAILED, NULL); + } + priv->signth = 0; + } + + if (priv->container.thread) { + if (gf_thread_cleanup_xint(priv->container.thread)) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + BRS_MSG_CANCEL_SIGN_THREAD_FAILED, NULL); + } + priv->container.thread = 0; + } + ret = -1; + return ret; +} + +int +notify(xlator_t *this, int event, void *data, ...) +{ + br_stub_private_t *priv = NULL; + + if (!this) + return 0; + + priv = this->private; + if (!priv) + return 0; + + default_notify(this, event, data); + return 0; +} + +void +fini(xlator_t *this) +{ + int32_t ret = 0; + br_stub_private_t *priv = this->private; + struct br_stub_signentry *sigstub = NULL; + call_stub_t *stub = NULL; + + if (!priv) + return; + + if (!priv->do_versioning) + goto cleanup; + + ret = gf_thread_cleanup_xint(priv->signth); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_CANCEL_SIGN_THREAD_FAILED, + NULL); + goto out; + } + priv->signth = 0; + + while (!list_empty(&priv->squeue)) { + sigstub = list_first_entry(&priv->squeue, struct br_stub_signentry, + list); + list_del_init(&sigstub->list); + + call_stub_destroy(sigstub->stub); + GF_FREE(sigstub); + } + + ret = gf_thread_cleanup_xint(priv->container.thread); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_CANCEL_SIGN_THREAD_FAILED, + NULL); + goto out; + } + + priv->container.thread = 0; + + while (!list_empty(&priv->container.bad_queue)) { + stub = list_first_entry(&priv->container.bad_queue, call_stub_t, list); + list_del_init(&stub->list); + call_stub_destroy(stub); + } + + pthread_mutex_destroy(&priv->container.bad_lock); + pthread_cond_destroy(&priv->container.bad_cond); + +cleanup: + pthread_mutex_destroy(&priv->lock); + pthread_cond_destroy(&priv->cond); + + if (priv->local_pool) { + mem_pool_destroy(priv->local_pool); + priv->local_pool = NULL; + } + + this->private = NULL; + GF_FREE(priv); + +out: + return; +} + +static int +br_stub_alloc_versions(br_version_t **obuf, br_signature_t **sbuf, + size_t signaturelen) +{ + void *mem = NULL; + size_t size = 0; + + if (obuf) + size += sizeof(br_version_t); + if (sbuf) + size += sizeof(br_signature_t) + signaturelen; + + mem = GF_CALLOC(1, size, gf_br_stub_mt_version_t); + if (!mem) + goto error_return; + + if (obuf) { + *obuf = (br_version_t *)mem; + mem = ((char *)mem + sizeof(br_version_t)); + } + if (sbuf) { + *sbuf = (br_signature_t *)mem; + } + + return 0; + +error_return: + return -1; +} + +static void +br_stub_dealloc_versions(void *mem) +{ + GF_FREE(mem); +} + +static br_stub_local_t * +br_stub_alloc_local(xlator_t *this) +{ + br_stub_private_t *priv = this->private; + + return mem_get0(priv->local_pool); +} + +static void +br_stub_dealloc_local(br_stub_local_t *ptr) +{ + if (!ptr) + return; + + mem_put(ptr); +} + +static int +br_stub_prepare_version_request(xlator_t *this, dict_t *dict, + br_version_t *obuf, unsigned long oversion) +{ + br_stub_private_t *priv = NULL; + + priv = this->private; + br_set_ongoingversion(obuf, oversion, priv->boot); + + return dict_set_bin(dict, BITROT_CURRENT_VERSION_KEY, (void *)obuf, + sizeof(br_version_t)); +} + +static int +br_stub_prepare_signing_request(dict_t *dict, br_signature_t *sbuf, + br_isignature_t *sign, size_t signaturelen) +{ + size_t size = 0; + + br_set_signature(sbuf, sign, signaturelen, &size); + + return dict_set_bin(dict, BITROT_SIGNING_VERSION_KEY, (void *)sbuf, size); +} + +/** + * initialize an inode context starting with a given ongoing version. + * a fresh lookup() or a first creat() call initializes the inode + * context, hence the inode is marked dirty. this routine also + * initializes the transient inode version. + */ +static int +br_stub_init_inode_versions(xlator_t *this, fd_t *fd, inode_t *inode, + unsigned long version, gf_boolean_t markdirty, + gf_boolean_t bad_object, uint64_t *ctx_addr) +{ + int32_t ret = 0; + br_stub_inode_ctx_t *ctx = NULL; + + ctx = GF_CALLOC(1, sizeof(br_stub_inode_ctx_t), gf_br_stub_mt_inode_ctx_t); + if (!ctx) + goto error_return; + + INIT_LIST_HEAD(&ctx->fd_list); + (markdirty) ? __br_stub_mark_inode_dirty(ctx) + : __br_stub_mark_inode_synced(ctx); + __br_stub_set_ongoing_version(ctx, version); + + if (bad_object) + __br_stub_mark_object_bad(ctx); + + if (fd) { + ret = br_stub_add_fd_to_inode(this, fd, ctx); + if (ret) + goto free_ctx; + } + + ret = br_stub_set_inode_ctx(this, inode, ctx); + if (ret) + goto free_ctx; + + if (ctx_addr) + *ctx_addr = (uint64_t)(uintptr_t)ctx; + return 0; + +free_ctx: + GF_FREE(ctx); +error_return: + return -1; +} + +/** + * modify the ongoing version of an inode. + */ +static int +br_stub_mod_inode_versions(xlator_t *this, fd_t *fd, inode_t *inode, + unsigned long version) +{ + int32_t ret = -1; + br_stub_inode_ctx_t *ctx = 0; + + LOCK(&inode->lock); + { + ctx = __br_stub_get_ongoing_version_ctx(this, inode, NULL); + if (ctx == NULL) + goto unblock; + if (__br_stub_is_inode_dirty(ctx)) { + __br_stub_set_ongoing_version(ctx, version); + __br_stub_mark_inode_synced(ctx); + } + + ret = 0; + } +unblock: + UNLOCK(&inode->lock); + + return ret; +} + +static void +br_stub_fill_local(br_stub_local_t *local, call_stub_t *stub, fd_t *fd, + inode_t *inode, uuid_t gfid, int versioningtype, + unsigned long memversion) +{ + local->fopstub = stub; + local->versioningtype = versioningtype; + local->u.context.version = memversion; + if (fd) + local->u.context.fd = fd_ref(fd); + if (inode) + local->u.context.inode = inode_ref(inode); + gf_uuid_copy(local->u.context.gfid, gfid); +} + +static void +br_stub_cleanup_local(br_stub_local_t *local) +{ + if (!local) + return; + + local->fopstub = NULL; + local->versioningtype = 0; + local->u.context.version = 0; + if (local->u.context.fd) { + fd_unref(local->u.context.fd); + local->u.context.fd = NULL; + } + if (local->u.context.inode) { + inode_unref(local->u.context.inode); + local->u.context.inode = NULL; + } + memset(local->u.context.gfid, '\0', sizeof(uuid_t)); +} + +static int +br_stub_need_versioning(xlator_t *this, fd_t *fd, gf_boolean_t *versioning, + gf_boolean_t *modified, br_stub_inode_ctx_t **ctx) +{ + int32_t ret = -1; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *c = NULL; + unsigned long version = BITROT_DEFAULT_CURRENT_VERSION; + + *versioning = _gf_false; + *modified = _gf_false; + + /* Bitrot stub inode context was initialized only in lookup, create + * and mknod cbk path. Object versioning was enabled by default + * irrespective of bitrot enabled or not. But it's made optional now. + * As a consequence there could be cases where getting inode ctx would + * fail because it's not set yet. + * e.g., If versioning (with bitrot enable) is enabled while I/O is + * happening, it could directly get other fops like writev without + * lookup, where getting inode ctx would fail. Hence initialize the + * inode ctx on failure to get ctx. This is done in all places where + * applicable. + */ + ret = br_stub_get_inode_ctx(this, fd->inode, &ctx_addr); + if (ret < 0) { + ret = br_stub_init_inode_versions(this, fd, fd->inode, version, + _gf_true, _gf_false, &ctx_addr); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + BRS_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s", + uuid_utoa(fd->inode->gfid), NULL); + goto error_return; + } + } + + c = (br_stub_inode_ctx_t *)(long)ctx_addr; + + LOCK(&fd->inode->lock); + { + if (__br_stub_is_inode_dirty(c)) + *versioning = _gf_true; + if (__br_stub_is_inode_modified(c)) + *modified = _gf_true; + } + UNLOCK(&fd->inode->lock); + + if (ctx) + *ctx = c; + return 0; + +error_return: + return -1; +} + +static int32_t +br_stub_anon_fd_ctx(xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx) +{ + int32_t ret = -1; + br_stub_fd_t *br_stub_fd = NULL; + + br_stub_fd = br_stub_fd_ctx_get(this, fd); + if (!br_stub_fd) { + ret = br_stub_add_fd_to_inode(this, fd, ctx); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ADD_FD_TO_INODE, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto out; + } + } + + ret = 0; + +out: + return ret; +} + +static int +br_stub_versioning_prep(call_frame_t *frame, xlator_t *this, fd_t *fd, + br_stub_inode_ctx_t *ctx) +{ + int32_t ret = -1; + br_stub_local_t *local = NULL; + + local = br_stub_alloc_local(this); + if (!local) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRS_MSG_NO_MEMORY, "gfid=%s", + uuid_utoa(fd->inode->gfid), NULL); + goto error_return; + } + + if (fd_is_anonymous(fd)) { + ret = br_stub_anon_fd_ctx(this, fd, ctx); + if (ret) + goto free_local; + } + + frame->local = local; + + return 0; + +free_local: + br_stub_dealloc_local(local); +error_return: + return -1; +} + +static int +br_stub_mark_inode_modified(xlator_t *this, br_stub_local_t *local) +{ + fd_t *fd = NULL; + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + unsigned long version = BITROT_DEFAULT_CURRENT_VERSION; + + fd = local->u.context.fd; + + ret = br_stub_get_inode_ctx(this, fd->inode, &ctx_addr); + if (ret < 0) { + ret = br_stub_init_inode_versions(this, fd, fd->inode, version, + _gf_true, _gf_false, &ctx_addr); + if (ret) + goto error_return; + } + + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + + LOCK(&fd->inode->lock); + { + __br_stub_set_inode_modified(ctx); + } + UNLOCK(&fd->inode->lock); + + return 0; + +error_return: + return -1; +} + +/** + * The possible return values from br_stub_is_bad_object () are: + * 1) 0 => as per the inode context object is not bad + * 2) -1 => Failed to get the inode context itself + * 3) -2 => As per the inode context object is bad + * Both -ve values means the fop which called this function is failed + * and error is returned upwards. + */ +static int +br_stub_check_bad_object(xlator_t *this, inode_t *inode, int32_t *op_ret, + int32_t *op_errno) +{ + int ret = -1; + unsigned long version = BITROT_DEFAULT_CURRENT_VERSION; + + ret = br_stub_is_bad_object(this, inode); + if (ret == -2) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJECT_ACCESS, + "gfid=%s", uuid_utoa(inode->gfid), NULL); + *op_ret = -1; + *op_errno = EIO; + } + + if (ret == -1) { + ret = br_stub_init_inode_versions(this, NULL, inode, version, _gf_true, + _gf_false, NULL); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + BRS_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s", + uuid_utoa(inode->gfid), NULL); + *op_ret = -1; + *op_errno = EINVAL; + } + } + + return ret; +} + +/** + * callback for inode/fd versioning + */ +int +br_stub_fd_incversioning_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + fd_t *fd = NULL; + inode_t *inode = NULL; + unsigned long version = 0; + br_stub_local_t *local = NULL; + + local = (br_stub_local_t *)frame->local; + if (op_ret < 0) + goto done; + fd = local->u.context.fd; + inode = local->u.context.inode; + version = local->u.context.version; + + op_ret = br_stub_mod_inode_versions(this, fd, inode, version); + if (op_ret < 0) + op_errno = EINVAL; + +done: + if (op_ret < 0) { + frame->local = NULL; + call_unwind_error(local->fopstub, -1, op_errno); + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + } else { + call_resume(local->fopstub); + } + return 0; +} + +/** + * Initial object versioning + * + * Version persists two (2) extended attributes as explained below: + * 1. Current (ongoing) version: This is incremented on an writev () + * or truncate () and is the running version for an object. + * 2. Signing version: This is the version against which an object + * was signed (checksummed). + * + * During initial versioning, both ongoing and signing versions are + * set of one and zero respectively. A write() call increments the + * ongoing version as an indication of modification to the object. + * Additionally this needs to be persisted on disk and needs to be + * durable: fsync().. :-/ + * As an optimization only the first write() synchronizes the ongoing + * version to disk, subsequent write()s before the *last* release() + * are no-op's. + * + * create(), just like lookup() initializes the object versions to + * the default. As an optimization this is not a durable operation: + * in case of a crash, hard reboot etc.. absence of versioning xattrs + * is ignored in scrubber along with the one time crawler explicitly + * triggering signing for such objects. + * + * c.f. br_stub_writev() / br_stub_truncate() + */ + +/** + * perform full or incremental versioning on an inode pointd by an + * fd. incremental versioning is done when an inode is dirty and a + * writeback is triggered. + */ + +int +br_stub_fd_versioning(xlator_t *this, call_frame_t *frame, call_stub_t *stub, + dict_t *dict, fd_t *fd, br_stub_version_cbk *callback, + unsigned long memversion, int versioningtype, int durable) +{ + int32_t ret = -1; + int flags = 0; + dict_t *xdata = NULL; + br_stub_local_t *local = NULL; + + xdata = dict_new(); + if (!xdata) + goto done; + + ret = dict_set_int32(xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1); + if (ret) + goto dealloc_xdata; + + if (durable) { + ret = dict_set_int32(xdata, GLUSTERFS_DURABLE_OP, 0); + if (ret) + goto dealloc_xdata; + } + + local = frame->local; + + br_stub_fill_local(local, stub, fd, fd->inode, fd->inode->gfid, + versioningtype, memversion); + + STACK_WIND(frame, callback, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); + + ret = 0; + +dealloc_xdata: + dict_unref(xdata); +done: + return ret; +} + +static int +br_stub_perform_incversioning(xlator_t *this, call_frame_t *frame, + call_stub_t *stub, fd_t *fd, + br_stub_inode_ctx_t *ctx) +{ + int32_t ret = -1; + dict_t *dict = NULL; + br_version_t *obuf = NULL; + unsigned long writeback_version = 0; + int op_errno = 0; + br_stub_local_t *local = NULL; + + op_errno = EINVAL; + local = frame->local; + + writeback_version = __br_stub_writeback_version(ctx); + + op_errno = ENOMEM; + dict = dict_new(); + if (!dict) + goto out; + ret = br_stub_alloc_versions(&obuf, NULL, 0); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ALLOC_MEM_FAILED, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto out; + } + ret = br_stub_prepare_version_request(this, dict, obuf, writeback_version); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_VERSION_PREPARE_FAIL, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + br_stub_dealloc_versions(obuf); + goto out; + } + + ret = br_stub_fd_versioning( + this, frame, stub, dict, fd, br_stub_fd_incversioning_cbk, + writeback_version, BR_STUB_INCREMENTAL_VERSIONING, !WRITEBACK_DURABLE); +out: + if (dict) + dict_unref(dict); + if (ret) { + if (local) + frame->local = NULL; + call_unwind_error(stub, -1, op_errno); + if (local) { + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + } + } + + return ret; +} + +/** {{{ */ + +/* fsetxattr() */ + +int32_t +br_stub_perform_objsign(call_frame_t *frame, xlator_t *this, fd_t *fd, + dict_t *dict, int flags, dict_t *xdata) +{ + STACK_WIND(frame, default_fsetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); + + dict_unref(xdata); + return 0; +} + +void * +br_stub_signth(void *arg) +{ + xlator_t *this = arg; + br_stub_private_t *priv = this->private; + struct br_stub_signentry *sigstub = NULL; + + THIS = this; + while (1) { + /* + * Disabling bit-rot feature leads to this particular thread + * getting cleaned up by reconfigure via a call to the function + * gf_thread_cleanup_xint (which in turn calls pthread_cancel + * and pthread_join). But, if this thread had held the mutex + * &priv->lock at the time of cancellation, then it leads to + * deadlock in future when bit-rot feature is enabled (which + * again spawns this thread which cant hold the lock as the + * mutex is still held by the previous instance of the thread + * which got killed). Also, the br_stub_handle_object_signature + * function which is called whenever file has to be signed + * also gets blocked as it too attempts to acquire &priv->lock. + * + * So, arrange for the lock to be unlocked as part of the + * cleanup of this thread using pthread_cleanup_push and + * pthread_cleanup_pop. + */ + pthread_cleanup_push(br_stub_lock_cleaner, &priv->lock); + pthread_mutex_lock(&priv->lock); + { + while (list_empty(&priv->squeue)) + pthread_cond_wait(&priv->cond, &priv->lock); + + sigstub = list_first_entry(&priv->squeue, struct br_stub_signentry, + list); + list_del_init(&sigstub->list); + } + pthread_mutex_unlock(&priv->lock); + pthread_cleanup_pop(0); + + call_resume(sigstub->stub); + + GF_FREE(sigstub); + } + + return NULL; +} + +static gf_boolean_t +br_stub_internal_xattr(dict_t *dict) +{ + if (dict_get(dict, GLUSTERFS_SET_OBJECT_SIGNATURE) || + dict_get(dict, GLUSTERFS_GET_OBJECT_SIGNATURE) || + dict_get(dict, BR_REOPEN_SIGN_HINT_KEY) || + dict_get(dict, BITROT_OBJECT_BAD_KEY) || + dict_get(dict, BITROT_SIGNING_VERSION_KEY) || + dict_get(dict, BITROT_CURRENT_VERSION_KEY)) + return _gf_true; + + return _gf_false; +} + +int +orderq(struct list_head *elem1, struct list_head *elem2) +{ + struct br_stub_signentry *s1 = NULL; + struct br_stub_signentry *s2 = NULL; + + s1 = list_entry(elem1, struct br_stub_signentry, list); + s2 = list_entry(elem2, struct br_stub_signentry, list); + + return (s1->v > s2->v); +} + +static int +br_stub_compare_sign_version(xlator_t *this, inode_t *inode, + br_signature_t *sbuf, dict_t *dict, + int *fakesuccess) +{ + int32_t ret = -1; + uint64_t tmp_ctx = 0; + gf_boolean_t invalid = _gf_false; + br_stub_inode_ctx_t *ctx = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + GF_VALIDATE_OR_GOTO(this->name, sbuf, out); + GF_VALIDATE_OR_GOTO(this->name, dict, out); + + ret = br_stub_get_inode_ctx(this, inode, &tmp_ctx); + if (ret) { + dict_del(dict, BITROT_SIGNING_VERSION_KEY); + goto out; + } + + ctx = (br_stub_inode_ctx_t *)(long)tmp_ctx; + + LOCK(&inode->lock); + { + if (ctx->currentversion < sbuf->signedversion) { + invalid = _gf_true; + } else if (ctx->currentversion > sbuf->signedversion) { + gf_msg_debug(this->name, 0, + "\"Signing version\" " + "(%lu) lower than \"Current version \" " + "(%lu)", + ctx->currentversion, sbuf->signedversion); + *fakesuccess = 1; + } + } + UNLOCK(&inode->lock); + + if (invalid) { + ret = -1; + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SIGN_VERSION_ERROR, + "Signing-ver=%lu", sbuf->signedversion, "current-ver=%lu", + ctx->currentversion, NULL); + } + +out: + return ret; +} + +static int +br_stub_prepare_signature(xlator_t *this, dict_t *dict, inode_t *inode, + br_isignature_t *sign, int *fakesuccess) +{ + int32_t ret = -1; + size_t signaturelen = 0; + br_signature_t *sbuf = NULL; + + if (!br_is_signature_type_valid(sign->signaturetype)) + goto out; + + signaturelen = sign->signaturelen; + ret = br_stub_alloc_versions(NULL, &sbuf, signaturelen); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ALLOC_MEM_FAILED, + "gfid=%s", uuid_utoa(inode->gfid), NULL); + ret = -1; + goto out; + } + ret = br_stub_prepare_signing_request(dict, sbuf, sign, signaturelen); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_SIGN_PREPARE_FAIL, + "gfid=%s", uuid_utoa(inode->gfid), NULL); + ret = -1; + br_stub_dealloc_versions(sbuf); + goto out; + } + + /* At this point sbuf has been added to dict, so the memory will be freed + * when the data from the dict is destroyed + */ + ret = br_stub_compare_sign_version(this, inode, sbuf, dict, fakesuccess); +out: + return ret; +} + +static void +br_stub_handle_object_signature(call_frame_t *frame, xlator_t *this, fd_t *fd, + dict_t *dict, br_isignature_t *sign, + dict_t *xdata) +{ + int32_t ret = -1; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + int fakesuccess = 0; + br_stub_private_t *priv = NULL; + struct br_stub_signentry *sigstub = NULL; + + priv = this->private; + + if (frame->root->pid != GF_CLIENT_PID_BITD) { + gf_smsg(this->name, GF_LOG_WARNING, op_errno, BRS_MSG_NON_BITD_PID, + "PID=%d", frame->root->pid, NULL); + goto dofop; + } + + ret = br_stub_prepare_signature(this, dict, fd->inode, sign, &fakesuccess); + if (ret) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SIGN_PREPARE_FAIL, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto dofop; + } + if (fakesuccess) { + op_ret = op_errno = 0; + goto dofop; + } + + dict_del(dict, GLUSTERFS_SET_OBJECT_SIGNATURE); + + ret = -1; + if (!xdata) { + xdata = dict_new(); + if (!xdata) + goto dofop; + } else { + dict_ref(xdata); + } + + ret = dict_set_int32(xdata, GLUSTERFS_DURABLE_OP, 0); + if (ret) + goto unref_dict; + + /* prepare dispatch stub to order object signing */ + sigstub = GF_CALLOC(1, sizeof(*sigstub), gf_br_stub_mt_sigstub_t); + if (!sigstub) + goto unref_dict; + + INIT_LIST_HEAD(&sigstub->list); + sigstub->v = ntohl(sign->signedversion); + sigstub->stub = fop_fsetxattr_stub(frame, br_stub_perform_objsign, fd, dict, + 0, xdata); + if (!sigstub->stub) + goto cleanup_stub; + + pthread_mutex_lock(&priv->lock); + { + list_add_order(&sigstub->list, &priv->squeue, orderq); + pthread_cond_signal(&priv->cond); + } + pthread_mutex_unlock(&priv->lock); + + return; + +cleanup_stub: + GF_FREE(sigstub); +unref_dict: + dict_unref(xdata); +dofop: + STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, NULL); +} + +int32_t +br_stub_fsetxattr_resume(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + int32_t ret = -1; + br_stub_local_t *local = NULL; + + local = frame->local; + frame->local = NULL; + + ret = br_stub_mark_inode_modified(this, local); + if (ret) { + op_ret = -1; + op_errno = EINVAL; + } + + STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, xdata); + + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + + return 0; +} + +/** + * Handles object reopens. Object reopens can be of 3 types. 2 are from + * oneshot crawler and 1 from the regular signer. + * ONESHOT CRAWLER: + * For those objects which were created before bitrot was enabled. oneshow + * crawler crawls the namespace and signs all the objects. It has to do + * the versioning before making bit-rot-stub send a sign notification. + * So it sends fsetxattr with BR_OBJECT_REOPEN as the value. And bit-rot-stub + * upon getting BR_OBJECT_REOPEN value checks if the version has to be + * increased or not. By default the version will be increased. But if the + * object is modified before BR_OBJECT_REOPEN from oneshot crawler, then + * versioning need not be done. In that case simply a success is returned. + * SIGNER: + * Signer wait for 2 minutes upon getting the notification from bit-rot-stub + * and then it sends a dummy write (in reality a fsetxattr) call, to change + * the state of the inode from REOPEN_WAIT to SIGN_QUICK. The funny part here + * is though the inode's state is REOPEN_WAIT, the call sent by signer is + * BR_OBJECT_RESIGN. Once the state is changed to SIGN_QUICK, then yet another + * notification is sent upon release (RESIGN would have happened via fsetxattr, + * so a fd is needed) and the object is signed truly this time. + * There is a challenge in the above RESIGN method by signer. After sending + * the 1st notification, the inode could be forgotten before RESIGN request + * is received. In that case, the inode's context (the newly looked up inode) + * would not indicate the inode as being modified (it would be in the default + * state) and because of this, a SIGN_QUICK notification to truly sign the + * object would not be sent. So, this is how its handled. + * if (request == RESIGN) { + * if (inode->sign_info == NORMAL) { + * mark_inode_non_dirty; + * mark_inode_modified; + * } + * GOBACK (means unwind without doing versioning) + * } + */ +static void +br_stub_handle_object_reopen(call_frame_t *frame, xlator_t *this, fd_t *fd, + uint32_t val) +{ + int32_t ret = -1; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + call_stub_t *stub = NULL; + gf_boolean_t inc_version = _gf_false; + gf_boolean_t modified = _gf_false; + br_stub_inode_ctx_t *ctx = NULL; + br_stub_local_t *local = NULL; + gf_boolean_t goback = _gf_true; + + ret = br_stub_need_versioning(this, fd, &inc_version, &modified, &ctx); + if (ret) + goto unwind; + + LOCK(&fd->inode->lock); + { + if ((val == BR_OBJECT_REOPEN) && inc_version) + goback = _gf_false; + if (val == BR_OBJECT_RESIGN && ctx->info_sign == BR_SIGN_NORMAL) { + __br_stub_mark_inode_synced(ctx); + __br_stub_set_inode_modified(ctx); + } + (void)__br_stub_inode_sign_state(ctx, GF_FOP_FSETXATTR, fd); + } + UNLOCK(&fd->inode->lock); + + if (goback) { + op_ret = op_errno = 0; + goto unwind; + } + + ret = br_stub_versioning_prep(frame, this, fd, ctx); + if (ret) + goto unwind; + local = frame->local; + + stub = fop_fsetxattr_cbk_stub(frame, br_stub_fsetxattr_resume, 0, 0, NULL); + if (!stub) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_STUB_ALLOC_FAILED, + "fsetxattr gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto cleanup_local; + } + + (void)br_stub_perform_incversioning(this, frame, stub, fd, ctx); + return; + +cleanup_local: + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + +unwind: + frame->local = NULL; + STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, NULL); +} + +/** + * This function only handles bad file identification. Instead of checking in + * fops like open, readv, writev whether the object is bad or not by doing + * getxattr calls, better to catch them when scrubber marks it as bad. + * So this callback is called only when the fsetxattr is sent by the scrubber + * to mark the object as bad. + */ +int +br_stub_fsetxattr_bad_object_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + br_stub_local_t *local = NULL; + int32_t ret = -1; + + local = frame->local; + frame->local = NULL; + + if (op_ret < 0) + goto unwind; + + /* + * What to do if marking the object as bad fails? (i.e. in memory + * marking within the inode context. If we are here means fsetxattr + * fop has succeeded on disk and the bad object xattr has been set). + * We can return failure to scruber, but there is nothing the scrubber + * can do with it (it might assume that the on disk setxattr itself has + * failed). The main purpose of this operation is to help identify the + * bad object by checking the inode context itself (thus avoiding the + * necessity of doing a getxattr fop on the disk). + * + * So as of now, success itself is being returned even though inode + * context set operation fails. + * In future if there is any change in the policy which can handle this, + * then appropriate response should be sent (i.e. success or error). + */ + ret = br_stub_mark_object_bad(this, local->u.context.inode); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_MARK_FAIL, + "gfid=%s", uuid_utoa(local->u.context.inode->gfid), NULL); + + ret = br_stub_add(this, local->u.context.inode->gfid); + +unwind: + STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, xdata); + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + return 0; +} + +static int32_t +br_stub_handle_bad_object_key(call_frame_t *frame, xlator_t *this, fd_t *fd, + dict_t *dict, int flags, dict_t *xdata) +{ + br_stub_local_t *local = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + + if (frame->root->pid != GF_CLIENT_PID_SCRUB) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_NON_SCRUB_BAD_OBJ_MARK, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto unwind; + } + + local = br_stub_alloc_local(this); + if (!local) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ALLOC_MEM_FAILED, + "fsetxattr gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + br_stub_fill_local(local, NULL, fd, fd->inode, fd->inode->gfid, + BR_STUB_NO_VERSIONING, 0); + frame->local = local; + + STACK_WIND(frame, br_stub_fsetxattr_bad_object_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); + return 0; +unwind: + STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, NULL); + return 0; +} + +/** + * As of now, versioning is done by the stub (though as a setxattr + * operation) as part of inode modification operations such as writev, + * truncate, ftruncate. And signing is done by BitD by a fsetxattr call. + * So any kind of setxattr coming on the versioning and the signing xattr is + * not allowed (i.e. BITROT_CURRENT_VERSION_KEY and BITROT_SIGNING_VERSION_KEY). + * In future if BitD/scrubber are allowed to change the versioning + * xattrs (though I cannot see a reason for it as of now), then the below + * function can be modified to block setxattr on version for only applications. + * + * NOTE: BitD sends sign request on GLUSTERFS_SET_OBJECT_SIGNATURE key. + * BITROT_SIGNING_VERSION_KEY is the xattr used to save the signature. + * + */ +static int32_t +br_stub_handle_internal_xattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + char *key) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_SET_INTERNAL_XATTR, + "setxattr key=%s", key, "inode-gfid=%s", uuid_utoa(fd->inode->gfid), + NULL); + + STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, NULL); + return 0; +} + +static void +br_stub_dump_xattr(xlator_t *this, dict_t *dict, int *op_errno) +{ + char *format = "(%s:%s)"; + char *dump = NULL; + + dump = GF_CALLOC(1, BR_STUB_DUMP_STR_SIZE, gf_br_stub_mt_misc); + if (!dump) { + *op_errno = ENOMEM; + goto out; + } + dict_dump_to_str(dict, dump, BR_STUB_DUMP_STR_SIZE, format); + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_SET_INTERNAL_XATTR, + "fsetxattr dump=%s", dump, NULL); +out: + if (dump) { + GF_FREE(dump); + } + return; +} + +int +br_stub_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int flags, dict_t *xdata) +{ + int32_t ret = 0; + uint32_t val = 0; + br_isignature_t *sign = NULL; + br_stub_private_t *priv = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + + priv = this->private; + + if ((frame->root->pid != GF_CLIENT_PID_BITD && + frame->root->pid != GF_CLIENT_PID_SCRUB) && + br_stub_internal_xattr(dict)) { + br_stub_dump_xattr(this, dict, &op_errno); + goto unwind; + } + + if (!priv->do_versioning) + goto wind; + + if (!IA_ISREG(fd->inode->ia_type)) + goto wind; + + /* object signature request */ + ret = dict_get_bin(dict, GLUSTERFS_SET_OBJECT_SIGNATURE, (void **)&sign); + if (!ret) { + gf_msg_debug(this->name, 0, "got SIGNATURE request on %s", + uuid_utoa(fd->inode->gfid)); + br_stub_handle_object_signature(frame, this, fd, dict, sign, xdata); + goto done; + } + + /* signing xattr */ + if (dict_get(dict, BITROT_SIGNING_VERSION_KEY)) { + br_stub_handle_internal_xattr(frame, this, fd, + BITROT_SIGNING_VERSION_KEY); + goto done; + } + + /* version xattr */ + if (dict_get(dict, BITROT_CURRENT_VERSION_KEY)) { + br_stub_handle_internal_xattr(frame, this, fd, + BITROT_CURRENT_VERSION_KEY); + goto done; + } + + if (dict_get(dict, GLUSTERFS_GET_OBJECT_SIGNATURE)) { + br_stub_handle_internal_xattr(frame, this, fd, + GLUSTERFS_GET_OBJECT_SIGNATURE); + goto done; + } + + /* object reopen request */ + ret = dict_get_uint32(dict, BR_REOPEN_SIGN_HINT_KEY, &val); + if (!ret) { + br_stub_handle_object_reopen(frame, this, fd, val); + goto done; + } + + /* handle bad object */ + if (dict_get(dict, BITROT_OBJECT_BAD_KEY)) { + br_stub_handle_bad_object_key(frame, this, fd, dict, flags, xdata); + goto done; + } + +wind: + STACK_WIND(frame, default_fsetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, NULL); + +done: + return 0; +} + +/** + * Currently BitD and scrubber are doing fsetxattr to either sign the object + * or to mark it as bad. Hence setxattr on any of those keys is denied directly + * without checking from where the fop is coming. + * Later, if BitD or Scrubber does setxattr of those keys, then appropriate + * check has to be added below. + */ +int +br_stub_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int flags, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + + if (br_stub_internal_xattr(dict)) { + br_stub_dump_xattr(this, dict, &op_errno); + goto unwind; + } + + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->setxattr, + loc, dict, flags, xdata); + return 0; +unwind: + STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, NULL); + return 0; +} + +/** }}} */ + +/** {{{ */ + +/* {f}removexattr() */ + +int32_t +br_stub_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + + if (!strcmp(BITROT_OBJECT_BAD_KEY, name) || + !strcmp(BITROT_SIGNING_VERSION_KEY, name) || + !strcmp(BITROT_CURRENT_VERSION_KEY, name)) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_REMOVE_INTERNAL_XATTR, + "name=%s", name, "file-path=%s", loc->path, NULL); + goto unwind; + } + + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); + return 0; +unwind: + STACK_UNWIND_STRICT(removexattr, frame, op_ret, op_errno, NULL); + return 0; +} + +int32_t +br_stub_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + + if (!strcmp(BITROT_OBJECT_BAD_KEY, name) || + !strcmp(BITROT_SIGNING_VERSION_KEY, name) || + !strcmp(BITROT_CURRENT_VERSION_KEY, name)) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_REMOVE_INTERNAL_XATTR, + "name=%s", name, "inode-gfid=%s", uuid_utoa(fd->inode->gfid), + NULL); + goto unwind; + } + + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata); + return 0; +unwind: + STACK_UNWIND_STRICT(fremovexattr, frame, op_ret, op_errno, NULL); + return 0; +} + +/** }}} */ + +/** {{{ */ + +/* {f}getxattr() */ + +int +br_stub_listxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ + if (op_ret < 0) + goto unwind; + + br_stub_remove_vxattrs(xattr, _gf_true); + +unwind: + STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, xattr, xdata); + return 0; +} + +/** + * ONE SHOT CRAWLER from BitD signs the objects that it encounters while + * crawling, if the object is identified as stale by the stub. Stub follows + * the below logic to mark an object as stale or not. + * If the ongoing version and the signed_version match, then the object is not + * stale. Just return. Otherwise if they does not match, then it means one + * of the below things. + * 1) If the inode does not need write back of the version and the sign state is + * is NORMAL, then some active i/o is going on the object. So skip it. + * A notification will be sent to trigger the sign once the release is + * received on the object. + * 2) If inode does not need writeback of the version and the sign state is + * either reopen wait or quick sign, then it means: + * A) BitD restarted and it is not sure whether the object it encountered + * while crawling is in its timer wheel or not. Since there is no way to + * scan the timer wheel as of now, ONE SHOT CRAWLER just goes ahead and + * signs the object. Since the inode does not need writeback, version will + * not be incremented and directly the object will be signed. + * 3) If the inode needs writeback, then it means the inode was forgotten after + * the versioning and it has to be signed now. + * + * This is the algorithm followed: + * if (ongoing_version == signed_version); then + * object_is_not_stale; + * return; + * else; then + * if (!inode_needs_writeback && inode_sign_state != NORMAL); then + * object_is_stale; + * if (inode_needs_writeback); then + * object_is_stale; + * + * For SCRUBBER, no need to check for the sign state and inode writeback. + * If the ondisk ongoingversion and the ondisk signed version does not match, + * then treat the object as stale. + */ +char +br_stub_is_object_stale(xlator_t *this, call_frame_t *frame, inode_t *inode, + br_version_t *obuf, br_signature_t *sbuf) +{ + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + int32_t ret = -1; + char stale = 0; + + if (obuf->ongoingversion == sbuf->signedversion) + goto out; + + if (frame->root->pid == GF_CLIENT_PID_SCRUB) { + stale = 1; + goto out; + } + + ret = br_stub_get_inode_ctx(this, inode, &ctx_addr); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_GET_INODE_CONTEXT_FAILED, + "gfid=%s", uuid_utoa(inode->gfid), NULL); + goto out; + } + + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + + LOCK(&inode->lock); + { + if ((!__br_stub_is_inode_dirty(ctx) && + ctx->info_sign != BR_SIGN_NORMAL) || + __br_stub_is_inode_dirty(ctx)) + stale = 1; + } + UNLOCK(&inode->lock); + +out: + return stale; +} + +int +br_stub_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ + int32_t ret = 0; + size_t totallen = 0; + size_t signaturelen = 0; + br_stub_private_t *priv = NULL; + br_version_t *obuf = NULL; + br_signature_t *sbuf = NULL; + br_isignature_out_t *sign = NULL; + br_vxattr_status_t status; + br_stub_local_t *local = NULL; + inode_t *inode = NULL; + gf_boolean_t bad_object = _gf_false; + gf_boolean_t ver_enabled = _gf_false; + + BR_STUB_VER_ENABLED_IN_CALLPATH(frame, ver_enabled); + priv = this->private; + + if (op_ret < 0) + goto unwind; + BR_STUB_VER_COND_GOTO(priv, (!ver_enabled), delkeys); + + if (cookie != (void *)BR_STUB_REQUEST_COOKIE) + goto unwind; + + local = frame->local; + frame->local = NULL; + if (!local) { + op_ret = -1; + op_errno = EINVAL; + goto unwind; + } + inode = local->u.context.inode; + + op_ret = -1; + status = br_version_xattr_state(xattr, &obuf, &sbuf, &bad_object); + + op_errno = EIO; + if (bad_object) + goto delkeys; + + op_errno = EINVAL; + if (status == BR_VXATTR_STATUS_INVALID) + goto delkeys; + + op_errno = ENODATA; + if ((status == BR_VXATTR_STATUS_MISSING) || + (status == BR_VXATTR_STATUS_UNSIGNED)) + goto delkeys; + + /** + * okay.. we have enough information to satisfy the request, + * namely: version and signing extended attribute. what's + * pending is the signature length -- that's figured out + * indirectly via the size of the _whole_ xattr and the + * on-disk signing xattr header size. + */ + op_errno = EINVAL; + ret = dict_get_uint32(xattr, BITROT_SIGNING_XATTR_SIZE_KEY, + (uint32_t *)&signaturelen); + if (ret) + goto delkeys; + + signaturelen -= sizeof(br_signature_t); + totallen = sizeof(br_isignature_out_t) + signaturelen; + + op_errno = ENOMEM; + sign = GF_CALLOC(1, totallen, gf_br_stub_mt_signature_t); + if (!sign) + goto delkeys; + + sign->time[0] = obuf->timebuf[0]; + sign->time[1] = obuf->timebuf[1]; + + /* Object's dirty state & current signed version */ + sign->version = sbuf->signedversion; + sign->stale = br_stub_is_object_stale(this, frame, inode, obuf, sbuf); + + /* Object's signature */ + sign->signaturelen = signaturelen; + sign->signaturetype = sbuf->signaturetype; + (void)memcpy(sign->signature, sbuf->signature, signaturelen); + + op_errno = EINVAL; + ret = dict_set_bin(xattr, GLUSTERFS_GET_OBJECT_SIGNATURE, (void *)sign, + totallen); + if (ret < 0) { + GF_FREE(sign); + goto delkeys; + } + op_errno = 0; + op_ret = totallen; + +delkeys: + br_stub_remove_vxattrs(xattr, _gf_true); + +unwind: + STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, xattr, xdata); + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + return 0; +} + +static void +br_stub_send_stub_init_time(call_frame_t *frame, xlator_t *this) +{ + int op_ret = 0; + int op_errno = 0; + dict_t *xattr = NULL; + br_stub_init_t stub = { + { + 0, + }, + }; + br_stub_private_t *priv = NULL; + + priv = this->private; + + xattr = dict_new(); + if (!xattr) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + stub.timebuf[0] = priv->boot[0]; + stub.timebuf[1] = priv->boot[1]; + memcpy(stub.export, priv->export, strlen(priv->export) + 1); + + op_ret = dict_set_static_bin(xattr, GLUSTERFS_GET_BR_STUB_INIT_TIME, + (void *)&stub, sizeof(br_stub_init_t)); + if (op_ret < 0) { + op_errno = EINVAL; + goto unwind; + } + + op_ret = sizeof(br_stub_init_t); + +unwind: + STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, xattr, NULL); + + if (xattr) + dict_unref(xattr); +} + +int +br_stub_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + void *cookie = NULL; + static uuid_t rootgfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + fop_getxattr_cbk_t cbk = br_stub_getxattr_cbk; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + br_stub_local_t *local = NULL; + br_stub_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO(this->name, loc, unwind); + GF_VALIDATE_OR_GOTO(this->name, this->private, unwind); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, unwind); + + if (!name) { + cbk = br_stub_listxattr_cbk; + goto wind; + } + + if (br_stub_is_internal_xattr(name)) + goto unwind; + + priv = this->private; + BR_STUB_VER_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); + + /** + * If xattr is node-uuid and the inode is marked bad, return EIO. + * Returning EIO would result in AFR to choose correct node-uuid + * corresponding to the subvolume * where the good copy of the + * file resides. + */ + if (IA_ISREG(loc->inode->ia_type) && XATTR_IS_NODE_UUID(name) && + br_stub_check_bad_object(this, loc->inode, &op_ret, &op_errno)) { + goto unwind; + } + + /** + * this special extended attribute is allowed only on root + */ + if (name && + (strncmp(name, GLUSTERFS_GET_BR_STUB_INIT_TIME, + sizeof(GLUSTERFS_GET_BR_STUB_INIT_TIME) - 1) == 0) && + ((gf_uuid_compare(loc->gfid, rootgfid) == 0) || + (gf_uuid_compare(loc->inode->gfid, rootgfid) == 0))) { + BR_STUB_RESET_LOCAL_NULL(frame); + br_stub_send_stub_init_time(frame, this); + return 0; + } + + if (!IA_ISREG(loc->inode->ia_type)) + goto wind; + + if (name && (strncmp(name, GLUSTERFS_GET_OBJECT_SIGNATURE, + sizeof(GLUSTERFS_GET_OBJECT_SIGNATURE) - 1) == 0)) { + cookie = (void *)BR_STUB_REQUEST_COOKIE; + + local = br_stub_alloc_local(this); + if (!local) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + br_stub_fill_local(local, NULL, NULL, loc->inode, loc->inode->gfid, + BR_STUB_NO_VERSIONING, 0); + frame->local = local; + } + +wind: + STACK_WIND_COOKIE(frame, cbk, cookie, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); + return 0; +unwind: + BR_STUB_RESET_LOCAL_NULL(frame); + STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, NULL, NULL); + return 0; +} + +int +br_stub_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + void *cookie = NULL; + static uuid_t rootgfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + fop_fgetxattr_cbk_t cbk = br_stub_getxattr_cbk; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + br_stub_local_t *local = NULL; + br_stub_private_t *priv = NULL; + + priv = this->private; + + if (!name) { + cbk = br_stub_listxattr_cbk; + goto wind; + } + + if (br_stub_is_internal_xattr(name)) + goto unwind; + + BR_STUB_VER_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); + + /** + * If xattr is node-uuid and the inode is marked bad, return EIO. + * Returning EIO would result in AFR to choose correct node-uuid + * corresponding to the subvolume * where the good copy of the + * file resides. + */ + if (IA_ISREG(fd->inode->ia_type) && XATTR_IS_NODE_UUID(name) && + br_stub_check_bad_object(this, fd->inode, &op_ret, &op_errno)) { + goto unwind; + } + + /** + * this special extended attribute is allowed only on root + */ + if (name && + (strncmp(name, GLUSTERFS_GET_BR_STUB_INIT_TIME, + sizeof(GLUSTERFS_GET_BR_STUB_INIT_TIME) - 1) == 0) && + (gf_uuid_compare(fd->inode->gfid, rootgfid) == 0)) { + BR_STUB_RESET_LOCAL_NULL(frame); + br_stub_send_stub_init_time(frame, this); + return 0; + } + + if (!IA_ISREG(fd->inode->ia_type)) + goto wind; + + if (name && (strncmp(name, GLUSTERFS_GET_OBJECT_SIGNATURE, + sizeof(GLUSTERFS_GET_OBJECT_SIGNATURE) - 1) == 0)) { + cookie = (void *)BR_STUB_REQUEST_COOKIE; + + local = br_stub_alloc_local(this); + if (!local) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + br_stub_fill_local(local, NULL, fd, fd->inode, fd->inode->gfid, + BR_STUB_NO_VERSIONING, 0); + frame->local = local; + } + +wind: + STACK_WIND_COOKIE(frame, cbk, cookie, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata); + return 0; +unwind: + BR_STUB_RESET_LOCAL_NULL(frame); + STACK_UNWIND_STRICT(fgetxattr, frame, op_ret, op_errno, NULL, NULL); + return 0; +} + +int32_t +br_stub_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + int32_t ret = -1; + br_stub_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO(this->name, frame, unwind); + GF_VALIDATE_OR_GOTO(this->name, this->private, unwind); + GF_VALIDATE_OR_GOTO(this->name, fd, unwind); + GF_VALIDATE_OR_GOTO(this->name, fd->inode, unwind); + + priv = this->private; + if (!priv->do_versioning) + goto wind; + + ret = br_stub_check_bad_object(this, fd->inode, &op_ret, &op_errno); + if (ret) + goto unwind; + +wind: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv, + fd, size, offset, flags, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, NULL, 0, NULL, NULL, + NULL); + return 0; +} + +/** + * The first write response on the first fd in the list of fds will set + * the flag to indicate that the inode is modified. The subsequent write + * respnses coming on either the first fd or some other fd will not change + * the fd. The inode-modified flag is unset only upon release of all the + * fds. + */ +int32_t +br_stub_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t ret = 0; + br_stub_local_t *local = NULL; + + local = frame->local; + frame->local = NULL; + + if (op_ret < 0) + goto unwind; + + ret = br_stub_mark_inode_modified(this, local); + if (ret) { + op_ret = -1; + op_errno = EINVAL; + } + +unwind: + STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + + return 0; +} + +int32_t +br_stub_writev_resume(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata) +{ + STACK_WIND(frame, br_stub_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, + flags, iobref, xdata); + return 0; +} + +/** + * This is probably the most crucial part about the whole versioning thing. + * There's absolutely no differentiation as such between an anonymous fd + * and a regular fd except the fd context initialization. Object versioning + * is performed when the inode is dirty. Parallel write operations are no + * special with each write performing object versioning followed by marking + * the inode as non-dirty (synced). This is followed by the actual operation + * (writev() in this case) which on a success marks the inode as modified. + * This prevents signing of objects that have not been modified. + */ +int32_t +br_stub_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata) +{ + call_stub_t *stub = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + gf_boolean_t inc_version = _gf_false; + gf_boolean_t modified = _gf_false; + br_stub_inode_ctx_t *ctx = NULL; + int32_t ret = -1; + fop_writev_cbk_t cbk = default_writev_cbk; + br_stub_local_t *local = NULL; + br_stub_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO(this->name, this->private, unwind); + GF_VALIDATE_OR_GOTO(this->name, frame, unwind); + GF_VALIDATE_OR_GOTO(this->name, fd, unwind); + + priv = this->private; + if (!priv->do_versioning) + goto wind; + + ret = br_stub_need_versioning(this, fd, &inc_version, &modified, &ctx); + if (ret) + goto unwind; + + ret = br_stub_check_bad_object(this, fd->inode, &op_ret, &op_errno); + if (ret) + goto unwind; + + /** + * The inode is not dirty and also witnessed at least one successful + * modification operation. Therefore, subsequent operations need not + * perform any special tracking. + */ + if (!inc_version && modified) + goto wind; + + /** + * okay.. so, either the inode needs versioning or the modification + * needs to be tracked. ->cbk is set to the appropriate callback + * routine for this. + * NOTE: ->local needs to be deallocated on failures from here on. + */ + ret = br_stub_versioning_prep(frame, this, fd, ctx); + if (ret) + goto unwind; + + local = frame->local; + if (!inc_version) { + br_stub_fill_local(local, NULL, fd, fd->inode, fd->inode->gfid, + BR_STUB_NO_VERSIONING, 0); + cbk = br_stub_writev_cbk; + goto wind; + } + + stub = fop_writev_stub(frame, br_stub_writev_resume, fd, vector, count, + offset, flags, iobref, xdata); + + if (!stub) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_STUB_ALLOC_FAILED, + "write gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto cleanup_local; + } + + /* Perform Versioning */ + return br_stub_perform_incversioning(this, frame, stub, fd, ctx); + +wind: + STACK_WIND(frame, cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, + fd, vector, count, offset, flags, iobref, xdata); + return 0; + +cleanup_local: + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + +unwind: + frame->local = NULL; + STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, NULL, NULL, NULL); + + return 0; +} + +int32_t +br_stub_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t ret = -1; + br_stub_local_t *local = NULL; + + local = frame->local; + frame->local = NULL; + + if (op_ret < 0) + goto unwind; + + ret = br_stub_mark_inode_modified(this, local); + if (ret) { + op_ret = -1; + op_errno = EINVAL; + } + +unwind: + STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + + return 0; +} + +int32_t +br_stub_ftruncate_resume(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, dict_t *xdata) +{ + STACK_WIND(frame, br_stub_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; +} + +/* c.f. br_stub_writev() for explanation */ +int32_t +br_stub_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + br_stub_local_t *local = NULL; + call_stub_t *stub = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + gf_boolean_t inc_version = _gf_false; + gf_boolean_t modified = _gf_false; + br_stub_inode_ctx_t *ctx = NULL; + int32_t ret = -1; + fop_ftruncate_cbk_t cbk = default_ftruncate_cbk; + br_stub_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO(this->name, this->private, unwind); + GF_VALIDATE_OR_GOTO(this->name, frame, unwind); + GF_VALIDATE_OR_GOTO(this->name, fd, unwind); + + priv = this->private; + if (!priv->do_versioning) + goto wind; + + ret = br_stub_need_versioning(this, fd, &inc_version, &modified, &ctx); + if (ret) + goto unwind; + + ret = br_stub_check_bad_object(this, fd->inode, &op_ret, &op_errno); + if (ret) + goto unwind; + + if (!inc_version && modified) + goto wind; + + ret = br_stub_versioning_prep(frame, this, fd, ctx); + if (ret) + goto unwind; + + local = frame->local; + if (!inc_version) { + br_stub_fill_local(local, NULL, fd, fd->inode, fd->inode->gfid, + BR_STUB_NO_VERSIONING, 0); + cbk = br_stub_ftruncate_cbk; + goto wind; + } + + stub = fop_ftruncate_stub(frame, br_stub_ftruncate_resume, fd, offset, + xdata); + if (!stub) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_STUB_ALLOC_FAILED, + "ftruncate gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto cleanup_local; + } + + return br_stub_perform_incversioning(this, frame, stub, fd, ctx); + +wind: + STACK_WIND(frame, cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; + +cleanup_local: + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + +unwind: + frame->local = NULL; + STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, NULL, NULL, NULL); + + return 0; +} + +int32_t +br_stub_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t ret = 0; + br_stub_local_t *local = NULL; + + local = frame->local; + frame->local = NULL; + + if (op_ret < 0) + goto unwind; + + ret = br_stub_mark_inode_modified(this, local); + if (ret) { + op_ret = -1; + op_errno = EINVAL; + } + +unwind: + STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + return 0; +} + +int32_t +br_stub_truncate_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, + off_t offset, dict_t *xdata) +{ + br_stub_local_t *local = frame->local; + + fd_unref(local->u.context.fd); + STACK_WIND(frame, br_stub_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; +} + +/** + * Bit-rot-stub depends heavily on the fd based operations to for doing + * versioning and sending notification. It starts tracking the operation + * upon getting first fd based modify operation by doing versioning and + * sends notification when last fd using which the inode was modified is + * released. + * But for truncate there is no fd and hence it becomes difficult to do + * the versioning and send notification. It is handled by doing versioning + * on an anonymous fd. The fd will be valid till the completion of the + * truncate call. It guarantees that release on this anonymous fd will happen + * after the truncate call and notification is sent after the truncate call. + * + * c.f. br_writev_cbk() for explanation + */ +int32_t +br_stub_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + br_stub_local_t *local = NULL; + call_stub_t *stub = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + gf_boolean_t inc_version = _gf_false; + gf_boolean_t modified = _gf_false; + br_stub_inode_ctx_t *ctx = NULL; + int32_t ret = -1; + fd_t *fd = NULL; + fop_truncate_cbk_t cbk = default_truncate_cbk; + br_stub_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO(this->name, this->private, unwind); + GF_VALIDATE_OR_GOTO(this->name, frame, unwind); + GF_VALIDATE_OR_GOTO(this->name, loc, unwind); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, unwind); + + priv = this->private; + if (!priv->do_versioning) + goto wind; + + fd = fd_anonymous(loc->inode); + if (!fd) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_CREATE_ANONYMOUS_FD_FAILED, + "inode-gfid=%s", uuid_utoa(loc->inode->gfid), NULL); + goto unwind; + } + + ret = br_stub_need_versioning(this, fd, &inc_version, &modified, &ctx); + if (ret) + goto cleanup_fd; + + ret = br_stub_check_bad_object(this, fd->inode, &op_ret, &op_errno); + if (ret) + goto unwind; + + if (!inc_version && modified) + goto wind; + + ret = br_stub_versioning_prep(frame, this, fd, ctx); + if (ret) + goto cleanup_fd; + + local = frame->local; + if (!inc_version) { + br_stub_fill_local(local, NULL, fd, fd->inode, fd->inode->gfid, + BR_STUB_NO_VERSIONING, 0); + cbk = br_stub_truncate_cbk; + goto wind; + } + + stub = fop_truncate_stub(frame, br_stub_truncate_resume, loc, offset, + xdata); + if (!stub) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_STUB_ALLOC_FAILED, + "truncate gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto cleanup_local; + } + + return br_stub_perform_incversioning(this, frame, stub, fd, ctx); + +wind: + STACK_WIND(frame, cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate, + loc, offset, xdata); + if (fd) + fd_unref(fd); + return 0; + +cleanup_local: + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); +cleanup_fd: + fd_unref(fd); +unwind: + frame->local = NULL; + STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, NULL, NULL, NULL); + + return 0; +} + +/** }}} */ + +/** {{{ */ + +/* open() */ + +/** + * It's probably worth mentioning a bit about why some of the housekeeping + * work is done in open() call path, rather than the callback path. + * Two (or more) open()'s in parallel can race and lead to a situation + * where a release() gets triggered (possibly after a series of write() + * calls) when *other* open()'s have still not reached callback path + * thereby having an active fd on an inode that is in process of getting + * signed with the current version. + * + * Maintaining fd list in the call path ensures that a release() would + * not be triggered if an open() call races ahead (followed by a close()) + * threby finding non-empty fd list. + */ + +int +br_stub_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + int32_t ret = -1; + br_stub_inode_ctx_t *ctx = NULL; + uint64_t ctx_addr = 0; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + br_stub_private_t *priv = NULL; + unsigned long version = BITROT_DEFAULT_CURRENT_VERSION; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO(this->name, this->private, unwind); + GF_VALIDATE_OR_GOTO(this->name, loc, unwind); + GF_VALIDATE_OR_GOTO(this->name, fd, unwind); + GF_VALIDATE_OR_GOTO(this->name, fd->inode, unwind); + + priv = this->private; + + if (!priv->do_versioning) + goto wind; + + ret = br_stub_get_inode_ctx(this, fd->inode, &ctx_addr); + if (ret) { + ret = br_stub_init_inode_versions(this, fd, fd->inode, version, + _gf_true, _gf_false, &ctx_addr); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + BRS_MSG_GET_INODE_CONTEXT_FAILED, "path=%s", loc->path, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto unwind; + } + } + + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + + ret = br_stub_check_bad_object(this, fd->inode, &op_ret, &op_errno); + if (ret) + goto unwind; + + if (frame->root->pid == GF_CLIENT_PID_SCRUB) + goto wind; + + if (flags == O_RDONLY) + goto wind; + + ret = br_stub_add_fd_to_inode(this, fd, ctx); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ADD_FD_TO_LIST_FAILED, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto unwind; + } + +wind: + STACK_WIND(frame, default_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + return 0; +unwind: + STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, NULL, NULL); + return 0; +} + +/** }}} */ + +/** {{{ */ + +/* creat() */ + +/** + * This routine registers a release callback for the given fd and adds the + * fd to the inode context fd tracking list. + */ +int32_t +br_stub_add_fd_to_inode(xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx) +{ + int32_t ret = -1; + br_stub_fd_t *br_stub_fd = NULL; + + ret = br_stub_require_release_call(this, fd, &br_stub_fd); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_SET_FD_CONTEXT_FAILED, + "gfid=%s", uuid_utoa(fd->inode->gfid), NULL); + goto out; + } + + LOCK(&fd->inode->lock); + { + list_add_tail(&ctx->fd_list, &br_stub_fd->list); + } + UNLOCK(&fd->inode->lock); + + ret = 0; + +out: + return ret; +} + +int +br_stub_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + unsigned long version = BITROT_DEFAULT_CURRENT_VERSION; + br_stub_private_t *priv = NULL; + + priv = this->private; + + if (op_ret < 0) + goto unwind; + + if (!priv->do_versioning) + goto unwind; + + ret = br_stub_get_inode_ctx(this, fd->inode, &ctx_addr); + if (ret < 0) { + ret = br_stub_init_inode_versions(this, fd, inode, version, _gf_true, + _gf_false, &ctx_addr); + if (ret) { + op_ret = -1; + op_errno = EINVAL; + } + } else { + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + ret = br_stub_add_fd_to_inode(this, fd, ctx); + } + +unwind: + STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, stbuf, + preparent, postparent, xdata); + return 0; +} + +int +br_stub_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO(this->name, loc, unwind); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, unwind); + GF_VALIDATE_OR_GOTO(this->name, fd, unwind); + GF_VALIDATE_OR_GOTO(this->name, fd->inode, unwind); + + STACK_WIND(frame, br_stub_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); + return 0; +unwind: + STACK_UNWIND_STRICT(create, frame, -1, EINVAL, NULL, NULL, NULL, NULL, NULL, + NULL); + return 0; +} + +int +br_stub_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + int32_t ret = -1; + unsigned long version = BITROT_DEFAULT_CURRENT_VERSION; + br_stub_private_t *priv = NULL; + + priv = this->private; + + if (op_ret < 0) + goto unwind; + + if (!priv->do_versioning) + goto unwind; + + ret = br_stub_init_inode_versions(this, NULL, inode, version, _gf_true, + _gf_false, NULL); + /** + * Like lookup, if init_inode_versions fail, return EINVAL + */ + if (ret) { + op_ret = -1; + op_errno = EINVAL; + } + +unwind: + STACK_UNWIND_STRICT(mknod, frame, op_ret, op_errno, inode, stbuf, preparent, + postparent, xdata); + return 0; +} + +int +br_stub_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t dev, mode_t umask, dict_t *xdata) +{ + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO(this->name, loc, unwind); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, unwind); + + STACK_WIND(frame, br_stub_mknod_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, loc, mode, dev, umask, xdata); + return 0; +unwind: + STACK_UNWIND_STRICT(mknod, frame, -1, EINVAL, NULL, NULL, NULL, NULL, NULL); + return 0; +} + +/** }}} */ + +/** + * As of now, only lookup searches for bad object xattr and marks the + * object as bad in its inode context if the xattr is present. But there + * is a possibility that, at the time of the lookup the object was not + * marked bad (i.e. bad object xattr was not set), and later its marked + * as bad. In this case, object is not bad, so when a fop such as open or + * readv or writev comes on the object, the fop will be sent downward instead + * of sending as error upwards. + * The solution for this is to do a getxattr for the below list of fops. + * lookup, readdirp, open, readv, writev. + * But doing getxattr for each of the above fops might be costly. + * So another method followed is to catch the bad file marking by the scrubber + * and set that info within the object's inode context. In this way getxattr + * calls can be avoided and bad objects can be caught instantly. Fetching the + * xattr is needed only in lookups when there is a brick restart or inode + * forget. + * + * If the dict (@xattr) is NULL, then how should that be handled? Fail the + * lookup operation? Or let it continue with version being initialized to + * BITROT_DEFAULT_CURRENT_VERSION. But what if the version was different + * on disk (and also a right signature was there), but posix failed to + * successfully allocate the dict? Posix does not treat call back xdata + * creattion failure as the lookup failure. + */ +static int32_t +br_stub_lookup_version(xlator_t *this, uuid_t gfid, inode_t *inode, + dict_t *xattr) +{ + unsigned long version = 0; + br_version_t *obuf = NULL; + br_signature_t *sbuf = NULL; + br_vxattr_status_t status; + gf_boolean_t bad_object = _gf_false; + + /** + * versioning xattrs were requested from POSIX. if available, figure + * out the correct version to use in the inode context (start with + * the default version if unavailable). As of now versions are not + * persisted on-disk. The inode is marked dirty, so that the first + * operation (such as write(), etc..) triggers synchronization to + * disk. + */ + status = br_version_xattr_state(xattr, &obuf, &sbuf, &bad_object); + version = ((status == BR_VXATTR_STATUS_FULL) || + (status == BR_VXATTR_STATUS_UNSIGNED)) + ? obuf->ongoingversion + : BITROT_DEFAULT_CURRENT_VERSION; + + /** + * If signature is there, but version is not there then that status is + * is treated as INVALID. So in that case, we should not initialize the + * inode context with wrong version names etc. + */ + if (status == BR_VXATTR_STATUS_INVALID) + return -1; + + return br_stub_init_inode_versions(this, NULL, inode, version, _gf_true, + bad_object, NULL); +} + +/** {{{ */ + +int32_t +br_stub_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) +{ + br_stub_private_t *priv = NULL; + br_stub_fd_t *fd_ctx = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + + priv = this->private; + if (gf_uuid_compare(fd->inode->gfid, priv->bad_object_dir_gfid)) + goto normal; + + fd_ctx = br_stub_fd_new(); + if (!fd_ctx) { + op_errno = ENOMEM; + goto unwind; + } + + fd_ctx->bad_object.dir_eof = -1; + fd_ctx->bad_object.dir = sys_opendir(priv->stub_basepath); + if (!fd_ctx->bad_object.dir) { + op_errno = errno; + goto err_freectx; + } + + op_ret = br_stub_fd_ctx_set(this, fd, fd_ctx); + if (!op_ret) + goto unwind; + + sys_closedir(fd_ctx->bad_object.dir); + +err_freectx: + GF_FREE(fd_ctx); +unwind: + STACK_UNWIND_STRICT(opendir, frame, op_ret, op_errno, fd, NULL); + return 0; + +normal: + STACK_WIND(frame, default_opendir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->opendir, loc, fd, xdata); + return 0; +} + +int32_t +br_stub_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) +{ + call_stub_t *stub = NULL; + br_stub_private_t *priv = NULL; + + priv = this->private; + if (!priv->do_versioning) + goto out; + + if (gf_uuid_compare(fd->inode->gfid, priv->bad_object_dir_gfid)) + goto out; + stub = fop_readdir_stub(frame, br_stub_readdir_wrapper, fd, size, off, + xdata); + if (!stub) { + STACK_UNWIND_STRICT(readdir, frame, -1, ENOMEM, NULL, NULL); + return 0; + } + br_stub_worker_enqueue(this, stub); + return 0; +out: + STACK_WIND(frame, default_readdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdir, fd, size, off, xdata); + return 0; +} + +int +br_stub_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries, + dict_t *dict) +{ + int32_t ret = 0; + uint64_t ctxaddr = 0; + gf_dirent_t *entry = NULL; + br_stub_private_t *priv = NULL; + gf_boolean_t ver_enabled = _gf_false; + + BR_STUB_VER_ENABLED_IN_CALLPATH(frame, ver_enabled); + priv = this->private; + BR_STUB_VER_COND_GOTO(priv, (!ver_enabled), unwind); + + if (op_ret < 0) + goto unwind; + + list_for_each_entry(entry, &entries->list, list) + { + if ((strcmp(entry->d_name, ".") == 0) || + (strcmp(entry->d_name, "..") == 0)) + continue; + + if (!IA_ISREG(entry->d_stat.ia_type)) + continue; + + /* + * Readdirp for most part is a bulk lookup for all the entries + * present in the directory being read. Ideally, for each + * entry, the handling should be similar to that of a lookup + * callback. But for now, just keeping this as it has been + * until now (which means, this comment has been added much + * later as part of a change that wanted to send the flag + * of true/false to br_stub_remove_vxattrs to indicate whether + * the bad-object xattr should be removed from the entry->dict + * or not). Until this change, the function br_stub_remove_vxattrs + * was just removing all the xattrs associated with bit-rot-stub + * (like version, bad-object, signature etc). But, there are + * scenarios where we only want to send bad-object xattr and not + * others. So this comment is part of that change which also + * mentions about another possible change that might be needed + * in future. + * But for now, adding _gf_true means functionally its same as + * what this function was doing before. Just remove all the stub + * related xattrs. + */ + ret = br_stub_get_inode_ctx(this, entry->inode, &ctxaddr); + if (ret < 0) + ctxaddr = 0; + if (ctxaddr) { /* already has the context */ + br_stub_remove_vxattrs(entry->dict, _gf_true); + continue; + } + + ret = br_stub_lookup_version(this, entry->inode->gfid, entry->inode, + entry->dict); + br_stub_remove_vxattrs(entry->dict, _gf_true); + if (ret) { + /** + * there's no per-file granularity support in case of + * failure. let's fail the entire request for now.. + */ + break; + } + } + + if (ret) { + op_ret = -1; + op_errno = EINVAL; + } + +unwind: + STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, entries, dict); + + return 0; +} + +int +br_stub_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *dict) +{ + int32_t ret = -1; + int op_errno = 0; + gf_boolean_t xref = _gf_false; + br_stub_private_t *priv = NULL; + + priv = this->private; + BR_STUB_VER_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); + + op_errno = ENOMEM; + if (!dict) { + dict = dict_new(); + if (!dict) + goto unwind; + } else { + dict = dict_ref(dict); + } + + xref = _gf_true; + + op_errno = EINVAL; + ret = dict_set_uint32(dict, BITROT_CURRENT_VERSION_KEY, 0); + if (ret) + goto unwind; + ret = dict_set_uint32(dict, BITROT_SIGNING_VERSION_KEY, 0); + if (ret) + goto unwind; + ret = dict_set_uint32(dict, BITROT_OBJECT_BAD_KEY, 0); + if (ret) + goto unwind; + +wind: + STACK_WIND(frame, br_stub_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, offset, dict); + goto unref_dict; + +unwind: + if (frame->local == (void *)0x1) + frame->local = NULL; + STACK_UNWIND_STRICT(readdirp, frame, -1, op_errno, NULL, NULL); + return 0; + +unref_dict: + if (xref) + dict_unref(dict); + return 0; +} + +/** }}} */ + +/** {{{ */ + +/* lookup() */ + +/** + * This function mainly handles the ENOENT error for the bad objects. Though + * br_stub_forget () handles removal of the link for the bad object from the + * quarantine directory, its better to handle it in lookup as well, where + * a failed lookup on a bad object with ENOENT, will trigger deletion of the + * link for the bad object from quarantine directory. So whoever comes first + * either forget () or lookup () will take care of removing the link. + */ +void +br_stub_handle_lookup_error(xlator_t *this, inode_t *inode, int32_t op_errno) +{ + int32_t ret = -1; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + + if (op_errno != ENOENT) + goto out; + + if (!inode_is_linked(inode)) + goto out; + + ret = br_stub_get_inode_ctx(this, inode, &ctx_addr); + if (ret) + goto out; + + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + + LOCK(&inode->lock); + { + if (__br_stub_is_bad_object(ctx)) + (void)br_stub_del(this, inode->gfid); + } + UNLOCK(&inode->lock); + + if (__br_stub_is_bad_object(ctx)) { + /* File is not present, might be deleted for recovery, + * del the bitrot inode context + */ + ctx_addr = 0; + inode_ctx_del(inode, this, &ctx_addr); + if (ctx_addr) { + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + GF_FREE(ctx); + } + } + +out: + return; +} + +int +br_stub_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, + dict_t *xattr, struct iatt *postparent) +{ + int32_t ret = 0; + br_stub_private_t *priv = NULL; + gf_boolean_t ver_enabled = _gf_false; + gf_boolean_t remove_bad_file_marker = _gf_true; + + BR_STUB_VER_ENABLED_IN_CALLPATH(frame, ver_enabled); + priv = this->private; + + if (op_ret < 0) { + (void)br_stub_handle_lookup_error(this, inode, op_errno); + + /* + * If the lookup error is not ENOENT, then it is better + * to send the bad file marker to the higher layer (if + * it has been set) + */ + if (op_errno != ENOENT) + remove_bad_file_marker = _gf_false; + goto delkey; + } + + BR_STUB_VER_COND_GOTO(priv, (!ver_enabled), delkey); + + if (!IA_ISREG(stbuf->ia_type)) + goto unwind; + + /** + * If the object is bad, then "bad inode" marker has to be sent back + * in resoinse, for revalidated lookups as well. Some xlators such as + * quick-read might cache the data in revalidated lookup as fresh + * lookup would anyway have sent "bad inode" marker. + * In general send bad inode marker for every lookup operation on the + * bad object. + */ + if (cookie != (void *)BR_STUB_REQUEST_COOKIE) { + ret = br_stub_mark_xdata_bad_object(this, inode, xattr); + if (ret) { + op_ret = -1; + op_errno = EIO; + /* + * This flag ensures that in the label @delkey below, + * bad file marker is not removed from the dictinary, + * but other virtual xattrs (such as version, signature) + * are removed. + */ + remove_bad_file_marker = _gf_false; + } + goto delkey; + } + + ret = br_stub_lookup_version(this, stbuf->ia_gfid, inode, xattr); + if (ret < 0) { + op_ret = -1; + op_errno = EINVAL; + goto delkey; + } + + /** + * If the object is bad, send "bad inode" marker back in response + * for xlator(s) to act accordingly (such as quick-read, etc..) + */ + ret = br_stub_mark_xdata_bad_object(this, inode, xattr); + if (ret) { + /** + * aaha! bad object, but sorry we would not + * satisfy the request on allocation failures. + */ + op_ret = -1; + op_errno = EIO; + goto delkey; + } + +delkey: + br_stub_remove_vxattrs(xattr, remove_bad_file_marker); +unwind: + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, stbuf, xattr, + postparent); + + return 0; +} + +int +br_stub_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + int32_t ret = 0; + int op_errno = 0; + void *cookie = NULL; + uint64_t ctx_addr = 0; + gf_boolean_t xref = _gf_false; + br_stub_private_t *priv = NULL; + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO(this->name, loc, unwind); + GF_VALIDATE_OR_GOTO(this->name, loc->inode, unwind); + + priv = this->private; + + BR_STUB_VER_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); + + if (!gf_uuid_compare(loc->gfid, priv->bad_object_dir_gfid) || + !gf_uuid_compare(loc->pargfid, priv->bad_object_dir_gfid)) { + stub = fop_lookup_stub(frame, br_stub_lookup_wrapper, loc, xdata); + if (!stub) { + op_errno = ENOMEM; + goto unwind; + } + br_stub_worker_enqueue(this, stub); + return 0; + } + + ret = br_stub_get_inode_ctx(this, loc->inode, &ctx_addr); + if (ret < 0) + ctx_addr = 0; + if (ctx_addr != 0) + goto wind; + + /** + * fresh lookup: request version keys from POSIX + */ + op_errno = ENOMEM; + if (!xdata) { + xdata = dict_new(); + if (!xdata) + goto unwind; + } else { + xdata = dict_ref(xdata); + } + + xref = _gf_true; + + /** + * Requesting both xattrs provides a way of sanity checking the + * object. Anomaly checking is done in cbk by examining absence + * of either or both xattrs. + */ + op_errno = EINVAL; + ret = dict_set_uint32(xdata, BITROT_CURRENT_VERSION_KEY, 0); + if (ret) + goto unwind; + ret = dict_set_uint32(xdata, BITROT_SIGNING_VERSION_KEY, 0); + if (ret) + goto unwind; + ret = dict_set_uint32(xdata, BITROT_OBJECT_BAD_KEY, 0); + if (ret) + goto unwind; + cookie = (void *)BR_STUB_REQUEST_COOKIE; + +wind: + STACK_WIND_COOKIE(frame, br_stub_lookup_cbk, cookie, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + goto dealloc_dict; + +unwind: + if (frame->local == (void *)0x1) + frame->local = NULL; + STACK_UNWIND_STRICT(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); +dealloc_dict: + if (xref) + dict_unref(xdata); + return 0; +} + +/** }}} */ + +/** {{{ */ + +/* stat */ +int +br_stub_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + int32_t ret = 0; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + br_stub_private_t *priv = NULL; + + priv = this->private; + + if (!priv->do_versioning) + goto wind; + + if (!IA_ISREG(loc->inode->ia_type)) + goto wind; + + ret = br_stub_check_bad_object(this, loc->inode, &op_ret, &op_errno); + if (ret) + goto unwind; + +wind: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->stat, + loc, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT(stat, frame, op_ret, op_errno, NULL, NULL); + return 0; +} + +/* fstat */ +int +br_stub_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int32_t ret = 0; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + br_stub_private_t *priv = NULL; + + priv = this->private; + + if (!priv->do_versioning) + goto wind; + + if (!IA_ISREG(fd->inode->ia_type)) + goto wind; + + ret = br_stub_check_bad_object(this, fd->inode, &op_ret, &op_errno); + if (ret) + goto unwind; + +wind: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fstat, + fd, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT(fstat, frame, op_ret, op_errno, NULL, NULL); + return 0; +} + +/** }}} */ + +/** {{{ */ + +/* unlink() */ + +int +br_stub_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + br_stub_local_t *local = NULL; + inode_t *inode = NULL; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + int32_t ret = -1; + br_stub_private_t *priv = NULL; + gf_boolean_t ver_enabled = _gf_false; + + BR_STUB_VER_ENABLED_IN_CALLPATH(frame, ver_enabled); + priv = this->private; + BR_STUB_VER_COND_GOTO(priv, (!ver_enabled), unwind); + + local = frame->local; + frame->local = NULL; + + if (op_ret < 0) + goto unwind; + + if (!local) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_NULL_LOCAL, NULL); + goto unwind; + } + inode = local->u.context.inode; + if (!IA_ISREG(inode->ia_type)) + goto unwind; + + ret = br_stub_get_inode_ctx(this, inode, &ctx_addr); + if (ret) { + /** + * If the inode is bad AND context is not there, then there + * is a possibility of the gfid of the object being listed + * in the quarantine directory and will be shown in the + * bad objects list. So continuing with the fop with a + * warning log. The entry from the quarantine directory + * has to be removed manually. Its not a good idea to fail + * the fop, as the object has already been deleted. + */ + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_GET_INODE_CONTEXT_FAILED, + "inode-gfid=%s", uuid_utoa(inode->gfid), NULL); + goto unwind; + } + + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + + LOCK(&inode->lock); + { + /** + * Ignoring the return value of br_stub_del (). + * There is not much that can be done if unlinking + * of the entry in the quarantine directory fails. + * The failure is logged. + */ + if (__br_stub_is_bad_object(ctx)) + (void)br_stub_del(this, inode->gfid); + } + UNLOCK(&inode->lock); + +unwind: + STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, preparent, postparent, + xdata); + br_stub_cleanup_local(local); + br_stub_dealloc_local(local); + return 0; +} + +int +br_stub_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int flag, + dict_t *xdata) +{ + br_stub_local_t *local = NULL; + int32_t op_ret = -1; + int32_t op_errno = 0; + br_stub_private_t *priv = NULL; + + priv = this->private; + BR_STUB_VER_NOT_ACTIVE_THEN_GOTO(frame, priv, wind); + + local = br_stub_alloc_local(this); + if (!local) { + op_ret = -1; + op_errno = ENOMEM; + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRS_MSG_ALLOC_MEM_FAILED, + "local path=%s", loc->path, "gfid=%s", + uuid_utoa(loc->inode->gfid), NULL); + goto unwind; + } + + br_stub_fill_local(local, NULL, NULL, loc->inode, loc->inode->gfid, + BR_STUB_NO_VERSIONING, 0); + + frame->local = local; + +wind: + STACK_WIND(frame, br_stub_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, flag, xdata); + return 0; + +unwind: + if (frame->local == (void *)0x1) + frame->local = NULL; + STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, NULL, NULL, NULL); + return 0; +} + +/** }}} */ + +/** {{{ */ + +/* forget() */ + +int +br_stub_forget(xlator_t *this, inode_t *inode) +{ + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + + inode_ctx_del(inode, this, &ctx_addr); + if (!ctx_addr) + return 0; + + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + + GF_FREE(ctx); + + return 0; +} + +/** }}} */ + +/** {{{ */ + +int32_t +br_stub_noop(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + STACK_DESTROY(frame->root); + return 0; +} + +static void +br_stub_send_ipc_fop(xlator_t *this, fd_t *fd, unsigned long releaseversion, + int sign_info) +{ + int32_t op = 0; + int32_t ret = 0; + dict_t *xdata = NULL; + call_frame_t *frame = NULL; + changelog_event_t ev = { + 0, + }; + + ev.ev_type = CHANGELOG_OP_TYPE_BR_RELEASE; + ev.u.releasebr.version = releaseversion; + ev.u.releasebr.sign_info = sign_info; + gf_uuid_copy(ev.u.releasebr.gfid, fd->inode->gfid); + + xdata = dict_new(); + if (!xdata) { + gf_smsg(this->name, GF_LOG_WARNING, ENOMEM, BRS_MSG_DICT_ALLOC_FAILED, + NULL); + goto out; + } + + ret = dict_set_static_bin(xdata, "RELEASE-EVENT", &ev, CHANGELOG_EV_SIZE); + if (ret) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SET_EVENT_FAILED, NULL); + goto dealloc_dict; + } + + frame = create_frame(this, this->ctx->pool); + if (!frame) { + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_CREATE_FRAME_FAILED, + NULL); + goto dealloc_dict; + } + + op = GF_IPC_TARGET_CHANGELOG; + STACK_WIND(frame, br_stub_noop, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ipc, op, xdata); + +dealloc_dict: + dict_unref(xdata); +out: + return; +} + +/** + * This is how the state machine of sign info works: + * 3 states: + * 1) BR_SIGN_NORMAL => The default State of the inode + * 2) BR_SIGN_REOPEN_WAIT => A release has been sent and is waiting for reopen + * 3) BR_SIGN_QUICK => reopen has happened and this release should trigger sign + * 2 events: + * 1) GF_FOP_RELEASE + * 2) GF_FOP_WRITE (actually a dummy write for BitD) + * + * This is how states are changed based on events: + * EVENT: GF_FOP_RELEASE: + * if (state == BR_SIGN_NORMAL) ; then + * set state = BR_SIGN_REOPEN_WAIT; + * if (state == BR_SIGN_QUICK); then + * set state = BR_SIGN_NORMAL; + * EVENT: GF_FOP_WRITE: + * if (state == BR_SIGN_REOPEN_WAIT); then + * set state = BR_SIGN_QUICK; + */ +br_sign_state_t +__br_stub_inode_sign_state(br_stub_inode_ctx_t *ctx, glusterfs_fop_t fop, + fd_t *fd) +{ + br_sign_state_t sign_info = BR_SIGN_INVALID; + + switch (fop) { + case GF_FOP_FSETXATTR: + sign_info = ctx->info_sign = BR_SIGN_QUICK; + break; + + case GF_FOP_RELEASE: + GF_ASSERT(ctx->info_sign != BR_SIGN_REOPEN_WAIT); + + if (ctx->info_sign == BR_SIGN_NORMAL) { + sign_info = ctx->info_sign = BR_SIGN_REOPEN_WAIT; + } else { + sign_info = ctx->info_sign; + ctx->info_sign = BR_SIGN_NORMAL; + } + + break; + default: + break; + } + + return sign_info; +} + +int32_t +br_stub_release(xlator_t *this, fd_t *fd) +{ + int32_t ret = 0; + int32_t flags = 0; + inode_t *inode = NULL; + unsigned long releaseversion = 0; + br_stub_inode_ctx_t *ctx = NULL; + uint64_t tmp = 0; + br_stub_fd_t *br_stub_fd = NULL; + int32_t signinfo = 0; + + inode = fd->inode; + + LOCK(&inode->lock); + { + ctx = __br_stub_get_ongoing_version_ctx(this, inode, NULL); + if (ctx == NULL) + goto unblock; + br_stub_fd = br_stub_fd_ctx_get(this, fd); + if (br_stub_fd) { + list_del_init(&br_stub_fd->list); + } + + ret = __br_stub_can_trigger_release(inode, ctx, &releaseversion); + if (!ret) + goto unblock; + + signinfo = __br_stub_inode_sign_state(ctx, GF_FOP_RELEASE, fd); + signinfo = htonl(signinfo); + + /* inode back to initital state: mark dirty */ + if (ctx->info_sign == BR_SIGN_NORMAL) { + __br_stub_mark_inode_dirty(ctx); + __br_stub_unset_inode_modified(ctx); + } + } +unblock: + UNLOCK(&inode->lock); + + if (ret) { + gf_msg_debug(this->name, 0, + "releaseversion: %lu | flags: %d " + "| signinfo: %d", + (unsigned long)ntohl(releaseversion), flags, + ntohl(signinfo)); + br_stub_send_ipc_fop(this, fd, releaseversion, signinfo); + } + + ret = fd_ctx_del(fd, this, &tmp); + br_stub_fd = (br_stub_fd_t *)(long)tmp; + + GF_FREE(br_stub_fd); + + return 0; +} + +int32_t +br_stub_releasedir(xlator_t *this, fd_t *fd) +{ + br_stub_fd_t *fctx = NULL; + uint64_t ctx = 0; + int ret = 0; + + ret = fd_ctx_del(fd, this, &ctx); + if (ret < 0) + goto out; + + fctx = (br_stub_fd_t *)(long)ctx; + if (fctx->bad_object.dir) { + ret = sys_closedir(fctx->bad_object.dir); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_DIR_CLOSE_FAIL, + "error=%s", strerror(errno), NULL); + } + + GF_FREE(fctx); +out: + return 0; +} + +/** }}} */ + +/** {{{ */ + +/* ictxmerge */ + +void +br_stub_ictxmerge(xlator_t *this, fd_t *fd, inode_t *inode, + inode_t *linked_inode) +{ + int32_t ret = 0; + uint64_t ctxaddr = 0; + uint64_t lctxaddr = 0; + br_stub_inode_ctx_t *ctx = NULL; + br_stub_inode_ctx_t *lctx = NULL; + br_stub_fd_t *br_stub_fd = NULL; + + ret = br_stub_get_inode_ctx(this, inode, &ctxaddr); + if (ret < 0) + goto done; + ctx = (br_stub_inode_ctx_t *)(uintptr_t)ctxaddr; + + LOCK(&linked_inode->lock); + { + ret = __br_stub_get_inode_ctx(this, linked_inode, &lctxaddr); + if (ret < 0) + goto unblock; + lctx = (br_stub_inode_ctx_t *)(uintptr_t)lctxaddr; + + GF_ASSERT(list_is_singular(&ctx->fd_list)); + br_stub_fd = list_first_entry(&ctx->fd_list, br_stub_fd_t, list); + if (br_stub_fd) { + GF_ASSERT(br_stub_fd->fd == fd); + list_move_tail(&br_stub_fd->list, &lctx->fd_list); + } + } +unblock: + UNLOCK(&linked_inode->lock); + +done: + return; +} + +/** }}} */ + +struct xlator_fops fops = { + .lookup = br_stub_lookup, + .stat = br_stub_stat, + .fstat = br_stub_fstat, + .open = br_stub_open, + .create = br_stub_create, + .readdirp = br_stub_readdirp, + .getxattr = br_stub_getxattr, + .fgetxattr = br_stub_fgetxattr, + .fsetxattr = br_stub_fsetxattr, + .writev = br_stub_writev, + .truncate = br_stub_truncate, + .ftruncate = br_stub_ftruncate, + .mknod = br_stub_mknod, + .readv = br_stub_readv, + .removexattr = br_stub_removexattr, + .fremovexattr = br_stub_fremovexattr, + .setxattr = br_stub_setxattr, + .opendir = br_stub_opendir, + .readdir = br_stub_readdir, + .unlink = br_stub_unlink, +}; + +struct xlator_cbks cbks = { + .forget = br_stub_forget, + .release = br_stub_release, + .ictxmerge = br_stub_ictxmerge, +}; + +struct volume_options options[] = { + {.key = {"bitrot"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_FORCE, + .tags = {"bitrot"}, + .description = "enable/disable bitrot stub"}, + {.key = {"export"}, + .type = GF_OPTION_TYPE_PATH, + .op_version = {GD_OP_VERSION_3_7_0}, + .tags = {"bitrot"}, + .description = "brick path for versioning", + .default_value = "{{ brick.path }}"}, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .notify = notify, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "bitrot-stub", + .category = GF_MAINTAINED, +}; diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.h b/xlators/features/bit-rot/src/stub/bit-rot-stub.h new file mode 100644 index 00000000000..edd79a77e4f --- /dev/null +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.h @@ -0,0 +1,515 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef __BIT_ROT_STUB_H__ +#define __BIT_ROT_STUB_H__ + +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> +#include <glusterfs/call-stub.h> +#include "bit-rot-stub-mem-types.h" +#include <glusterfs/syscall.h> +#include <glusterfs/common-utils.h> +#include "bit-rot-common.h" +#include "bit-rot-stub-messages.h" +#include "glusterfs3-xdr.h" +#include <glusterfs/syncop.h> +#include <glusterfs/syncop-utils.h> + +#define BAD_OBJECT_THREAD_STACK_SIZE ((size_t)(1024 * 1024)) +#define BR_STUB_DUMP_STR_SIZE 65536 + +#define BR_PATH_MAX_EXTRA (PATH_MAX + 1024) +#define BR_PATH_MAX_PLUS (PATH_MAX + 2048) + +/* + * Oops. Spelling mistake. Correcting it + */ +#define OLD_BR_STUB_QUARANTINE_DIR GF_HIDDEN_PATH "/quanrantine" +#define BR_STUB_QUARANTINE_DIR GF_HIDDEN_PATH "/quarantine" + +/* do not reference frame->local in cbk unless initialized. + * Assigned 0x1 marks verisoning flag between call path and + * cbk path. + */ +#define BR_STUB_VER_NOT_ACTIVE_THEN_GOTO(frame, priv, label) \ + do { \ + if (priv->do_versioning) \ + frame->local = (void *)0x1; \ + else \ + goto label; \ + } while (0) + +#define BR_STUB_VER_COND_GOTO(priv, cond, label) \ + do { \ + if (!priv->do_versioning || cond) \ + goto label; \ + } while (0) + +#define BR_STUB_VER_ENABLED_IN_CALLPATH(frame, flag) \ + do { \ + if (frame->local) \ + flag = _gf_true; \ + if (frame->local == (void *)0x1) \ + frame->local = NULL; \ + } while (0) + +#define BR_STUB_RESET_LOCAL_NULL(frame) \ + do { \ + if (frame->local == (void *)0x1) \ + frame->local = NULL; \ + } while (0) + +typedef int(br_stub_version_cbk)(call_frame_t *, void *, xlator_t *, int32_t, + int32_t, dict_t *); + +typedef struct br_stub_inode_ctx { + int need_writeback; /* does the inode need + a writeback to disk? */ + unsigned long currentversion; /* ongoing version */ + + int info_sign; + struct list_head fd_list; /* list of open fds or fds participating in + write operations */ + gf_boolean_t bad_object; +} br_stub_inode_ctx_t; + +typedef struct br_stub_fd { + fd_t *fd; + struct list_head list; + struct bad_object_dir { + DIR *dir; + off_t dir_eof; + } bad_object; +} br_stub_fd_t; + +#define I_DIRTY (1 << 0) /* inode needs writeback */ +#define I_MODIFIED (1 << 1) +#define WRITEBACK_DURABLE 1 /* writeback is durable */ + +/** + * This could just have been a plain struct without unions and all, + * but we may need additional things in the future. + */ +typedef struct br_stub_local { + call_stub_t *fopstub; /* stub for original fop */ + + int versioningtype; /* not much used atm */ + + union { + struct br_stub_ctx { + fd_t *fd; + uuid_t gfid; + inode_t *inode; + unsigned long version; + } context; + } u; +} br_stub_local_t; + +#define BR_STUB_NO_VERSIONING (1 << 0) +#define BR_STUB_INCREMENTAL_VERSIONING (1 << 1) + +typedef struct br_stub_private { + gf_boolean_t do_versioning; + + uint32_t boot[2]; + char export[PATH_MAX]; + + pthread_mutex_t lock; + pthread_cond_t cond; + + struct list_head squeue; /* ordered signing queue */ + pthread_t signth; + struct bad_objects_container { + pthread_t thread; + pthread_mutex_t bad_lock; + pthread_cond_t bad_cond; + struct list_head bad_queue; + } container; + struct mem_pool *local_pool; + + char stub_basepath[BR_PATH_MAX_EXTRA]; + + uuid_t bad_object_dir_gfid; +} br_stub_private_t; + +br_stub_fd_t * +br_stub_fd_new(void); + +int +__br_stub_fd_ctx_set(xlator_t *this, fd_t *fd, br_stub_fd_t *br_stub_fd); + +br_stub_fd_t * +__br_stub_fd_ctx_get(xlator_t *this, fd_t *fd); + +br_stub_fd_t * +br_stub_fd_ctx_get(xlator_t *this, fd_t *fd); + +int32_t +br_stub_fd_ctx_set(xlator_t *this, fd_t *fd, br_stub_fd_t *br_stub_fd); + +static inline gf_boolean_t +__br_stub_is_bad_object(br_stub_inode_ctx_t *ctx) +{ + return ctx->bad_object; +} + +static inline void +__br_stub_mark_object_bad(br_stub_inode_ctx_t *ctx) +{ + ctx->bad_object = _gf_true; +} + +/* inode writeback helpers */ +static inline void +__br_stub_mark_inode_dirty(br_stub_inode_ctx_t *ctx) +{ + ctx->need_writeback |= I_DIRTY; +} + +static inline void +__br_stub_mark_inode_synced(br_stub_inode_ctx_t *ctx) +{ + ctx->need_writeback &= ~I_DIRTY; +} + +static inline int +__br_stub_is_inode_dirty(br_stub_inode_ctx_t *ctx) +{ + return (ctx->need_writeback & I_DIRTY); +} + +/* inode mofification markers */ +static inline void +__br_stub_set_inode_modified(br_stub_inode_ctx_t *ctx) +{ + ctx->need_writeback |= I_MODIFIED; +} + +static inline void +__br_stub_unset_inode_modified(br_stub_inode_ctx_t *ctx) +{ + ctx->need_writeback &= ~I_MODIFIED; +} + +static inline int +__br_stub_is_inode_modified(br_stub_inode_ctx_t *ctx) +{ + return (ctx->need_writeback & I_MODIFIED); +} + +static inline int +br_stub_require_release_call(xlator_t *this, fd_t *fd, br_stub_fd_t **fd_ctx) +{ + int32_t ret = 0; + br_stub_fd_t *br_stub_fd = NULL; + + br_stub_fd = br_stub_fd_new(); + if (!br_stub_fd) + return -1; + + br_stub_fd->fd = fd; + INIT_LIST_HEAD(&br_stub_fd->list); + + ret = br_stub_fd_ctx_set(this, fd, br_stub_fd); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SET_CONTEXT_FAILED, + NULL); + else + *fd_ctx = br_stub_fd; + + return ret; +} + +/* get/set inode context helpers */ + +static inline int +__br_stub_get_inode_ctx(xlator_t *this, inode_t *inode, uint64_t *ctx) +{ + return __inode_ctx_get(inode, this, ctx); +} + +static inline int +br_stub_get_inode_ctx(xlator_t *this, inode_t *inode, uint64_t *ctx) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __br_stub_get_inode_ctx(this, inode, ctx); + } + UNLOCK(&inode->lock); + + return ret; +} + +static inline int +br_stub_set_inode_ctx(xlator_t *this, inode_t *inode, br_stub_inode_ctx_t *ctx) +{ + uint64_t ctx_addr = (uint64_t)(uintptr_t)ctx; + return inode_ctx_set(inode, this, &ctx_addr); +} + +/* version get/set helpers */ + +static inline unsigned long +__br_stub_writeback_version(br_stub_inode_ctx_t *ctx) +{ + return (ctx->currentversion + 1); +} + +static inline void +__br_stub_set_ongoing_version(br_stub_inode_ctx_t *ctx, unsigned long version) +{ + if (ctx->currentversion < version) + ctx->currentversion = version; + else + gf_smsg("bit-rot-stub", GF_LOG_WARNING, 0, + BRS_MSG_CHANGE_VERSION_FAILED, "current version=%lu", + ctx->currentversion, "new version=%lu", version, NULL); +} + +static inline int +__br_stub_can_trigger_release(inode_t *inode, br_stub_inode_ctx_t *ctx, + unsigned long *version) +{ + /** + * If the inode is modified, then it has to be dirty. An inode is + * marked dirty once version is increased. Its marked as modified + * when the modification call (write/truncate) which triggered + * the versioning is successful. + */ + if (__br_stub_is_inode_modified(ctx) && list_empty(&ctx->fd_list) && + (ctx->info_sign != BR_SIGN_REOPEN_WAIT)) { + GF_ASSERT(__br_stub_is_inode_dirty(ctx) == 0); + + if (version) + *version = htonl(ctx->currentversion); + return 1; + } + + return 0; +} + +static inline int32_t +br_stub_get_ongoing_version(xlator_t *this, inode_t *inode, + unsigned long *version) +{ + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + + LOCK(&inode->lock); + { + ret = __inode_ctx_get(inode, this, &ctx_addr); + if (ret < 0) + goto unblock; + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + *version = ctx->currentversion; + } +unblock: + UNLOCK(&inode->lock); + + return ret; +} + +/** + * fetch the current version from inode and return the context. + * inode->lock should be held before invoking this as context + * *needs* to be valid in the caller. + */ +static inline br_stub_inode_ctx_t * +__br_stub_get_ongoing_version_ctx(xlator_t *this, inode_t *inode, + unsigned long *version) +{ + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + + ret = __inode_ctx_get(inode, this, &ctx_addr); + if (ret < 0) + return NULL; + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + if (version) + *version = ctx->currentversion; + + return ctx; +} + +/* filter for xattr fetch */ +static inline int +br_stub_is_internal_xattr(const char *name) +{ + if (name && ((strncmp(name, BITROT_CURRENT_VERSION_KEY, + SLEN(BITROT_CURRENT_VERSION_KEY)) == 0) || + (strncmp(name, BITROT_SIGNING_VERSION_KEY, + SLEN(BITROT_SIGNING_VERSION_KEY)) == 0))) + return 1; + return 0; +} + +static inline void +br_stub_remove_vxattrs(dict_t *xattr, gf_boolean_t remove_bad_marker) +{ + if (xattr) { + /* + * When a file is corrupted, bad-object should be + * set in the dict. But, other info such as version, + * signature etc should not be set. Hence the flag + * remove_bad_marker. The consumer should know whether + * to send the bad-object info in the dict or not. + */ + if (remove_bad_marker) + dict_del(xattr, BITROT_OBJECT_BAD_KEY); + dict_del(xattr, BITROT_CURRENT_VERSION_KEY); + dict_del(xattr, BITROT_SIGNING_VERSION_KEY); + dict_del(xattr, BITROT_SIGNING_XATTR_SIZE_KEY); + } +} + +/** + * This function returns the below values for different situations + * 0 => as per the inode context object is not bad + * -1 => Failed to get the inode context itself + * -2 => As per the inode context object is bad + * Both -ve values means the fop which called this function is failed + * and error is returned upwards. + * In future if needed or more errors have to be handled, then those + * errors can be made into enums. + */ +static inline int +br_stub_is_bad_object(xlator_t *this, inode_t *inode) +{ + int bad_object = 0; + gf_boolean_t tmp = _gf_false; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + int32_t ret = -1; + + ret = br_stub_get_inode_ctx(this, inode, &ctx_addr); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_GET_INODE_CONTEXT_FAILED, + "inode-gfid=%s", uuid_utoa(inode->gfid), NULL); + bad_object = -1; + goto out; + } + + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + + LOCK(&inode->lock); + { + tmp = __br_stub_is_bad_object(ctx); + if (tmp) + bad_object = -2; + } + UNLOCK(&inode->lock); + +out: + return bad_object; +} + +static inline int32_t +br_stub_mark_object_bad(xlator_t *this, inode_t *inode) +{ + int32_t ret = -1; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + + ret = br_stub_get_inode_ctx(this, inode, &ctx_addr); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_GET_INODE_CONTEXT_FAILED, + "inode-gfid=%s", uuid_utoa(inode->gfid), NULL); + goto out; + } + + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + + LOCK(&inode->lock); + { + __br_stub_mark_object_bad(ctx); + } + UNLOCK(&inode->lock); + +out: + return ret; +} + +/** + * There is a possibility that dict_set might fail. The o/p of dict_set is + * given to the caller and the caller has to decide what to do. + */ +static inline int32_t +br_stub_mark_xdata_bad_object(xlator_t *this, inode_t *inode, dict_t *xdata) +{ + int32_t ret = 0; + + if (br_stub_is_bad_object(this, inode) == -2) + ret = dict_set_int32(xdata, GLUSTERFS_BAD_INODE, 1); + + return ret; +} + +int32_t +br_stub_add_fd_to_inode(xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx); + +br_sign_state_t +__br_stub_inode_sign_state(br_stub_inode_ctx_t *ctx, glusterfs_fop_t fop, + fd_t *fd); + +int +br_stub_dir_create(xlator_t *this, br_stub_private_t *priv); + +int +br_stub_add(xlator_t *this, uuid_t gfid); + +int32_t +br_stub_create_stub_gfid(xlator_t *this, char *stub_gfid_path, uuid_t gfid); + +int +br_stub_dir_create(xlator_t *this, br_stub_private_t *priv); + +call_stub_t * +__br_stub_dequeue(struct list_head *callstubs); + +void +__br_stub_enqueue(struct list_head *callstubs, call_stub_t *stub); + +void +br_stub_worker_enqueue(xlator_t *this, call_stub_t *stub); + +void * +br_stub_worker(void *data); + +int32_t +br_stub_lookup_wrapper(call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xattr_req); + +int32_t +br_stub_readdir_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t off, dict_t *xdata); + +int +br_stub_del(xlator_t *this, uuid_t gfid); + +int +br_stub_bad_objects_path(xlator_t *this, fd_t *fd, gf_dirent_t *entries, + dict_t **dict); + +void +br_stub_entry_xattr_fill(xlator_t *this, char *hpath, gf_dirent_t *entry, + dict_t *dict); + +int +br_stub_get_path_of_gfid(xlator_t *this, inode_t *parent, inode_t *inode, + uuid_t gfid, char **path); + +#endif /* __BIT_ROT_STUB_H__ */ |
