diff options
Diffstat (limited to 'xlators/features')
79 files changed, 11505 insertions, 4160 deletions
diff --git a/xlators/features/Makefile.am b/xlators/features/Makefile.am index 6a73301d7..d2f5ef192 100644 --- a/xlators/features/Makefile.am +++ b/xlators/features/Makefile.am @@ -1,4 +1,4 @@ SUBDIRS = locks quota read-only mac-compat quiesce marker index \ - protect $(GLUPY_SUBDIR) # trash path-converter # filter + protect compress changelog gfid-access $(GLUPY_SUBDIR) qemu-block # trash path-converter # filter CLEANFILES = diff --git a/xlators/features/changelog/Makefile.am b/xlators/features/changelog/Makefile.am new file mode 100644 index 000000000..153bb6850 --- /dev/null +++ b/xlators/features/changelog/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src lib + +CLEANFILES = diff --git a/xlators/features/changelog/lib/Makefile.am b/xlators/features/changelog/lib/Makefile.am new file mode 100644 index 000000000..a985f42a8 --- /dev/null +++ b/xlators/features/changelog/lib/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/features/changelog/lib/examples/c/get-changes.c b/xlators/features/changelog/lib/examples/c/get-changes.c new file mode 100644 index 000000000..14562585a --- /dev/null +++ b/xlators/features/changelog/lib/examples/c/get-changes.c @@ -0,0 +1,87 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +/** + * get set of new changes every 10 seconds (just print the file names) + * + * Compile it using: + * gcc -o getchanges `pkg-config --cflags libgfchangelog` get-changes.c \ + * `pkg-config --libs libgfchangelog` + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/un.h> +#include <limits.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <errno.h> + +#include "changelog.h" + +#define handle_error(fn) \ + printf ("%s (reason: %s)\n", fn, strerror (errno)) + +int +main (int argc, char ** argv) +{ + int i = 0; + int ret = 0; + ssize_t nr_changes = 0; + ssize_t changes = 0; + char fbuf[PATH_MAX] = {0,}; + + /* get changes for brick "/home/vshankar/export/yow/yow-1" */ + ret = gf_changelog_register ("/home/vshankar/export/yow/yow-1", + "/tmp/scratch", "/tmp/change.log", 9, 5); + if (ret) { + handle_error ("register failed"); + goto out; + } + + while (1) { + i = 0; + nr_changes = gf_changelog_scan (); + if (nr_changes < 0) { + handle_error ("scan(): "); + break; + } + + if (nr_changes == 0) + goto next; + + printf ("Got %ld changelog files\n", nr_changes); + + while ( (changes = + gf_changelog_next_change (fbuf, PATH_MAX)) > 0) { + printf ("changelog file [%d]: %s\n", ++i, fbuf); + + /* process changelog */ + /* ... */ + /* ... */ + /* ... */ + /* done processing */ + + ret = gf_changelog_done (fbuf); + if (ret) + handle_error ("gf_changelog_done"); + } + + if (changes == -1) + handle_error ("gf_changelog_next_change"); + + next: + sleep (10); + } + + out: + return ret; +} diff --git a/xlators/features/changelog/lib/examples/python/changes.py b/xlators/features/changelog/lib/examples/python/changes.py new file mode 100644 index 000000000..d21db8eab --- /dev/null +++ b/xlators/features/changelog/lib/examples/python/changes.py @@ -0,0 +1,32 @@ +#!/usr/bin/python + +import os +import sys +import time +import libgfchangelog + +cl = libgfchangelog.Changes() + +def get_changes(brick, scratch_dir, log_file, log_level, interval): + change_list = [] + try: + cl.cl_register(brick, scratch_dir, log_file, log_level) + while True: + cl.cl_scan() + change_list = cl.cl_getchanges() + if change_list: + print change_list + for change in change_list: + print('done with %s' % (change)) + cl.cl_done(change) + time.sleep(interval) + except OSError: + ex = sys.exc_info()[1] + print ex + +if __name__ == '__main__': + if len(sys.argv) != 5: + print("usage: %s <brick> <scratch-dir> <log-file> <fetch-interval>" + % (sys.argv[0])) + sys.exit(1) + get_changes(sys.argv[1], sys.argv[2], sys.argv[3], 9, int(sys.argv[4])) diff --git a/xlators/features/changelog/lib/examples/python/libgfchangelog.py b/xlators/features/changelog/lib/examples/python/libgfchangelog.py new file mode 100644 index 000000000..68ec3baf1 --- /dev/null +++ b/xlators/features/changelog/lib/examples/python/libgfchangelog.py @@ -0,0 +1,64 @@ +import os +from ctypes import * +from ctypes.util import find_library + +class Changes(object): + libgfc = CDLL(find_library("gfchangelog"), use_errno=True) + + @classmethod + def geterrno(cls): + return get_errno() + + @classmethod + def raise_oserr(cls): + errn = cls.geterrno() + raise OSError(errn, os.strerror(errn)) + + @classmethod + def _get_api(cls, call): + return getattr(cls.libgfc, call) + + @classmethod + def cl_register(cls, brick, path, log_file, log_level, retries = 0): + ret = cls._get_api('gf_changelog_register')(brick, path, + log_file, log_level, retries) + if ret == -1: + cls.raise_oserr() + + @classmethod + def cl_scan(cls): + ret = cls._get_api('gf_changelog_scan')() + if ret == -1: + cls.raise_oserr() + + @classmethod + def cl_startfresh(cls): + ret = cls._get_api('gf_changelog_start_fresh')() + if ret == -1: + cls.raise_oserr() + + @classmethod + def cl_getchanges(cls): + """ remove hardcoding for path name length """ + def clsort(f): + return f.split('.')[-1] + changes = [] + buf = create_string_buffer('\0', 4096) + call = cls._get_api('gf_changelog_next_change') + + while True: + ret = call(buf, 4096) + if ret in (0, -1): + break; + changes.append(buf.raw[:ret-1]) + if ret == -1: + cls.raise_oserr() + # cleanup tracker + cls.cl_startfresh() + return sorted(changes, key=clsort) + + @classmethod + def cl_done(cls, clfile): + ret = cls._get_api('gf_changelog_done')(clfile) + if ret == -1: + cls.raise_oserr() diff --git a/xlators/features/changelog/lib/src/Makefile.am b/xlators/features/changelog/lib/src/Makefile.am new file mode 100644 index 000000000..fbaaea628 --- /dev/null +++ b/xlators/features/changelog/lib/src/Makefile.am @@ -0,0 +1,37 @@ +libgfchangelog_la_CFLAGS = -Wall $(GF_CFLAGS) $(GF_DARWIN_LIBGLUSTERFS_CFLAGS) \ + -DDATADIR=\"$(localstatedir)\" + +libgfchangelog_la_CPPFLAGS = $(GF_CPPFLAGS) -D__USE_FILE_OFFSET64 -fpic \ + -I../../../src/ -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/xlators/features/changelog/src \ + -DDATADIR=\"$(localstatedir)\" + +libgfchangelog_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ + $(GF_GLUSTERFS_LIBS) + +libgfchangelog_la_LDFLAGS = $(GF_LDFLAGS) + +libgfchangelogdir = $(includedir)/glusterfs/gfchangelog +lib_LTLIBRARIES = libgfchangelog.la + +CONTRIB_BUILDDIR = $(top_builddir)/contrib + +libgfchangelog_la_SOURCES = gf-changelog.c gf-changelog-process.c \ + gf-changelog-helpers.c $(CONTRIBDIR)/uuid/clear.c \ + $(CONTRIBDIR)/uuid/copy.c $(CONTRIBDIR)/uuid/gen_uuid.c \ + $(CONTRIBDIR)/uuid/pack.c $(CONTRIBDIR)/uuid/parse.c \ + $(CONTRIBDIR)/uuid/unparse.c $(CONTRIBDIR)/uuid/uuid_time.c \ + $(CONTRIBDIR)/uuid/compare.c $(CONTRIBDIR)/uuid/isnull.c \ + $(CONTRIBDIR)/uuid/unpack.c + +noinst_HEADERS = gf-changelog-helpers.h $(CONTRIBDIR)/uuid/uuidd.h \ + $(CONTRIBDIR)/uuid/uuid.h $(CONTRIBDIR)/uuid/uuidP.h \ + $(CONTRIB_BUILDDIR)/uuid/uuid_types.h + +libgfchangelog_HEADERS = changelog.h + +CLEANFILES = +CONFIG_CLEAN_FILES = $(CONTRIB_BUILDDIR)/uuid/uuid_types.h + +$(top_builddir)/libglusterfs/src/libglusterfs.la: + $(MAKE) -C $(top_builddir)/libglusterfs/src/ all diff --git a/xlators/features/changelog/lib/src/changelog.h b/xlators/features/changelog/lib/src/changelog.h new file mode 100644 index 000000000..5cddfb583 --- /dev/null +++ b/xlators/features/changelog/lib/src/changelog.h @@ -0,0 +1,31 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _GF_CHANGELOG_H +#define _GF_CHANGELOG_H + +/* API set */ + +int +gf_changelog_register (char *brick_path, char *scratch_dir, + char *log_file, int log_levl, int max_reconnects); +ssize_t +gf_changelog_scan (); + +int +gf_changelog_start_fresh (); + +ssize_t +gf_changelog_next_change (char *bufptr, size_t maxlen); + +int +gf_changelog_done (char *file); + +#endif diff --git a/xlators/features/changelog/lib/src/gf-changelog-helpers.c b/xlators/features/changelog/lib/src/gf-changelog-helpers.c new file mode 100644 index 000000000..1eef8bf04 --- /dev/null +++ b/xlators/features/changelog/lib/src/gf-changelog-helpers.c @@ -0,0 +1,180 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "changelog-mem-types.h" +#include "gf-changelog-helpers.h" + +ssize_t gf_changelog_read_path (int fd, char *buffer, size_t bufsize) +{ + return read (fd, buffer, bufsize); +} + +size_t +gf_changelog_write (int fd, char *buffer, size_t len) +{ + ssize_t size = 0; + size_t writen = 0; + + while (writen < len) { + size = write (fd, + buffer + writen, len - writen); + if (size <= 0) + break; + + writen += size; + } + + return writen; +} + +void +gf_rfc3986_encode (unsigned char *s, char *enc, char *estr) +{ + for (; *s; s++) { + if (estr[*s]) + sprintf(enc, "%c", estr[*s]); + else + sprintf(enc, "%%%02X", *s); + while (*++enc); + } +} + +/** + * thread safe version of readline with buffering + * (taken from Unix Network Programming Volume I, W.R. Stevens) + * + * This is favoured over fgets() as we'd need to ftruncate() + * (see gf_changelog_scan() API) to record new changelog files. + * stream open functions does have a truncate like api (although + * that can be done via @fflush(fp), @ftruncate(fd) and @fseek(fp), + * but this involves mixing POSIX file descriptors and stream FILE *). + * + * NOTE: This implmentation still does work with more than one fd's + * used to perform gf_readline(). For this very reason it's not + * made a part of libglusterfs. + */ + +static pthread_key_t rl_key; +static pthread_once_t rl_once = PTHREAD_ONCE_INIT; + +static void +readline_destructor (void *ptr) +{ + GF_FREE (ptr); +} + +static void +readline_once (void) +{ + pthread_key_create (&rl_key, readline_destructor); +} + +static ssize_t +my_read (read_line_t *tsd, int fd, char *ptr) +{ + if (tsd->rl_cnt <= 0) { + if ( (tsd->rl_cnt = read (fd, tsd->rl_buf, MAXLINE)) < 0 ) + return -1; + else if (tsd->rl_cnt == 0) + return 0; + tsd->rl_bufptr = tsd->rl_buf; + } + + tsd->rl_cnt--; + *ptr = *tsd->rl_bufptr++; + return 1; +} + +static int +gf_readline_init_once (read_line_t **tsd) +{ + if (pthread_once (&rl_once, readline_once) != 0) + return -1; + + *tsd = pthread_getspecific (rl_key); + if (*tsd) + goto out; + + *tsd = GF_CALLOC (1, sizeof (**tsd), + gf_changelog_mt_libgfchangelog_rl_t); + if (!*tsd) + return -1; + + if (pthread_setspecific (rl_key, *tsd) != 0) + return -1; + + out: + return 0; +} + +ssize_t +gf_readline (int fd, void *vptr, size_t maxlen) +{ + size_t n = 0; + size_t rc = 0; + char c = ' '; + char *ptr = NULL; + read_line_t *tsd = NULL; + + if (gf_readline_init_once (&tsd)) + return -1; + + ptr = vptr; + for (n = 1; n < maxlen; n++) { + if ( (rc = my_read (tsd, fd, &c)) == 1 ) { + *ptr++ = c; + if (c == '\n') + break; + } else if (rc == 0) { + *ptr = '\0'; + return (n - 1); + } else + return -1; + } + + *ptr = '\0'; + return n; + +} + +off_t +gf_lseek (int fd, off_t offset, int whence) +{ + off_t off = 0; + read_line_t *tsd = NULL; + + if (gf_readline_init_once (&tsd)) + return -1; + + if ( (off = lseek (fd, offset, whence)) == -1) + return -1; + + tsd->rl_cnt = 0; + tsd->rl_bufptr = tsd->rl_buf; + + return off; +} + +int +gf_ftruncate (int fd, off_t length) +{ + read_line_t *tsd = NULL; + + if (gf_readline_init_once (&tsd)) + return -1; + + if (ftruncate (fd, 0)) + return -1; + + tsd->rl_cnt = 0; + tsd->rl_bufptr = tsd->rl_buf; + + return 0; +} diff --git a/xlators/features/changelog/lib/src/gf-changelog-helpers.h b/xlators/features/changelog/lib/src/gf-changelog-helpers.h new file mode 100644 index 000000000..3aa6ed7b8 --- /dev/null +++ b/xlators/features/changelog/lib/src/gf-changelog-helpers.h @@ -0,0 +1,97 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _GF_CHANGELOG_HELPERS_H +#define _GF_CHANGELOG_HELPERS_H + +#include <unistd.h> +#include <dirent.h> +#include <limits.h> +#include <pthread.h> + +#include <xlator.h> + +#define GF_CHANGELOG_TRACKER "tracker" + +#define GF_CHANGELOG_CURRENT_DIR ".current" +#define GF_CHANGELOG_PROCESSED_DIR ".processed" +#define GF_CHANGELOG_PROCESSING_DIR ".processing" + +#ifndef MAXLINE +#define MAXLINE 4096 +#endif + +#define GF_CHANGELOG_FILL_BUFFER(ptr, ascii, off, len) do { \ + memcpy (ascii + off, ptr, len); \ + off += len; \ + } while (0) + +typedef struct read_line { + int rl_cnt; + char *rl_bufptr; + char rl_buf[MAXLINE]; +} read_line_t; + +typedef struct gf_changelog { + xlator_t *this; + + /* 'processing' directory stream */ + DIR *gfc_dir; + + /* fd to the tracker file */ + int gfc_fd; + + /* connection retries */ + int gfc_connretries; + + char gfc_sockpath[PATH_MAX]; + + char gfc_brickpath[PATH_MAX]; + + /* socket for recieving notifications */ + int gfc_sockfd; + + char *gfc_working_dir; + + /* RFC 3986 string encoding */ + char rfc3986[256]; + + char gfc_current_dir[PATH_MAX]; + char gfc_processed_dir[PATH_MAX]; + char gfc_processing_dir[PATH_MAX]; + + pthread_t gfc_changelog_processor; +} gf_changelog_t; + +int +gf_changelog_notification_init (xlator_t *this, gf_changelog_t *gfc); + +void * +gf_changelog_process (void *data); + +ssize_t +gf_changelog_read_path (int fd, char *buffer, size_t bufsize); + +void +gf_rfc3986_encode (unsigned char *s, char *enc, char *estr); + +size_t +gf_changelog_write (int fd, char *buffer, size_t len); + +ssize_t +gf_readline (int fd, void *vptr, size_t maxlen); + +int +gf_ftruncate (int fd, off_t length); + +off_t +gf_lseek (int fd, off_t offset, int whence); + +#endif diff --git a/xlators/features/changelog/lib/src/gf-changelog-process.c b/xlators/features/changelog/lib/src/gf-changelog-process.c new file mode 100644 index 000000000..df7204931 --- /dev/null +++ b/xlators/features/changelog/lib/src/gf-changelog-process.c @@ -0,0 +1,571 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <unistd.h> +#include <pthread.h> + +#include "uuid.h" +#include "globals.h" +#include "glusterfs.h" + +#include "gf-changelog-helpers.h" + +/* from the changelog translator */ +#include "changelog-misc.h" + +extern int byebye; + +/** + * number of gfid records after fop number + */ +int nr_gfids[] = { + [GF_FOP_MKNOD] = 1, + [GF_FOP_MKDIR] = 1, + [GF_FOP_UNLINK] = 1, + [GF_FOP_RMDIR] = 1, + [GF_FOP_SYMLINK] = 1, + [GF_FOP_RENAME] = 2, + [GF_FOP_LINK] = 1, + [GF_FOP_CREATE] = 1, +}; + +static char * +binary_to_ascii (uuid_t uuid) +{ + return uuid_utoa (uuid); +} + +static char * +conv_noop (char *ptr) { return ptr; } + +#define VERIFY_SEPARATOR(ptr, plen, perr) \ + { \ + if (*(ptr + plen) != '\0') { \ + perr = 1; \ + break; \ + } \ + } + +#define MOVER_MOVE(mover, nleft, bytes) \ + { \ + mover += bytes; \ + nleft -= bytes; \ + } \ + +#define PARSE_GFID(mov, ptr, le, fn, perr) \ + { \ + VERIFY_SEPARATOR (mov, le, perr); \ + ptr = fn (mov); \ + if (!ptr) { \ + perr = 1; \ + break; \ + } \ + } + +#define FILL_AND_MOVE(pt, buf, of, mo, nl, le) \ + { \ + GF_CHANGELOG_FILL_BUFFER (pt, buf, of, strlen (pt)); \ + MOVER_MOVE (mo, nl, le); \ + } + + +#define PARSE_GFID_MOVE(ptr, uuid, mover, nleft, perr) \ + { \ + memcpy (uuid, mover, sizeof (uuid_t)); \ + ptr = binary_to_ascii (uuid); \ + if (!ptr) { \ + perr = 1; \ + break; \ + } \ + MOVER_MOVE (mover, nleft, sizeof (uuid_t)); \ + } \ + +#define LINE_BUFSIZE 3*PATH_MAX /* enough buffer for extra chars too */ + +/** + * using mmap() makes parsing easy. fgets() cannot be used here as + * the binary gfid could contain a line-feed (0x0A), in that case fgets() + * would read an incomplete line and parsing would fail. using POSIX fds + * would result is additional code to maintain state in case of partial + * reads of data (where multiple entries do not fit extirely in the buffer). + * + * mmap() gives the flexibility of pointing to an offset in the file + * without us worrying about reading it in memory (VM does that for us for + * free). + */ + +static int +gf_changelog_parse_binary (xlator_t *this, + gf_changelog_t *gfc, int from_fd, int to_fd, + size_t start_offset, struct stat *stbuf) + +{ + int ret = -1; + off_t off = 0; + off_t nleft = 0; + uuid_t uuid = {0,}; + char *ptr = NULL; + char *bname_start = NULL; + char *bname_end = NULL; + char *mover = NULL; + char *start = NULL; + char current_mover = ' '; + size_t blen = 0; + int parse_err = 0; + char ascii[LINE_BUFSIZE] = {0,}; + + nleft = stbuf->st_size; + + start = (char *) mmap (NULL, nleft, + PROT_READ, MAP_PRIVATE, from_fd, 0); + if (!start) { + gf_log (this->name, GF_LOG_ERROR, + "mmap() error (reason: %s)", strerror (errno)); + goto out; + } + + mover = start; + + MOVER_MOVE (mover, nleft, start_offset); + + while (nleft > 0) { + + off = blen = 0; + ptr = bname_start = bname_end = NULL; + + current_mover = *mover; + + switch (current_mover) { + case 'D': + case 'M': + MOVER_MOVE (mover, nleft, 1); + PARSE_GFID_MOVE (ptr, uuid, mover, nleft, parse_err); + + break; + + case 'E': + MOVER_MOVE (mover, nleft, 1); + PARSE_GFID_MOVE (ptr, uuid, mover, nleft, parse_err); + + bname_start = mover; + if ( (bname_end = strchr (mover, '\n')) == NULL ) { + parse_err = 1; + break; + } + + blen = bname_end - bname_start; + MOVER_MOVE (mover, nleft, blen); + + break; + + default: + parse_err = 1; + } + + if (parse_err) + break; + + GF_CHANGELOG_FILL_BUFFER (¤t_mover, ascii, off, 1); + GF_CHANGELOG_FILL_BUFFER (" ", ascii, off, 1); + GF_CHANGELOG_FILL_BUFFER (ptr, ascii, off, strlen (ptr)); + if (blen) + GF_CHANGELOG_FILL_BUFFER (bname_start, + ascii, off, blen); + GF_CHANGELOG_FILL_BUFFER ("\n", ascii, off, 1); + + if (gf_changelog_write (to_fd, ascii, off) != off) { + gf_log (this->name, GF_LOG_ERROR, + "processing binary changelog failed due to " + " error in writing ascii change (reason: %s)", + strerror (errno)); + break; + } + + MOVER_MOVE (mover, nleft, 1); + } + + if ( (nleft == 0) && (!parse_err)) + ret = 0; + + if (munmap (start, stbuf->st_size)) + gf_log (this->name, GF_LOG_ERROR, + "munmap() error (reason: %s)", strerror (errno)); + out: + return ret; +} + +/** + * ascii decoder: + * - separate out one entry from another + * - use fop name rather than fop number + */ +static int +gf_changelog_parse_ascii (xlator_t *this, + gf_changelog_t *gfc, int from_fd, int to_fd, + size_t start_offset, struct stat *stbuf) +{ + int ng = 0; + int ret = -1; + int fop = 0; + int len = 0; + off_t off = 0; + off_t nleft = 0; + char *ptr = NULL; + char *eptr = NULL; + char *start = NULL; + char *mover = NULL; + int parse_err = 0; + char current_mover = ' '; + char ascii[LINE_BUFSIZE] = {0,}; + const char *fopname = NULL; + + nleft = stbuf->st_size; + + start = (char *) mmap (NULL, nleft, + PROT_READ, MAP_PRIVATE, from_fd, 0); + if (!start) { + gf_log (this->name, GF_LOG_ERROR, + "mmap() error (reason: %s)", strerror (errno)); + goto out; + } + + mover = start; + + MOVER_MOVE (mover, nleft, start_offset); + + while (nleft > 0) { + off = 0; + current_mover = *mover; + + GF_CHANGELOG_FILL_BUFFER (¤t_mover, ascii, off, 1); + GF_CHANGELOG_FILL_BUFFER (" ", ascii, off, 1); + + switch (current_mover) { + case 'D': + case 'M': + MOVER_MOVE (mover, nleft, 1); + + /* target gfid */ + PARSE_GFID (mover, ptr, UUID_CANONICAL_FORM_LEN, + conv_noop, parse_err); + FILL_AND_MOVE(ptr, ascii, off, + mover, nleft, UUID_CANONICAL_FORM_LEN); + break; + + case 'E': + MOVER_MOVE (mover, nleft, 1); + + /* target gfid */ + PARSE_GFID (mover, ptr, UUID_CANONICAL_FORM_LEN, + conv_noop, parse_err); + FILL_AND_MOVE (ptr, ascii, off, + mover, nleft, UUID_CANONICAL_FORM_LEN); + FILL_AND_MOVE (" ", ascii, off, + mover, nleft, 1); + + /* fop */ + len = strlen (mover); + VERIFY_SEPARATOR (mover, len, parse_err); + + fop = atoi (mover); + if ( (fopname = gf_fop_list[fop]) == NULL) { + parse_err = 1; + break; + } + + MOVER_MOVE (mover, nleft, len); + + len = strlen (fopname); + GF_CHANGELOG_FILL_BUFFER (fopname, ascii, off, len); + + /* pargfid + bname */ + ng = nr_gfids[fop]; + while (ng-- > 0) { + MOVER_MOVE (mover, nleft, 1); + len = strlen (mover); + GF_CHANGELOG_FILL_BUFFER (" ", ascii, off, 1); + + PARSE_GFID (mover, ptr, len, + conv_noop, parse_err); + eptr = calloc (3, strlen (ptr)); + if (!eptr) { + parse_err = 1; + break; + } + + gf_rfc3986_encode ((unsigned char *) ptr, + eptr, gfc->rfc3986); + FILL_AND_MOVE (eptr, ascii, off, + mover, nleft, len); + free (eptr); + } + + break; + default: + parse_err = 1; + } + + if (parse_err) + break; + + GF_CHANGELOG_FILL_BUFFER ("\n", ascii, off, 1); + + if (gf_changelog_write (to_fd, ascii, off) != off) { + gf_log (this->name, GF_LOG_ERROR, + "processing ascii changelog failed due to " + " wrror in writing change (reason: %s)", + strerror (errno)); + break; + } + + MOVER_MOVE (mover, nleft, 1); + + } + + if ( (nleft == 0) && (!parse_err)) + ret = 0; + + if (munmap (start, stbuf->st_size)) + gf_log (this->name, GF_LOG_ERROR, + "munmap() error (reason: %s)", strerror (errno)); + + out: + return ret; +} + +#define COPY_BUFSIZE 8192 +static int +gf_changelog_copy (xlator_t *this, int from_fd, int to_fd) +{ + ssize_t size = 0; + char buffer[COPY_BUFSIZE+1] = {0,}; + + while (1) { + size = read (from_fd, buffer, COPY_BUFSIZE); + if (size <= 0) + break; + + if (gf_changelog_write (to_fd, + buffer, size) != size) { + gf_log (this->name, GF_LOG_ERROR, + "error processing ascii changlog"); + size = -1; + break; + } + } + + return (size < 0 ? -1 : 0); +} + +static int +gf_changelog_decode (xlator_t *this, gf_changelog_t *gfc, int from_fd, + int to_fd, struct stat *stbuf, int *zerob) +{ + int ret = -1; + int encoding = -1; + size_t elen = 0; + char buffer[1024] = {0,}; + + CHANGELOG_GET_ENCODING (from_fd, buffer, 1024, encoding, elen); + if (encoding == -1) /* unknown encoding */ + goto out; + + if (!CHANGELOG_VALID_ENCODING (encoding)) + goto out; + + if (elen == stbuf->st_size) { + *zerob = 1; + goto out; + } + + /** + * start processing after the header + */ + lseek (from_fd, elen, SEEK_SET); + + switch (encoding) { + case CHANGELOG_ENCODE_BINARY: + /** + * this ideally should have been a part of changelog-encoders.c + * (ie. part of the changelog translator). + */ + ret = gf_changelog_parse_binary (this, gfc, from_fd, + to_fd, elen, stbuf); + break; + + case CHANGELOG_ENCODE_ASCII: + ret = gf_changelog_parse_ascii (this, gfc, from_fd, + to_fd, elen, stbuf); + break; + default: + ret = gf_changelog_copy (this, from_fd, to_fd); + } + + out: + return ret; +} + +static int +gf_changelog_consume (xlator_t *this, gf_changelog_t *gfc, char *from_path) +{ + int ret = -1; + int fd1 = 0; + int fd2 = 0; + int zerob = 0; + struct stat stbuf = {0,}; + char dest[PATH_MAX] = {0,}; + char to_path[PATH_MAX] = {0,}; + + ret = stat (from_path, &stbuf); + if (ret || !S_ISREG(stbuf.st_mode)) { + gf_log (this->name, GF_LOG_ERROR, + "stat failed on changelog file: %s", from_path); + goto out; + } + + fd1 = open (from_path, O_RDONLY); + if (fd1 < 0) { + gf_log (this->name, GF_LOG_ERROR, + "cannot open changelog file: %s (reason: %s)", + from_path, strerror (errno)); + goto out; + } + + (void) snprintf (to_path, PATH_MAX, "%s%s", + gfc->gfc_current_dir, basename (from_path)); + (void) snprintf (dest, PATH_MAX, "%s%s", + gfc->gfc_processing_dir, basename (from_path)); + + fd2 = open (to_path, O_CREAT | O_TRUNC | O_RDWR, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (fd2 < 0) { + gf_log (this->name, GF_LOG_ERROR, + "cannot create ascii changelog file %s (reason %s)", + to_path, strerror (errno)); + goto close_fd; + } else { + ret = gf_changelog_decode (this, gfc, fd1, + fd2, &stbuf, &zerob); + + close (fd2); + + if (!ret) { + /* move it to processing on a successfull + decode */ + ret = rename (to_path, dest); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "error moving %s to processing dir" + " (reason: %s)", to_path, + strerror (errno)); + } + + /* remove it from .current if it's an empty file */ + if (zerob) { + ret = unlink (to_path); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "could not unlink %s (reason: %s", + to_path, strerror (errno)); + } + } + + close_fd: + close (fd1); + + out: + return ret; +} + +static char * +gf_changelog_ext_change (xlator_t *this, + gf_changelog_t *gfc, char *path, size_t readlen) +{ + int alo = 0; + int ret = 0; + size_t len = 0; + char *buf = NULL; + + buf = path; + while (len < readlen) { + if (*buf == '\0') { + alo = 1; + gf_log (this->name, GF_LOG_DEBUG, + "processing changelog: %s", path); + ret = gf_changelog_consume (this, gfc, path); + } + + if (ret) + break; + + len++; buf++; + if (alo) { + alo = 0; + path = buf; + } + } + + return (ret) ? NULL : path; +} + +void * +gf_changelog_process (void *data) +{ + ssize_t len = 0; + ssize_t offlen = 0; + xlator_t *this = NULL; + char *sbuf = NULL; + gf_changelog_t *gfc = NULL; + char from_path[PATH_MAX] = {0,}; + + gfc = (gf_changelog_t *) data; + this = gfc->this; + + pthread_detach (pthread_self()); + + for (;;) { + len = gf_changelog_read_path (gfc->gfc_sockfd, + from_path + offlen, + PATH_MAX - offlen); + if (len < 0) + continue; /* ignore it for now */ + + if (len == 0) { /* close() from the changelog translator */ + gf_log (this->name, GF_LOG_INFO, "close from changelog" + " notification translator."); + + if (gfc->gfc_connretries != 1) { + if (!gf_changelog_notification_init(this, gfc)) + continue; + } + + byebye = 1; + break; + } + + len += offlen; + sbuf = gf_changelog_ext_change (this, gfc, from_path, len); + if (!sbuf) { + gf_log (this->name, GF_LOG_ERROR, + "could not extract changelog filename"); + continue; + } + + offlen = 0; + if (sbuf != (from_path + len)) { + offlen = from_path + len - sbuf; + memmove (from_path, sbuf, offlen); + } + } + + gf_log (this->name, GF_LOG_DEBUG, + "byebye (%d) from processing thread...", byebye); + return NULL; +} diff --git a/xlators/features/changelog/lib/src/gf-changelog.c b/xlators/features/changelog/lib/src/gf-changelog.c new file mode 100644 index 000000000..ca8e373e7 --- /dev/null +++ b/xlators/features/changelog/lib/src/gf-changelog.c @@ -0,0 +1,515 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <errno.h> +#include <dirent.h> +#include <stddef.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include <string.h> + +#include "globals.h" +#include "glusterfs.h" +#include "logging.h" + +#include "gf-changelog-helpers.h" + +/* from the changelog translator */ +#include "changelog-misc.h" +#include "changelog-mem-types.h" + +int byebye = 0; + +static void +gf_changelog_cleanup (gf_changelog_t *gfc) +{ + /* socket */ + if (gfc->gfc_sockfd != -1) + close (gfc->gfc_sockfd); + /* tracker fd */ + if (gfc->gfc_fd != -1) + close (gfc->gfc_fd); + /* processing dir */ + if (gfc->gfc_dir) + closedir (gfc->gfc_dir); + + if (gfc->gfc_working_dir) + free (gfc->gfc_working_dir); /* allocated by realpath */ +} + +void +__attribute__ ((constructor)) gf_changelog_ctor (void) +{ + glusterfs_ctx_t *ctx = NULL; + + ctx = glusterfs_ctx_new (); + if (!ctx) + return; + + if (glusterfs_globals_init (ctx)) { + free (ctx); + ctx = NULL; + return; + } + + THIS->ctx = ctx; +} + +void +__attribute__ ((destructor)) gf_changelog_dtor (void) +{ + xlator_t *this = NULL; + glusterfs_ctx_t *ctx = NULL; + gf_changelog_t *gfc = NULL; + + this = THIS; + if (!this) + return; + + ctx = this->ctx; + gfc = this->private; + + if (gfc) { + gf_changelog_cleanup (gfc); + GF_FREE (gfc); + } + + if (ctx) { + pthread_mutex_destroy (&ctx->lock); + free (ctx); + ctx = NULL; + } +} + + +static int +gf_changelog_open_dirs (gf_changelog_t *gfc) +{ + int ret = -1; + DIR *dir = NULL; + int tracker_fd = 0; + char tracker_path[PATH_MAX] = {0,}; + + (void) snprintf (gfc->gfc_current_dir, PATH_MAX, + "%s/"GF_CHANGELOG_CURRENT_DIR"/", + gfc->gfc_working_dir); + ret = mkdir_p (gfc->gfc_current_dir, 0600, _gf_false); + if (ret) + goto out; + + (void) snprintf (gfc->gfc_processed_dir, PATH_MAX, + "%s/"GF_CHANGELOG_PROCESSED_DIR"/", + gfc->gfc_working_dir); + ret = mkdir_p (gfc->gfc_processed_dir, 0600, _gf_false); + if (ret) + goto out; + + (void) snprintf (gfc->gfc_processing_dir, PATH_MAX, + "%s/"GF_CHANGELOG_PROCESSING_DIR"/", + gfc->gfc_working_dir); + ret = mkdir_p (gfc->gfc_processing_dir, 0600, _gf_false); + if (ret) + goto out; + + dir = opendir (gfc->gfc_processing_dir); + if (!dir) { + gf_log ("", GF_LOG_ERROR, + "opendir() error [reason: %s]", strerror (errno)); + goto out; + } + + gfc->gfc_dir = dir; + + (void) snprintf (tracker_path, PATH_MAX, + "%s/"GF_CHANGELOG_TRACKER, gfc->gfc_working_dir); + + tracker_fd = open (tracker_path, O_CREAT | O_APPEND | O_RDWR, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (tracker_fd < 0) { + closedir (gfc->gfc_dir); + ret = -1; + goto out; + } + + gfc->gfc_fd = tracker_fd; + ret = 0; + out: + return ret; +} + +int +gf_changelog_notification_init (xlator_t *this, gf_changelog_t *gfc) +{ + int ret = 0; + int len = 0; + int tries = 0; + int sockfd = 0; + struct sockaddr_un remote; + + this = gfc->this; + + if (gfc->gfc_sockfd != -1) { + gf_log (this->name, GF_LOG_INFO, + "Reconnecting..."); + close (gfc->gfc_sockfd); + } + + sockfd = socket (AF_UNIX, SOCK_STREAM, 0); + if (sockfd < 0) { + ret = -1; + goto out; + } + + CHANGELOG_MAKE_SOCKET_PATH (gfc->gfc_brickpath, + gfc->gfc_sockpath, PATH_MAX); + gf_log (this->name, GF_LOG_INFO, + "connecting to changelog socket: %s (brick: %s)", + gfc->gfc_sockpath, gfc->gfc_brickpath); + + remote.sun_family = AF_UNIX; + strcpy (remote.sun_path, gfc->gfc_sockpath); + + len = strlen (remote.sun_path) + sizeof (remote.sun_family); + + while (tries < gfc->gfc_connretries) { + gf_log (this->name, GF_LOG_WARNING, + "connection attempt %d/%d...", + tries + 1, gfc->gfc_connretries); + + /* initiate a connect */ + if (connect (sockfd, (struct sockaddr *) &remote, len) == 0) { + gfc->gfc_sockfd = sockfd; + break; + } + + tries++; + sleep (2); + } + + if (tries == gfc->gfc_connretries) { + gf_log (this->name, GF_LOG_ERROR, + "could not connect to changelog socket!" + " bailing out..."); + ret = -1; + } else + gf_log (this->name, GF_LOG_INFO, + "connection successful"); + + out: + return ret; +} + +int +gf_changelog_done (char *file) +{ + int ret = -1; + char *buffer = NULL; + xlator_t *this = NULL; + gf_changelog_t *gfc = NULL; + char to_path[PATH_MAX] = {0,}; + + errno = EINVAL; + + this = THIS; + if (!this) + goto out; + + gfc = (gf_changelog_t *) this->private; + if (!gfc) + goto out; + + if (!file || !strlen (file)) + goto out; + + /* make sure 'file' is inside ->gfc_working_dir */ + buffer = realpath (file, NULL); + if (!buffer) + goto out; + + if (strncmp (gfc->gfc_working_dir, + buffer, strlen (gfc->gfc_working_dir))) + goto out; + + (void) snprintf (to_path, PATH_MAX, "%s%s", + gfc->gfc_processed_dir, basename (buffer)); + gf_log (this->name, GF_LOG_DEBUG, + "moving %s to processed directory", file); + ret = rename (buffer, to_path); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "cannot move %s to %s (reason: %s)", + file, to_path, strerror (errno)); + goto out; + } + + ret = 0; + + out: + if (buffer) + free (buffer); /* allocated by realpath() */ + return ret; +} + +/** + * @API + * for a set of changelogs, start from the begining + */ +int +gf_changelog_start_fresh () +{ + xlator_t *this = NULL; + gf_changelog_t *gfc = NULL; + + this = THIS; + if (!this) + goto out; + + errno = EINVAL; + + gfc = (gf_changelog_t *) this->private; + if (!gfc) + goto out; + + if (gf_ftruncate (gfc->gfc_fd, 0)) + goto out; + + return 0; + + out: + return -1; +} + +/** + * @API + * return the next changelog file entry. zero means all chanelogs + * consumed. + */ +ssize_t +gf_changelog_next_change (char *bufptr, size_t maxlen) +{ + ssize_t size = 0; + int tracker_fd = 0; + xlator_t *this = NULL; + gf_changelog_t *gfc = NULL; + char buffer[PATH_MAX] = {0,}; + + errno = EINVAL; + + this = THIS; + if (!this) + goto out; + + gfc = (gf_changelog_t *) this->private; + if (!gfc) + goto out; + + tracker_fd = gfc->gfc_fd; + + size = gf_readline (tracker_fd, buffer, maxlen); + if (size < 0) + goto out; + if (size == 0) + return 0; + + memcpy (bufptr, buffer, size - 1); + *(buffer + size) = '\0'; + + return size; + + out: + return -1; +} + +/** + * @API + * gf_changelog_scan() - scan and generate a list of change entries + * + * calling this api multiple times (without calling gf_changlog_done()) + * would result new changelogs(s) being refreshed in the tracker file. + * This call also acts as a cancellation point for the consumer. + */ +ssize_t +gf_changelog_scan () +{ + int ret = 0; + int tracker_fd = 0; + size_t len = 0; + size_t off = 0; + xlator_t *this = NULL; + size_t nr_entries = 0; + gf_changelog_t *gfc = NULL; + struct dirent *entryp = NULL; + struct dirent *result = NULL; + char buffer[PATH_MAX] = {0,}; + + this = THIS; + if (!this) + goto out; + + gfc = (gf_changelog_t *) this->private; + if (!gfc) + goto out; + + /** + * do we need to protect 'byebye' with locks? worst, the + * consumer would get notified during next scan(). + */ + if (byebye) { + errno = ECONNREFUSED; + goto out; + } + + errno = EINVAL; + + tracker_fd = gfc->gfc_fd; + + if (gf_ftruncate (tracker_fd, 0)) + goto out; + + len = offsetof(struct dirent, d_name) + + pathconf(gfc->gfc_processing_dir, _PC_NAME_MAX) + 1; + entryp = GF_CALLOC (1, len, + gf_changelog_mt_libgfchangelog_dirent_t); + if (!entryp) + goto out; + + rewinddir (gfc->gfc_dir); + while (1) { + ret = readdir_r (gfc->gfc_dir, entryp, &result); + if (ret || !result) + break; + + if ( !strcmp (basename (entryp->d_name), ".") + || !strcmp (basename (entryp->d_name), "..") ) + continue; + + nr_entries++; + + GF_CHANGELOG_FILL_BUFFER (gfc->gfc_processing_dir, + buffer, off, + strlen (gfc->gfc_processing_dir)); + GF_CHANGELOG_FILL_BUFFER (entryp->d_name, buffer, + off, strlen (entryp->d_name)); + GF_CHANGELOG_FILL_BUFFER ("\n", buffer, off, 1); + + if (gf_changelog_write (tracker_fd, buffer, off) != off) { + gf_log (this->name, GF_LOG_ERROR, + "error writing changelog filename" + " to tracker file"); + break; + } + off = 0; + } + + GF_FREE (entryp); + + if (!result) { + if (gf_lseek (tracker_fd, 0, SEEK_SET) != -1) + return nr_entries; + } + out: + return -1; +} + +/** + * @API + * gf_changelog_register() - register a client for updates. + */ +int +gf_changelog_register (char *brick_path, char *scratch_dir, + char *log_file, int log_level, int max_reconnects) +{ + int i = 0; + int ret = -1; + int errn = 0; + xlator_t *this = NULL; + gf_changelog_t *gfc = NULL; + + this = THIS; + if (!this->ctx) + goto out; + + errno = ENOMEM; + + gfc = GF_CALLOC (1, sizeof (*gfc), + gf_changelog_mt_libgfchangelog_t); + if (!gfc) + goto out; + + gfc->this = this; + + gfc->gfc_dir = NULL; + gfc->gfc_fd = gfc->gfc_sockfd = -1; + + gfc->gfc_working_dir = realpath (scratch_dir, NULL); + if (!gfc->gfc_working_dir) { + errn = errno; + goto cleanup; + } + + ret = gf_changelog_open_dirs (gfc); + if (ret) { + errn = errno; + gf_log (this->name, GF_LOG_ERROR, + "could not create entries in scratch dir"); + goto cleanup; + } + + /* passing ident as NULL means to use default ident for syslog */ + if (gf_log_init (this->ctx, log_file, NULL)) + goto cleanup; + + gf_log_set_loglevel ((log_level == -1) ? GF_LOG_INFO : + log_level); + + gfc->gfc_connretries = (max_reconnects <= 0) ? 1 : max_reconnects; + (void) strncpy (gfc->gfc_brickpath, brick_path, PATH_MAX); + + ret = gf_changelog_notification_init (this, gfc); + if (ret) { + errn = errno; + goto cleanup; + } + + ret = gf_thread_create (&gfc->gfc_changelog_processor, + NULL, gf_changelog_process, gfc); + if (ret) { + errn = errno; + gf_log (this->name, GF_LOG_ERROR, + "error creating changelog processor thread" + " new changes won't be recorded!!!"); + goto cleanup; + } + + for (; i < 256; i++) { + gfc->rfc3986[i] = + (isalnum(i) || i == '~' || + i == '-' || i == '.' || i == '_') ? i : 0; + } + + ret = 0; + this->private = gfc; + + goto out; + + cleanup: + gf_changelog_cleanup (gfc); + GF_FREE (gfc); + this->private = NULL; + errno = errn; + + out: + return ret; +} diff --git a/xlators/features/changelog/src/Makefile.am b/xlators/features/changelog/src/Makefile.am new file mode 100644 index 000000000..e85031ad4 --- /dev/null +++ b/xlators/features/changelog/src/Makefile.am @@ -0,0 +1,19 @@ +xlator_LTLIBRARIES = changelog.la + +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +noinst_HEADERS = changelog-helpers.h changelog-mem-types.h changelog-rt.h \ + changelog-misc.h changelog-encoders.h changelog-notifier.h + +changelog_la_LDFLAGS = -module -avoidversion + +changelog_la_SOURCES = changelog.c changelog-rt.c changelog-helpers.c \ + changelog-encoders.c changelog-notifier.c +changelog_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src -fPIC -D_FILE_OFFSET_BITS=64 \ + -D_GNU_SOURCE -D$(GF_HOST_OS) -shared -nostartfiles -DDATADIR=\"$(localstatedir)\" + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/changelog/src/changelog-encoders.c b/xlators/features/changelog/src/changelog-encoders.c new file mode 100644 index 000000000..553eec85c --- /dev/null +++ b/xlators/features/changelog/src/changelog-encoders.c @@ -0,0 +1,176 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "changelog-encoders.h" + +size_t +entry_fn (void *data, char *buffer, gf_boolean_t encode) +{ + char *tmpbuf = NULL; + size_t bufsz = 0; + struct changelog_entry_fields *ce = NULL; + + ce = (struct changelog_entry_fields *) data; + + if (encode) { + tmpbuf = uuid_utoa (ce->cef_uuid); + CHANGELOG_FILL_BUFFER (buffer, bufsz, tmpbuf, strlen (tmpbuf)); + } else { + CHANGELOG_FILL_BUFFER (buffer, bufsz, + ce->cef_uuid, sizeof (uuid_t)); + } + + CHANGELOG_FILL_BUFFER (buffer, bufsz, "/", 1); + CHANGELOG_FILL_BUFFER (buffer, bufsz, + ce->cef_bname, strlen (ce->cef_bname)); + return bufsz; +} + +size_t +fop_fn (void *data, char *buffer, gf_boolean_t encode) +{ + char buf[10] = {0,}; + size_t bufsz = 0; + glusterfs_fop_t fop = 0; + + fop = *(glusterfs_fop_t *) data; + + if (encode) { + (void) snprintf (buf, sizeof (buf), "%d", fop); + CHANGELOG_FILL_BUFFER (buffer, bufsz, buf, strlen (buf)); + } else + CHANGELOG_FILL_BUFFER (buffer, bufsz, &fop, sizeof (fop)); + + return bufsz; +} + +void +entry_free_fn (void *data) +{ + changelog_opt_t *co = data; + + if (!co) + return; + + GF_FREE (co->co_entry.cef_bname); +} + +/** + * try to write all data in one shot + */ + +static inline void +changelog_encode_write_xtra (changelog_log_data_t *cld, + char *buffer, size_t *off, gf_boolean_t encode) +{ + int i = 0; + size_t offset = 0; + void *data = NULL; + changelog_opt_t *co = NULL; + + offset = *off; + + co = (changelog_opt_t *) cld->cld_ptr; + + for (; i < cld->cld_xtra_records; i++, co++) { + CHANGELOG_FILL_BUFFER (buffer, offset, "\0", 1); + + switch (co->co_type) { + case CHANGELOG_OPT_REC_FOP: + data = &co->co_fop; + break; + case CHANGELOG_OPT_REC_ENTRY: + data = &co->co_entry; + break; + } + + if (co->co_convert) + offset += co->co_convert (data, + buffer + offset, encode); + else /* no coversion: write it out as it is */ + CHANGELOG_FILL_BUFFER (buffer, offset, + data, co->co_len); + } + + *off = offset; +} + +int +changelog_encode_ascii (xlator_t *this, changelog_log_data_t *cld) +{ + size_t off = 0; + size_t gfid_len = 0; + char *gfid_str = NULL; + char *buffer = NULL; + changelog_priv_t *priv = NULL; + + priv = this->private; + + gfid_str = uuid_utoa (cld->cld_gfid); + gfid_len = strlen (gfid_str); + + /* extra bytes for decorations */ + buffer = alloca (gfid_len + cld->cld_ptr_len + 10); + CHANGELOG_STORE_ASCII (priv, buffer, + off, gfid_str, gfid_len, cld); + + if (cld->cld_xtra_records) + changelog_encode_write_xtra (cld, buffer, &off, _gf_true); + + CHANGELOG_FILL_BUFFER (buffer, off, "\0", 1); + + return changelog_write_change (priv, buffer, off); +} + +int +changelog_encode_binary (xlator_t *this, changelog_log_data_t *cld) +{ + size_t off = 0; + char *buffer = NULL; + changelog_priv_t *priv = NULL; + + priv = this->private; + + /* extra bytes for decorations */ + buffer = alloca (sizeof (uuid_t) + cld->cld_ptr_len + 10); + CHANGELOG_STORE_BINARY (priv, buffer, off, cld->cld_gfid, cld); + + if (cld->cld_xtra_records) + changelog_encode_write_xtra (cld, buffer, &off, _gf_false); + + CHANGELOG_FILL_BUFFER (buffer, off, "\0", 1); + + return changelog_write_change (priv, buffer, off); +} + +static struct changelog_encoder +cb_encoder[] = { + [CHANGELOG_ENCODE_BINARY] = + { + .encoder = CHANGELOG_ENCODE_BINARY, + .encode = changelog_encode_binary, + }, + [CHANGELOG_ENCODE_ASCII] = + { + .encoder = CHANGELOG_ENCODE_ASCII, + .encode = changelog_encode_ascii, + }, +}; + +void +changelog_encode_change( changelog_priv_t * priv) +{ + priv->ce = &cb_encoder[priv->encode_mode]; +} diff --git a/xlators/features/changelog/src/changelog-encoders.h b/xlators/features/changelog/src/changelog-encoders.h new file mode 100644 index 000000000..a3efbee05 --- /dev/null +++ b/xlators/features/changelog/src/changelog-encoders.h @@ -0,0 +1,46 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CHANGELOG_ENCODERS_H +#define _CHANGELOG_ENCODERS_H + +#include "xlator.h" +#include "defaults.h" + +#include "changelog-helpers.h" + +#define CHANGELOG_STORE_ASCII(priv, buf, off, gfid, gfid_len, cld) do { \ + CHANGELOG_FILL_BUFFER (buffer, off, \ + priv->maps[cld->cld_type], 1); \ + CHANGELOG_FILL_BUFFER (buffer, \ + off, gfid, gfid_len); \ + } while (0) + +#define CHANGELOG_STORE_BINARY(priv, buf, off, gfid, cld) do { \ + CHANGELOG_FILL_BUFFER (buffer, off, \ + priv->maps[cld->cld_type], 1); \ + CHANGELOG_FILL_BUFFER (buffer, \ + off, gfid, sizeof (uuid_t)); \ + } while (0) + +size_t +entry_fn (void *data, char *buffer, gf_boolean_t encode); +size_t +fop_fn (void *data, char *buffer, gf_boolean_t encode); +void +entry_free_fn (void *data); +int +changelog_encode_binary (xlator_t *, changelog_log_data_t *); +int +changelog_encode_ascii (xlator_t *, changelog_log_data_t *); +void +changelog_encode_change(changelog_priv_t *); + +#endif /* _CHANGELOG_ENCODERS_H */ diff --git a/xlators/features/changelog/src/changelog-helpers.c b/xlators/features/changelog/src/changelog-helpers.c new file mode 100644 index 000000000..7ab0091b5 --- /dev/null +++ b/xlators/features/changelog/src/changelog-helpers.c @@ -0,0 +1,693 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "defaults.h" +#include "logging.h" +#include "iobuf.h" + +#include "changelog-helpers.h" +#include "changelog-mem-types.h" + +#include "changelog-encoders.h" +#include <pthread.h> + +void +changelog_thread_cleanup (xlator_t *this, pthread_t thr_id) +{ + int ret = 0; + void *retval = NULL; + + /* send a cancel request to the thread */ + ret = pthread_cancel (thr_id); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "could not cancel thread (reason: %s)", + strerror (errno)); + goto out; + } + + ret = pthread_join (thr_id, &retval); + if (ret || (retval != PTHREAD_CANCELED)) { + gf_log (this->name, GF_LOG_ERROR, + "cancel request not adhered as expected" + " (reason: %s)", strerror (errno)); + } + + out: + return; +} + +inline void * +changelog_get_usable_buffer (changelog_local_t *local) +{ + changelog_log_data_t *cld = NULL; + + cld = &local->cld; + if (!cld->cld_iobuf) + return NULL; + + return cld->cld_iobuf->ptr; +} + +inline void +changelog_set_usable_record_and_length (changelog_local_t *local, + size_t len, int xr) +{ + changelog_log_data_t *cld = NULL; + + cld = &local->cld; + + cld->cld_ptr_len = len; + cld->cld_xtra_records = xr; +} + +void +changelog_local_cleanup (xlator_t *xl, changelog_local_t *local) +{ + int i = 0; + changelog_opt_t *co = NULL; + changelog_log_data_t *cld = NULL; + + if (!local) + return; + + cld = &local->cld; + + /* cleanup dynamic allocation for extra records */ + if (cld->cld_xtra_records) { + co = (changelog_opt_t *) cld->cld_ptr; + for (; i < cld->cld_xtra_records; i++, co++) + if (co->co_free) + co->co_free (co); + } + + CHANGELOG_IOBUF_UNREF (cld->cld_iobuf); + + if (local->inode) + inode_unref (local->inode); + + mem_put (local); +} + +inline int +changelog_write (int fd, char *buffer, size_t len) +{ + ssize_t size = 0; + size_t writen = 0; + + while (writen < len) { + size = write (fd, + buffer + writen, len - writen); + if (size <= 0) + break; + + writen += size; + } + + return (writen != len); +} + +static int +changelog_rollover_changelog (xlator_t *this, + changelog_priv_t *priv, unsigned long ts) +{ + int ret = -1; + int notify = 0; + char *bname = NULL; + char ofile[PATH_MAX] = {0,}; + char nfile[PATH_MAX] = {0,}; + + if (priv->changelog_fd != -1) { + close (priv->changelog_fd); + priv->changelog_fd = -1; + } + + (void) snprintf (ofile, PATH_MAX, + "%s/"CHANGELOG_FILE_NAME, priv->changelog_dir); + (void) snprintf (nfile, PATH_MAX, + "%s/"CHANGELOG_FILE_NAME".%lu", + priv->changelog_dir, ts); + + ret = rename (ofile, nfile); + if (!ret) + notify = 1; + + if (ret && (errno == ENOENT)) { + ret = 0; + } + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "error renaming %s -> %s (reason %s)", + ofile, nfile, strerror (errno)); + } + + if (notify) { + bname = basename (nfile); + gf_log (this->name, GF_LOG_DEBUG, "notifying: %s", bname); + ret = changelog_write (priv->wfd, bname, strlen (bname) + 1); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to send file name to notify thread" + " (reason: %s)", strerror (errno)); + } + } + + return ret; +} + +int +changelog_open (xlator_t *this, + changelog_priv_t *priv) +{ + int fd = 0; + int ret = -1; + int flags = 0; + char buffer[1024] = {0,}; + char changelog_path[PATH_MAX] = {0,}; + + (void) snprintf (changelog_path, PATH_MAX, + "%s/"CHANGELOG_FILE_NAME, + priv->changelog_dir); + + flags |= (O_CREAT | O_RDWR); + if (priv->fsync_interval == 0) + flags |= O_SYNC; + + fd = open (changelog_path, flags, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (fd < 0) { + gf_log (this->name, GF_LOG_ERROR, + "unable to open/create changelog file %s" + " (reason: %s). change-logging will be" + " inactive", changelog_path, strerror (errno)); + goto out; + } + + priv->changelog_fd = fd; + + (void) snprintf (buffer, 1024, CHANGELOG_HEADER, + CHANGELOG_VERSION_MAJOR, + CHANGELOG_VERSION_MINOR, + priv->ce->encoder); + ret = changelog_write_change (priv, buffer, strlen (buffer)); + if (ret) { + close (priv->changelog_fd); + priv->changelog_fd = -1; + goto out; + } + + ret = 0; + + out: + return ret; +} + +int +changelog_start_next_change (xlator_t *this, + changelog_priv_t *priv, + unsigned long ts, gf_boolean_t finale) +{ + int ret = -1; + + ret = changelog_rollover_changelog (this, priv, ts); + + if (!ret && !finale) + ret = changelog_open (this, priv); + + return ret; +} + +/** + * return the length of entry + */ +inline size_t +changelog_entry_length () +{ + return sizeof (changelog_log_data_t); +} + +int +changelog_fill_rollover_data (changelog_log_data_t *cld, gf_boolean_t is_last) +{ + struct timeval tv = {0,}; + + cld->cld_type = CHANGELOG_TYPE_ROLLOVER; + + if (gettimeofday (&tv, NULL)) + return -1; + + cld->cld_roll_time = (unsigned long) tv.tv_sec; + cld->cld_finale = is_last; + return 0; +} + +int +changelog_write_change (changelog_priv_t *priv, char *buffer, size_t len) +{ + return changelog_write (priv->changelog_fd, buffer, len); +} + +inline int +changelog_handle_change (xlator_t *this, + changelog_priv_t *priv, changelog_log_data_t *cld) +{ + int ret = 0; + + if (CHANGELOG_TYPE_IS_ROLLOVER (cld->cld_type)) { + changelog_encode_change(priv); + ret = changelog_start_next_change (this, priv, + cld->cld_roll_time, + cld->cld_finale); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Problem rolling over changelog(s)"); + goto out; + } + + /** + * case when there is reconfigure done (disabling changelog) and there + * are still fops that have updates in prgress. + */ + if (priv->changelog_fd == -1) + return 0; + + if (CHANGELOG_TYPE_IS_FSYNC (cld->cld_type)) { + ret = fsync (priv->changelog_fd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "fsync failed (reason: %s)", + strerror (errno)); + } + goto out; + } + + ret = priv->ce->encode (this, cld); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "error writing changelog to disk"); + } + + out: + return ret; +} + +changelog_local_t * +changelog_local_init (xlator_t *this, inode_t *inode, + uuid_t gfid, int xtra_records, + gf_boolean_t update_flag) +{ + changelog_local_t *local = NULL; + struct iobuf *iobuf = NULL; + + /** + * We relax the presence of inode if @update_flag is true. + * The caller (implmentation of the fop) needs to be careful to + * not blindly use local->inode. + */ + if (!update_flag && !inode) { + gf_log_callingfn (this->name, GF_LOG_WARNING, + "inode needed for version checking !!!"); + goto out; + } + + if (xtra_records) { + iobuf = iobuf_get2 (this->ctx->iobuf_pool, + xtra_records * CHANGELOG_OPT_RECORD_LEN); + if (!iobuf) + goto out; + } + + local = mem_get0 (this->local_pool); + if (!local) { + CHANGELOG_IOBUF_UNREF (iobuf); + goto out; + } + + local->update_no_check = update_flag; + + uuid_copy (local->cld.cld_gfid, gfid); + + local->cld.cld_iobuf = iobuf; + local->cld.cld_xtra_records = 0; /* set by the caller */ + + if (inode) + local->inode = inode_ref (inode); + + out: + return local; +} + +int +changelog_forget (xlator_t *this, inode_t *inode) +{ + uint64_t ctx_addr = 0; + changelog_inode_ctx_t *ctx = NULL; + + inode_ctx_del (inode, this, &ctx_addr); + if (!ctx_addr) + return 0; + + ctx = (changelog_inode_ctx_t *) (long) ctx_addr; + GF_FREE (ctx); + + return 0; +} + +int +changelog_inject_single_event (xlator_t *this, + changelog_priv_t *priv, + changelog_log_data_t *cld) +{ + return priv->cd.dispatchfn (this, priv, priv->cd.cd_data, cld, NULL); +} + +/** + * TODO: these threads have many thing in common (wake up after + * a certain time etc..). move them into separate routine. + */ +void * +changelog_rollover (void *data) +{ + int ret = 0; + xlator_t *this = NULL; + struct timeval tv = {0,}; + changelog_log_data_t cld = {0,}; + changelog_time_slice_t *slice = NULL; + changelog_priv_t *priv = data; + + this = priv->cr.this; + slice = &priv->slice; + + while (1) { + tv.tv_sec = priv->rollover_time; + tv.tv_usec = 0; + + ret = select (0, NULL, NULL, NULL, &tv); + if (ret) + continue; + + ret = changelog_fill_rollover_data (&cld, _gf_false); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to fill rollover data"); + continue; + } + + LOCK (&priv->lock); + { + ret = changelog_inject_single_event (this, priv, &cld); + if (!ret) + SLICE_VERSION_UPDATE (slice); + } + UNLOCK (&priv->lock); + } + + return NULL; +} + +void * +changelog_fsync_thread (void *data) +{ + int ret = 0; + xlator_t *this = NULL; + struct timeval tv = {0,}; + changelog_log_data_t cld = {0,}; + changelog_priv_t *priv = data; + + this = priv->cf.this; + cld.cld_type = CHANGELOG_TYPE_FSYNC; + + while (1) { + tv.tv_sec = priv->fsync_interval; + tv.tv_usec = 0; + + ret = select (0, NULL, NULL, NULL, &tv); + if (ret) + continue; + + ret = changelog_inject_single_event (this, priv, &cld); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "failed to inject fsync event"); + } + + return NULL; +} + +/* macros for inode/changelog version checks */ + +#define INODE_VERSION_UPDATE(priv, inode, iver, slice, type) do { \ + LOCK (&inode->lock); \ + { \ + LOCK (&priv->lock); \ + { \ + *iver = slice->changelog_version[type]; \ + } \ + UNLOCK (&priv->lock); \ + } \ + UNLOCK (&inode->lock); \ + } while (0) + +#define INODE_VERSION_EQUALS_SLICE(priv, ver, slice, type, upd) do { \ + LOCK (&priv->lock); \ + { \ + upd = (ver == slice->changelog_version[type]) \ + ? _gf_false : _gf_true; \ + } \ + UNLOCK (&priv->lock); \ + } while (0) + +static int +__changelog_inode_ctx_set (xlator_t *this, + inode_t *inode, changelog_inode_ctx_t *ctx) +{ + uint64_t ctx_addr = (uint64_t) ctx; + return __inode_ctx_set (inode, this, &ctx_addr); +} + +/** + * one shot routine to get the address and the value of a inode version + * for a particular type. + */ +static changelog_inode_ctx_t * +__changelog_inode_ctx_get (xlator_t *this, + inode_t *inode, unsigned long **iver, + unsigned long *version, changelog_log_type type) +{ + int ret = 0; + uint64_t ctx_addr = 0; + changelog_inode_ctx_t *ctx = NULL; + + ret = __inode_ctx_get (inode, this, &ctx_addr); + if (ret < 0) + ctx_addr = 0; + if (ctx_addr != 0) { + ctx = (changelog_inode_ctx_t *) (long)ctx_addr; + goto out; + } + + ctx = GF_CALLOC (1, sizeof (*ctx), gf_changelog_mt_inode_ctx_t); + if (!ctx) + goto out; + + ret = __changelog_inode_ctx_set (this, inode, ctx); + if (ret) { + GF_FREE (ctx); + ctx = NULL; + } + + out: + if (ctx && iver && version) { + *iver = CHANGELOG_INODE_VERSION_TYPE (ctx, type); + *version = **iver; + } + + return ctx; +} + +static changelog_inode_ctx_t * +changelog_inode_ctx_get (xlator_t *this, + inode_t *inode, unsigned long **iver, + unsigned long *version, changelog_log_type type) +{ + changelog_inode_ctx_t *ctx = NULL; + + LOCK (&inode->lock); + { + ctx = __changelog_inode_ctx_get (this, + inode, iver, version, type); + } + UNLOCK (&inode->lock); + + return ctx; +} + +/** + * This is the main update routine. Locking has been made granular so as to + * maximize parallelism of fops - I'll try to explain it below using execution + * timelines. + * + * Basically, the contention is between multiple execution threads of this + * routine and the roll-over thread. So, instead of having a big lock, we hold + * granular locks: inode->lock and priv->lock. Now I'll explain what happens + * when there is an update and a roll-over at just about the same time. + * NOTE: + * - the dispatcher itself synchronizes updates via it's own lock + * - the slice version in incremented by the roll-over thread + * + * Case 1: When the rollover thread wins before the inode version can be + * compared with the slice version. + * + * [updater] | [rollover] + * | + * | <SLICE: 1, 1, 1> + * <changelog_update> | + * <changelog_inode_ctx_get> | + * <CTX: 1, 1, 1> | + * | <dispatch-rollover-event> + * | LOCK (&priv->lock) + * | <SLICE_VERSION_UPDATE> + * | <SLICE: 2, 2, 2> + * | UNLOCK (&priv->lock) + * | + * LOCK (&priv->lock) | + * <INODE_VERSION_EQUALS_SLICE> | + * I: 1 <-> S: 2 | + * update: true | + * UNLOCK (&priv->lock) | + * | + * <if update == true> | + * <dispath-update-event> | + * <INODE_VERSION_UPDATE> | + * LOCK (&inode->lock) | + * LOCK (&priv->lock) | + * <CTX: 2, 1, 1> | + * UNLOCK (&priv->lock) | + * UNLOCK (&inode->lock) | + * + * Therefore, the change gets recorded in the next change (no lost change). If + * the slice version was ahead of the inode version (say I:1, S: 2), then + * anyway the comparison would result in a update (I: 1, S: 3). + * + * If the rollover time is too less, then there is another contention when the + * updater tries to bring up inode version to the slice version (this is also + * the case when the roll-over thread wakes up during INODE_VERSION_UPDATE. + * + * <CTX: 1, 1, 1> | <SLICE: 2, 2, 2> + * | + * | + * <dispath-update-event> | + * <INODE_VERSION_UPDATE> | + * LOCK (&inode->lock) | + * LOCK (&priv->lock) | + * <CTX: 2, 1, 1> | + * UNLOCK (&priv->lock) | + * UNLOCK (&inode->lock) | + * | <dispatch-rollover-event> + * | LOCK (&priv->lock) + * | <SLICE_VERSION_UPDATE> + * | <SLICE: 3, 3, 3> + * | UNLOCK (&priv->lock) + * + * + * Case 2: When the fop thread wins + * + * [updater] | [rollover] + * | + * | <SLICE: 1, 1, 1> + * <changelog_update> | + * <changelog_inode_ctx_get> | + * <CTX: 0, 0, 0> | + * | + * LOCK (&priv->lock) | + * <INODE_VERSION_EQUALS_SLICE> | + * I: 0 <-> S: 1 | + * update: true | + * UNLOCK (&priv->lock) | + * | <dispatch-rollover-event> + * | LOCK (&priv->lock) + * | <SLICE_VERSION_UPDATE> + * | <SLICE: 2, 2, 2> + * | UNLOCK (&priv->lock) + * <if update == true> | + * <dispath-update-event> | + * <INODE_VERSION_UPDATE> | + * LOCK (&inode->lock) | + * LOCK (&priv->lock) | + * <CTX: 2, 0, 0> | + * UNLOCK (&priv->lock) | + * UNLOCK (&inode->lock) | + * + * Here again, if the inode version was equal to the slice version (I: 1, S: 1) + * then there is no need to record an update (as the equality of the two version + * signifies an update was recorded in the current time slice). + */ +inline void +changelog_update (xlator_t *this, changelog_priv_t *priv, + changelog_local_t *local, changelog_log_type type) +{ + int ret = 0; + unsigned long *iver = NULL; + unsigned long version = 0; + inode_t *inode = NULL; + changelog_time_slice_t *slice = NULL; + changelog_inode_ctx_t *ctx = NULL; + changelog_log_data_t *cld_0 = NULL; + changelog_log_data_t *cld_1 = NULL; + changelog_local_t *next_local = NULL; + gf_boolean_t need_upd = _gf_true; + + slice = &priv->slice; + + /** + * for fops that do not require inode version checking + */ + if (local->update_no_check) + goto update; + + inode = local->inode; + + ctx = changelog_inode_ctx_get (this, + inode, &iver, &version, type); + if (!ctx) + goto update; + + INODE_VERSION_EQUALS_SLICE (priv, version, slice, type, need_upd); + + update: + if (need_upd) { + cld_0 = &local->cld; + cld_0->cld_type = type; + + if ( (next_local = local->prev_entry) != NULL ) { + cld_1 = &next_local->cld; + cld_1->cld_type = type; + } + + ret = priv->cd.dispatchfn (this, priv, + priv->cd.cd_data, cld_0, cld_1); + + /** + * update after the dispatcher has successfully done + * it's job. + */ + if (!local->update_no_check && iver && !ret) + INODE_VERSION_UPDATE (priv, inode, iver, slice, type); + } + + return; +} diff --git a/xlators/features/changelog/src/changelog-helpers.h b/xlators/features/changelog/src/changelog-helpers.h new file mode 100644 index 000000000..ad79636b0 --- /dev/null +++ b/xlators/features/changelog/src/changelog-helpers.h @@ -0,0 +1,395 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CHANGELOG_HELPERS_H +#define _CHANGELOG_HELPERS_H + +#include "locking.h" +#include "timer.h" +#include "pthread.h" +#include "iobuf.h" + +#include "changelog-misc.h" + +/** + * the changelog entry + */ +typedef struct changelog_log_data { + /* rollover related */ + unsigned long cld_roll_time; + + /* reopen changelog? */ + gf_boolean_t cld_finale; + + changelog_log_type cld_type; + + /** + * sincd gfid is _always_ a necessity, it's not a part + * of the iobuf. by doing this we do not add any overhead + * for data and metadata related fops. + */ + uuid_t cld_gfid; + + /** + * iobufs are used for optionals records: pargfid, path, + * write offsets etc.. It's the fop implementers job + * to allocate (iobuf_get() in the fop) and get unref'ed + * in the callback (CHANGELOG_STACK_UNWIND). + */ + struct iobuf *cld_iobuf; + +#define cld_ptr cld_iobuf->ptr + + /** + * after allocation you can point this to the length of + * usable data, but make sure it does not exceed the + * the size of the requested iobuf. + */ + size_t cld_iobuf_len; + +#define cld_ptr_len cld_iobuf_len + + /** + * number of optional records + */ + int cld_xtra_records; +} changelog_log_data_t; + +/** + * holder for dispatch function and private data + */ + +typedef struct changelog_priv changelog_priv_t; + +typedef struct changelog_dispatcher { + void *cd_data; + int (*dispatchfn) (xlator_t *, changelog_priv_t *, void *, + changelog_log_data_t *, changelog_log_data_t *); +} changelog_dispatcher_t; + +struct changelog_bootstrap { + changelog_mode_t mode; + int (*ctor) (xlator_t *, changelog_dispatcher_t *); + int (*dtor) (xlator_t *, changelog_dispatcher_t *); +}; + +struct changelog_encoder { + changelog_encoder_t encoder; + int (*encode) (xlator_t *, changelog_log_data_t *); +}; + + +/* xlator private */ + +typedef struct changelog_time_slice { + /** + * just in case we need nanosecond granularity some day. + * field is unused as of now (maybe we'd need it later). + */ + struct timeval tv_start; + + /** + * version of changelog file, incremented each time changes + * rollover. + */ + unsigned long changelog_version[CHANGELOG_MAX_TYPE]; +} changelog_time_slice_t; + +typedef struct changelog_rollover { + /* rollover thread */ + pthread_t rollover_th; + + xlator_t *this; +} changelog_rollover_t; + +typedef struct changelog_fsync { + /* fsync() thread */ + pthread_t fsync_th; + + xlator_t *this; +} changelog_fsync_t; + +# define CHANGELOG_MAX_CLIENTS 5 +typedef struct changelog_notify { + /* reader end of the pipe */ + int rfd; + + /* notifier thread */ + pthread_t notify_th; + + /* unique socket path */ + char sockpath[PATH_MAX]; + + int socket_fd; + + /** + * simple array of accept()'ed fds. Not scalable at all + * for large number of clients, but it's okay as we have + * a ahrd limit in this version (@CHANGELOG_MAX_CLIENTS). + */ + int client_fd[CHANGELOG_MAX_CLIENTS]; + + xlator_t *this; +} changelog_notify_t; + +struct changelog_priv { + gf_boolean_t active; + + /* to generate unique socket file per brick */ + char *changelog_brick; + + /* logging directory */ + char *changelog_dir; + + /* one file for all changelog types */ + int changelog_fd; + + gf_lock_t lock; + + /* writen end of the pipe */ + int wfd; + + /* rollover time */ + int32_t rollover_time; + + /* fsync() interval */ + int32_t fsync_interval; + + /* changelog type maps */ + const char *maps[CHANGELOG_MAX_TYPE]; + + /* time slicer */ + changelog_time_slice_t slice; + + /* context of the updater */ + changelog_dispatcher_t cd; + + /* context of the rollover thread */ + changelog_rollover_t cr; + + /* context of fsync thread */ + changelog_fsync_t cf; + + /* context of the notifier thread */ + changelog_notify_t cn; + + /* operation mode */ + changelog_mode_t op_mode; + + /* bootstrap routine for 'current' logger */ + struct changelog_bootstrap *cb; + + /* encoder mode */ + changelog_encoder_t encode_mode; + + /* encoder */ + struct changelog_encoder *ce; +}; + +struct changelog_local { + inode_t *inode; + gf_boolean_t update_no_check; + + changelog_log_data_t cld; + + /** + * ->prev_entry is used in cases when there needs to be + * additional changelog entry for the parent (eg. rename) + * It's analogous to ->next in single linked list world, + * but we call it as ->prev_entry... ha ha ha + */ + struct changelog_local *prev_entry; +}; + +typedef struct changelog_local changelog_local_t; + +/* inode version is stored in inode ctx */ +typedef struct changelog_inode_ctx { + unsigned long iversion[CHANGELOG_MAX_TYPE]; +} changelog_inode_ctx_t; + +#define CHANGELOG_INODE_VERSION_TYPE(ctx, type) &(ctx->iversion[type]) + +/** + * Optional Records: + * fops that need to save additional information request a array of + * @changelog_opt_t struct. The array is allocated via @iobufs. + */ +typedef enum { + CHANGELOG_OPT_REC_FOP, + CHANGELOG_OPT_REC_ENTRY, +} changelog_optional_rec_type_t; + +struct changelog_entry_fields { + uuid_t cef_uuid; + char *cef_bname; +}; + +typedef struct { + /** + * @co_covert can be used to do post-processing of the record before + * it's persisted to the CHANGELOG. If this is NULL, then the record + * is persisted as per it's in memory format. + */ + size_t (*co_convert) (void *data, char *buffer, gf_boolean_t encode); + + /* release routines */ + void (*co_free) (void *data); + + /* type of the field */ + changelog_optional_rec_type_t co_type; + + /** + * sizeof of the 'valid' field in the union. This field is not used if + * @co_convert is specified. + */ + size_t co_len; + + union { + glusterfs_fop_t co_fop; + struct changelog_entry_fields co_entry; + }; +} changelog_opt_t; + +#define CHANGELOG_OPT_RECORD_LEN sizeof (changelog_opt_t) + +/** + * helpers routines + */ + +void +changelog_thread_cleanup (xlator_t *this, pthread_t thr_id); +inline void * +changelog_get_usable_buffer (changelog_local_t *local); +inline void +changelog_set_usable_record_and_length (changelog_local_t *local, + size_t len, int xr); +void +changelog_local_cleanup (xlator_t *xl, changelog_local_t *local); +changelog_local_t * +changelog_local_init (xlator_t *this, inode_t *inode, uuid_t gfid, + int xtra_records, gf_boolean_t update_flag); +int +changelog_start_next_change (xlator_t *this, + changelog_priv_t *priv, + unsigned long ts, gf_boolean_t finale); +int +changelog_open (xlator_t *this, changelog_priv_t *priv); +int +changelog_fill_rollover_data (changelog_log_data_t *cld, gf_boolean_t is_last); +int +changelog_inject_single_event (xlator_t *this, + changelog_priv_t *priv, + changelog_log_data_t *cld); +inline size_t +changelog_entry_length (); +inline int +changelog_write (int fd, char *buffer, size_t len); +int +changelog_write_change (changelog_priv_t *priv, char *buffer, size_t len); +inline int +changelog_handle_change (xlator_t *this, + changelog_priv_t *priv, changelog_log_data_t *cld); +inline void +changelog_update (xlator_t *this, changelog_priv_t *priv, + changelog_local_t *local, changelog_log_type type); +void * +changelog_rollover (void *data); +void * +changelog_fsync_thread (void *data); +int +changelog_forget (xlator_t *this, inode_t *inode); + +/* macros */ + +#define CHANGELOG_STACK_UNWIND(fop, frame, params ...) do { \ + changelog_local_t *__local = NULL; \ + xlator_t *__xl = NULL; \ + if (frame) { \ + __local = frame->local; \ + __xl = frame->this; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT (fop, frame, params); \ + changelog_local_cleanup (__xl, __local); \ + if (__local && __local->prev_entry) \ + changelog_local_cleanup (__xl, \ + __local->prev_entry); \ + } while (0) + +#define CHANGELOG_IOBUF_REF(iobuf) do { \ + if (iobuf) \ + iobuf_ref (iobuf); \ + } while (0) + +#define CHANGELOG_IOBUF_UNREF(iobuf) do { \ + if (iobuf) \ + iobuf_unref (iobuf); \ + } while (0) + +#define CHANGELOG_FILL_BUFFER(buffer, off, val, len) do { \ + memcpy (buffer + off, val, len); \ + off += len; \ + } while (0) + +#define SLICE_VERSION_UPDATE(slice) do { \ + int i = 0; \ + for (; i < CHANGELOG_MAX_TYPE; i++) { \ + slice->changelog_version[i]++; \ + } \ + } while (0) + +#define CHANGLOG_FILL_FOP_NUMBER(co, fop, converter, xlen) do { \ + co->co_convert = converter; \ + co->co_free = NULL; \ + co->co_type = CHANGELOG_OPT_REC_FOP; \ + co->co_fop = fop; \ + xlen += sizeof (fop); \ + } while (0) + +#define CHANGELOG_FILL_ENTRY(co, pargfid, bname, \ + converter, freefn, xlen, label) \ + do { \ + co->co_convert = converter; \ + co->co_free = freefn; \ + co->co_type = CHANGELOG_OPT_REC_ENTRY; \ + uuid_copy (co->co_entry.cef_uuid, pargfid); \ + co->co_entry.cef_bname = gf_strdup(bname); \ + if (!co->co_entry.cef_bname) \ + goto label; \ + xlen += (UUID_CANONICAL_FORM_LEN + strlen (bname)); \ + } while (0) + +#define CHANGELOG_INIT(this, local, inode, gfid, xrec) \ + local = changelog_local_init (this, inode, gfid, xrec, _gf_false) + +#define CHANGELOG_INIT_NOCHECK(this, local, inode, gfid, xrec) \ + local = changelog_local_init (this, inode, gfid, xrec, _gf_true) + +#define CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, label) do { \ + if (!priv->active) \ + goto label; \ + /* ignore rebalance process's activity. */ \ + if (frame->root->pid == GF_CLIENT_PID_DEFRAG) \ + goto label; \ + } while (0) + +/* ignore internal fops */ +#define CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO(dict, label) do { \ + if (dict && dict_get (dict, GLUSTERFS_INTERNAL_FOP_KEY)) \ + goto label; \ + } while (0) + +#define CHANGELOG_COND_GOTO(priv, cond, label) do { \ + if (!priv->active || cond) \ + goto label; \ + } while (0) + +#endif /* _CHANGELOG_HELPERS_H */ diff --git a/xlators/features/changelog/src/changelog-mem-types.h b/xlators/features/changelog/src/changelog-mem-types.h new file mode 100644 index 000000000..d72464eab --- /dev/null +++ b/xlators/features/changelog/src/changelog-mem-types.h @@ -0,0 +1,29 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CHANGELOG_MEM_TYPES_H +#define _CHANGELOG_MEM_TYPES_H + +#include "mem-types.h" + +enum gf_changelog_mem_types { + gf_changelog_mt_priv_t = gf_common_mt_end + 1, + gf_changelog_mt_str_t = gf_common_mt_end + 2, + gf_changelog_mt_batch_t = gf_common_mt_end + 3, + gf_changelog_mt_rt_t = gf_common_mt_end + 4, + gf_changelog_mt_inode_ctx_t = gf_common_mt_end + 5, + gf_changelog_mt_libgfchangelog_t = gf_common_mt_end + 6, + gf_changelog_mt_libgfchangelog_rl_t = gf_common_mt_end + 7, + gf_changelog_mt_libgfchangelog_dirent_t = gf_common_mt_end + 8, + gf_changelog_mt_changelog_buffer_t = gf_common_mt_end + 9, + gf_changelog_mt_end +}; + +#endif diff --git a/xlators/features/changelog/src/changelog-misc.h b/xlators/features/changelog/src/changelog-misc.h new file mode 100644 index 000000000..0712a3771 --- /dev/null +++ b/xlators/features/changelog/src/changelog-misc.h @@ -0,0 +1,101 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CHANGELOG_MISC_H +#define _CHANGELOG_MISC_H + +#include "glusterfs.h" +#include "common-utils.h" + +#define CHANGELOG_MAX_TYPE 3 +#define CHANGELOG_FILE_NAME "CHANGELOG" + +#define CHANGELOG_VERSION_MAJOR 1 +#define CHANGELOG_VERSION_MINOR 0 + +#define CHANGELOG_UNIX_SOCK DEFAULT_VAR_RUN_DIRECTORY"/changelog-%s.sock" + +/** + * header starts with the version and the format of the changelog. + * 'version' not much of a use now. + */ +#define CHANGELOG_HEADER \ + "GlusterFS Changelog | version: v%d.%d | encoding : %d\n" + +#define CHANGELOG_MAKE_SOCKET_PATH(brick_path, sockpath, len) do { \ + char md5_sum[MD5_DIGEST_LENGTH*2+1] = {0,}; \ + md5_wrapper((unsigned char *) brick_path, \ + strlen(brick_path), \ + md5_sum); \ + (void) snprintf (sockpath, len, \ + CHANGELOG_UNIX_SOCK, md5_sum); \ + } while (0) + +/** + * ... used by libgfchangelog. + */ +#define CHANGELOG_GET_ENCODING(fd, buffer, len, enc, enc_len) do { \ + FILE *fp; \ + int fd_dup, maj, min; \ + \ + enc = -1; \ + fd_dup = dup (fd); \ + \ + if (fd_dup != -1) { \ + fp = fdopen (fd_dup, "r"); \ + if (fp) { \ + if (fgets (buffer, len, fp)) { \ + elen = strlen (buffer); \ + sscanf (buffer, \ + CHANGELOG_HEADER, \ + &maj, &min, &enc); \ + } \ + fclose (fp); \ + } else { \ + close (fd_dup); \ + } \ + } \ + } while (0) + +/** + * everything after 'CHANGELOG_TYPE_ENTRY' are internal types + * (ie. none of the fops trigger this type of event), hence + * CHANGELOG_MAX_TYPE = 3 + */ +typedef enum { + CHANGELOG_TYPE_DATA = 0, + CHANGELOG_TYPE_METADATA, + CHANGELOG_TYPE_ENTRY, + CHANGELOG_TYPE_ROLLOVER, + CHANGELOG_TYPE_FSYNC, +} changelog_log_type; + +/* operation modes - RT for now */ +typedef enum { + CHANGELOG_MODE_RT = 0, +} changelog_mode_t; + +/* encoder types */ + +typedef enum { + CHANGELOG_ENCODE_MIN = 0, + CHANGELOG_ENCODE_BINARY, + CHANGELOG_ENCODE_ASCII, + CHANGELOG_ENCODE_MAX, +} changelog_encoder_t; + +#define CHANGELOG_VALID_ENCODING(enc) \ + (enc > CHANGELOG_ENCODE_MIN && enc < CHANGELOG_ENCODE_MAX) + +#define CHANGELOG_TYPE_IS_ENTRY(type) (type == CHANGELOG_TYPE_ENTRY) +#define CHANGELOG_TYPE_IS_ROLLOVER(type) (type == CHANGELOG_TYPE_ROLLOVER) +#define CHANGELOG_TYPE_IS_FSYNC(type) (type == CHANGELOG_TYPE_FSYNC) + +#endif /* _CHANGELOG_MISC_H */ diff --git a/xlators/features/changelog/src/changelog-notifier.c b/xlators/features/changelog/src/changelog-notifier.c new file mode 100644 index 000000000..1f8b31253 --- /dev/null +++ b/xlators/features/changelog/src/changelog-notifier.c @@ -0,0 +1,314 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "changelog-notifier.h" + +#include <pthread.h> + +inline static void +changelog_notify_clear_fd (changelog_notify_t *cn, int i) +{ + cn->client_fd[i] = -1; +} + +inline static void +changelog_notify_save_fd (changelog_notify_t *cn, int i, int fd) +{ + cn->client_fd[i] = fd; +} + +static int +changelog_notify_insert_fd (xlator_t *this, changelog_notify_t *cn, int fd) +{ + int i = 0; + int ret = 0; + + for (; i < CHANGELOG_MAX_CLIENTS; i++) { + if (cn->client_fd[i] == -1) + break; + } + + if (i == CHANGELOG_MAX_CLIENTS) { + /** + * this case should not be hit as listen() would limit + * the number of completely established connections. + */ + gf_log (this->name, GF_LOG_WARNING, + "hit max client limit (%d)", CHANGELOG_MAX_CLIENTS); + ret = -1; + } + else + changelog_notify_save_fd (cn, i, fd); + + return ret; +} + +static void +changelog_notify_fill_rset (changelog_notify_t *cn, fd_set *rset, int *maxfd) +{ + int i = 0; + + FD_ZERO (rset); + + FD_SET (cn->socket_fd, rset); + *maxfd = cn->socket_fd; + + FD_SET (cn->rfd, rset); + *maxfd = max (*maxfd, cn->rfd); + + for (; i < CHANGELOG_MAX_CLIENTS; i++) { + if (cn->client_fd[i] != -1) { + FD_SET (cn->client_fd[i], rset); + *maxfd = max (*maxfd, cn->client_fd[i]); + } + } + + *maxfd = *maxfd + 1; +} + +static int +changelog_notify_client (changelog_notify_t *cn, char *path, ssize_t len) +{ + int i = 0; + int ret = 0; + + for (; i < CHANGELOG_MAX_CLIENTS; i++) { + if (cn->client_fd[i] == -1) + continue; + + if (changelog_write (cn->client_fd[i], + path, len)) { + ret = -1; + + close (cn->client_fd[i]); + changelog_notify_clear_fd (cn, i); + } + } + + return ret; +} + +static void +changelog_notifier_init (changelog_notify_t *cn) +{ + int i = 0; + + cn->socket_fd = -1; + + for (; i < CHANGELOG_MAX_CLIENTS; i++) { + changelog_notify_clear_fd (cn, i); + } +} + +static void +changelog_close_client_conn (changelog_notify_t *cn) +{ + int i = 0; + + for (; i < CHANGELOG_MAX_CLIENTS; i++) { + if (cn->client_fd[i] == -1) + continue; + + close (cn->client_fd[i]); + changelog_notify_clear_fd (cn, i); + } +} + +static void +changelog_notifier_cleanup (void *arg) +{ + changelog_notify_t *cn = NULL; + + cn = (changelog_notify_t *) arg; + + changelog_close_client_conn (cn); + + if (cn->socket_fd != -1) + close (cn->socket_fd); + + if (cn->rfd) + close (cn->rfd); + + if (unlink (cn->sockpath)) + gf_log ("", GF_LOG_WARNING, + "could not unlink changelog socket file" + " %s (reason: %s", cn->sockpath, strerror (errno)); +} + +void * +changelog_notifier (void *data) +{ + int i = 0; + int fd = 0; + int max_fd = 0; + int len = 0; + ssize_t readlen = 0; + xlator_t *this = NULL; + changelog_priv_t *priv = NULL; + changelog_notify_t *cn = NULL; + struct sockaddr_un local = {0,}; + char path[PATH_MAX] = {0,}; + char abspath[PATH_MAX] = {0,}; + + char buffer; + fd_set rset; + + priv = (changelog_priv_t *) data; + + cn = &priv->cn; + this = cn->this; + + pthread_cleanup_push (changelog_notifier_cleanup, cn); + + changelog_notifier_init (cn); + + cn->socket_fd = socket (AF_UNIX, SOCK_STREAM, 0); + if (cn->socket_fd < 0) { + gf_log (this->name, GF_LOG_ERROR, + "changelog socket error (reason: %s)", + strerror (errno)); + goto out; + } + + CHANGELOG_MAKE_SOCKET_PATH (priv->changelog_brick, + cn->sockpath, PATH_MAX); + if (unlink (cn->sockpath) < 0) { + if (errno != ENOENT) { + gf_log (this->name, GF_LOG_ERROR, + "Could not unlink changelog socket file (%s)" + " (reason: %s)", + CHANGELOG_UNIX_SOCK, strerror (errno)); + goto cleanup; + } + } + + local.sun_family = AF_UNIX; + strcpy (local.sun_path, cn->sockpath); + + len = strlen (local.sun_path) + sizeof (local.sun_family); + + /* bind to the unix domain socket */ + if (bind (cn->socket_fd, (struct sockaddr *) &local, len) < 0) { + gf_log (this->name, GF_LOG_ERROR, + "Could not bind to changelog socket (reason: %s)", + strerror (errno)); + goto cleanup; + } + + /* listen for incoming connections */ + if (listen (cn->socket_fd, CHANGELOG_MAX_CLIENTS) < 0) { + gf_log (this->name, GF_LOG_ERROR, + "listen() error on changelog socket (reason: %s)", + strerror (errno)); + goto cleanup; + } + + /** + * simple select() on all to-be-read file descriptors. This method + * though old school works pretty well when you have a handfull of + * fd's to be watched (clients). + * + * Future TODO: move this to epoll based notification facility if + * number of clients increase. + */ + for (;;) { + changelog_notify_fill_rset (cn, &rset, &max_fd); + + if (select (max_fd, &rset, NULL, NULL, NULL) < 0) { + gf_log (this->name, GF_LOG_ERROR, + "select() returned -1 (reason: %s)", + strerror (errno)); + sleep (2); + continue; + } + + if (FD_ISSET (cn->socket_fd, &rset)) { + fd = accept (cn->socket_fd, NULL, NULL); + if (fd < 0) { + gf_log (this->name, GF_LOG_ERROR, + "accept error on changelog socket" + " (reason: %s)", strerror (errno)); + } else if (changelog_notify_insert_fd (this, cn, fd)) { + gf_log (this->name, GF_LOG_ERROR, + "hit max client limit"); + } + } + + if (FD_ISSET (cn->rfd, &rset)) { + /** + * read changelog filename and notify all connected + * clients. + */ + readlen = 0; + while (readlen < PATH_MAX) { + len = read (cn->rfd, &path[readlen++], 1); + if (len == -1) { + break; + } + + if (len == 0) { + gf_log (this->name, GF_LOG_ERROR, + "rollover thread sent EOF" + " on pipe - possibly a crash."); + /* be blunt and close all connections */ + pthread_exit(NULL); + } + + if (path[readlen - 1] == '\0') + break; + } + + /* should we close all client connections here too? */ + if (len < 0 || readlen == PATH_MAX) { + gf_log (this->name, GF_LOG_ERROR, + "Could not get pathname from rollover" + " thread or pathname too long"); + goto process_rest; + } + + (void) snprintf (abspath, PATH_MAX, + "%s/%s", priv->changelog_dir, path); + if (changelog_notify_client (cn, abspath, + strlen (abspath) + 1)) + gf_log (this->name, GF_LOG_ERROR, + "could not notify some clients with new" + " changelogs"); + } + + process_rest: + for (i = 0; i < CHANGELOG_MAX_CLIENTS; i++) { + if ( (fd = cn->client_fd[i]) == -1 ) + continue; + + if (FD_ISSET (fd, &rset)) { + /** + * the only data we accept from the client is a + * disconnect. Anything else is treated as bogus + * and is silently discarded (also warned!!!). + */ + if ( (readlen = read (fd, &buffer, 1)) <= 0 ) { + close (fd); + changelog_notify_clear_fd (cn, i); + } else { + /* silently discard data and log */ + gf_log (this->name, GF_LOG_WARNING, + "misbehaving changelog client"); + } + } + } + + } + + cleanup:; + pthread_cleanup_pop (1); + + out: + return NULL; +} diff --git a/xlators/features/marker/utils/src/procdiggy.h b/xlators/features/changelog/src/changelog-notifier.h index 56dfc4eb2..55e728356 100644 --- a/xlators/features/marker/utils/src/procdiggy.h +++ b/xlators/features/changelog/src/changelog-notifier.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -7,14 +7,13 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifdef __NetBSD__ -#include <sys/syslimits.h> -#endif /* __NetBSD__ */ -#define PROC "/proc" +#ifndef _CHANGELOG_NOTIFIER_H +#define _CHANGELOG_NOTIFIER_H -pid_t pidinfo (pid_t pid, char **name); +#include "changelog-helpers.h" -int prociter (int (*proch) (pid_t pid, pid_t ppid, char *name, void *data), - void *data); +void * +changelog_notifier (void *data); +#endif diff --git a/xlators/features/changelog/src/changelog-rt.c b/xlators/features/changelog/src/changelog-rt.c new file mode 100644 index 000000000..c147f68ca --- /dev/null +++ b/xlators/features/changelog/src/changelog-rt.c @@ -0,0 +1,72 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "defaults.h" +#include "logging.h" + +#include "changelog-rt.h" +#include "changelog-mem-types.h" + +int +changelog_rt_init (xlator_t *this, changelog_dispatcher_t *cd) +{ + changelog_rt_t *crt = NULL; + + crt = GF_CALLOC (1, sizeof (*crt), + gf_changelog_mt_rt_t); + if (!crt) + return -1; + + LOCK_INIT (&crt->lock); + + cd->cd_data = crt; + cd->dispatchfn = &changelog_rt_enqueue; + + return 0; +} + +int +changelog_rt_fini (xlator_t *this, changelog_dispatcher_t *cd) +{ + changelog_rt_t *crt = NULL; + + crt = cd->cd_data; + + LOCK_DESTROY (&crt->lock); + GF_FREE (crt); + + return 0; +} + +int +changelog_rt_enqueue (xlator_t *this, changelog_priv_t *priv, void *cbatch, + changelog_log_data_t *cld_0, changelog_log_data_t *cld_1) +{ + int ret = 0; + changelog_rt_t *crt = NULL; + + crt = (changelog_rt_t *) cbatch; + + LOCK (&crt->lock); + { + ret = changelog_handle_change (this, priv, cld_0); + if (!ret && cld_1) + ret = changelog_handle_change (this, priv, cld_1); + } + UNLOCK (&crt->lock); + + return ret; +} diff --git a/xlators/features/changelog/src/changelog-rt.h b/xlators/features/changelog/src/changelog-rt.h new file mode 100644 index 000000000..1fc2bbc5b --- /dev/null +++ b/xlators/features/changelog/src/changelog-rt.h @@ -0,0 +1,33 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CHANGELOG_RT_H +#define _CHANGELOG_RT_H + +#include "locking.h" +#include "timer.h" +#include "pthread.h" + +#include "changelog-helpers.h" + +/* unused as of now - may be you would need it later */ +typedef struct changelog_rt { + gf_lock_t lock; +} changelog_rt_t; + +int +changelog_rt_init (xlator_t *this, changelog_dispatcher_t *cd); +int +changelog_rt_fini (xlator_t *this, changelog_dispatcher_t *cd); +int +changelog_rt_enqueue (xlator_t *this, changelog_priv_t *priv, void *cbatch, + changelog_log_data_t *cld_0, changelog_log_data_t *cld_1); + +#endif /* _CHANGELOG_RT_H */ diff --git a/xlators/features/changelog/src/changelog.c b/xlators/features/changelog/src/changelog.c new file mode 100644 index 000000000..cea0e8c70 --- /dev/null +++ b/xlators/features/changelog/src/changelog.c @@ -0,0 +1,1477 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "defaults.h" +#include "logging.h" +#include "iobuf.h" + +#include "changelog-rt.h" + +#include "changelog-encoders.h" +#include "changelog-mem-types.h" + +#include <pthread.h> + +#include "changelog-notifier.h" + +static struct changelog_bootstrap +cb_bootstrap[] = { + { + .mode = CHANGELOG_MODE_RT, + .ctor = changelog_rt_init, + .dtor = changelog_rt_fini, + }, +}; + +/* Entry operations - TYPE III */ + +/** + * entry operations do not undergo inode version checking. + */ + +/* {{{ */ + +/* rmdir */ + +int32_t +changelog_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + + unwind: + CHANGELOG_STACK_UNWIND (rmdir, frame, op_ret, op_errno, + preparent, postparent, xdata); + return 0; +} + +int32_t +changelog_rmdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, int xflags, dict_t *xdata) +{ + size_t xtra_len = 0; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + CHANGELOG_INIT_NOCHECK (this, frame->local, + NULL, loc->inode->gfid, 2); + + co = changelog_get_usable_buffer (frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, + entry_fn, entry_free_fn, xtra_len, wind); + + changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + + wind: + STACK_WIND (frame, changelog_rmdir_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->rmdir, + loc, xflags, xdata); + return 0; +} + +/* unlink */ + +int32_t +changelog_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + + unwind: + CHANGELOG_STACK_UNWIND (unlink, frame, op_ret, op_errno, + preparent, postparent, xdata); + return 0; +} + +int32_t +changelog_unlink (call_frame_t *frame, xlator_t *this, + loc_t *loc, int xflags, dict_t *xdata) +{ + size_t xtra_len = 0; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO (xdata, wind); + + CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, loc->inode->gfid, 2); + + co = changelog_get_usable_buffer (frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, + entry_fn, entry_free_fn, xtra_len, wind); + + changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + + wind: + STACK_WIND (frame, changelog_unlink_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->unlink, + loc, xflags, xdata); + return 0; +} + +/* rename */ + +int32_t +changelog_rename_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *buf, struct iatt *preoldparent, + struct iatt *postoldparent, struct iatt *prenewparent, + struct iatt *postnewparent, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + + unwind: + CHANGELOG_STACK_UNWIND (rename, frame, op_ret, op_errno, + buf, preoldparent, postoldparent, + prenewparent, postnewparent, xdata); + return 0; +} + + +int32_t +changelog_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ + size_t xtra_len = 0; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + /* 3 == fop + oldloc + newloc */ + CHANGELOG_INIT_NOCHECK (this, frame->local, + NULL, oldloc->inode->gfid, 3); + + co = changelog_get_usable_buffer (frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, oldloc->pargfid, oldloc->name, + entry_fn, entry_free_fn, xtra_len, wind); + + co++; + CHANGELOG_FILL_ENTRY (co, newloc->pargfid, newloc->name, + entry_fn, entry_free_fn, xtra_len, wind); + + changelog_set_usable_record_and_length (frame->local, xtra_len, 3); + + wind: + STACK_WIND (frame, changelog_rename_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->rename, + oldloc, newloc, xdata); + return 0; +} + +/* link */ + +int32_t +changelog_link_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + + unwind: + CHANGELOG_STACK_UNWIND (link, frame, op_ret, op_errno, + inode, buf, preparent, postparent, xdata); + return 0; +} + +int32_t +changelog_link (call_frame_t *frame, + xlator_t *this, loc_t *oldloc, + loc_t *newloc, dict_t *xdata) +{ + size_t xtra_len = 0; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + + priv = this->private; + + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO (xdata, wind); + + CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, oldloc->gfid, 2); + + co = changelog_get_usable_buffer (frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, newloc->pargfid, newloc->name, + entry_fn, entry_free_fn, xtra_len, wind); + + changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + + wind: + STACK_WIND (frame, changelog_link_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->link, + oldloc, newloc, xdata); + return 0; +} + +/* mkdir */ + +int32_t +changelog_mkdir_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + + unwind: + CHANGELOG_STACK_UNWIND (mkdir, frame, op_ret, op_errno, + inode, buf, preparent, postparent, xdata); + return 0; +} + +int32_t +changelog_mkdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata) +{ + int ret = -1; + uuid_t gfid = {0,}; + void *uuid_req = NULL; + size_t xtra_len = 0; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + ret = dict_get_ptr (xdata, "gfid-req", &uuid_req); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "failed to get gfid from dict"); + goto wind; + } + uuid_copy (gfid, uuid_req); + + CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, gfid, 2); + + co = changelog_get_usable_buffer (frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, + entry_fn, entry_free_fn, xtra_len, wind); + + changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + + wind: + STACK_WIND (frame, changelog_mkdir_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->mkdir, + loc, mode, umask, xdata); + return 0; +} + +/* symlink */ + +int32_t +changelog_symlink_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + + unwind: + CHANGELOG_STACK_UNWIND (symlink, frame, op_ret, op_errno, + inode, buf, preparent, postparent, xdata); + return 0; +} + +int32_t +changelog_symlink (call_frame_t *frame, xlator_t *this, + const char *linkname, loc_t *loc, + mode_t umask, dict_t *xdata) +{ + int ret = -1; + size_t xtra_len = 0; + uuid_t gfid = {0,}; + void *uuid_req = NULL; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + ret = dict_get_ptr (xdata, "gfid-req", &uuid_req); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "failed to get gfid from dict"); + goto wind; + } + uuid_copy (gfid, uuid_req); + + CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, gfid, 2); + + co = changelog_get_usable_buffer (frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, + entry_fn, entry_free_fn, xtra_len, wind); + + changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + + wind: + STACK_WIND (frame, changelog_symlink_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->symlink, + linkname, loc, umask, xdata); + return 0; +} + +/* mknod */ + +int32_t +changelog_mknod_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + + unwind: + CHANGELOG_STACK_UNWIND (mknod, frame, op_ret, op_errno, + inode, buf, preparent, postparent, xdata); + return 0; +} + +int32_t +changelog_mknod (call_frame_t *frame, + xlator_t *this, loc_t *loc, + mode_t mode, dev_t dev, mode_t umask, dict_t *xdata) +{ + int ret = -1; + uuid_t gfid = {0,}; + void *uuid_req = NULL; + size_t xtra_len = 0; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + ret = dict_get_ptr (xdata, "gfid-req", &uuid_req); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "failed to get gfid from dict"); + goto wind; + } + uuid_copy (gfid, uuid_req); + + CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, gfid, 2); + + co = changelog_get_usable_buffer (frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, + entry_fn, entry_free_fn, xtra_len, wind); + + changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + + wind: + STACK_WIND (frame, changelog_mknod_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->mknod, + loc, mode, dev, umask, xdata); + return 0; +} + +/* creat */ + +int32_t +changelog_create_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + fd_t *fd, inode_t *inode, struct iatt *buf, + struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + + unwind: + CHANGELOG_STACK_UNWIND (create, frame, + op_ret, op_errno, fd, inode, + buf, preparent, postparent, xdata); + return 0; +} + +int32_t +changelog_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, + mode_t umask, fd_t *fd, dict_t *xdata) +{ + int ret = -1; + uuid_t gfid = {0,}; + void *uuid_req = NULL; + changelog_opt_t *co = NULL; + changelog_priv_t *priv = NULL; + size_t xtra_len = 0; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + ret = dict_get_ptr (xdata, "gfid-req", &uuid_req); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "failed to get gfid from dict"); + goto wind; + } + uuid_copy (gfid, uuid_req); + + /* init with two extra records */ + CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, gfid, 2); + if (!frame->local) + goto wind; + + co = changelog_get_usable_buffer (frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, + entry_fn, entry_free_fn, xtra_len, wind); + + changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + + wind: + STACK_WIND (frame, changelog_create_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->create, + loc, flags, mode, umask, fd, xdata); + return 0; +} + +/* }}} */ + + +/* Metadata modification fops - TYPE II */ + +/* {{{ */ + +/* {f}setattr */ + +int32_t +changelog_fsetattr_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *preop_stbuf, + struct iatt *postop_stbuf, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + + unwind: + CHANGELOG_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, + preop_stbuf, postop_stbuf, xdata); + + return 0; + + +} + +int32_t +changelog_fsetattr (call_frame_t *frame, + xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + CHANGELOG_INIT (this, frame->local, + fd->inode, fd->inode->gfid, 0); + + wind: + STACK_WIND (frame, changelog_fsetattr_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetattr, + fd, stbuf, valid, xdata); + return 0; + + +} + +int32_t +changelog_setattr_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *preop_stbuf, + struct iatt *postop_stbuf, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + + unwind: + CHANGELOG_STACK_UNWIND (setattr, frame, op_ret, op_errno, + preop_stbuf, postop_stbuf, xdata); + + return 0; +} + +int32_t +changelog_setattr (call_frame_t *frame, + xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + CHANGELOG_INIT (this, frame->local, + loc->inode, loc->inode->gfid, 0); + + wind: + STACK_WIND (frame, changelog_setattr_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->setattr, + loc, stbuf, valid, xdata); + return 0; +} + +/* {f}removexattr */ + +int32_t +changelog_fremovexattr_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + + unwind: + CHANGELOG_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, xdata); + + return 0; +} + +int32_t +changelog_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + CHANGELOG_INIT (this, frame->local, + fd->inode, fd->inode->gfid, 0); + + wind: + STACK_WIND (frame, changelog_fremovexattr_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->fremovexattr, + fd, name, xdata); + return 0; +} + +int32_t +changelog_removexattr_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + + unwind: + CHANGELOG_STACK_UNWIND (removexattr, frame, op_ret, op_errno, xdata); + + return 0; +} + +int32_t +changelog_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + CHANGELOG_INIT (this, frame->local, + loc->inode, loc->inode->gfid, 0); + + wind: + STACK_WIND (frame, changelog_removexattr_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->removexattr, + loc, name, xdata); + return 0; +} + +/* {f}setxattr */ + +int32_t +changelog_setxattr_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + + unwind: + CHANGELOG_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); + + return 0; +} + +int32_t +changelog_setxattr (call_frame_t *frame, + xlator_t *this, loc_t *loc, + dict_t *dict, int32_t flags, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + CHANGELOG_INIT (this, frame->local, + loc->inode, loc->inode->gfid, 0); + + wind: + STACK_WIND (frame, changelog_setxattr_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->setxattr, + loc, dict, flags, xdata); + return 0; +} + +int32_t +changelog_fsetxattr_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + + unwind: + CHANGELOG_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, xdata); + + return 0; +} + +int32_t +changelog_fsetxattr (call_frame_t *frame, + xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + CHANGELOG_INIT (this, frame->local, + fd->inode, fd->inode->gfid, 0); + + wind: + STACK_WIND (frame, changelog_fsetxattr_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetxattr, + fd, dict, flags, xdata); + return 0; +} + +/* }}} */ + + +/* Data modification fops - TYPE I */ + +/* {{{ */ + +/* {f}truncate() */ + +int32_t +changelog_truncate_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_DATA); + + unwind: + CHANGELOG_STACK_UNWIND (truncate, frame, + op_ret, op_errno, prebuf, postbuf, xdata); + return 0; +} + +int32_t +changelog_truncate (call_frame_t *frame, + xlator_t *this, loc_t *loc, off_t offset, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + CHANGELOG_INIT (this, frame->local, + loc->inode, loc->inode->gfid, 0); + + wind: + STACK_WIND (frame, changelog_truncate_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->truncate, + loc, offset, xdata); + return 0; +} + +int32_t +changelog_ftruncate_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_DATA); + + unwind: + CHANGELOG_STACK_UNWIND (ftruncate, frame, + op_ret, op_errno, prebuf, postbuf, xdata); + return 0; +} + +int32_t +changelog_ftruncate (call_frame_t *frame, + xlator_t *this, fd_t *fd, off_t offset, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + CHANGELOG_INIT (this, frame->local, + fd->inode, fd->inode->gfid, 0); + + wind: + STACK_WIND (frame, changelog_ftruncate_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->ftruncate, + fd, offset, xdata); + return 0; +} + +/* writev() */ + +int32_t +changelog_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, + dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret <= 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_DATA); + + unwind: + CHANGELOG_STACK_UNWIND (writev, frame, + op_ret, op_errno, prebuf, postbuf, xdata); + return 0; +} + +int32_t +changelog_writev (call_frame_t *frame, + xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + CHANGELOG_INIT (this, frame->local, + fd->inode, fd->inode->gfid, 0); + + wind: + STACK_WIND (frame, changelog_writev_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->writev, fd, vector, + count, offset, flags, iobref, xdata); + return 0; +} + +/* }}} */ + +/** + * The + * - @init () + * - @fini () + * - @reconfigure () + * ... and helper routines + */ + +/** + * needed if there are more operation modes in the future. + */ +static void +changelog_assign_opmode (changelog_priv_t *priv, char *mode) +{ + if ( strncmp (mode, "realtime", 8) == 0 ) { + priv->op_mode = CHANGELOG_MODE_RT; + } +} + +static void +changelog_assign_encoding (changelog_priv_t *priv, char *enc) +{ + if ( strncmp (enc, "binary", 6) == 0 ) { + priv->encode_mode = CHANGELOG_ENCODE_BINARY; + } else if ( strncmp (enc, "ascii", 5) == 0 ) { + priv->encode_mode = CHANGELOG_ENCODE_ASCII; + } +} + +/* cleanup any helper threads that are running */ +static void +changelog_cleanup_helper_threads (xlator_t *this, changelog_priv_t *priv) +{ + if (priv->cr.rollover_th) { + changelog_thread_cleanup (this, priv->cr.rollover_th); + priv->cr.rollover_th = 0; + } + + if (priv->cf.fsync_th) { + changelog_thread_cleanup (this, priv->cf.fsync_th); + priv->cf.fsync_th = 0; + } +} + +/* spawn helper thread; cleaning up in case of errors */ +static int +changelog_spawn_helper_threads (xlator_t *this, changelog_priv_t *priv) +{ + int ret = 0; + + priv->cr.this = this; + ret = gf_thread_create (&priv->cr.rollover_th, + NULL, changelog_rollover, priv); + if (ret) + goto out; + + if (priv->fsync_interval) { + priv->cf.this = this; + ret = gf_thread_create (&priv->cf.fsync_th, + NULL, changelog_fsync_thread, priv); + } + + if (ret) + changelog_cleanup_helper_threads (this, priv); + + out: + return ret; +} + +/* cleanup the notifier thread */ +static int +changelog_cleanup_notifier (xlator_t *this, changelog_priv_t *priv) +{ + int ret = 0; + + if (priv->cn.notify_th) { + changelog_thread_cleanup (this, priv->cn.notify_th); + priv->cn.notify_th = 0; + + ret = close (priv->wfd); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "error closing writer end of notifier pipe" + " (reason: %s)", strerror (errno)); + } + + return ret; +} + +/* spawn the notifier thread - nop if already running */ +static int +changelog_spawn_notifier (xlator_t *this, changelog_priv_t *priv) +{ + int ret = 0; + int flags = 0; + int pipe_fd[2] = {0, 0}; + + if (priv->cn.notify_th) + goto out; /* notifier thread already running */ + + ret = pipe (pipe_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "Cannot create pipe (reason: %s)", strerror (errno)); + goto out; + } + + /* writer is non-blocking */ + flags = fcntl (pipe_fd[1], F_GETFL); + flags |= O_NONBLOCK; + + ret = fcntl (pipe_fd[1], F_SETFL, flags); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set O_NONBLOCK flag"); + goto out; + } + + priv->wfd = pipe_fd[1]; + + priv->cn.this = this; + priv->cn.rfd = pipe_fd[0]; + + ret = gf_thread_create (&priv->cn.notify_th, + NULL, changelog_notifier, priv); + + out: + return ret; +} + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init (this, gf_changelog_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_WARNING, "Memory accounting" + " init failed"); + return ret; + } + + return ret; +} + +static int +changelog_init (xlator_t *this, changelog_priv_t *priv) +{ + int i = 0; + int ret = -1; + struct timeval tv = {0,}; + changelog_log_data_t cld = {0,}; + + ret = gettimeofday (&tv, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "gettimeofday() failure"); + goto out; + } + + priv->slice.tv_start = tv; + + priv->maps[CHANGELOG_TYPE_DATA] = "D "; + priv->maps[CHANGELOG_TYPE_METADATA] = "M "; + priv->maps[CHANGELOG_TYPE_ENTRY] = "E "; + + for (; i < CHANGELOG_MAX_TYPE; i++) { + /* start with version 1 */ + priv->slice.changelog_version[i] = 1; + } + + if (!priv->active) + return ret; + + /* spawn the notifier thread */ + ret = changelog_spawn_notifier (this, priv); + if (ret) + goto out; + + /** + * start with a fresh changelog file every time. this is done + * in case there was an encoding change. so... things are kept + * simple here. + */ + ret = changelog_fill_rollover_data (&cld, _gf_false); + if (ret) + goto out; + + LOCK (&priv->lock); + { + ret = changelog_inject_single_event (this, priv, &cld); + } + UNLOCK (&priv->lock); + + /* ... and finally spawn the helpers threads */ + ret = changelog_spawn_helper_threads (this, priv); + + out: + return ret; +} + +int +reconfigure (xlator_t *this, dict_t *options) +{ + int ret = 0; + char *tmp = NULL; + changelog_priv_t *priv = NULL; + gf_boolean_t active_earlier = _gf_true; + gf_boolean_t active_now = _gf_true; + changelog_time_slice_t *slice = NULL; + changelog_log_data_t cld = {0,}; + + priv = this->private; + if (!priv) + goto out; + + ret = -1; + active_earlier = priv->active; + + /* first stop the rollover and the fsync thread */ + changelog_cleanup_helper_threads (this, priv); + + GF_OPTION_RECONF ("changelog-dir", tmp, options, str, out); + if (!tmp) { + gf_log (this->name, GF_LOG_ERROR, + "\"changelog-dir\" option is not set"); + goto out; + } + + GF_FREE (priv->changelog_dir); + priv->changelog_dir = gf_strdup (tmp); + if (!priv->changelog_dir) + goto out; + + ret = mkdir_p (priv->changelog_dir, 0600, _gf_true); + if (ret) + goto out; + + GF_OPTION_RECONF ("changelog", active_now, options, bool, out); + + /** + * changelog_handle_change() handles changes that could possibly + * have been submit changes before changelog deactivation. + */ + if (!active_now) + priv->active = _gf_false; + + GF_OPTION_RECONF ("op-mode", tmp, options, str, out); + changelog_assign_opmode (priv, tmp); + + tmp = NULL; + + GF_OPTION_RECONF ("encoding", tmp, options, str, out); + changelog_assign_encoding (priv, tmp); + + GF_OPTION_RECONF ("rollover-time", + priv->rollover_time, options, int32, out); + GF_OPTION_RECONF ("fsync-interval", + priv->fsync_interval, options, int32, out); + + if (active_now || active_earlier) { + ret = changelog_fill_rollover_data (&cld, !active_now); + if (ret) + goto out; + + slice = &priv->slice; + + LOCK (&priv->lock); + { + ret = changelog_inject_single_event (this, priv, &cld); + if (!ret && active_now) + SLICE_VERSION_UPDATE (slice); + } + UNLOCK (&priv->lock); + + if (ret) + goto out; + + if (active_now) { + ret = changelog_spawn_notifier (this, priv); + if (!ret) + ret = changelog_spawn_helper_threads (this, + priv); + } else + ret = changelog_cleanup_notifier (this, priv); + } + + out: + if (ret) { + ret = changelog_cleanup_notifier (this, priv); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "changelog reconfigured"); + if (active_now) + priv->active = _gf_true; + } + + return ret; +} + +int32_t +init (xlator_t *this) +{ + int ret = -1; + char *tmp = NULL; + changelog_priv_t *priv = NULL; + + GF_VALIDATE_OR_GOTO ("changelog", this, out); + + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "translator needs a single subvolume"); + goto out; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_ERROR, + "dangling volume. please check volfile"); + goto out; + } + + priv = GF_CALLOC (1, sizeof (*priv), gf_changelog_mt_priv_t); + if (!priv) + goto out; + + this->local_pool = mem_pool_new (changelog_local_t, 64); + if (!this->local_pool) { + gf_log (this->name, GF_LOG_ERROR, + "failed to create local memory pool"); + goto out; + } + + LOCK_INIT (&priv->lock); + + GF_OPTION_INIT ("changelog-brick", tmp, str, out); + if (!tmp) { + gf_log (this->name, GF_LOG_ERROR, + "\"changelog-brick\" option is not set"); + goto out; + } + + priv->changelog_brick = gf_strdup (tmp); + if (!priv->changelog_brick) + goto out; + tmp = NULL; + + GF_OPTION_INIT ("changelog-dir", tmp, str, out); + if (!tmp) { + gf_log (this->name, GF_LOG_ERROR, + "\"changelog-dir\" option is not set"); + goto out; + } + + priv->changelog_dir = gf_strdup (tmp); + if (!priv->changelog_dir) + goto out; + tmp = NULL; + + /** + * create the directory even if change-logging would be inactive + * so that consumers can _look_ into it (finding nothing...) + */ + ret = mkdir_p (priv->changelog_dir, 0600, _gf_true); + if (ret) + goto out; + + GF_OPTION_INIT ("changelog", priv->active, bool, out); + + GF_OPTION_INIT ("op-mode", tmp, str, out); + changelog_assign_opmode (priv, tmp); + + tmp = NULL; + + GF_OPTION_INIT ("encoding", tmp, str, out); + changelog_assign_encoding (priv, tmp); + + GF_OPTION_INIT ("rollover-time", priv->rollover_time, int32, out); + + GF_OPTION_INIT ("fsync-interval", priv->fsync_interval, int32, out); + + changelog_encode_change(priv); + + GF_ASSERT (cb_bootstrap[priv->op_mode].mode == priv->op_mode); + priv->cb = &cb_bootstrap[priv->op_mode]; + + /* ... now bootstrap the logger */ + ret = priv->cb->ctor (this, &priv->cd); + if (ret) + goto out; + + priv->changelog_fd = -1; + ret = changelog_init (this, priv); + if (ret) + goto out; + + gf_log (this->name, GF_LOG_DEBUG, "changelog translator loaded"); + + out: + if (ret) { + if (this->local_pool) + mem_pool_destroy (this->local_pool); + if (priv->cb) { + ret = priv->cb->dtor (this, &priv->cd); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "error in cleanup during init()"); + } + GF_FREE (priv->changelog_brick); + GF_FREE (priv->changelog_dir); + GF_FREE (priv); + this->private = NULL; + } else + this->private = priv; + + return ret; +} + +void +fini (xlator_t *this) +{ + int ret = -1; + changelog_priv_t *priv = NULL; + + priv = this->private; + + if (priv) { + ret = priv->cb->dtor (this, &priv->cd); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "error in fini"); + mem_pool_destroy (this->local_pool); + GF_FREE (priv->changelog_brick); + GF_FREE (priv->changelog_dir); + GF_FREE (priv); + } + + this->private = NULL; + + return; +} + +struct xlator_fops fops = { + .mknod = changelog_mknod, + .mkdir = changelog_mkdir, + .create = changelog_create, + .symlink = changelog_symlink, + .writev = changelog_writev, + .truncate = changelog_truncate, + .ftruncate = changelog_ftruncate, + .link = changelog_link, + .rename = changelog_rename, + .unlink = changelog_unlink, + .rmdir = changelog_rmdir, + .setattr = changelog_setattr, + .fsetattr = changelog_fsetattr, + .setxattr = changelog_setxattr, + .fsetxattr = changelog_fsetxattr, + .removexattr = changelog_removexattr, + .fremovexattr = changelog_fremovexattr, +}; + +struct xlator_cbks cbks = { + .forget = changelog_forget, +}; + +struct volume_options options[] = { + {.key = {"changelog"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable change-logging" + }, + {.key = {"changelog-brick"}, + .type = GF_OPTION_TYPE_PATH, + .description = "brick path to generate unique socket file name." + " should be the export directory of the volume strictly." + }, + {.key = {"changelog-dir"}, + .type = GF_OPTION_TYPE_PATH, + .description = "directory for the changelog files" + }, + {.key = {"op-mode"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "realtime", + .value = {"realtime"}, + .description = "operation mode - futuristic operation modes" + }, + {.key = {"encoding"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "ascii", + .value = {"binary", "ascii"}, + .description = "encoding type for changelogs" + }, + {.key = {"rollover-time"}, + .default_value = "60", + .type = GF_OPTION_TYPE_TIME, + .description = "time to switch to a new changelog file (in seconds)" + }, + {.key = {"fsync-interval"}, + .type = GF_OPTION_TYPE_TIME, + .default_value = "0", + .description = "do not open CHANGELOG file with O_SYNC mode." + " instead perform fsync() at specified intervals" + }, + {.key = {NULL} + }, +}; diff --git a/xlators/features/compress/Makefile.am b/xlators/features/compress/Makefile.am new file mode 100644 index 000000000..a985f42a8 --- /dev/null +++ b/xlators/features/compress/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/features/compress/src/Makefile.am b/xlators/features/compress/src/Makefile.am new file mode 100644 index 000000000..4a64b52a9 --- /dev/null +++ b/xlators/features/compress/src/Makefile.am @@ -0,0 +1,17 @@ +xlator_LTLIBRARIES = cdc.la + +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +noinst_HEADERS = cdc.h cdc-mem-types.h + +cdc_la_LDFLAGS = -module -avoidversion $(LIBZ_LIBS) + +cdc_la_SOURCES = cdc.c cdc-helper.c +cdc_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS) \ +-shared -nostartfiles $(LIBZ_CFLAGS) + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/compress/src/cdc-helper.c b/xlators/features/compress/src/cdc-helper.c new file mode 100644 index 000000000..54432ff45 --- /dev/null +++ b/xlators/features/compress/src/cdc-helper.c @@ -0,0 +1,547 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" + +#include "cdc.h" +#include "cdc-mem-types.h" + +#ifdef HAVE_LIB_Z +#include "zlib.h" +#endif + +#ifdef HAVE_LIB_Z +/* gzip header looks something like this + * (RFC 1950) + * + * +---+---+---+---+---+---+---+---+---+---+ + * |ID1|ID2|CM |FLG| MTIME |XFL|OS | + * +---+---+---+---+---+---+---+---+---+---+ + * + * Data is usually sent without this header i.e + * Data sent = <compressed-data> + trailer(8) + * The trailer contains the checksum. + * + * gzip_header is added only during debugging. + * Refer to the function cdc_dump_iovec_to_disk + */ +static const char gzip_header[10] = + { + '\037', '\213', Z_DEFLATED, 0, + 0, 0, 0, 0, + 0, GF_CDC_OS_ID + }; + +static int32_t +cdc_next_iovec (xlator_t *this, cdc_info_t *ci) +{ + int ret = -1; + + ci->ncount++; + /* check for iovec overflow -- should not happen */ + if (ci->ncount == MAX_IOVEC) { + gf_log (this->name, GF_LOG_ERROR, + "Zlib output buffer overflow" + " ->ncount (%d) | ->MAX_IOVEC (%d)", + ci->ncount, MAX_IOVEC); + goto out; + } + + ret = 0; + + out: + return ret; +} + +static void +cdc_put_long (unsigned char *string, unsigned long x) +{ + string[0] = (unsigned char) (x & 0xff); + string[1] = (unsigned char) ((x & 0xff00) >> 8); + string[2] = (unsigned char) ((x & 0xff0000) >> 16); + string[3] = (unsigned char) ((x & 0xff000000) >> 24); +} + +static unsigned long +cdc_get_long (unsigned char *buf) +{ + return ((unsigned long) buf[0]) + | (((unsigned long) buf[1]) << 8) + | (((unsigned long) buf[2]) << 16) + | (((unsigned long) buf[3]) << 24); +} + +static int32_t +cdc_init_gzip_trailer (xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci) +{ + int ret = -1; + char *buf = NULL; + + ret = cdc_next_iovec (this, ci); + if (ret) + goto out; + + buf = CURR_VEC(ci).iov_base = + (char *) GF_CALLOC (1, GF_CDC_VALIDATION_SIZE, + gf_cdc_mt_gzip_trailer_t); + + if (!CURR_VEC(ci).iov_base) + goto out; + + CURR_VEC(ci).iov_len = GF_CDC_VALIDATION_SIZE; + + cdc_put_long ((unsigned char *)&buf[0], ci->crc); + cdc_put_long ((unsigned char *)&buf[4], ci->stream.total_in); + + ret = 0; + + out: + return ret; +} + +static int32_t +cdc_alloc_iobuf_and_init_vec (xlator_t *this, + cdc_priv_t *priv, cdc_info_t *ci, + int size) +{ + int ret = -1; + int alloc_len = 0; + struct iobuf *iobuf = NULL; + + ret = cdc_next_iovec (this, ci); + if (ret) + goto out; + + alloc_len = size ? size : ci->buffer_size; + + iobuf = iobuf_get2 (this->ctx->iobuf_pool, alloc_len); + if (!iobuf) + goto out; + + ret = iobref_add (ci->iobref, iobuf); + if (ret) + goto out; + + /* Initialize this iovec */ + CURR_VEC(ci).iov_base = iobuf->ptr; + CURR_VEC(ci).iov_len = alloc_len; + + ret = 0; + + out: + return ret; +} + +static void +cdc_init_zlib_output_stream (cdc_priv_t *priv, cdc_info_t *ci, int size) +{ + ci->stream.next_out = (unsigned char *) CURR_VEC(ci).iov_base; + ci->stream.avail_out = size ? size : ci->buffer_size; +} + +/* This routine is for testing and debugging only. + * Data written = header(10) + <compressed-data> + trailer(8) + * So each gzip dump file is at least 18 bytes in size. + */ +void +cdc_dump_iovec_to_disk (xlator_t *this, cdc_info_t *ci, const char *file) +{ + int i = 0; + int fd = 0; + size_t writen = 0; + size_t total_writen = 0; + + fd = open (file, O_WRONLY|O_CREAT|O_TRUNC, 0777 ); + if (fd < 0) { + gf_log (this->name, GF_LOG_ERROR, + "Cannot open file: %s", file); + return; + } + + writen = write (fd, (char *) gzip_header, 10); + total_writen += writen; + for (i = 0; i < ci->ncount; i++) { + writen = write (fd, (char *) ci->vec[i].iov_base, ci->vec[i].iov_len); + total_writen += writen; + } + + gf_log (this->name, GF_LOG_DEBUG, + "dump'd %zu bytes to %s", total_writen, GF_CDC_DEBUG_DUMP_FILE ); + + close (fd); +} + +static int32_t +cdc_flush_libz_buffer (cdc_priv_t *priv, xlator_t *this, cdc_info_t *ci, + int (*libz_func)(z_streamp, int), + int flush) +{ + int32_t ret = Z_OK; + int done = 0; + unsigned int deflate_len = 0; + + for (;;) { + deflate_len = ci->buffer_size - ci->stream.avail_out; + + if (deflate_len != 0) { + CURR_VEC(ci).iov_len = deflate_len; + + ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0); + if (ret) { + ret = Z_MEM_ERROR; + break; + } + + /* Re-position Zlib output buffer */ + cdc_init_zlib_output_stream (priv, ci, 0); + } + + if (done) { + ci->ncount--; + break; + } + + ret = libz_func (&ci->stream, flush); + + if (ret == Z_BUF_ERROR) { + ret = Z_OK; + ci->ncount--; + break; + } + + done = (ci->stream.avail_out != 0 || ret == Z_STREAM_END); + + if (ret != Z_OK && ret != Z_STREAM_END) + break; + } + + return ret; +} + +static int32_t +do_cdc_compress (struct iovec *vec, xlator_t *this, cdc_priv_t *priv, + cdc_info_t *ci) +{ + int ret = -1; + + /* Initialize defalte */ + ret = deflateInit2 (&ci->stream, priv->cdc_level, Z_DEFLATED, + priv->window_size, priv->mem_level, + Z_DEFAULT_STRATEGY); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "unable to init Zlib (retval: %d)", ret); + goto out; + } + + ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0); + if (ret) + goto out; + + /* setup output buffer */ + cdc_init_zlib_output_stream (priv, ci, 0); + + /* setup input buffer */ + ci->stream.next_in = (unsigned char *) vec->iov_base; + ci->stream.avail_in = vec->iov_len; + + ci->crc = crc32 (ci->crc, (const Bytef *) vec->iov_base, vec->iov_len); + + gf_log (this->name, GF_LOG_DEBUG, "crc=%lu len=%d buffer_size=%d", + ci->crc, ci->stream.avail_in, ci->buffer_size); + + /* compress !! */ + while (ci->stream.avail_in != 0) { + if (ci->stream.avail_out == 0) { + + CURR_VEC(ci).iov_len = ci->buffer_size; + + ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0); + if (ret) + break; + + /* Re-position Zlib output buffer */ + cdc_init_zlib_output_stream (priv, ci, 0); + } + + ret = deflate (&ci->stream, Z_NO_FLUSH); + if (ret != Z_OK) + break; + } + + out: + return ret; +} + +int32_t +cdc_compress (xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci, + dict_t **xdata) +{ + int ret = -1; + int i = 0; + + ci->iobref = iobref_new (); + if (!ci->iobref) + goto out; + + if (!*xdata) { + *xdata = dict_new (); + if (!*xdata) { + gf_log (this->name, GF_LOG_ERROR, "Cannot allocate xdata" + " dict"); + goto out; + } + } + + /* data */ + for (i = 0; i < ci->count; i++) { + ret = do_cdc_compress (&ci->vector[i], this, priv, ci); + if (ret != Z_OK) + goto deflate_cleanup_out; + } + + /* flush zlib buffer */ + ret = cdc_flush_libz_buffer (priv, this, ci, deflate, Z_FINISH); + if (!(ret == Z_OK || ret == Z_STREAM_END)) { + gf_log (this->name, GF_LOG_ERROR, + "Compression Error: ret (%d)", ret); + ret = -1; + goto deflate_cleanup_out; + } + + /* trailer */ + ret = cdc_init_gzip_trailer (this, priv, ci); + if (ret) + goto deflate_cleanup_out; + + gf_log (this->name, GF_LOG_DEBUG, + "Compressed %ld to %ld bytes", + ci->stream.total_in, ci->stream.total_out); + + ci->nbytes = ci->stream.total_out + GF_CDC_VALIDATION_SIZE; + + /* set deflated canary value for identification */ + ret = dict_set_int32 (*xdata, GF_CDC_DEFLATE_CANARY_VAL, 1); + if (ret) { + /* Send uncompressed data if we can't _tell_ the client + * that deflated data is on it's way. So, we just log + * the faliure and continue as usual. + */ + gf_log (this->name, GF_LOG_ERROR, + "Data deflated, but could not set canary" + " value in dict for identification"); + } + + /* This is to be used in testing */ + if ( priv->debug ) { + cdc_dump_iovec_to_disk (this, ci, GF_CDC_DEBUG_DUMP_FILE ); + } + + deflate_cleanup_out: + (void) deflateEnd(&ci->stream); + + out: + return ret; +} + + +/* deflate content is checked by the presence of a canary + * value in the dict as the key + */ +static int32_t +cdc_check_content_for_deflate (dict_t *xdata) +{ + return dict_get (xdata, GF_CDC_DEFLATE_CANARY_VAL) ? -1 : 0; +} + +static unsigned long +cdc_extract_crc (char *trailer) +{ + return cdc_get_long ((unsigned char *) &trailer[0]); +} + +static unsigned long +cdc_extract_size (char *trailer) +{ + return cdc_get_long ((unsigned char *) &trailer[4]); +} + +static int32_t +cdc_validate_inflate (cdc_info_t *ci, unsigned long crc, + unsigned long len) +{ + return !((crc == ci->crc) + /* inflated length is hidden inside + * Zlib stream struct */ + && (len == ci->stream.total_out)); +} + +static int32_t +do_cdc_decompress (xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci) +{ + int ret = -1; + int i = 0; + int len = 0; + char *inflte = NULL; + char *trailer = NULL; + struct iovec vec = {0,}; + unsigned long computed_crc = 0; + unsigned long computed_len = 0; + + ret = inflateInit2 (&ci->stream, priv->window_size); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Zlib: Unable to initialize inflate"); + goto out; + } + + vec = THIS_VEC(ci, 0); + + trailer = (char *) (((char *) vec.iov_base) + vec.iov_len + - GF_CDC_VALIDATION_SIZE); + + /* CRC of uncompressed data */ + computed_crc = cdc_extract_crc (trailer); + + /* size of uncomrpessed data */ + computed_len = cdc_extract_size (trailer); + + gf_log (this->name, GF_LOG_DEBUG, "crc=%lu len=%lu buffer_size=%d", + computed_crc, computed_len, ci->buffer_size); + + inflte = vec.iov_base ; + len = vec.iov_len - GF_CDC_VALIDATION_SIZE; + + /* allocate buffer of the original length of the data */ + ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0); + if (ret) + goto out; + + /* setup output buffer */ + cdc_init_zlib_output_stream (priv, ci, 0); + + /* setup input buffer */ + ci->stream.next_in = (unsigned char *) inflte; + ci->stream.avail_in = len; + + while (ci->stream.avail_in != 0) { + if (ci->stream.avail_out == 0) { + CURR_VEC(ci).iov_len = ci->buffer_size; + + ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0); + if (ret) + break; + + /* Re-position Zlib output buffer */ + cdc_init_zlib_output_stream (priv, ci, 0); + } + + ret = inflate (&ci->stream, Z_NO_FLUSH); + if (ret == Z_STREAM_ERROR) + break; + } + + /* flush zlib buffer */ + ret = cdc_flush_libz_buffer (priv, this, ci, inflate, Z_SYNC_FLUSH); + if (!(ret == Z_OK || ret == Z_STREAM_END)) { + gf_log (this->name, GF_LOG_ERROR, + "Decompression Error: ret (%d)", ret); + ret = -1; + goto out; + } + + /* compute CRC of the uncompresses data to check for + * correctness */ + + for (i = 0; i < ci->ncount; i++) { + ci->crc = crc32 (ci->crc, + (const Bytef *) ci->vec[i].iov_base, + ci->vec[i].iov_len); + } + + /* validate inflated data */ + ret = cdc_validate_inflate (ci, computed_crc, computed_len); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Checksum or length mismatched in inflated data"); + } + + out: + return ret; +} + +int32_t +cdc_decompress (xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci, + dict_t *xdata) +{ + int32_t ret = -1; + + /* check for deflate content */ + if (!cdc_check_content_for_deflate (xdata)) { + gf_log (this->name, GF_LOG_DEBUG, + "Content not deflated, passing through ..."); + goto passthrough_out; + } + + ci->iobref = iobref_new (); + if (!ci->iobref) + goto passthrough_out; + + /* do we need to do this? can we assume that one iovec + * will hold per request data everytime? + * + * server/client protocol seems to deal with a single + * iovec even if op_ret > 1M. So, it looks ok to + * assume that a single iovec will contain all the + * data (This saves us a lot from finding the trailer + * and the data since it could have been split-up onto + * two adjacent iovec's. + * + * But, in case this translator is loaded above quick-read + * for some reason, then it's entirely possible that we get + * multiple iovec's... + * + * This case (handled below) is not tested. (by loading the + * xlator below quick-read) + */ + + /* @@ I_HOPE_THIS_IS_NEVER_HIT */ + if (ci->count > 1) { + gf_log (this->name, GF_LOG_WARNING, "unable to handle" + " multiple iovecs (%d in number)", ci->count); + goto inflate_cleanup_out; + /* TODO: coallate all iovecs in one */ + } + + ret = do_cdc_decompress (this, priv, ci); + if (ret) + goto inflate_cleanup_out; + + ci->nbytes = ci->stream.total_out; + + gf_log (this->name, GF_LOG_DEBUG, + "Inflated %ld to %ld bytes", + ci->stream.total_in, ci->stream.total_out); + + inflate_cleanup_out: + (void) inflateEnd (&ci->stream); + + passthrough_out: + return ret; +} + +#endif diff --git a/xlators/features/compress/src/cdc-mem-types.h b/xlators/features/compress/src/cdc-mem-types.h new file mode 100644 index 000000000..efa008059 --- /dev/null +++ b/xlators/features/compress/src/cdc-mem-types.h @@ -0,0 +1,22 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __CDC_MEM_TYPES_H +#define __CDC_MEM_TYPES_H + +#include "mem-types.h" + +enum gf_cdc_mem_types { + gf_cdc_mt_priv_t = gf_common_mt_end + 1, + gf_cdc_mt_vec_t = gf_common_mt_end + 2, + gf_cdc_mt_gzip_trailer_t = gf_common_mt_end + 3, +}; + +#endif diff --git a/xlators/features/compress/src/cdc.c b/xlators/features/compress/src/cdc.c new file mode 100644 index 000000000..eb7d87c56 --- /dev/null +++ b/xlators/features/compress/src/cdc.c @@ -0,0 +1,342 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <sys/uio.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "defaults.h" +#include "logging.h" + +#include "cdc.h" +#include "cdc-mem-types.h" + +static void +cdc_cleanup_iobref (cdc_info_t *ci) +{ + assert(ci->iobref != NULL); + iobref_clear (ci->iobref); +} + +int32_t +cdc_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iovec *vector, int32_t count, + struct iatt *stbuf, struct iobref *iobref, + dict_t *xdata) +{ + int ret = -1; + cdc_priv_t *priv = NULL; + cdc_info_t ci = {0,}; + + GF_VALIDATE_OR_GOTO ("cdc", this, default_out); + GF_VALIDATE_OR_GOTO (this->name, frame, default_out); + + priv = this->private; + + if (op_ret <= 0) + goto default_out; + + if ( (priv->min_file_size != 0) + && (op_ret < priv->min_file_size) ) + goto default_out; + + ci.count = count; + ci.ibytes = op_ret; + ci.vector = vector; + ci.buf = NULL; + ci.iobref = NULL; + ci.ncount = 0; + ci.crc = 0; + ci.buffer_size = GF_CDC_DEF_BUFFERSIZE; + +/* A readv compresses on the server side and decompresses on the client side + */ + if (priv->op_mode == GF_CDC_MODE_SERVER) { + ret = cdc_compress (this, priv, &ci, &xdata); + } else if (priv->op_mode == GF_CDC_MODE_CLIENT) { + ret = cdc_decompress (this, priv, &ci, xdata); + } else { + gf_log (this->name, GF_LOG_ERROR, + "Invalid operation mode (%d)", priv->op_mode); + } + + if (ret) + goto default_out; + + STACK_UNWIND_STRICT (readv, frame, ci.nbytes, op_errno, + ci.vec, ci.ncount, stbuf, iobref, + xdata); + cdc_cleanup_iobref (&ci); + return 0; + + default_out: + STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, + vector, count, stbuf, iobref, xdata); + return 0; +} + +int32_t +cdc_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset, uint32_t flags, + dict_t *xdata) +{ + fop_readv_cbk_t cbk = NULL; + +#ifdef HAVE_LIB_Z + cbk = cdc_readv_cbk; +#else + cbk = default_readv_cbk; +#endif + STACK_WIND (frame, cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, + fd, size, offset, flags, xdata); + return 0; +} + +int32_t +cdc_writev_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, xdata); + return 0; +} + +int32_t +cdc_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t offset, + uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + int ret = -1; + cdc_priv_t *priv = NULL; + cdc_info_t ci = {0,}; + size_t isize = 0; + + GF_VALIDATE_OR_GOTO ("cdc", this, default_out); + GF_VALIDATE_OR_GOTO (this->name, frame, default_out); + + priv = this->private; + + isize = iov_length(vector, count); + + if (isize <= 0) + goto default_out; + + if ( (priv->min_file_size != 0) + && (isize < priv->min_file_size) ) + goto default_out; + + ci.count = count; + ci.ibytes = isize; + ci.vector = vector; + ci.buf = NULL; + ci.iobref = NULL; + ci.ncount = 0; + ci.crc = 0; + ci.buffer_size = GF_CDC_DEF_BUFFERSIZE; + +/* A writev compresses on the client side and decompresses on the server side + */ + if (priv->op_mode == GF_CDC_MODE_CLIENT) { + ret = cdc_compress (this, priv, &ci, &xdata); + } else if (priv->op_mode == GF_CDC_MODE_SERVER) { + ret = cdc_decompress (this, priv, &ci, xdata); + } else { + gf_log (this->name, GF_LOG_ERROR, "Invalid operation mode (%d) ", priv->op_mode); + } + + if (ret) + goto default_out; + + STACK_WIND (frame, + cdc_writev_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->writev, + fd, ci.vec, ci.ncount, offset, flags, + iobref, xdata); + + cdc_cleanup_iobref (&ci); + return 0; + + default_out: + STACK_WIND (frame, + cdc_writev_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->writev, + fd, vector, count, offset, flags, + iobref, xdata); + return 0; +} + +int32_t +init (xlator_t *this) +{ + int ret = -1; + char *temp_str = NULL; + cdc_priv_t *priv = NULL; + + GF_VALIDATE_OR_GOTO ("cdc", this, err); + + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "Need subvolume == 1"); + goto err; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "Dangling volume. Check volfile"); + } + + priv = GF_CALLOC (1, sizeof (*priv), gf_cdc_mt_priv_t); + if (!priv) { + goto err; + } + + /* Check if debug mode is turned on */ + GF_OPTION_INIT ("debug", priv->debug, bool, err); + if( priv->debug ) { + gf_log (this->name, GF_LOG_DEBUG, "CDC debug option turned on"); + } + + /* Set Gzip Window Size */ + GF_OPTION_INIT ("window-size", priv->window_size, int32, err); + if ( (priv->window_size > GF_CDC_MAX_WINDOWSIZE) + || (priv->window_size < GF_CDC_DEF_WINDOWSIZE) ) { + gf_log (this->name, GF_LOG_WARNING, + "Invalid gzip window size (%d), using default", + priv->window_size); + priv->window_size = GF_CDC_DEF_WINDOWSIZE; + } + + /* Set Gzip (De)Compression Level */ + GF_OPTION_INIT ("compression-level", priv->cdc_level, int32, err); + if ( ((priv->cdc_level < 1) || (priv->cdc_level > 9)) + && (priv->cdc_level != GF_CDC_DEF_COMPRESSION) ) { + gf_log (this->name, GF_LOG_WARNING, + "Invalid gzip (de)compression level (%d)," + " using default", priv->cdc_level); + priv->cdc_level = GF_CDC_DEF_COMPRESSION; + } + + /* Set Gzip Memory Level */ + GF_OPTION_INIT ("mem-level", priv->mem_level, int32, err); + if ( (priv->mem_level < 1) || (priv->mem_level > 9) ) { + gf_log (this->name, GF_LOG_WARNING, + "Invalid gzip memory level, using the default"); + priv->mem_level = GF_CDC_DEF_MEMLEVEL; + } + + /* Set min file size to enable compression */ + GF_OPTION_INIT ("min-size", priv->min_file_size, int32, err); + + /* Mode of operation - Server/Client */ + ret = dict_get_str (this->options, "mode", &temp_str); + if (ret) { + gf_log (this->name, GF_LOG_CRITICAL, + "Operation mode not specified !!"); + goto err; + } + + if (GF_CDC_MODE_IS_CLIENT (temp_str)) { + priv->op_mode = GF_CDC_MODE_CLIENT; + } else if (GF_CDC_MODE_IS_SERVER (temp_str)) { + priv->op_mode = GF_CDC_MODE_SERVER; + } else { + gf_log (this->name, GF_LOG_CRITICAL, + "Bogus operation mode (%s) specified", temp_str); + goto err; + } + + this->private = priv; + gf_log (this->name, GF_LOG_DEBUG, "CDC xlator loaded in (%s) mode",temp_str); + return 0; + + err: + if (priv) + GF_FREE (priv); + + return -1; +} + +void +fini (xlator_t *this) +{ + cdc_priv_t *priv = this->private; + + if (priv) + GF_FREE (priv); + this->private = NULL; + return; +} + +struct xlator_fops fops = { + .readv = cdc_readv, + .writev = cdc_writev, +}; + +struct xlator_cbks cbks = { +}; + +struct volume_options options[] = { + { .key = {"window-size"}, + .default_value = "-15", + .type = GF_OPTION_TYPE_INT, + .description = "Size of the zlib history buffer." + }, + { .key = {"mem-level"}, + .default_value = "8", + .type = GF_OPTION_TYPE_INT, + .description = "Memory allocated for internal compression state.\ + 1 uses minimum memory but is slow and reduces \ + compression ratio; memLevel=9 uses maximum memory \ + for optimal speed. The default value is 8." + }, + { .key = {"compression-level"}, + .default_value = "-1", + .type = GF_OPTION_TYPE_INT, + .description = "Compression levels \ + 0 : no compression, 1 : best speed, \ + 9 : best compression, -1 : default compression " + }, + { .key = {"min-size"}, + .default_value = "0", + .type = GF_OPTION_TYPE_INT, + .description = "Data is compressed only when its size exceeds this." + }, + { .key = {"mode"}, + .value = {"server", "client"}, + .type = GF_OPTION_TYPE_STR, + .description = "Set on the basis of where the xlator is loaded." + }, + { .key = {"debug"}, + .default_value = "false", + .type = GF_OPTION_TYPE_BOOL, + .description = "This is used in testing. Will dump compressed data \ + to disk as a gzip file." + }, + { .key = {NULL} + }, +}; diff --git a/xlators/features/compress/src/cdc.h b/xlators/features/compress/src/cdc.h new file mode 100644 index 000000000..71f4d2317 --- /dev/null +++ b/xlators/features/compress/src/cdc.h @@ -0,0 +1,107 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __CDC_H +#define __CDC_H + +#ifdef HAVE_LIB_Z +#include "zlib.h" +#endif + +#include "xlator.h" + +#ifndef MAX_IOVEC +#define MAX_IOVEC 16 +#endif + +typedef struct cdc_priv { + int window_size; + int mem_level; + int cdc_level; + int min_file_size; + int op_mode; + gf_boolean_t debug; + gf_lock_t lock; +} cdc_priv_t; + +typedef struct cdc_info { + /* input bits */ + int count; + int32_t ibytes; + struct iovec *vector; + struct iatt *buf; + + /* output bits */ + int ncount; + int nbytes; + int buffer_size; + struct iovec vec[MAX_IOVEC]; + struct iobref *iobref; + + /* zlib bits */ +#ifdef HAVE_LIB_Z + z_stream stream; +#endif + unsigned long crc; +} cdc_info_t; + +#define NVEC(ci) (ci->ncount - 1) +#define CURR_VEC(ci) ci->vec[ci->ncount - 1] +#define THIS_VEC(ci, i) ci->vector[i] + +/* Gzip defaults */ +#define GF_CDC_DEF_WINDOWSIZE -15 /* default value */ +#define GF_CDC_MAX_WINDOWSIZE -8 /* max value */ + +#ifdef HAVE_LIB_Z +#define GF_CDC_DEF_COMPRESSION Z_DEFAULT_COMPRESSION +#else +#define GF_CDC_DEF_COMPRESSION -1 +#endif + +#define GF_CDC_DEF_MEMLEVEL 8 +#define GF_CDC_DEF_BUFFERSIZE 262144 // 256K - default compression buffer size + +/* Operation mode + * If xlator is loaded on client, readv decompresses and writev compresses + * If xlator is loaded on server, readv compresses and writev decompresses + */ +#define GF_CDC_MODE_CLIENT 0 +#define GF_CDC_MODE_SERVER 1 + +/* min size of data to do cmpression + * 0 == compress even 1byte + */ +#define GF_CDC_MIN_CHUNK_SIZE 0 + +#define GF_CDC_VALIDATION_SIZE 8 + +#define GF_CDC_OS_ID 0xFF +#define GF_CDC_DEFLATE_CANARY_VAL "deflate" +#define GF_CDC_DEBUG_DUMP_FILE "/tmp/cdcdump.gz" + +#define GF_CDC_MODE_IS_CLIENT(m) \ + (strcmp (m, "client") == 0) + +#define GF_CDC_MODE_IS_SERVER(m) \ + (strcmp (m, "server") == 0) + +int32_t +cdc_compress (xlator_t *this, + cdc_priv_t *priv, + cdc_info_t *ci, + dict_t **xdata); +int32_t +cdc_decompress (xlator_t *this, + cdc_priv_t *priv, + cdc_info_t *ci, + dict_t *xdata); + +#endif diff --git a/xlators/features/gfid-access/Makefile.am b/xlators/features/gfid-access/Makefile.am new file mode 100644 index 000000000..af437a64d --- /dev/null +++ b/xlators/features/gfid-access/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src diff --git a/xlators/features/gfid-access/src/Makefile.am b/xlators/features/gfid-access/src/Makefile.am new file mode 100644 index 000000000..db53affaa --- /dev/null +++ b/xlators/features/gfid-access/src/Makefile.am @@ -0,0 +1,15 @@ +xlator_LTLIBRARIES = gfid-access.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +gfid_access_la_LDFLAGS = -module -avoid-version + +gfid_access_la_SOURCES = gfid-access.c +gfid_access_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = gfid-access.h gfid-access-mem-types.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/gfid-access/src/gfid-access-mem-types.h b/xlators/features/gfid-access/src/gfid-access-mem-types.h new file mode 100644 index 000000000..168d67b43 --- /dev/null +++ b/xlators/features/gfid-access/src/gfid-access-mem-types.h @@ -0,0 +1,23 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _GFID_ACCESS_MEM_TYPES_H +#define _GFID_ACCESS_MEM_TYPES_H + +#include "mem-types.h" + +enum gf_changelog_mem_types { + gf_gfid_access_mt_priv_t = gf_common_mt_end + 1, + gf_gfid_access_mt_gfid_t, + gf_gfid_access_mt_end +}; + +#endif + diff --git a/xlators/features/gfid-access/src/gfid-access.c b/xlators/features/gfid-access/src/gfid-access.c new file mode 100644 index 000000000..da0ba7e50 --- /dev/null +++ b/xlators/features/gfid-access/src/gfid-access.c @@ -0,0 +1,1172 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "gfid-access.h" +#include "inode.h" +#include "byte-order.h" + + + +void +ga_newfile_args_free (ga_newfile_args_t *args) +{ + if (!args) + goto out; + + GF_FREE (args->bname); + + if (S_ISLNK (args->st_mode) && args->args.symlink.linkpath) { + GF_FREE (args->args.symlink.linkpath); + args->args.symlink.linkpath = NULL; + } + + mem_put (args); +out: + return; +} + + +void +ga_heal_args_free (ga_heal_args_t *args) +{ + if (!args) + goto out; + + GF_FREE (args->bname); + + mem_put (args); +out: + return; +} + + +ga_newfile_args_t * +ga_newfile_parse_args (xlator_t *this, data_t *data) +{ + ga_newfile_args_t *args = NULL; + ga_private_t *priv = NULL; + int len = 0; + int blob_len = 0; + int min_len = 0; + void *blob = NULL; + + priv = this->private; + + blob = data->data; + blob_len = data->len; + + min_len = sizeof (args->uid) + sizeof (args->gid) + sizeof (args->gfid) + + sizeof (args->st_mode) + 2 + 2; + if (blob_len < min_len) { + gf_log (this->name, GF_LOG_ERROR, + "Invalid length: Total length is less " + "than minimum length."); + goto err; + } + + args = mem_get0 (priv->newfile_args_pool); + if (args == NULL) + goto err; + + args->uid = ntoh32 (*(uint32_t *)blob); + blob += sizeof (uint32_t); + blob_len -= sizeof (uint32_t); + + args->gid = ntoh32 (*(uint32_t *)blob); + blob += sizeof (uint32_t); + blob_len -= sizeof (uint32_t); + + memcpy (args->gfid, blob, sizeof (args->gfid)); + blob += sizeof (args->gfid); + blob_len -= sizeof (args->gfid); + + args->st_mode = ntoh32 (*(uint32_t *)blob); + blob += sizeof (uint32_t); + blob_len -= sizeof (uint32_t); + + len = strnlen (blob, blob_len); + if (len == blob_len) + if (len == blob_len) { + gf_log (this->name, GF_LOG_ERROR, + "gfid: %s. No null byte present.", + args->gfid); + goto err; + } + + args->bname = GF_CALLOC (1, (len + 1), gf_common_mt_char); + if (args->bname == NULL) + goto err; + + memcpy (args->bname, blob, (len + 1)); + blob += (len + 1); + blob_len -= (len + 1); + + if (S_ISDIR (args->st_mode)) { + if (blob_len < sizeof (uint32_t)) { + gf_log (this->name, GF_LOG_ERROR, + "gfid: %s. Invalid length", + args->gfid); + goto err; + } + args->args.mkdir.mode = ntoh32 (*(uint32_t *)blob); + blob += sizeof (uint32_t); + blob_len -= sizeof (uint32_t); + + if (blob_len < sizeof (uint32_t)) { + gf_log (this->name, GF_LOG_ERROR, + "gfid: %s. Invalid length", + args->gfid); + goto err; + } + args->args.mkdir.umask = ntoh32 (*(uint32_t *)blob); + blob += sizeof (uint32_t); + blob_len -= sizeof (uint32_t); + if (blob_len < 0) { + gf_log (this->name, GF_LOG_ERROR, + "gfid: %s. Invalid length", + args->gfid); + goto err; + } + } else if (S_ISLNK (args->st_mode)) { + len = strnlen (blob, blob_len); + if (len == blob_len) { + gf_log (this->name, GF_LOG_ERROR, + "gfid: %s. Invalid length", + args->gfid); + goto err; + } + args->args.symlink.linkpath = GF_CALLOC (1, len + 1, + gf_common_mt_char); + if (args->args.symlink.linkpath == NULL) + goto err; + + memcpy (args->args.symlink.linkpath, blob, (len + 1)); + blob += (len + 1); + blob_len -= (len + 1); + } else { + if (blob_len < sizeof (uint32_t)) { + gf_log (this->name, GF_LOG_ERROR, + "gfid: %s. Invalid length", + args->gfid); + goto err; + } + args->args.mknod.mode = ntoh32 (*(uint32_t *)blob); + blob += sizeof (uint32_t); + blob_len -= sizeof (uint32_t); + + if (blob_len < sizeof (uint32_t)) { + gf_log (this->name, GF_LOG_ERROR, + "gfid: %s. Invalid length", + args->gfid); + goto err; + } + args->args.mknod.rdev = ntoh32 (*(uint32_t *)blob); + blob += sizeof (uint32_t); + blob_len -= sizeof (uint32_t); + + if (blob_len < sizeof (uint32_t)) { + gf_log (this->name, GF_LOG_ERROR, + "gfid: %s. Invalid length", + args->gfid); + goto err; + } + args->args.mknod.umask = ntoh32 (*(uint32_t *)blob); + blob += sizeof (uint32_t); + blob_len -= sizeof (uint32_t); + } + + if (blob_len) { + gf_log (this->name, GF_LOG_ERROR, + "gfid: %s. Invalid length", + args->gfid); + goto err; + } + + return args; + +err: + if (args) + ga_newfile_args_free (args); + + return NULL; +} + +ga_heal_args_t * +ga_heal_parse_args (xlator_t *this, data_t *data) +{ + ga_heal_args_t *args = NULL; + ga_private_t *priv = NULL; + void *blob = NULL; + int len = 0; + int blob_len = 0; + + blob = data->data; + blob_len = data->len; + + priv = this->private; + + /* bname should at least contain a character */ + if (blob_len < (sizeof (args->gfid) + 2)) + goto err; + + args = mem_get0 (priv->heal_args_pool); + if (!args) + goto err; + + memcpy (args->gfid, blob, sizeof (args->gfid)); + blob += sizeof (args->gfid); + blob_len -= sizeof (args->gfid); + + len = strnlen (blob, blob_len); + if (len == blob_len) + goto err; + + args->bname = GF_CALLOC (1, len + 1, gf_common_mt_char); + if (!args->bname) + goto err; + + memcpy (args->bname, blob, len); + blob_len -= (len + 1); + + if (blob_len) + goto err; + + return args; + +err: + if (args) + ga_heal_args_free (args); + + return NULL; +} + +static int32_t +ga_fill_tmp_loc (loc_t *loc, xlator_t *this, char *gfid, + char *bname, dict_t *xdata, loc_t *new_loc) +{ + int ret = -1; + uint64_t value = 0; + inode_t *parent = NULL; + + parent = loc->inode; + ret = inode_ctx_get (loc->inode, this, &value); + if (!ret) { + parent = (void *)value; + } + + /* parent itself should be looked up */ + uuid_copy (new_loc->pargfid, parent->gfid); + new_loc->parent = inode_ref (parent); + + new_loc->inode = inode_grep (parent->table, parent, bname); + if (!new_loc->inode) + new_loc->inode = inode_new (parent->table); + + loc_path (new_loc, bname); + new_loc->name = basename (new_loc->path); + + /* As GFID would not be set on the entry yet, lets not send entry + gfid in the request */ + /*uuid_copy (new_loc->gfid, (const unsigned char *)gfid); */ + + ret = dict_set_static_bin (xdata, "gfid-req", gfid, 16); + if (ret < 0) + goto out; + + ret = 0; + +out: + return ret; +} + + + +static gf_boolean_t +__is_gfid_access_dir (uuid_t gfid) +{ + uuid_t aux_gfid; + + memset (aux_gfid, 0, 16); + aux_gfid[15] = GF_AUX_GFID; + + if (uuid_compare (gfid, aux_gfid) == 0) + return _gf_true; + + return _gf_false; +} + +int32_t +ga_forget (xlator_t *this, inode_t *inode) +{ + int ret = -1; + uint64_t value = 0; + inode_t *tmp_inode = NULL; + + ret = inode_ctx_del (inode, this, &value); + if (ret) + goto out; + + tmp_inode = (void *)value; + inode_unref (tmp_inode); + +out: + return 0; +} + + +static int +ga_heal_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *stat, dict_t *dict, + struct iatt *postparent) +{ + call_frame_t *orig_frame = NULL; + + orig_frame = frame->local; + frame->local = NULL; + + /* don't worry about inode linking and other stuff. They'll happen on + * the next lookup. + */ + STACK_DESTROY (frame->root); + + STACK_UNWIND_STRICT (setxattr, orig_frame, op_ret, op_errno, dict); + + return 0; +} + +static int +ga_newentry_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + call_frame_t *orig_frame = NULL; + + orig_frame = frame->local; + frame->local = NULL; + + /* don't worry about inode linking and other stuff. They'll happen on + * the next lookup. + */ + STACK_DESTROY (frame->root); + + STACK_UNWIND_STRICT (setxattr, orig_frame, op_ret, op_errno, xdata); + + return 0; +} + +int32_t +ga_new_entry (call_frame_t *frame, xlator_t *this, loc_t *loc, data_t *data, + dict_t *xdata) +{ + int ret = -1; + ga_newfile_args_t *args = NULL; + loc_t tmp_loc = {0,}; + call_frame_t *new_frame = NULL; + mode_t mode = 0; + + args = ga_newfile_parse_args (this, data); + if (!args) + goto out; + + if (!xdata) + xdata = dict_new (); + + ret = ga_fill_tmp_loc (loc, this, args->gfid, + args->bname, xdata, &tmp_loc); + if (ret) + goto out; + + new_frame = copy_frame (frame); + if (!new_frame) + goto out; + new_frame->local = (void *)frame; + + new_frame->root->uid = args->uid; + new_frame->root->gid = args->gid; + + if (S_ISDIR (args->st_mode)) { + STACK_WIND (new_frame, ga_newentry_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, + &tmp_loc, args->args.mkdir.mode, + args->args.mkdir.umask, xdata); + } else if (S_ISLNK (args->st_mode)) { + STACK_WIND (new_frame, ga_newentry_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink, + args->args.symlink.linkpath, + &tmp_loc, 0, xdata); + } else { + /* use 07777 (4 7s) for considering the Sticky bits etc) */ + mode = (S_IFMT & args->st_mode) | + (07777 | args->args.mknod.mode);; + + STACK_WIND (new_frame, ga_newentry_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, + &tmp_loc, mode, + args->args.mknod.rdev, args->args.mknod.umask, + xdata); + } + + ret = 0; +out: + ga_newfile_args_free (args); + + return ret; +} + +int32_t +ga_heal_entry (call_frame_t *frame, xlator_t *this, loc_t *loc, data_t *data, + dict_t *xdata) +{ + int ret = -1; + ga_heal_args_t *args = NULL; + loc_t tmp_loc = {0,}; + call_frame_t *new_frame = NULL; + + args = ga_heal_parse_args (this, data); + if (!args) + goto out; + + if (!xdata) + xdata = dict_new (); + + ret = ga_fill_tmp_loc (loc, this, args->gfid, args->bname, + xdata, &tmp_loc); + if (ret) + goto out; + + new_frame = copy_frame (frame); + if (!new_frame) + goto out; + new_frame->local = (void *)frame; + + STACK_WIND (new_frame, ga_heal_cbk, FIRST_CHILD (this), + FIRST_CHILD(this)->fops->lookup, + &tmp_loc, xdata); + + ret = 0; +out: + if (args) + ga_heal_args_free (args); + + return ret; +} + +int32_t +ga_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t *xdata) +{ + STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata); + return 0; +} + +int32_t +ga_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + + data_t *data = NULL; + int op_errno = ENOMEM; + int ret = 0; + inode_t *unref = NULL; + + if ((loc->name && !strcmp (GF_GFID_DIR, loc->name)) && + ((loc->parent && + __is_root_gfid (loc->parent->gfid)) || + __is_root_gfid (loc->pargfid))) { + op_errno = EPERM; + goto err; + } + + data = dict_get (dict, GF_FUSE_AUX_GFID_NEWFILE); + if (data) { + ret = ga_new_entry (frame, this, loc, data, xdata); + if (ret) + goto err; + return 0; + } + + data = dict_get (dict, GF_FUSE_AUX_GFID_HEAL); + if (data) { + ret = ga_heal_entry (frame, this, loc, data, xdata); + if (ret) + goto err; + return 0; + } + + //If the inode is a virtual inode change the inode otherwise perform + //the operation on same inode + GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind); + +wind: + STACK_WIND (frame, ga_setxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, + xdata); + if (unref) + inode_unref (unref); + + return 0; +err: + STACK_UNWIND_STRICT (setxattr, frame, -1, op_errno, xdata); + return 0; +} + + +int32_t +ga_virtual_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) +{ + int j = 0; + int i = 0; + int ret = 0; + uint64_t temp_ino = 0; + inode_t *cbk_inode = NULL; + inode_t *true_inode = NULL; + uuid_t random_gfid = {0,}; + + if (frame->local) + cbk_inode = frame->local; + else + cbk_inode = inode; + + frame->local = NULL; + if (op_ret) + goto unwind; + + if (!IA_ISDIR (buf->ia_type)) + goto unwind; + + /* need to send back a different inode for linking in itable */ + if (cbk_inode == inode) { + /* check if the inode is in the 'itable' or + if its just previously discover()'d inode */ + true_inode = inode_find (inode->table, buf->ia_gfid); + if (!true_inode) { + cbk_inode = inode_new (inode->table); + + if (!cbk_inode) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + /* the inode is not present in itable, ie, the actual + path is not yet looked up. Use the current inode + itself for now */ + inode_ref (inode); + } else { + /* 'inode_ref()' has been done in inode_find() */ + inode = true_inode; + } + + ret = inode_ctx_put (cbk_inode, this, (uint64_t)inode); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set the inode ctx with" + "the actual inode"); + if (inode) + inode_unref (inode); + } + inode = NULL; + } + + if (!uuid_is_null (cbk_inode->gfid)) { + /* if the previous linked inode is used, use the + same gfid */ + uuid_copy (random_gfid, cbk_inode->gfid); + } else { + /* replace the buf->ia_gfid to a random gfid + for directory, for files, what we received is fine */ + uuid_generate (random_gfid); + } + + uuid_copy (buf->ia_gfid, random_gfid); + + for (i = 15; i > (15 - 8); i--) { + temp_ino += (uint64_t)(buf->ia_gfid[i]) << j; + j += 8; + } + buf->ia_ino = temp_ino; + +unwind: + STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, cbk_inode, buf, + xdata, postparent); + + return 0; +} + +int32_t +ga_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) +{ + ga_private_t *priv = NULL; + + /* if the entry in question is not 'root', + then follow the normal path */ + if (op_ret || !__is_root_gfid(buf->ia_gfid)) + goto unwind; + + priv = this->private; + + /* do we need to copy root stbuf everytime? */ + /* mostly yes, as we want to have the 'stat' info show latest + in every _cbk() */ + + /* keep the reference for root stat buf */ + priv->root_stbuf = *buf; + priv->gfiddir_stbuf = priv->root_stbuf; + priv->gfiddir_stbuf.ia_gfid[15] = GF_AUX_GFID; + priv->gfiddir_stbuf.ia_ino = GF_AUX_GFID; + +unwind: + STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, + xdata, postparent); + return 0; +} + +int32_t +ga_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + ga_private_t *priv = NULL; + int ret = -1; + uuid_t tmp_gfid = {0,}; + loc_t tmp_loc = {0,}; + uint64_t value = 0; + inode_t *inode = NULL; + inode_t *true_inode = NULL; + int32_t op_errno = ENOENT; + + /* if its discover(), no need for any action here */ + if (!loc->name) + goto wind; + + /* if its revalidate, and inode is not of type directory, + proceed with 'wind' */ + if (loc->inode && loc->inode->ia_type && + !IA_ISDIR (loc->inode->ia_type)) + goto wind; + + priv = this->private; + + /* need to check if the lookup is on virtual dir */ + if ((loc->name && !strcmp (GF_GFID_DIR, loc->name)) && + ((loc->parent && __is_root_gfid (loc->parent->gfid)) || + __is_root_gfid (loc->pargfid))) { + /* this means, the query is on '/.gfid', return the fake stat, + and say success */ + + STACK_UNWIND_STRICT (lookup, frame, 0, 0, loc->inode, + &priv->gfiddir_stbuf, xdata, + &priv->root_stbuf); + return 0; + } + + /* now, check if the lookup() is on an existing entry, + but on gfid-path */ + if (!((loc->parent && __is_gfid_access_dir (loc->parent->gfid)) || + __is_gfid_access_dir (loc->pargfid))) + goto wind; + + /* make sure the 'basename' is actually a 'canonical-gfid', + otherwise, return error */ + ret = uuid_parse (loc->name, tmp_gfid); + if (ret) + goto err; + + /* if its fresh lookup, go ahead and send it down, if not, + for directory, we need indirection to actual dir inode */ + if (!(loc->inode && loc->inode->ia_type)) + goto discover; + + /* revalidate on directory */ + ret = inode_ctx_get (loc->inode, this, &value); + if (ret) + goto err; + + inode = (void *)value; + + /* valid inode, already looked up, work on that */ + if (inode->ia_type) + goto discover; + + /* check if the inode is in the 'itable' or + if its just previously discover()'d inode */ + true_inode = inode_find (loc->inode->table, tmp_gfid); + if (true_inode) { + /* time do another lookup and update the context + with proper inode */ + op_errno = ESTALE; + goto err; + } + +discover: + /* for the virtual entries, we don't need to send 'gfid-req' key, as + for these entries, we don't want to 'set' a new gfid */ + if (xdata) + dict_del (xdata, "gfid-req"); + + uuid_copy (tmp_loc.gfid, tmp_gfid); + + /* if revalidate, then we need to have the proper reference */ + if (inode) { + tmp_loc.inode = inode_ref (inode); + frame->local = loc->inode; + } else { + tmp_loc.inode = inode_ref (loc->inode); + } + + STACK_WIND (frame, ga_virtual_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, &tmp_loc, xdata); + + inode_unref (tmp_loc.inode); + + return 0; + +wind: + /* used for all the normal lookup path */ + STACK_WIND (frame, ga_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + + return 0; + +err: + STACK_UNWIND_STRICT (lookup, frame, -1, op_errno, loc->inode, + &priv->gfiddir_stbuf, xdata, + &priv->root_stbuf); + return 0; +} + +int +ga_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) +{ + int op_errno = 0; + + GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err); + + STACK_WIND (frame, default_mkdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, + xdata); + + return 0; + +err: + STACK_UNWIND_STRICT (mkdir, frame, -1, op_errno, loc->inode, + NULL, NULL, NULL, xdata); + return 0; +} + + +int +ga_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + int op_errno = 0; + + GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err); + + STACK_WIND (frame, default_create_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->create, + loc, flags, mode, umask, fd, xdata); + return 0; +err: + STACK_UNWIND_STRICT (create, frame, -1, op_errno, NULL, + NULL, NULL, NULL, NULL, xdata); + + return 0; + +} + +int +ga_symlink (call_frame_t *frame, xlator_t *this, const char *linkname, + loc_t *loc, mode_t umask, dict_t *xdata) +{ + int op_errno = 0; + + GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err); + + STACK_WIND (frame, default_symlink_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink, + linkname, loc, umask, xdata); + return 0; +err: + STACK_UNWIND_STRICT (symlink, frame, -1, op_errno, NULL, + NULL, NULL, NULL, xdata); + + return 0; +} + +int +ga_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) +{ + int op_errno = 0; + + GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err); + + STACK_WIND (frame, default_mknod_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, + umask, xdata); + + return 0; +err: + STACK_UNWIND_STRICT (mknod, frame, -1, op_errno, NULL, + NULL, NULL, NULL, xdata); + + return 0; +} + +int +ga_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flag, + dict_t *xdata) +{ + int op_errno = 0; + inode_t *unref = NULL; + + GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err); + + GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind); + +wind: + STACK_WIND (frame, default_rmdir_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->rmdir, + loc, flag, xdata); + if (unref) + inode_unref (unref); + + return 0; +err: + STACK_UNWIND_STRICT (rmdir, frame, -1, op_errno, NULL, + NULL, xdata); + + return 0; +} + +int +ga_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t xflag, + dict_t *xdata) +{ + int op_errno = 0; + inode_t *unref = NULL; + + GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err); + + GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind); + +wind: + STACK_WIND (frame, default_unlink_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, + loc, xflag, xdata); + + if (unref) + inode_unref (unref); + + return 0; +err: + STACK_UNWIND_STRICT (unlink, frame, -1, op_errno, NULL, + NULL, xdata); + + return 0; +} + +int +ga_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ + int op_errno = 0; + inode_t *oldloc_unref = NULL; + inode_t *newloc_unref = NULL; + + GFID_ACCESS_ENTRY_OP_CHECK (oldloc, op_errno, err); + GFID_ACCESS_ENTRY_OP_CHECK (newloc, op_errno, err); + + GFID_ACCESS_GET_VALID_DIR_INODE (this, oldloc, oldloc_unref, + handle_newloc); + +handle_newloc: + GFID_ACCESS_GET_VALID_DIR_INODE (this, newloc, newloc_unref, wind); + +wind: + STACK_WIND (frame, default_rename_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename, + oldloc, newloc, xdata); + + if (oldloc_unref) + inode_unref (oldloc_unref); + + if (newloc_unref) + inode_unref (newloc_unref); + + return 0; +err: + STACK_UNWIND_STRICT (rename, frame, -1, op_errno, NULL, + NULL, NULL, NULL, NULL, xdata); + + return 0; +} + + +int +ga_link (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ + int op_errno = 0; + inode_t *oldloc_unref = NULL; + inode_t *newloc_unref = NULL; + + GFID_ACCESS_ENTRY_OP_CHECK (oldloc, op_errno, err); + GFID_ACCESS_ENTRY_OP_CHECK (newloc, op_errno, err); + + GFID_ACCESS_GET_VALID_DIR_INODE (this, oldloc, oldloc_unref, + handle_newloc); + +handle_newloc: + GFID_ACCESS_GET_VALID_DIR_INODE (this, newloc, newloc_unref, wind); + +wind: + STACK_WIND (frame, default_link_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, + oldloc, newloc, xdata); + + if (oldloc_unref) + inode_unref (oldloc_unref); + + if (newloc_unref) + inode_unref (newloc_unref); + + return 0; +err: + STACK_UNWIND_STRICT (link, frame, -1, op_errno, NULL, + NULL, NULL, NULL, xdata); + + return 0; +} + +int32_t +ga_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, + fd_t *fd, dict_t *xdata) +{ + int op_errno = 0; + + GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err); + + /* also check if the loc->inode itself is virtual + inode, if yes, return with failure, mainly because we + can't handle all the readdirp and other things on it. */ + if (inode_ctx_get (loc->inode, this, NULL) == 0) { + op_errno = ENOTSUP; + goto err; + } + + STACK_WIND (frame, default_opendir_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->opendir, + loc, fd, xdata); + return 0; +err: + STACK_UNWIND_STRICT (opendir, frame, -1, op_errno, NULL, xdata); + + return 0; +} + +int32_t +ga_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + inode_t *unref = NULL; + + GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind); + +wind: + STACK_WIND (frame, default_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); + + if (unref) + inode_unref (unref); + + return 0; +} + +int32_t +ga_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xdata) +{ + inode_t *unref = NULL; + + GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind); + +wind: + STACK_WIND (frame, default_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + if (unref) + inode_unref (unref); + + return 0; +} + +int32_t +ga_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, + dict_t *xdata) +{ + inode_t *unref = NULL; + + GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind); + +wind: + STACK_WIND (frame, default_setattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setattr, loc, stbuf, valid, + xdata); + if (unref) + inode_unref (unref); + + return 0; +} + +int32_t +ga_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + inode_t *unref = NULL; + + GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind); + +wind: + STACK_WIND (frame, default_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, + xdata); + if (unref) + inode_unref (unref); + + return 0; +} + + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init (this, gf_gfid_access_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_WARNING, "Memory accounting" + " init failed"); + return ret; + } + + return ret; +} + +int32_t +init (xlator_t *this) +{ + ga_private_t *priv = NULL; + int ret = -1; + + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "not configured with exactly one child. exiting"); + goto out; + } + + /* This can be the top of graph in certain cases */ + if (!this->parents) { + gf_log (this->name, GF_LOG_DEBUG, + "dangling volume. check volfile "); + } + + /* TODO: define a mem-type structure */ + priv = GF_CALLOC (1, sizeof (*priv), gf_gfid_access_mt_priv_t); + if (!priv) + goto out; + + priv->newfile_args_pool = mem_pool_new (ga_newfile_args_t, 512); + if (!priv->newfile_args_pool) + goto out; + + priv->heal_args_pool = mem_pool_new (ga_heal_args_t, 512); + if (!priv->heal_args_pool) + goto out; + + this->private = priv; + + ret = 0; +out: + if (ret && priv) { + if (priv->newfile_args_pool) + mem_pool_destroy (priv->newfile_args_pool); + GF_FREE (priv); + } + + return ret; +} + +void +fini (xlator_t *this) +{ + ga_private_t *priv = NULL; + priv = this->private; + this->private = NULL; + + if (priv) { + if (priv->newfile_args_pool) + mem_pool_destroy (priv->newfile_args_pool); + if (priv->heal_args_pool) + mem_pool_destroy (priv->heal_args_pool); + GF_FREE (priv); + } + + return; +} + + +struct xlator_fops fops = { + .lookup = ga_lookup, + + /* entry fops */ + .mkdir = ga_mkdir, + .mknod = ga_mknod, + .create = ga_create, + .symlink = ga_symlink, + .link = ga_link, + .unlink = ga_unlink, + .rmdir = ga_rmdir, + .rename = ga_rename, + + /* handle any other directory operations here */ + .opendir = ga_opendir, + .stat = ga_stat, + .setattr = ga_setattr, + .getxattr = ga_getxattr, + .removexattr = ga_removexattr, + + /* special fop to handle more entry creations */ + .setxattr = ga_setxattr, +}; + +struct xlator_cbks cbks = { + .forget = ga_forget, +}; + +struct volume_options options[] = { + /* This translator doesn't take any options, or provide any options */ + { .key = {NULL} }, +}; diff --git a/xlators/features/gfid-access/src/gfid-access.h b/xlators/features/gfid-access/src/gfid-access.h new file mode 100644 index 000000000..e13c9b724 --- /dev/null +++ b/xlators/features/gfid-access/src/gfid-access.h @@ -0,0 +1,128 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef __GFID_ACCESS_H__ +#define __GFID_ACCESS_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "defaults.h" +#include "gfid-access-mem-types.h" + +#define UUID_CANONICAL_FORM_LEN 36 + +#define GF_FUSE_AUX_GFID_NEWFILE "glusterfs.gfid.newfile" +#define GF_FUSE_AUX_GFID_HEAL "glusterfs.gfid.heal" + +#define GF_GFID_KEY "GLUSTERFS_GFID" +#define GF_GFID_DIR ".gfid" +#define GF_AUX_GFID 0xd + +#define GFID_ACCESS_GET_VALID_DIR_INODE(x,l,unref,lbl) do { \ + int ret = 0; \ + uint64_t value = 0; \ + inode_t *tmp_inode = NULL; \ + \ + /* if its an entry operation, on the virtual */ \ + /* directory inode as parent, we need to handle */ \ + /* it properly */ \ + if (l->parent) { \ + ret = inode_ctx_get (l->parent, x, &value); \ + if (ret) \ + goto lbl; \ + tmp_inode = (inode_t *)value; \ + unref = inode_ref (tmp_inode); \ + l->parent = tmp_inode; \ + /* if parent is virtual, no need to handle */ \ + /* loc->inode */ \ + break; \ + } \ + \ + /* if its an inode operation, on the virtual */ \ + /* directory inode itself, we need to handle */ \ + /* it properly */ \ + if (l->inode) { \ + ret = inode_ctx_get (l->inode, x, &value); \ + if (ret) \ + goto lbl; \ + tmp_inode = (inode_t *)value; \ + unref = inode_ref (tmp_inode); \ + l->inode = tmp_inode; \ + } \ + \ + } while (0) + +#define GFID_ACCESS_ENTRY_OP_CHECK(loc,err,lbl) do { \ + /* need to check if the lookup is on virtual dir */ \ + if ((loc->name && !strcmp (GF_GFID_DIR, loc->name)) && \ + ((loc->parent && \ + __is_root_gfid (loc->parent->gfid)) || \ + __is_root_gfid (loc->pargfid))) { \ + err = EEXIST; \ + goto lbl; \ + } \ + \ + /* now, check if the lookup() is on an existing */ \ + /* entry, but on gfid-path */ \ + if ((loc->parent && \ + __is_gfid_access_dir (loc->parent->gfid)) || \ + __is_gfid_access_dir (loc->pargfid)) { \ + err = EPERM; \ + goto lbl; \ + } \ + } while (0) + + +typedef struct { + unsigned int uid; + unsigned int gid; + char gfid[UUID_CANONICAL_FORM_LEN + 1]; + unsigned int st_mode; + char *bname; + + union { + struct _symlink_in { + char *linkpath; + } __attribute__ ((__packed__)) symlink; + + struct _mknod_in { + unsigned int mode; + unsigned int rdev; + unsigned int umask; + } __attribute__ ((__packed__)) mknod; + + struct _mkdir_in { + unsigned int mode; + unsigned int umask; + } __attribute__ ((__packed__)) mkdir; + } __attribute__ ((__packed__)) args; +} __attribute__((__packed__)) ga_newfile_args_t; + +typedef struct { + char gfid[UUID_CANONICAL_FORM_LEN + 1]; + char *bname; /* a null terminated basename */ +} __attribute__((__packed__)) ga_heal_args_t; + +struct ga_private { + /* root inode's stbuf */ + struct iatt root_stbuf; + struct iatt gfiddir_stbuf; + struct mem_pool *newfile_args_pool; + struct mem_pool *heal_args_pool; +}; +typedef struct ga_private ga_private_t; + +#endif /* __GFID_ACCESS_H__ */ diff --git a/xlators/features/index/src/index.c b/xlators/features/index/src/index.c index 3245076ec..9253120f3 100644 --- a/xlators/features/index/src/index.c +++ b/xlators/features/index/src/index.c @@ -15,8 +15,10 @@ #include "index.h" #include "options.h" #include "glusterfs3-xdr.h" +#include "syncop.h" #define XATTROP_SUBDIR "xattrop" +#define BASE_INDICES_HOLDER_SUBDIR "base_indices_holder" call_stub_t * __index_dequeue (struct list_head *callstubs) @@ -242,20 +244,40 @@ check_delete_stale_index_file (xlator_t *this, char *filename) { int ret = 0; struct stat st = {0}; + struct stat base_index_st = {0}; char filepath[PATH_MAX] = {0}; + char filepath_under_base_indices_holder[PATH_MAX] = {0}; index_priv_t *priv = NULL; priv = this->private; + if (priv->to_be_healed_states != synced_state) + return; + make_file_path (priv->index_basepath, XATTROP_SUBDIR, filename, filepath, sizeof (filepath)); + + make_file_path (priv->index_basepath, BASE_INDICES_HOLDER_SUBDIR, + filename, filepath_under_base_indices_holder, + sizeof (filepath_under_base_indices_holder)); + + + ret = stat (filepath_under_base_indices_holder, &base_index_st); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, "Base index is not created" + "under index/base_indices_holder"); + return; + } + ret = stat (filepath, &st); - if (!ret && st.st_nlink == 1) + if (!ret && st.st_nlink == 2) { unlink (filepath); + unlink (filepath_under_base_indices_holder); + } } static int index_fill_readdir (fd_t *fd, DIR *dir, off_t off, - size_t size, gf_dirent_t *entries) + size_t size, gf_dirent_t *entries, readdir_directory type) { off_t in_case = -1; size_t filled = 0; @@ -298,7 +320,8 @@ index_fill_readdir (fd_t *fd, DIR *dir, off_t off, } if (!strncmp (entry->d_name, XATTROP_SUBDIR"-", - strlen (XATTROP_SUBDIR"-"))) { + strlen (XATTROP_SUBDIR"-")) && + (type == INDEX_XATTROP)) { check_delete_stale_index_file (this, entry->d_name); continue; } @@ -337,16 +360,175 @@ out: } int +sync_base_indices (void *index_priv) +{ + index_priv_t *priv = NULL; + DIR *dir_base_holder = NULL; + DIR *xattrop_dir = NULL; + struct dirent *entry = NULL; + char base_indices_holder[PATH_MAX] = {0}; + char xattrop_directory[PATH_MAX] = {0}; + char base_index_path[PATH_MAX] = {0}; + char xattrop_index_path[PATH_MAX] = {0}; + int ret = 0; + + priv = index_priv; + + snprintf (base_indices_holder, PATH_MAX, "%s/%s", priv->index_basepath, + BASE_INDICES_HOLDER_SUBDIR); + snprintf (xattrop_directory, PATH_MAX, "%s/%s", priv->index_basepath, + XATTROP_SUBDIR); + + if ((dir_base_holder = opendir(base_indices_holder)) == NULL) { + ret = -1; + goto out; + } + if ((xattrop_dir = opendir (xattrop_directory)) == NULL) { + ret = -1; + goto out; + } + + priv->to_be_healed_states = sync_started; + while ((entry = readdir(xattrop_dir)) != NULL) { + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) { + continue; + } + if (strncmp (entry->d_name, XATTROP_SUBDIR"-", + strlen (XATTROP_SUBDIR"-"))) { + continue; + } + if (!strncmp (entry->d_name, XATTROP_SUBDIR"-", + strlen (XATTROP_SUBDIR"-"))) { + + snprintf (xattrop_index_path, PATH_MAX, "%s/%s", + xattrop_directory, entry->d_name); + + snprintf (base_index_path, PATH_MAX, "%s/%s", + base_indices_holder, entry->d_name); + + ret = link (xattrop_index_path, base_index_path); + if (ret && errno != EEXIST) + goto out; + + } + } + ret = closedir (xattrop_dir); + if (ret) + goto out; + ret = closedir (dir_base_holder); + if (ret) + goto out; + + ret = 0; +out: + return ret; + +} + +int +base_indices_syncing_done (int ret, call_frame_t *frame, void *data) +{ + index_priv_t *priv = NULL; + priv = data; + + if (!priv) + goto out; + + if (ret) { + priv->to_be_healed_states = sync_not_started; + } else { + priv->to_be_healed_states = synced_state; + } + + STACK_DESTROY (frame->root); + +out: + return 0; +} + +int +sync_base_indices_from_xattrop (xlator_t *this) +{ + + index_priv_t *priv = NULL; + char base_indices_holder[PATH_MAX] = {0}; + int ret = 0; + struct stat st = {0}; + DIR *dir = NULL; + struct dirent *entry = NULL; + call_frame_t *frame = NULL; + + priv = this->private; + + if (priv->to_be_healed_states != sync_not_started) { + ret = -1; + goto out; + } + + snprintf (base_indices_holder, PATH_MAX, "%s/%s", priv->index_basepath, + BASE_INDICES_HOLDER_SUBDIR); + + ret = stat (base_indices_holder, &st); + + if (ret && (errno != ENOENT)) { + goto out; + } else if (errno == ENOENT) { + ret = index_dir_create (this, BASE_INDICES_HOLDER_SUBDIR); + if (ret) + goto out; + } else { + if ((dir = opendir (base_indices_holder)) == NULL) { + ret = -1; + goto out; + } + while ((entry = readdir (dir)) != NULL) { + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name,"..")) { + continue; + } + ret = unlink (entry->d_name); + if (ret) + goto out; + } + closedir (dir); + } + + /*At this point of time we have index/base_indicies_holder directory + *is with no entries*/ + + frame = create_frame (this, this->ctx->pool); + if (!frame) { + ret = -1; + goto out; + } + set_lk_owner_from_ptr (&frame->root->lk_owner, frame->root); + + frame->root->pid = LOW_PRIO_PROC_PID; + + ret = synctask_new (this->ctx->env, sync_base_indices, + base_indices_syncing_done,frame, priv); + + + +out: + return ret; + +} + +int index_add (xlator_t *this, uuid_t gfid, const char *subdir) { int32_t op_errno = 0; char gfid_path[PATH_MAX] = {0}; char index_path[PATH_MAX] = {0}; + char base_path[PATH_MAX] = {0}; int ret = 0; uuid_t index = {0}; index_priv_t *priv = NULL; struct stat st = {0}; int fd = 0; + int index_created = 0; priv = this->private; GF_ASSERT_AND_GOTO_WITH_ERROR (this->name, !uuid_is_null (gfid), @@ -364,9 +546,11 @@ index_add (xlator_t *this, uuid_t gfid, const char *subdir) ret = link (index_path, gfid_path); if (!ret || (errno == EEXIST)) { ret = 0; + index_created = 1; goto out; } + op_errno = errno; if (op_errno == ENOENT) { ret = index_dir_create (this, subdir); @@ -398,10 +582,36 @@ index_add (xlator_t *this, uuid_t gfid, const char *subdir) "add to index (%s)", uuid_utoa (gfid), strerror (errno)); goto out; + } else { + index_created = 1; + } + + if (priv->to_be_healed_states != sync_not_started) { + make_index_path (priv->index_basepath, + GF_BASE_INDICES_HOLDER_GFID, + index, base_path, sizeof (base_path)); + ret = link (index_path, base_path); + if (ret) + goto out; } ret = 0; out: + /*If base_indices_holder is not created: create and sync + *If directory is present: delete contents and start syncing + *If syncing is in progress :No need to do any thing + *If syncing is done: No need to do anything*/ + if (!ret) { + switch (priv->to_be_healed_states) { + case sync_not_started: + ret = sync_base_indices_from_xattrop (this); + break; + case sync_started: + case synced_state: + /*No need to do anything*/ + break; + } + } return ret; } @@ -737,8 +947,18 @@ index_getxattr_wrapper (call_frame_t *frame, xlator_t *this, goto done; } - ret = dict_set_static_bin (xattr, (char*)name, priv->xattrop_vgfid, - sizeof (priv->xattrop_vgfid)); + if (!strcmp (name, GF_XATTROP_INDEX_GFID)) { + + ret = dict_set_static_bin (xattr, (char*)name, + priv->xattrop_vgfid, + sizeof (priv->xattrop_vgfid)); + + } else if (!strcmp (name, GF_BASE_INDICES_HOLDER_GFID)) { + + ret = dict_set_static_bin (xattr, (char*)name, + priv->base_indices_holder_vgfid, + sizeof (priv->base_indices_holder_vgfid)); + } if (ret) { ret = -ENOMEM; gf_log (THIS->name, GF_LOG_ERROR, "xattrop index " @@ -782,6 +1002,15 @@ index_lookup_wrapper (call_frame_t *frame, xlator_t *this, } else if (!uuid_compare (loc->pargfid, priv->xattrop_vgfid)) { make_file_path (priv->index_basepath, XATTROP_SUBDIR, loc->name, path, sizeof (path)); + } else if (!uuid_compare (loc->gfid,priv->base_indices_holder_vgfid)){ + make_index_dir_path (priv->index_basepath, + BASE_INDICES_HOLDER_SUBDIR, path, + sizeof (path)); + is_dir = _gf_true; + } else if (!uuid_compare (loc->pargfid, priv->base_indices_holder_vgfid)) { + make_file_path (priv->index_basepath, + BASE_INDICES_HOLDER_SUBDIR,loc->name, path, + sizeof (path)); } ret = lstat (path, &lstatbuf); @@ -803,10 +1032,14 @@ index_lookup_wrapper (call_frame_t *frame, xlator_t *this, } iatt_from_stat (&stbuf, &lstatbuf); - if (is_dir) + if (is_dir && !uuid_compare (loc->gfid, priv->xattrop_vgfid)) { uuid_copy (stbuf.ia_gfid, priv->xattrop_vgfid); - else + } else if (is_dir && + !uuid_compare (loc->gfid, priv->base_indices_holder_vgfid)) { + uuid_copy (stbuf.ia_gfid, priv->base_indices_holder_vgfid); + } else { uuid_generate (stbuf.ia_gfid); + } stbuf.ia_ino = -1; op_ret = 0; done: @@ -818,6 +1051,44 @@ done: } int32_t +base_indices_readdir_wrapper (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t off, dict_t *xdata) +{ + index_priv_t *priv = NULL; + char base_indices_holder[PATH_MAX] = {0}; + DIR *dir = NULL; + int32_t op_ret = -1; + int32_t op_errno = 0; + int count = 0; + gf_dirent_t entries; + + priv = this->private; + + make_index_dir_path (priv->index_basepath, BASE_INDICES_HOLDER_SUBDIR, + base_indices_holder, sizeof (base_indices_holder)); + + dir = opendir (base_indices_holder); + if (!dir) { + op_errno = EINVAL; + goto done; + } + + + INIT_LIST_HEAD (&entries.list); + + count = index_fill_readdir (fd, dir, off, size, &entries, + BASE_INDICES_HOLDER); + /* pick ENOENT to indicate EOF */ + op_errno = errno; + op_ret = count; + closedir (dir); +done: + STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, &entries, xdata); + gf_dirent_free (&entries); + return 0; +} + +int32_t index_readdir_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t off, dict_t *xdata) { @@ -848,7 +1119,8 @@ index_readdir_wrapper (call_frame_t *frame, xlator_t *this, goto done; } - count = index_fill_readdir (fd, dir, off, size, &entries); + count = index_fill_readdir (fd, dir, off, size, &entries, + INDEX_XATTROP); /* pick ENOENT to indicate EOF */ op_errno = errno; @@ -915,7 +1187,10 @@ index_getxattr (call_frame_t *frame, xlator_t *this, { call_stub_t *stub = NULL; - if (!name || strcmp (GF_XATTROP_INDEX_GFID, name)) + if (!name) + goto out; + if (strcmp (GF_XATTROP_INDEX_GFID, name) && + strcmp (GF_BASE_INDICES_HOLDER_GFID, name)) goto out; stub = fop_getxattr_stub (frame, index_getxattr_wrapper, loc, name, @@ -942,7 +1217,9 @@ index_lookup (call_frame_t *frame, xlator_t *this, priv = this->private; if (uuid_compare (loc->gfid, priv->xattrop_vgfid) && - uuid_compare (loc->pargfid, priv->xattrop_vgfid)) + uuid_compare (loc->pargfid, priv->xattrop_vgfid) && + uuid_compare (loc->gfid, priv->base_indices_holder_vgfid) && + uuid_compare (loc->pargfid, priv->base_indices_holder_vgfid)) goto normal; stub = fop_lookup_stub (frame, index_lookup_wrapper, loc, xattr_req); @@ -968,10 +1245,19 @@ index_readdir (call_frame_t *frame, xlator_t *this, index_priv_t *priv = NULL; priv = this->private; - if (uuid_compare (fd->inode->gfid, priv->xattrop_vgfid)) + if (uuid_compare (fd->inode->gfid, priv->xattrop_vgfid) && + uuid_compare (fd->inode->gfid, priv->base_indices_holder_vgfid)) goto out; - stub = fop_readdir_stub (frame, index_readdir_wrapper, fd, size, off, - xdata); + + if (!uuid_compare (fd->inode->gfid, priv->xattrop_vgfid)) { + stub = fop_readdir_stub (frame, index_readdir_wrapper, fd, size, + off, xdata); + } else if (!uuid_compare (fd->inode->gfid, + priv->base_indices_holder_vgfid)) { + stub = fop_readdir_stub (frame, base_indices_readdir_wrapper, + fd, size, off, xdata); + } + if (!stub) { STACK_UNWIND_STRICT (readdir, frame, -1, ENOMEM, NULL, NULL); return 0; @@ -1075,10 +1361,13 @@ init (xlator_t *this) GF_OPTION_INIT ("index-base", priv->index_basepath, path, out); uuid_generate (priv->index); uuid_generate (priv->xattrop_vgfid); + /*base_indices_holder is a directory which contains hard links to + * all base indices inside indices/xattrop directory*/ + uuid_generate (priv->base_indices_holder_vgfid); INIT_LIST_HEAD (&priv->callstubs); this->private = priv; - ret = pthread_create (&thread, &w_attr, index_worker, this); + ret = gf_thread_create (&thread, &w_attr, index_worker, this); if (ret) { gf_log (this->name, GF_LOG_WARNING, "Failed to create " "worker thread, aborting"); diff --git a/xlators/features/index/src/index.h b/xlators/features/index/src/index.h index 661dcdbc4..d6dcb1c23 100644 --- a/xlators/features/index/src/index.h +++ b/xlators/features/index/src/index.h @@ -36,14 +36,28 @@ typedef struct index_fd_ctx { DIR *dir; } index_fd_ctx_t; +typedef enum { + sync_not_started, + sync_started, + synced_state, +} to_be_healed_states_t; + +typedef enum { + INDEX_XATTROP, + BASE_INDICES_HOLDER, +} readdir_directory; + typedef struct index_priv { char *index_basepath; uuid_t index; gf_lock_t lock; uuid_t xattrop_vgfid;//virtual gfid of the xattrop index dir + uuid_t base_indices_holder_vgfid; //virtual gfid of the + //to_be_healed_xattrop directory struct list_head callstubs; pthread_mutex_t mutex; pthread_cond_t cond; + to_be_healed_states_t to_be_healed_states; } index_priv_t; #define INDEX_STACK_UNWIND(fop, frame, params ...) \ diff --git a/xlators/features/locks/src/Makefile.am b/xlators/features/locks/src/Makefile.am index 8908c1f52..0f79731b4 100644 --- a/xlators/features/locks/src/Makefile.am +++ b/xlators/features/locks/src/Makefile.am @@ -11,6 +11,7 @@ noinst_HEADERS = locks.h common.h locks-mem-types.h clear.h AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src + AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/features/locks/src/clear.c b/xlators/features/locks/src/clear.c index 5790a99ce..124b9ad0f 100644 --- a/xlators/features/locks/src/clear.c +++ b/xlators/features/locks/src/clear.c @@ -339,6 +339,7 @@ blkd: -1, EAGAIN); STACK_UNWIND_STRICT (entrylk, elock->frame, -1, EAGAIN, NULL); GF_FREE ((char *) elock->basename); + GF_FREE (elock->connection_id); GF_FREE (elock); } @@ -379,8 +380,8 @@ out: int clrlk_clear_lks_in_all_domains (xlator_t *this, pl_inode_t *pl_inode, - clrlk_args *args, int *blkd, int *granted, - int *op_errno) + clrlk_args *args, int *blkd, int *granted, + int *op_errno) { pl_dom_list_t *dom = NULL; int ret = -1; diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c index 9c21bddb9..b3309580d 100644 --- a/xlators/features/locks/src/common.c +++ b/xlators/features/locks/src/common.c @@ -35,6 +35,7 @@ __insert_and_merge (pl_inode_t *pl_inode, posix_lock_t *lock); static int pl_send_prelock_unlock (xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *old_lock); + static pl_dom_list_t * __allocate_domain (const char *volume) { @@ -75,8 +76,8 @@ get_domain (pl_inode_t *pl_inode, const char *volume) { pl_dom_list_t *dom = NULL; - GF_VALIDATE_OR_GOTO (POSIX_LOCKS, pl_inode, out); - GF_VALIDATE_OR_GOTO (POSIX_LOCKS, volume, out); + GF_VALIDATE_OR_GOTO ("posix-locks", pl_inode, out); + GF_VALIDATE_OR_GOTO ("posix-locks", volume, out); pthread_mutex_lock (&pl_inode->mutex); { @@ -92,9 +93,9 @@ get_domain (pl_inode_t *pl_inode, const char *volume) unlock: pthread_mutex_unlock (&pl_inode->mutex); if (dom) { - gf_log (POSIX_LOCKS, GF_LOG_TRACE, "Domain %s found", volume); + gf_log ("posix-locks", GF_LOG_TRACE, "Domain %s found", volume); } else { - gf_log (POSIX_LOCKS, GF_LOG_TRACE, "Domain %s not found", volume); + gf_log ("posix-locks", GF_LOG_TRACE, "Domain %s not found", volume); } out: return dom; @@ -135,10 +136,10 @@ __pl_inode_is_empty (pl_inode_t *pl_inode) void pl_print_locker (char *str, int size, xlator_t *this, call_frame_t *frame) { - snprintf (str, size, "Pid=%llu, lk-owner=%s, Transport=%p, Frame=%llu", + snprintf (str, size, "Pid=%llu, lk-owner=%s, Client=%p, Frame=%llu", (unsigned long long) frame->root->pid, lkowner_utoa (&frame->root->lk_owner), - (void *)frame->root->trans, + frame->root->client, (unsigned long long) frame->root->unique); } @@ -462,14 +463,14 @@ unlock: /* Create a new posix_lock_t */ posix_lock_t * -new_posix_lock (struct gf_flock *flock, void *transport, pid_t client_pid, +new_posix_lock (struct gf_flock *flock, client_t *client, pid_t client_pid, gf_lkowner_t *owner, fd_t *fd) { posix_lock_t *lock = NULL; - GF_VALIDATE_OR_GOTO (POSIX_LOCKS, flock, out); - GF_VALIDATE_OR_GOTO (POSIX_LOCKS, transport, out); - GF_VALIDATE_OR_GOTO (POSIX_LOCKS, fd, out); + GF_VALIDATE_OR_GOTO ("posix-locks", flock, out); + GF_VALIDATE_OR_GOTO ("posix-locks", client, out); + GF_VALIDATE_OR_GOTO ("posix-locks", fd, out); lock = GF_CALLOC (1, sizeof (posix_lock_t), gf_locks_mt_posix_lock_t); @@ -485,7 +486,7 @@ new_posix_lock (struct gf_flock *flock, void *transport, pid_t client_pid, else lock->fl_end = flock->l_start + flock->l_len - 1; - lock->transport = transport; + lock->client = client; lock->fd_num = fd_to_fdnum (fd); lock->fd = fd; lock->client_pid = client_pid; @@ -565,7 +566,7 @@ same_owner (posix_lock_t *l1, posix_lock_t *l2) { return (is_same_lkowner (&l1->owner, &l2->owner) && - (l1->transport == l2->transport)); + (l1->client == l2->client)); } @@ -694,7 +695,7 @@ subtract_locks (posix_lock_t *big, posix_lock_t *small) } GF_ASSERT (0); - gf_log (POSIX_LOCKS, GF_LOG_ERROR, "Unexpected case in subtract_locks"); + gf_log ("posix-locks", GF_LOG_ERROR, "Unexpected case in subtract_locks"); out: if (v.locks[0]) { @@ -812,7 +813,7 @@ __insert_and_merge (pl_inode_t *pl_inode, posix_lock_t *lock) sum = add_locks (lock, conf); sum->fl_type = lock->fl_type; - sum->transport = lock->transport; + sum->client = lock->client; sum->fd_num = lock->fd_num; sum->client_pid = lock->client_pid; sum->owner = lock->owner; @@ -830,7 +831,7 @@ __insert_and_merge (pl_inode_t *pl_inode, posix_lock_t *lock) sum = add_locks (lock, conf); sum->fl_type = conf->fl_type; - sum->transport = conf->transport; + sum->client = conf->client; sum->fd_num = conf->fd_num; sum->client_pid = conf->client_pid; sum->owner = conf->owner; @@ -988,7 +989,7 @@ pl_send_prelock_unlock (xlator_t *this, pl_inode_t *pl_inode, flock.l_len = old_lock->user_flock.l_len; - unlock_lock = new_posix_lock (&flock, old_lock->transport, + unlock_lock = new_posix_lock (&flock, old_lock->client, old_lock->client_pid, &old_lock->owner, old_lock->fd); GF_VALIDATE_OR_GOTO (this->name, unlock_lock, out); @@ -1097,3 +1098,124 @@ pl_getlk (pl_inode_t *pl_inode, posix_lock_t *lock) return conf; } + + +struct _lock_table * +pl_lock_table_new (void) +{ + struct _lock_table *new = NULL; + + new = GF_CALLOC (1, sizeof (struct _lock_table), gf_common_mt_lock_table); + if (new == NULL) { + goto out; + } + INIT_LIST_HEAD (&new->entrylk_lockers); + INIT_LIST_HEAD (&new->inodelk_lockers); + LOCK_INIT (&new->lock); +out: + return new; +} + + +int +pl_add_locker (struct _lock_table *table, const char *volume, + loc_t *loc, fd_t *fd, pid_t pid, gf_lkowner_t *owner, + glusterfs_fop_t type) +{ + int32_t ret = -1; + struct _locker *new = NULL; + + GF_VALIDATE_OR_GOTO ("lock-table", table, out); + GF_VALIDATE_OR_GOTO ("lock-table", volume, out); + + new = GF_CALLOC (1, sizeof (struct _locker), gf_common_mt_locker); + if (new == NULL) { + goto out; + } + INIT_LIST_HEAD (&new->lockers); + + new->volume = gf_strdup (volume); + + if (fd == NULL) { + loc_copy (&new->loc, loc); + } else { + new->fd = fd_ref (fd); + } + + new->pid = pid; + new->owner = *owner; + + LOCK (&table->lock); + { + if (type == GF_FOP_ENTRYLK) + list_add_tail (&new->lockers, &table->entrylk_lockers); + else + list_add_tail (&new->lockers, &table->inodelk_lockers); + } + UNLOCK (&table->lock); +out: + return ret; +} + +int +pl_del_locker (struct _lock_table *table, const char *volume, + loc_t *loc, fd_t *fd, gf_lkowner_t *owner, glusterfs_fop_t type) +{ + struct _locker *locker = NULL; + struct _locker *tmp = NULL; + int32_t ret = -1; + struct list_head *head = NULL; + struct list_head del; + + GF_VALIDATE_OR_GOTO ("lock-table", table, out); + GF_VALIDATE_OR_GOTO ("lock-table", volume, out); + + INIT_LIST_HEAD (&del); + + LOCK (&table->lock); + { + if (type == GF_FOP_ENTRYLK) { + head = &table->entrylk_lockers; + } else { + head = &table->inodelk_lockers; + } + + list_for_each_entry_safe (locker, tmp, head, lockers) { + if (!is_same_lkowner (&locker->owner, owner) || + strcmp (locker->volume, volume)) + continue; + + /* + * It is possible for inodelk lock to come on anon-fd + * and inodelk unlock to come on normal fd in case of + * client re-opens. So don't check for fds to be equal. + */ + if (locker->fd && fd) + list_move_tail (&locker->lockers, &del); + else if (locker->loc.inode && loc && + (locker->loc.inode == loc->inode)) + list_move_tail (&locker->lockers, &del); + } + } + UNLOCK (&table->lock); + + tmp = NULL; + locker = NULL; + + list_for_each_entry_safe (locker, tmp, &del, lockers) { + list_del_init (&locker->lockers); + if (locker->fd) + fd_unref (locker->fd); + else + loc_wipe (&locker->loc); + + GF_FREE (locker->volume); + GF_FREE (locker); + } + + ret = 0; +out: + return ret; + +} + diff --git a/xlators/features/locks/src/common.h b/xlators/features/locks/src/common.h index 94ef4b2b4..db19ec978 100644 --- a/xlators/features/locks/src/common.h +++ b/xlators/features/locks/src/common.h @@ -14,12 +14,14 @@ /*dump locks format strings */ #define RANGE_FMT "type=%s, whence=%hd, start=%llu, len=%llu" #define ENTRY_FMT "type=%s on basename=%s" -#define DUMP_GEN_FMT "pid = %llu, owner=%s, transport=%p, " +#define DUMP_GEN_FMT "pid = %llu, owner=%s, client=%p" #define GRNTD_AT "granted at %s" #define BLKD_AT "blocked at %s" -#define DUMP_BLKD_FMT DUMP_GEN_FMT", "BLKD_AT -#define DUMP_GRNTD_FMT DUMP_GEN_FMT", "GRNTD_AT -#define DUMP_BLKD_GRNTD_FMT DUMP_GEN_FMT", "BLKD_AT", "GRNTD_AT +#define CONN_ID "connection-id=%s" +#define DUMP_BLKD_FMT DUMP_GEN_FMT", "CONN_ID", "BLKD_AT +#define DUMP_GRNTD_FMT DUMP_GEN_FMT", "CONN_ID", "GRNTD_AT +#define DUMP_BLKD_GRNTD_FMT DUMP_GEN_FMT", "CONN_ID", "BLKD_AT", "GRNTD_AT + #define ENTRY_BLKD_FMT ENTRY_FMT", "DUMP_BLKD_FMT #define ENTRY_GRNTD_FMT ENTRY_FMT", "DUMP_GRNTD_FMT #define ENTRY_BLKD_GRNTD_FMT ENTRY_FMT", "DUMP_BLKD_GRNTD_FMT @@ -29,8 +31,24 @@ #define RANGE_BLKD_GRNTD_FMT RANGE_FMT", "DUMP_BLKD_GRNTD_FMT #define SET_FLOCK_PID(flock, lock) ((flock)->l_pid = lock->client_pid) + +struct _locker { + struct list_head lockers; + char *volume; + loc_t loc; + fd_t *fd; + gf_lkowner_t owner; + pid_t pid; +}; + +struct _lock_table { + struct list_head inodelk_lockers; + struct list_head entrylk_lockers; + gf_lock_t lock; +}; + posix_lock_t * -new_posix_lock (struct gf_flock *flock, void *transport, pid_t client_pid, +new_posix_lock (struct gf_flock *flock, client_t *client, pid_t client_pid, gf_lkowner_t *owner, fd_t *fd); pl_inode_t * @@ -63,7 +81,8 @@ pl_dom_list_t * get_domain (pl_inode_t *pl_inode, const char *volume); void -grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom); +grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, + pl_dom_list_t *dom); void __delete_inode_lock (pl_inode_lock_t *lock); @@ -78,9 +97,9 @@ grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, void pl_update_refkeeper (xlator_t *this, inode_t *inode); int32_t -__get_inodelk_count (xlator_t *this, pl_inode_t *pl_inode); +__get_inodelk_count (xlator_t *this, pl_inode_t *pl_inode, char *domname); int32_t -get_inodelk_count (xlator_t *this, inode_t *inode); +get_inodelk_count (xlator_t *this, inode_t *inode, char *domname); int32_t __get_entrylk_count (xlator_t *this, pl_inode_t *pl_inode); @@ -143,6 +162,26 @@ pl_verify_reservelk (xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, int can_block); int pl_reserve_unlock (xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *reqlock); + uint32_t check_entrylk_on_basename (xlator_t *this, inode_t *parent, char *basename); + +int32_t +pl_add_locker (struct _lock_table *table, const char *volume, + loc_t *loc, + fd_t *fd, + pid_t pid, + gf_lkowner_t *owner, + glusterfs_fop_t type); + +int32_t +pl_del_locker (struct _lock_table *table, const char *volume, + loc_t *loc, + fd_t *fd, + gf_lkowner_t *owner, + glusterfs_fop_t type); + +struct _lock_table * +pl_lock_table_new (void); + #endif /* __COMMON_H__ */ diff --git a/xlators/features/locks/src/entrylk.c b/xlators/features/locks/src/entrylk.c index d934a8b94..0785dc547 100644 --- a/xlators/features/locks/src/entrylk.c +++ b/xlators/features/locks/src/entrylk.c @@ -25,7 +25,7 @@ static pl_entry_lock_t * new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type, - void *trans, pid_t client_pid, gf_lkowner_t *owner, + client_t *client, pid_t client_pid, gf_lkowner_t *owner, const char *volume) { @@ -39,7 +39,7 @@ new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type, newlock->basename = basename ? gf_strdup (basename) : NULL; newlock->type = type; - newlock->trans = trans; + newlock->trans = client; newlock->volume = volume; newlock->client_pid = client_pid; newlock->owner = *owner; @@ -305,18 +305,15 @@ __find_most_matching_lock (pl_dom_list_t *dom, const char *basename) int __lock_name (pl_inode_t *pinode, const char *basename, entrylk_type type, - call_frame_t *frame, pl_dom_list_t *dom, xlator_t *this, int nonblock) + call_frame_t *frame, pl_dom_list_t *dom, xlator_t *this, + int nonblock, char *conn_id) { pl_entry_lock_t *lock = NULL; pl_entry_lock_t *conf = NULL; - void *trans = NULL; - pid_t client_pid = 0; int ret = -EINVAL; - trans = frame->root->trans; - client_pid = frame->root->pid; - - lock = new_entrylk_lock (pinode, basename, type, trans, client_pid, + lock = new_entrylk_lock (pinode, basename, type, + frame->root->client, frame->root->pid, &frame->root->lk_owner, dom->domain); if (!lock) { ret = -ENOMEM; @@ -325,12 +322,17 @@ __lock_name (pl_inode_t *pinode, const char *basename, entrylk_type type, lock->frame = frame; lock->this = this; - lock->trans = trans; + lock->trans = frame->root->client; + + if (conn_id) { + lock->connection_id = gf_strdup (conn_id); + } conf = __lock_grantable (dom, basename, type); if (conf) { ret = -EAGAIN; if (nonblock){ + GF_FREE (lock->connection_id); GF_FREE ((char *)lock->basename); GF_FREE (lock); goto out; @@ -350,6 +352,7 @@ __lock_name (pl_inode_t *pinode, const char *basename, entrylk_type type, if ( __blocked_lock_conflict (dom, basename, type) && !(__owner_has_lock (dom, lock))) { ret = -EAGAIN; if (nonblock) { + GF_FREE (lock->connection_id); GF_FREE ((char *) lock->basename); GF_FREE (lock); goto out; @@ -480,13 +483,15 @@ __grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, pl_inode, bl->basename); bl_ret = __lock_name (pl_inode, bl->basename, bl->type, - bl->frame, dom, bl->this, 0); + bl->frame, dom, bl->this, 0, + bl->connection_id); if (bl_ret == 0) { list_add (&bl->blocked_locks, granted); } else { gf_log (this->name, GF_LOG_DEBUG, "should never happen"); + GF_FREE (bl->connection_id); GF_FREE ((char *)bl->basename); GF_FREE (bl); } @@ -507,7 +512,8 @@ grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, pthread_mutex_lock (&pl_inode->mutex); { - __grant_blocked_entry_locks (this, pl_inode, dom, &granted_list); + __grant_blocked_entry_locks (this, pl_inode, dom, + &granted_list); } pthread_mutex_unlock (&pl_inode->mutex); @@ -520,24 +526,26 @@ grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, STACK_UNWIND_STRICT (entrylk, lock->frame, 0, 0, NULL); - GF_FREE ((char *)lock->basename); - GF_FREE (lock); + GF_FREE (lock->connection_id); + GF_FREE ((char *)lock->basename); + GF_FREE (lock); } GF_FREE ((char *)unlocked->basename); + GF_FREE (unlocked->connection_id); GF_FREE (unlocked); return; } /** - * release_entry_locks_for_transport: release all entry locks from this - * transport for this loc_t + * release_entry_locks_for_client: release all entry locks from this + * client for this loc_t */ static int -release_entry_locks_for_transport (xlator_t *this, pl_inode_t *pinode, - pl_dom_list_t *dom, void *trans) +release_entry_locks_for_client (xlator_t *this, pl_inode_t *pinode, + pl_dom_list_t *dom, client_t *client) { pl_entry_lock_t *lock = NULL; pl_entry_lock_t *tmp = NULL; @@ -551,14 +559,14 @@ release_entry_locks_for_transport (xlator_t *this, pl_inode_t *pinode, { list_for_each_entry_safe (lock, tmp, &dom->blocked_entrylks, blocked_locks) { - if (lock->trans != trans) + if (lock->trans != client) continue; list_del_init (&lock->blocked_locks); gf_log (this->name, GF_LOG_TRACE, "releasing lock on held by " - "{transport=%p}",trans); + "{client=%p}", client); list_add (&lock->blocked_locks, &released); @@ -566,16 +574,17 @@ release_entry_locks_for_transport (xlator_t *this, pl_inode_t *pinode, list_for_each_entry_safe (lock, tmp, &dom->entrylk_list, domain_list) { - if (lock->trans != trans) + if (lock->trans != client) continue; list_del_init (&lock->domain_list); gf_log (this->name, GF_LOG_TRACE, "releasing lock on held by " - "{transport=%p}",trans); + "{client=%p}", client); GF_FREE ((char *)lock->basename); + GF_FREE (lock->connection_id); GF_FREE (lock); } @@ -591,6 +600,7 @@ release_entry_locks_for_transport (xlator_t *this, pl_inode_t *pinode, STACK_UNWIND_STRICT (entrylk, lock->frame, -1, EAGAIN, NULL); GF_FREE ((char *)lock->basename); + GF_FREE (lock->connection_id); GF_FREE (lock); } @@ -601,6 +611,7 @@ release_entry_locks_for_transport (xlator_t *this, pl_inode_t *pinode, STACK_UNWIND_STRICT (entrylk, lock->frame, 0, 0, NULL); GF_FREE ((char *)lock->basename); + GF_FREE (lock->connection_id); GF_FREE (lock); } @@ -611,19 +622,23 @@ release_entry_locks_for_transport (xlator_t *this, pl_inode_t *pinode, int pl_common_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, inode_t *inode, const char *basename, - entrylk_cmd cmd, entrylk_type type, loc_t *loc, fd_t *fd) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; + entrylk_cmd cmd, entrylk_type type, loc_t *loc, fd_t *fd, + dict_t *xdata) - void * transport = NULL; - - pl_inode_t * pinode = NULL; +{ + int32_t op_ret = -1; + int32_t op_errno = 0; int ret = -1; - pl_entry_lock_t *unlocked = NULL; char unwind = 1; + GF_UNUSED int dict_ret = -1; + pl_inode_t *pinode = NULL; + pl_entry_lock_t *unlocked = NULL; + pl_dom_list_t *dom = NULL; + char *conn_id = NULL; + pl_ctx_t *ctx = NULL; - pl_dom_list_t *dom = NULL; + if (xdata) + dict_ret = dict_get_str (xdata, "connection-id", &conn_id); pinode = pl_inode_get (this, inode); if (!pinode) { @@ -639,18 +654,17 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this, entrylk_trace_in (this, frame, volume, fd, loc, basename, cmd, type); - transport = frame->root->trans; - if (frame->root->lk_owner.len == 0) { /* this is a special case that means release - all locks from this transport + all locks from this client */ gf_log (this->name, GF_LOG_TRACE, - "Releasing locks for transport %p", transport); + "Releasing locks for client %p", frame->root->client); - release_entry_locks_for_transport (this, pinode, dom, transport); + release_entry_locks_for_client (this, pinode, dom, + frame->root->client); op_ret = 0; goto out; @@ -661,7 +675,7 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this, pthread_mutex_lock (&pinode->mutex); { ret = __lock_name (pinode, basename, type, - frame, dom, this, 0); + frame, dom, this, 0, conn_id); } pthread_mutex_unlock (&pinode->mutex); @@ -686,7 +700,7 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this, pthread_mutex_lock (&pinode->mutex); { ret = __lock_name (pinode, basename, type, - frame, dom, this, 1); + frame, dom, this, 1, conn_id); } pthread_mutex_unlock (&pinode->mutex); @@ -723,6 +737,24 @@ out: entrylk_trace_out (this, frame, volume, fd, loc, basename, cmd, type, op_ret, op_errno); + ctx = pl_ctx_get (frame->root->client, this); + + if (ctx == NULL) { + gf_log (this->name, GF_LOG_INFO, "pl_ctx_get() failed"); + goto unwind; + } + + if (cmd == ENTRYLK_UNLOCK) + pl_del_locker (ctx->ltable, volume, loc, fd, + &frame->root->lk_owner, + GF_FOP_ENTRYLK); + else + pl_add_locker (ctx->ltable, volume, loc, fd, + frame->root->pid, + &frame->root->lk_owner, + GF_FOP_ENTRYLK); + +unwind: STACK_UNWIND_STRICT (entrylk, frame, op_ret, op_errno, NULL); } else { entrylk_trace_block (this, frame, volume, fd, loc, basename, @@ -742,10 +774,10 @@ out: int pl_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { - - pl_common_entrylk (frame, this, volume, loc->inode, basename, cmd, type, loc, NULL); + pl_common_entrylk (frame, this, volume, loc->inode, basename, cmd, + type, loc, NULL, xdata); return 0; } @@ -760,10 +792,10 @@ pl_entrylk (call_frame_t *frame, xlator_t *this, int pl_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type) + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { - - pl_common_entrylk (frame, this, volume, fd->inode, basename, cmd, type, NULL, fd); + pl_common_entrylk (frame, this, volume, fd->inode, basename, cmd, + type, NULL, fd, xdata); return 0; } diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c index 42350e59a..508523e11 100644 --- a/xlators/features/locks/src/inodelk.c +++ b/xlators/features/locks/src/inodelk.c @@ -39,8 +39,10 @@ inline void __pl_inodelk_unref (pl_inode_lock_t *lock) { lock->ref--; - if (!lock->ref) + if (!lock->ref) { + GF_FREE (lock->connection_id); GF_FREE (lock); + } } /* Check if 2 inodelks are conflicting on type. Only 2 shared locks don't conflict */ @@ -122,7 +124,7 @@ static inline int same_inodelk_owner (pl_inode_lock_t *l1, pl_inode_lock_t *l2) { return (is_same_lkowner (&l1->owner, &l2->owner) && - (l1->transport == l2->transport)); + (l1->client == l2->client)); } /* Returns true if the 2 inodelks conflict with each other */ @@ -292,7 +294,7 @@ __inode_unlock_lock (xlator_t *this, pl_inode_lock_t *lock, pl_dom_list_t *dom) " Matching lock not found for unlock %llu-%llu, by %s " "on %p", (unsigned long long)lock->fl_start, (unsigned long long)lock->fl_end, - lkowner_utoa (&lock->owner), lock->transport); + lkowner_utoa (&lock->owner), lock->client); goto out; } __delete_inode_lock (conf); @@ -300,7 +302,7 @@ __inode_unlock_lock (xlator_t *this, pl_inode_lock_t *lock, pl_dom_list_t *dom) " Matching lock found for unlock %llu-%llu, by %s on %p", (unsigned long long)lock->fl_start, (unsigned long long)lock->fl_end, lkowner_utoa (&lock->owner), - lock->transport); + lock->client); out: return conf; @@ -333,7 +335,8 @@ __grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, /* Grant all inodelks blocked on a lock */ void -grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom) +grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, + pl_dom_list_t *dom) { struct list_head granted; pl_inode_lock_t *lock; @@ -372,10 +375,10 @@ grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t * pthread_mutex_unlock (&pl_inode->mutex); } -/* Release all inodelks from this transport */ +/* Release all inodelks from this client */ static int -release_inode_locks_of_transport (xlator_t *this, pl_dom_list_t *dom, - inode_t *inode, void *trans) +release_inode_locks_of_client (xlator_t *this, pl_dom_list_t *dom, + inode_t *inode, client_t *client) { pl_inode_lock_t *tmp = NULL; pl_inode_lock_t *l = NULL; @@ -395,7 +398,7 @@ release_inode_locks_of_transport (xlator_t *this, pl_dom_list_t *dom, { list_for_each_entry_safe (l, tmp, &dom->blocked_inodelks, blocked_locks) { - if (l->transport != trans) + if (l->client != client) continue; list_del_init (&l->blocked_locks); @@ -408,8 +411,8 @@ release_inode_locks_of_transport (xlator_t *this, pl_dom_list_t *dom, gf_log (this->name, GF_LOG_DEBUG, "releasing blocking lock on %s held by " - "{transport=%p, pid=%"PRId64" lk-owner=%s}", - file, trans, (uint64_t) l->client_pid, + "{client=%p, pid=%"PRId64" lk-owner=%s}", + file, client, (uint64_t) l->client_pid, lkowner_utoa (&l->owner)); list_add (&l->blocked_locks, &released); @@ -420,7 +423,7 @@ release_inode_locks_of_transport (xlator_t *this, pl_dom_list_t *dom, } list_for_each_entry_safe (l, tmp, &dom->inodelk_list, list) { - if (l->transport != trans) + if (l->client != client) continue; inode_path (inode, NULL, &path); @@ -431,8 +434,8 @@ release_inode_locks_of_transport (xlator_t *this, pl_dom_list_t *dom, gf_log (this->name, GF_LOG_DEBUG, "releasing granted lock on %s held by " - "{transport=%p, pid=%"PRId64" lk-owner=%s}", - file, trans, (uint64_t) l->client_pid, + "{client=%p, pid=%"PRId64" lk-owner=%s}", + file, client, (uint64_t) l->client_pid, lkowner_utoa (&l->owner)); if (path) { @@ -515,8 +518,9 @@ out: /* Create a new inode_lock_t */ pl_inode_lock_t * -new_inode_lock (struct gf_flock *flock, void *transport, pid_t client_pid, - gf_lkowner_t *owner, const char *volume) +new_inode_lock (struct gf_flock *flock, client_t *client, pid_t client_pid, + call_frame_t *frame, xlator_t *this, const char *volume, + char *conn_id) { pl_inode_lock_t *lock = NULL; @@ -535,10 +539,16 @@ new_inode_lock (struct gf_flock *flock, void *transport, pid_t client_pid, else lock->fl_end = flock->l_start + flock->l_len - 1; - lock->transport = transport; + lock->client = client; lock->client_pid = client_pid; lock->volume = volume; - lock->owner = *owner; + lock->owner = frame->root->lk_owner; + lock->frame = frame; + lock->this = this; + + if (conn_id) { + lock->connection_id = gf_strdup (conn_id); + } INIT_LIST_HEAD (&lock->list); INIT_LIST_HEAD (&lock->blocked_locks); @@ -547,21 +557,58 @@ new_inode_lock (struct gf_flock *flock, void *transport, pid_t client_pid, return lock; } +int32_t +_pl_convert_volume (const char *volume, char **res) +{ + char *mdata_vol = NULL; + int ret = 0; + + mdata_vol = strrchr (volume, ':'); + //if the volume already ends with :metadata don't bother + if (mdata_vol && (strcmp (mdata_vol, ":metadata") == 0)) + return 0; + + ret = gf_asprintf (res, "%s:metadata", volume); + if (ret <= 0) + return ENOMEM; + return 0; +} + +int32_t +_pl_convert_volume_for_special_range (struct gf_flock *flock, + const char *volume, char **res) +{ + int32_t ret = 0; + + if ((flock->l_start == LLONG_MAX -1) && + (flock->l_len == 0)) { + ret = _pl_convert_volume (volume, res); + } + + return ret; +} + /* Common inodelk code called from pl_inodelk and pl_finodelk */ int pl_common_inodelk (call_frame_t *frame, xlator_t *this, const char *volume, inode_t *inode, int32_t cmd, - struct gf_flock *flock, loc_t *loc, fd_t *fd) + struct gf_flock *flock, loc_t *loc, fd_t *fd, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; int ret = -1; + GF_UNUSED int dict_ret = -1; int can_block = 0; - pid_t client_pid = -1; - void * transport = NULL; pl_inode_t * pinode = NULL; pl_inode_lock_t * reqlock = NULL; pl_dom_list_t * dom = NULL; + char *res = NULL; + char *res1 = NULL; + char *conn_id = NULL; + pl_ctx_t *ctx = NULL; + + if (xdata) + dict_ret = dict_get_str (xdata, "connection-id", &conn_id); VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (inode, unwind); @@ -572,10 +619,13 @@ pl_common_inodelk (call_frame_t *frame, xlator_t *this, goto unwind; } - pl_trace_in (this, frame, fd, loc, cmd, flock, volume); + op_errno = _pl_convert_volume_for_special_range (flock, volume, &res); + if (op_errno) + goto unwind; + if (res) + volume = res; - transport = frame->root->trans; - client_pid = frame->root->pid; + pl_trace_in (this, frame, fd, loc, cmd, flock, volume); pinode = pl_inode_get (this, inode); if (!pinode) { @@ -592,19 +642,26 @@ pl_common_inodelk (call_frame_t *frame, xlator_t *this, if (frame->root->lk_owner.len == 0) { /* special case: this means release all locks - from this transport + from this client */ gf_log (this->name, GF_LOG_TRACE, - "Releasing all locks from transport %p", transport); - - release_inode_locks_of_transport (this, dom, inode, transport); + "Releasing all locks from client %p", frame->root->client); + + release_inode_locks_of_client (this, dom, inode, frame->root->client); + _pl_convert_volume (volume, &res1); + if (res1) { + dom = get_domain (pinode, res1); + if (dom) + release_inode_locks_of_client (this, dom, + inode, frame->root->client); + } op_ret = 0; goto unwind; } - reqlock = new_inode_lock (flock, transport, client_pid, - &frame->root->lk_owner, volume); + reqlock = new_inode_lock (flock, frame->root->client, frame->root->pid, + frame, this, volume, conn_id); if (!reqlock) { op_ret = -1; @@ -612,14 +669,10 @@ pl_common_inodelk (call_frame_t *frame, xlator_t *this, goto unwind; } - reqlock->frame = frame; - reqlock->this = this; switch (cmd) { case F_SETLKW: can_block = 1; - reqlock->frame = frame; - reqlock->this = this; /* fall through */ @@ -651,6 +704,23 @@ pl_common_inodelk (call_frame_t *frame, xlator_t *this, op_ret = 0; + ctx = pl_ctx_get (frame->root->client, this); + + if (ctx == NULL) { + gf_log (this->name, GF_LOG_INFO, "pl_ctx_get() failed"); + goto unwind; + } + + if (flock->l_type == F_UNLCK) + pl_del_locker (ctx->ltable, volume, loc, fd, + &frame->root->lk_owner, + GF_FOP_INODELK); + else + pl_add_locker (ctx->ltable, volume, loc, fd, + frame->root->pid, + &frame->root->lk_owner, + GF_FOP_INODELK); + unwind: if ((inode != NULL) && (flock !=NULL)) { pl_update_refkeeper (this, inode); @@ -659,53 +729,78 @@ unwind: STACK_UNWIND_STRICT (inodelk, frame, op_ret, op_errno, NULL); out: + GF_FREE (res); + GF_FREE (res1); return 0; } int pl_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock) + const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock, + dict_t *xdata) { - - pl_common_inodelk (frame, this, volume, loc->inode, cmd, flock, loc, NULL); + pl_common_inodelk (frame, this, volume, loc->inode, cmd, flock, + loc, NULL, xdata); return 0; } int pl_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock) + const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, + dict_t *xdata) { - - pl_common_inodelk (frame, this, volume, fd->inode, cmd, flock, NULL, fd); + pl_common_inodelk (frame, this, volume, fd->inode, cmd, flock, + NULL, fd, xdata); return 0; } +static inline int32_t +__get_inodelk_dom_count (pl_dom_list_t *dom) +{ + pl_inode_lock_t *lock = NULL; + int32_t count = 0; + + list_for_each_entry (lock, &dom->inodelk_list, list) { + count++; + } + list_for_each_entry (lock, &dom->blocked_inodelks, blocked_locks) { + count++; + } + return count; +} +/* Returns the no. of locks (blocked/granted) held on a given domain name + * If @domname is NULL, returns the no. of locks in all the domains present. + * If @domname is non-NULL and non-existent, returns 0 */ int32_t -__get_inodelk_count (xlator_t *this, pl_inode_t *pl_inode) +__get_inodelk_count (xlator_t *this, pl_inode_t *pl_inode, char *domname) { int32_t count = 0; - pl_inode_lock_t *lock = NULL; pl_dom_list_t *dom = NULL; list_for_each_entry (dom, &pl_inode->dom_list, inode_list) { - list_for_each_entry (lock, &dom->inodelk_list, list) { - count++; - } - list_for_each_entry (lock, &dom->blocked_inodelks, blocked_locks) { - count++; - } + if (domname) { + if (strcmp (domname, dom->domain) == 0) { + count = __get_inodelk_dom_count (dom); + goto out; + } + + } else { + /* Counting locks from all domains */ + count += __get_inodelk_dom_count (dom); + } } +out: return count; } int32_t -get_inodelk_count (xlator_t *this, inode_t *inode) +get_inodelk_count (xlator_t *this, inode_t *inode, char *domname) { pl_inode_t *pl_inode = NULL; uint64_t tmp_pl_inode = 0; @@ -721,7 +816,7 @@ get_inodelk_count (xlator_t *this, inode_t *inode) pthread_mutex_lock (&pl_inode->mutex); { - count = __get_inodelk_count (this, pl_inode); + count = __get_inodelk_count (this, pl_inode, domname); } pthread_mutex_unlock (&pl_inode->mutex); diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h index 7ffc67e1b..76fc941d7 100644 --- a/xlators/features/locks/src/locks.h +++ b/xlators/features/locks/src/locks.h @@ -19,10 +19,10 @@ #include "stack.h" #include "call-stub.h" #include "locks-mem-types.h" +#include "client_t.h" #include "lkowner.h" -#define POSIX_LOCKS "posix-locks" struct __pl_fd; struct __posix_lock { @@ -33,7 +33,7 @@ struct __posix_lock { off_t fl_end; short blocked; /* waiting to acquire */ - struct gf_flock user_flock; /* the flock supplied by the user */ + struct gf_flock user_flock; /* the flock supplied by the user */ xlator_t *this; /* required for blocked locks */ unsigned long fd_num; @@ -46,7 +46,7 @@ struct __posix_lock { /* These two together serve to uniquely identify each process across nodes */ - void *transport; /* to identify client node */ + void *client; /* to identify client node */ gf_lkowner_t owner; pid_t client_pid; /* pid of client process */ }; @@ -63,7 +63,7 @@ struct __pl_inode_lock { const char *volume; - struct gf_flock user_flock; /* the flock supplied by the user */ + struct gf_flock user_flock; /* the flock supplied by the user */ xlator_t *this; /* required for blocked locks */ fd_t *fd; @@ -75,9 +75,11 @@ struct __pl_inode_lock { /* These two together serve to uniquely identify each process across nodes */ - void *transport; /* to identify client node */ + void *client; /* to identify client node */ gf_lkowner_t owner; pid_t client_pid; /* pid of client process */ + + char *connection_id; /* stores the client connection id */ }; typedef struct __pl_inode_lock pl_inode_lock_t; @@ -115,7 +117,9 @@ struct __entry_lock { void *trans; gf_lkowner_t owner; - pid_t client_pid; /* pid of client process */ + pid_t client_pid; /* pid of client process */ + + char *connection_id; /* stores the client connection id */ }; typedef struct __entry_lock pl_entry_lock_t; @@ -152,9 +156,11 @@ typedef struct { char *brickname; } posix_locks_private_t; + typedef struct { gf_boolean_t entrylk_count_req; gf_boolean_t inodelk_count_req; + gf_boolean_t inodelk_dom_count_req; gf_boolean_t posixlk_count_req; gf_boolean_t parent_entrylk_req; @@ -166,8 +172,21 @@ typedef struct { enum {TRUNCATE, FTRUNCATE} op; } pl_local_t; + typedef struct { struct list_head locks_list; } pl_fdctx_t; + +typedef struct _locks_ctx { + gf_lock_t ltable_lock; /* only for replace, + ltable has its own internal + lock for operations */ + struct _lock_table *ltable; +} pl_ctx_t; + + +pl_ctx_t * +pl_ctx_get (client_t *client, xlator_t *xlator); + #endif /* __POSIX_LOCKS_H__ */ diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c index 2bc5f8581..7bfb38a51 100644 --- a/xlators/features/locks/src/posix.c +++ b/xlators/features/locks/src/posix.c @@ -51,7 +51,7 @@ pl_new_fdctx () fdctx = GF_CALLOC (1, sizeof (*fdctx), gf_locks_mt_pl_fdctx_t); - GF_VALIDATE_OR_GOTO (POSIX_LOCKS, fdctx, out); + GF_VALIDATE_OR_GOTO ("posix-locks", fdctx, out); INIT_LIST_HEAD (&fdctx->locks_list); @@ -66,7 +66,7 @@ pl_check_n_create_fdctx (xlator_t *this, fd_t *fd) uint64_t tmp = 0; pl_fdctx_t *fdctx = NULL; - GF_VALIDATE_OR_GOTO (POSIX_LOCKS, this, out); + GF_VALIDATE_OR_GOTO ("posix-locks", this, out); GF_VALIDATE_OR_GOTO (this->name, fd, out); LOCK (&fd->lock); @@ -119,7 +119,7 @@ pl_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, static int truncate_allowed (pl_inode_t *pl_inode, - void *transport, pid_t client_pid, + client_t *client, pid_t client_pid, gf_lkowner_t *owner, off_t offset) { posix_lock_t *l = NULL; @@ -128,7 +128,7 @@ truncate_allowed (pl_inode_t *pl_inode, region.fl_start = offset; region.fl_end = LLONG_MAX; - region.transport = transport; + region.client = client; region.client_pid = client_pid; region.owner = *owner; @@ -139,7 +139,7 @@ truncate_allowed (pl_inode_t *pl_inode, && locks_overlap (®ion, l) && !same_owner (®ion, l)) { ret = 0; - gf_log (POSIX_LOCKS, GF_LOG_TRACE, "Truncate " + gf_log ("posix-locks", GF_LOG_TRACE, "Truncate " "allowed"); break; } @@ -186,7 +186,7 @@ truncate_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (priv->mandatory && pl_inode->mandatory - && !truncate_allowed (pl_inode, frame->root->trans, + && !truncate_allowed (pl_inode, frame->root->client, frame->root->pid, &frame->root->lk_owner, local->offset)) { op_ret = -1; @@ -347,7 +347,7 @@ delete_locks_of_fd (xlator_t *this, pl_inode_t *pl_inode, fd_t *fd) static void __delete_locks_of_owner (pl_inode_t *pl_inode, - void *transport, gf_lkowner_t *owner) + client_t *client, gf_lkowner_t *owner) { posix_lock_t *tmp = NULL; posix_lock_t *l = NULL; @@ -357,7 +357,7 @@ __delete_locks_of_owner (pl_inode_t *pl_inode, list_for_each_entry_safe (l, tmp, &pl_inode->ext_list, list) { if (l->blocked) continue; - if ((l->transport == transport) && + if ((l->client == client) && is_same_lkowner (&l->owner, owner)) { gf_log ("posix-locks", GF_LOG_TRACE, " Flushing lock" @@ -810,7 +810,7 @@ pl_migrate_locks (call_frame_t *frame, fd_t *newfd, uint64_t oldfd_num, list_for_each_entry (l, &pl_inode->ext_list, list) { if (l->fd_num == oldfd_num) { l->fd_num = newfd_num; - l->transport = frame->root->trans; + l->client = frame->root->client; } } } @@ -983,7 +983,7 @@ pl_flush (call_frame_t *frame, xlator_t *this, } pthread_mutex_lock (&pl_inode->mutex); { - __delete_locks_of_owner (pl_inode, frame->root->trans, + __delete_locks_of_owner (pl_inode, frame->root->client, &frame->root->lk_owner); } pthread_mutex_unlock (&pl_inode->mutex); @@ -1178,7 +1178,7 @@ pl_readv (call_frame_t *frame, xlator_t *this, if (priv->mandatory && pl_inode->mandatory) { region.fl_start = offset; region.fl_end = offset + size - 1; - region.transport = frame->root->trans; + region.client = frame->root->client; region.fd_num = fd_to_fdnum(fd); region.client_pid = frame->root->pid; region.owner = frame->root->lk_owner; @@ -1272,7 +1272,7 @@ pl_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, if (priv->mandatory && pl_inode->mandatory) { region.fl_start = offset; region.fl_end = offset + iov_length (vector, count) - 1; - region.transport = frame->root->trans; + region.client = frame->root->client; region.fd_num = fd_to_fdnum(fd); region.client_pid = frame->root->pid; region.owner = frame->root->lk_owner; @@ -1353,7 +1353,7 @@ lock_dup (posix_lock_t *lock) { posix_lock_t *new_lock = NULL; - new_lock = new_posix_lock (&lock->user_flock, lock->transport, + new_lock = new_posix_lock (&lock->user_flock, lock->client, lock->client_pid, &lock->owner, (fd_t *)lock->fd_num); return new_lock; @@ -1513,8 +1513,6 @@ int pl_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { - void *transport = NULL; - pid_t client_pid = 0; pl_inode_t *pl_inode = NULL; int op_ret = 0; int op_errno = 0; @@ -1523,9 +1521,6 @@ pl_lk (call_frame_t *frame, xlator_t *this, posix_lock_t *conf = NULL; int ret = 0; - transport = frame->root->trans; - client_pid = frame->root->pid; - if ((flock->l_start < 0) || (flock->l_len < 0)) { op_ret = -1; op_errno = EINVAL; @@ -1539,7 +1534,7 @@ pl_lk (call_frame_t *frame, xlator_t *this, goto unwind; } - reqlock = new_posix_lock (flock, transport, client_pid, + reqlock = new_posix_lock (flock, frame->root->client, frame->root->pid, &frame->root->lk_owner, fd); if (!reqlock) { @@ -1764,6 +1759,7 @@ pl_forget (xlator_t *this, list_del_init (&entry_l->domain_list); GF_FREE ((char *)entry_l->basename); + GF_FREE (entry_l->connection_id); GF_FREE (entry_l); } @@ -1797,6 +1793,7 @@ pl_forget (xlator_t *this, STACK_UNWIND_STRICT (entrylk, entry_l->frame, -1, 0, NULL); GF_FREE ((char *)entry_l->basename); + GF_FREE (entry_l->connection_id); GF_FREE (entry_l); } @@ -1946,19 +1943,34 @@ pl_entrylk_xattr_fill (xlator_t *this, inode_t *inode, } void -pl_inodelk_xattr_fill (xlator_t *this, inode_t *inode, - dict_t *dict) +pl_inodelk_xattr_fill (xlator_t *this, inode_t *inode, dict_t *dict, + gf_boolean_t per_dom) { int32_t count = 0; int ret = -1; + char *domname = NULL; + + + if (per_dom){ + ret = dict_get_str (dict, GLUSTERFS_INODELK_DOM_COUNT, + &domname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get " + "value for key %s",GLUSTERFS_INODELK_DOM_COUNT); + goto out; + } + } + + count = get_inodelk_count (this, inode, domname); - count = get_inodelk_count (this, inode); ret = dict_set_int32 (dict, GLUSTERFS_INODELK_COUNT, count); if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - " dict_set failed on key %s", GLUSTERFS_INODELK_COUNT); + gf_log (this->name, GF_LOG_DEBUG, "Failed to set count for " + "key %s", GLUSTERFS_INODELK_COUNT); } +out: + return; } void @@ -2003,7 +2015,9 @@ pl_lookup_cbk (call_frame_t *frame, if (local->entrylk_count_req) pl_entrylk_xattr_fill (this, inode, xdata); if (local->inodelk_count_req) - pl_inodelk_xattr_fill (this, inode, xdata); + pl_inodelk_xattr_fill (this, inode, xdata, _gf_false); + if (local->inodelk_dom_count_req) + pl_inodelk_xattr_fill (this, inode, xdata, _gf_true); if (local->posixlk_count_req) pl_posixlk_xattr_fill (this, inode, xdata); @@ -2050,6 +2064,8 @@ pl_lookup (call_frame_t *frame, local->entrylk_count_req = 1; if (dict_get (xdata, GLUSTERFS_INODELK_COUNT)) local->inodelk_count_req = 1; + if (dict_get (xdata, GLUSTERFS_INODELK_DOM_COUNT)) + local->inodelk_dom_count_req = 1; if (dict_get (xdata, GLUSTERFS_POSIXLK_COUNT)) local->posixlk_count_req = 1; if (dict_get (xdata, GLUSTERFS_PARENT_ENTRYLK)) @@ -2088,7 +2104,11 @@ pl_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (local->entrylk_count_req) pl_entrylk_xattr_fill (this, entry->inode, entry->dict); if (local->inodelk_count_req) - pl_inodelk_xattr_fill (this, entry->inode, entry->dict); + pl_inodelk_xattr_fill (this, entry->inode, entry->dict, + _gf_false); + if (local->inodelk_dom_count_req) + pl_inodelk_xattr_fill (this, entry->inode, entry->dict, + _gf_true); if (local->posixlk_count_req) pl_posixlk_xattr_fill (this, entry->inode, entry->dict); } @@ -2117,6 +2137,8 @@ pl_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, local->entrylk_count_req = 1; if (dict_get (dict, GLUSTERFS_INODELK_COUNT)) local->inodelk_count_req = 1; + if (dict_get (dict, GLUSTERFS_INODELK_DOM_COUNT)) + local->inodelk_dom_count_req = 1; if (dict_get (dict, GLUSTERFS_POSIXLK_COUNT)) local->posixlk_count_req = 1; } @@ -2136,8 +2158,8 @@ out: void pl_dump_lock (char *str, int size, struct gf_flock *flock, - gf_lkowner_t *owner, void *trans, time_t *granted_time, - time_t *blkd_time, gf_boolean_t active) + gf_lkowner_t *owner, void *trans, char *conn_id, + time_t *granted_time, time_t *blkd_time, gf_boolean_t active) { char *type_str = NULL; char granted[32] = {0,}; @@ -2165,7 +2187,7 @@ pl_dump_lock (char *str, int size, struct gf_flock *flock, (unsigned long long) flock->l_start, (unsigned long long) flock->l_len, (unsigned long long) flock->l_pid, - lkowner_utoa (owner), trans, + lkowner_utoa (owner), trans, conn_id, ctime_r (granted_time, granted)); } else { snprintf (str, size, RANGE_BLKD_GRNTD_FMT, @@ -2173,7 +2195,7 @@ pl_dump_lock (char *str, int size, struct gf_flock *flock, (unsigned long long) flock->l_start, (unsigned long long) flock->l_len, (unsigned long long) flock->l_pid, - lkowner_utoa (owner), trans, + lkowner_utoa (owner), trans, conn_id, ctime_r (blkd_time, blocked), ctime_r (granted_time, granted)); } @@ -2184,7 +2206,7 @@ pl_dump_lock (char *str, int size, struct gf_flock *flock, (unsigned long long) flock->l_start, (unsigned long long) flock->l_len, (unsigned long long) flock->l_pid, - lkowner_utoa (owner), trans, + lkowner_utoa (owner), trans, conn_id, ctime_r (blkd_time, blocked)); } @@ -2222,6 +2244,7 @@ __dump_entrylks (pl_inode_t *pl_inode) "ENTRYLK_WRLCK", lock->basename, (unsigned long long) lock->client_pid, lkowner_utoa (&lock->owner), lock->trans, + lock->connection_id, ctime_r (&lock->granted_time.tv_sec, granted)); } else { snprintf (tmp, 256, ENTRY_BLKD_GRNTD_FMT, @@ -2229,6 +2252,7 @@ __dump_entrylks (pl_inode_t *pl_inode) "ENTRYLK_WRLCK", lock->basename, (unsigned long long) lock->client_pid, lkowner_utoa (&lock->owner), lock->trans, + lock->connection_id, ctime_r (&lock->blkd_time.tv_sec, blocked), ctime_r (&lock->granted_time.tv_sec, granted)); } @@ -2248,6 +2272,7 @@ __dump_entrylks (pl_inode_t *pl_inode) "ENTRYLK_WRLCK", lock->basename, (unsigned long long) lock->client_pid, lkowner_utoa (&lock->owner), lock->trans, + lock->connection_id, ctime_r (&lock->blkd_time.tv_sec, blocked)); gf_proc_dump_write(key, tmp); @@ -2298,7 +2323,7 @@ __dump_inodelks (pl_inode_t *pl_inode) SET_FLOCK_PID (&lock->user_flock, lock); pl_dump_lock (tmp, 256, &lock->user_flock, &lock->owner, - lock->transport, + lock->client, lock->connection_id, &lock->granted_time.tv_sec, &lock->blkd_time.tv_sec, _gf_true); @@ -2315,7 +2340,7 @@ __dump_inodelks (pl_inode_t *pl_inode) SET_FLOCK_PID (&lock->user_flock, lock); pl_dump_lock (tmp, 256, &lock->user_flock, &lock->owner, - lock->transport, + lock->client, lock->connection_id, 0, &lock->blkd_time.tv_sec, _gf_false); gf_proc_dump_write(key, tmp); @@ -2356,7 +2381,7 @@ __dump_posixlks (pl_inode_t *pl_inode) count, lock->blocked ? "BLOCKED" : "ACTIVE"); pl_dump_lock (tmp, 256, &lock->user_flock, - &lock->owner, lock->transport, + &lock->owner, lock->client, NULL, &lock->granted_time.tv_sec, &lock->blkd_time.tv_sec, (lock->blocked)? _gf_false: _gf_true); gf_proc_dump_write(key, tmp); @@ -2433,7 +2458,7 @@ unlock: __dump_entrylks (pl_inode); } - count = __get_inodelk_count (this, pl_inode); + count = __get_inodelk_count (this, pl_inode, NULL); if (count) { gf_proc_dump_write("inodelk-count", "%d", count); __dump_inodelks (pl_inode); @@ -2480,6 +2505,124 @@ mem_acct_init (xlator_t *this) return ret; } + +pl_ctx_t* +pl_ctx_get (client_t *client, xlator_t *xlator) +{ + void *tmp = NULL; + pl_ctx_t *ctx = NULL; + + client_ctx_get (client, xlator, &tmp); + + ctx = tmp; + + if (ctx != NULL) + goto out; + + ctx = GF_CALLOC (1, sizeof (pl_ctx_t), gf_locks_mt_posix_lock_t); + + if (ctx == NULL) + goto out; + + ctx->ltable = pl_lock_table_new(); + + if (ctx->ltable == NULL) { + GF_FREE (ctx); + ctx = NULL; + goto out; + } + + LOCK_INIT (&ctx->ltable_lock); + + if (client_ctx_set (client, xlator, ctx) != 0) { + LOCK_DESTROY (&ctx->ltable_lock); + GF_FREE (ctx->ltable); + GF_FREE (ctx); + ctx = NULL; + } +out: + return ctx; +} + +static void +ltable_delete_locks (struct _lock_table *ltable) +{ + struct _locker *locker = NULL; + struct _locker *tmp = NULL; + + list_for_each_entry_safe (locker, tmp, <able->inodelk_lockers, lockers) { + if (locker->fd) + pl_del_locker (ltable, locker->volume, &locker->loc, + locker->fd, &locker->owner, + GF_FOP_INODELK); + GF_FREE (locker->volume); + GF_FREE (locker); + } + + list_for_each_entry_safe (locker, tmp, <able->entrylk_lockers, lockers) { + if (locker->fd) + pl_del_locker (ltable, locker->volume, &locker->loc, + locker->fd, &locker->owner, + GF_FOP_ENTRYLK); + GF_FREE (locker->volume); + GF_FREE (locker); + } + GF_FREE (ltable); +} + + +static int32_t +destroy_cbk (xlator_t *this, client_t *client) +{ + void *tmp = NULL; + pl_ctx_t *locks_ctx = NULL; + + client_ctx_del (client, this, &tmp); + + if (tmp == NULL) + return 0 +; + locks_ctx = tmp; + if (locks_ctx->ltable) + ltable_delete_locks (locks_ctx->ltable); + + LOCK_DESTROY (&locks_ctx->ltable_lock); + GF_FREE (locks_ctx); + + return 0; +} + + +static int32_t +disconnect_cbk (xlator_t *this, client_t *client) +{ + int32_t ret = 0; + pl_ctx_t *locks_ctx = NULL; + struct _lock_table *ltable = NULL; + + locks_ctx = pl_ctx_get (client, this); + if (locks_ctx == NULL) { + gf_log (this->name, GF_LOG_INFO, "pl_ctx_get() failed"); + goto out; + } + + LOCK (&locks_ctx->ltable_lock); + { + if (locks_ctx->ltable) { + ltable = locks_ctx->ltable; + locks_ctx->ltable = pl_lock_table_new (); + } + } + UNLOCK (&locks_ctx->ltable_lock); + + if (ltable) + ltable_delete_locks (ltable); + +out: + return ret; +} + + int init (xlator_t *this) { @@ -2508,7 +2651,7 @@ init (xlator_t *this) gf_log (this->name, GF_LOG_CRITICAL, "'locks' translator is not loaded over a storage " "translator"); - goto out;; + goto out; } priv = GF_CALLOC (1, sizeof (*priv), @@ -2610,9 +2753,11 @@ struct xlator_dumpops dumpops = { }; struct xlator_cbks cbks = { - .forget = pl_forget, - .release = pl_release, - .releasedir = pl_releasedir, + .forget = pl_forget, + .release = pl_release, + .releasedir = pl_releasedir, + .client_destroy = destroy_cbk, + .client_disconnect = disconnect_cbk, }; diff --git a/xlators/features/marker/Makefile.am b/xlators/features/marker/Makefile.am index a6ba2de16..a985f42a8 100644 --- a/xlators/features/marker/Makefile.am +++ b/xlators/features/marker/Makefile.am @@ -1,3 +1,3 @@ -SUBDIRS = src @SYNCDAEMON_SUBDIR@ +SUBDIRS = src CLEANFILES = diff --git a/xlators/features/marker/src/marker.c b/xlators/features/marker/src/marker.c index 82d9066d5..6a2c85691 100644 --- a/xlators/features/marker/src/marker.c +++ b/xlators/features/marker/src/marker.c @@ -185,6 +185,8 @@ marker_local_unref (marker_local_t *local) loc_wipe (&local->loc); loc_wipe (&local->parent_loc); + if (local->xdata) + dict_unref (local->xdata); if (local->oplocal) { marker_local_unref (local->oplocal); @@ -466,11 +468,16 @@ marker_create_frame (xlator_t *this, marker_local_t *local) int32_t marker_xtime_update_marks (xlator_t *this, marker_local_t *local) { + marker_conf_t *priv = NULL; + GF_VALIDATE_OR_GOTO ("marker", this, out); GF_VALIDATE_OR_GOTO (this->name, local, out); - if ((local->pid == GF_CLIENT_PID_GSYNCD) || - (local->pid == GF_CLIENT_PID_DEFRAG)) + priv = this->private; + + if ((local->pid == GF_CLIENT_PID_GSYNCD + && !(priv->feature_enabled & GF_XTIME_GSYNC_FORCE)) + || (local->pid == GF_CLIENT_PID_DEFRAG)) goto out; marker_gettimeofday (local); @@ -833,7 +840,7 @@ marker_unlink_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STACK_WIND (frame, marker_unlink_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, &local->loc, local->xflag, - NULL); + local->xdata); return 0; err: frame->local = NULL; @@ -858,6 +865,8 @@ marker_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, local = mem_get0 (this->local_pool); local->xflag = xflag; + if (xdata) + local->xdata = dict_ref (xdata); MARKER_INIT_LOCAL (frame, local); ret = loc_copy (&local->loc, loc); @@ -1808,6 +1817,210 @@ err: } +int32_t +marker_fallocate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_TRACE, "%s occurred while " + "fallocating a file ", strerror (op_errno)); + } + + local = (marker_local_t *) frame->local; + + frame->local = NULL; + + STACK_UNWIND_STRICT (fallocate, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + + if (op_ret == -1 || local == NULL) + goto out; + + priv = this->private; + + if (priv->feature_enabled & GF_QUOTA) + mq_initiate_quota_txn (this, &local->loc); + + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks (this, local); +out: + marker_local_unref (local); + + return 0; +} + +int32_t +marker_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; + + priv = this->private; + + if (priv->feature_enabled == 0) + goto wind; + + local = mem_get0 (this->local_pool); + + MARKER_INIT_LOCAL (frame, local); + + ret = marker_inode_loc_fill (fd->inode, &local->loc); + + if (ret == -1) + goto err; +wind: + STACK_WIND (frame, marker_fallocate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len, + xdata); + return 0; +err: + STACK_UNWIND_STRICT (fallocate, frame, -1, ENOMEM, NULL, NULL, NULL); + + return 0; +} + + +int32_t +marker_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_TRACE, "%s occurred during discard", + strerror (op_errno)); + } + + local = (marker_local_t *) frame->local; + + frame->local = NULL; + + STACK_UNWIND_STRICT (discard, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + + if (op_ret == -1 || local == NULL) + goto out; + + priv = this->private; + + if (priv->feature_enabled & GF_QUOTA) + mq_initiate_quota_txn (this, &local->loc); + + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks (this, local); +out: + marker_local_unref (local); + + return 0; +} + +int32_t +marker_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; + + priv = this->private; + + if (priv->feature_enabled == 0) + goto wind; + + local = mem_get0 (this->local_pool); + + MARKER_INIT_LOCAL (frame, local); + + ret = marker_inode_loc_fill (fd->inode, &local->loc); + + if (ret == -1) + goto err; +wind: + STACK_WIND (frame, marker_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); + return 0; +err: + STACK_UNWIND_STRICT (discard, frame, -1, ENOMEM, NULL, NULL, NULL); + + return 0; +} + +int32_t +marker_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_TRACE, "%s occurred during zerofill", + strerror (op_errno)); + } + + local = (marker_local_t *) frame->local; + + frame->local = NULL; + + STACK_UNWIND_STRICT (zerofill, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + + if (op_ret == -1 || local == NULL) + goto out; + + priv = this->private; + + if (priv->feature_enabled & GF_QUOTA) + mq_initiate_quota_txn (this, &local->loc); + + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks (this, local); +out: + marker_local_unref (local); + + return 0; +} + +int32_t +marker_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; + + priv = this->private; + + if (priv->feature_enabled == 0) + goto wind; + + local = mem_get0 (this->local_pool); + + MARKER_INIT_LOCAL (frame, local); + + ret = marker_inode_loc_fill (fd->inode, &local->loc); + + if (ret == -1) + goto err; +wind: + STACK_WIND (frame, marker_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); + return 0; +err: + STACK_UNWIND_STRICT (zerofill, frame, -1, ENOMEM, NULL, NULL, NULL); + + return 0; +} + + /* when a call from the special client is received on * key trusted.glusterfs.volume-mark with value "RESET" * or if the value is 0length, update the change the @@ -2452,7 +2665,7 @@ out: int32_t reconfigure (xlator_t *this, dict_t *options) { - int32_t ret = -1; + int32_t ret = 0; data_t *data = NULL; gf_boolean_t flag = _gf_false; marker_conf_t *priv = NULL; @@ -2493,11 +2706,17 @@ reconfigure (xlator_t *this, dict_t *options) "xtime updation will fail"); } else { priv->feature_enabled |= GF_XTIME; + data = dict_get (options, "gsync-force-xtime"); + if (!data) + goto out; + ret = gf_string2boolean (data->data, &flag); + if (ret == 0 && flag) + priv->feature_enabled |= GF_XTIME_GSYNC_FORCE; } } } out: - return 0; + return ret; } @@ -2553,9 +2772,16 @@ init (xlator_t *this) goto err; priv->feature_enabled |= GF_XTIME; + data = dict_get (options, "gsync-force-xtime"); + if (!data) + goto cont; + ret = gf_string2boolean (data->data, &flag); + if (ret == 0 && flag) + priv->feature_enabled |= GF_XTIME_GSYNC_FORCE; } } + cont: this->local_pool = mem_pool_new (marker_local_t, 128); if (!this->local_pool) { gf_log (this->name, GF_LOG_ERROR, @@ -2617,6 +2843,9 @@ struct xlator_fops fops = { .removexattr = marker_removexattr, .getxattr = marker_getxattr, .readdirp = marker_readdirp, + .fallocate = marker_fallocate, + .discard = marker_discard, + .zerofill = marker_zerofill, }; struct xlator_cbks cbks = { @@ -2628,5 +2857,6 @@ struct volume_options options[] = { {.key = {"timestamp-file"}}, {.key = {"quota"}}, {.key = {"xtime"}}, + {.key = {"gsync-force-xtime"}}, {.key = {NULL}} }; diff --git a/xlators/features/marker/src/marker.h b/xlators/features/marker/src/marker.h index 63491ab37..1a58f8cfc 100644 --- a/xlators/features/marker/src/marker.h +++ b/xlators/features/marker/src/marker.h @@ -28,8 +28,9 @@ #define TIMESTAMP_FILE "timestamp-file" enum { - GF_QUOTA=1, - GF_XTIME=2 + GF_QUOTA = 1, + GF_XTIME = 2, + GF_XTIME_GSYNC_FORCE = 4, }; /*initialize the local variable*/ @@ -110,6 +111,7 @@ struct marker_local{ inode_contribution_t *contri; int xflag; + dict_t *xdata; }; typedef struct marker_local marker_local_t; diff --git a/xlators/features/marker/utils/Makefile.am b/xlators/features/marker/utils/Makefile.am deleted file mode 100644 index 556951d9f..000000000 --- a/xlators/features/marker/utils/Makefile.am +++ /dev/null @@ -1,3 +0,0 @@ -SUBDIRS = syncdaemon src - -CLEANFILES = diff --git a/xlators/features/marker/utils/src/Makefile.am b/xlators/features/marker/utils/src/Makefile.am deleted file mode 100644 index 9e410cda6..000000000 --- a/xlators/features/marker/utils/src/Makefile.am +++ /dev/null @@ -1,26 +0,0 @@ -gsyncddir = $(libexecdir)/glusterfs - -gsyncd_PROGRAMS = gsyncd - -gsyncd_SOURCES = gsyncd.c procdiggy.c - -gsyncd_LDADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ - $(GF_GLUSTERFS_LIBS) - -gsyncd_LDFLAGS = $(GF_LDFLAGS) - -noinst_HEADERS = procdiggy.h - -AM_CPPFLAGS = $(GF_CPPFLAGS) \ - -I$(top_srcdir)/libglusterfs/src\ - -DGSYNCD_PREFIX=\"$(libexecdir)/glusterfs\"\ - -DUSE_LIBGLUSTERFS\ - -DSBIN_DIR=\"$(sbindir)\" -DPYTHON=\"$(PYTHON)\" - -AM_CFLAGS = -Wall $(GF_CFLAGS) - - -CLEANFILES = - -$(top_builddir)/libglusterfs/src/libglusterfs.la: - $(MAKE) -C $(top_builddir)/libglusterfs/src/ all diff --git a/xlators/features/marker/utils/src/gsyncd.c b/xlators/features/marker/utils/src/gsyncd.c deleted file mode 100644 index 9c4a5bdff..000000000 --- a/xlators/features/marker/utils/src/gsyncd.c +++ /dev/null @@ -1,367 +0,0 @@ -/* - Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include <stdlib.h> -#include <stdio.h> -#include <unistd.h> -#include <string.h> -#include <sys/param.h> /* for PATH_MAX */ - -/* NOTE (USE_LIBGLUSTERFS): - * ------------------------ - * When USE_LIBGLUSTERFS debugging sumbol is passed; perform - * glusterfs translator like initialization so that glusterfs - * globals, contexts are valid when glustefs api's are invoked. - * We unconditionally pass then while building gsyncd binary. - */ -#ifdef USE_LIBGLUSTERFS -#include "glusterfs.h" -#include "globals.h" -#endif - -#include "common-utils.h" -#include "run.h" -#include "procdiggy.h" - -#define _GLUSTERD_CALLED_ "_GLUSTERD_CALLED_" -#define _GSYNCD_DISPATCHED_ "_GSYNCD_DISPATCHED_" -#define GSYNCD_CONF "geo-replication/gsyncd.conf" -#define GSYNCD_PY "gsyncd.py" -#define RSYNC "rsync" - -int restricted = 0; - -static int -duplexpand (void **buf, size_t tsiz, size_t *len) -{ - size_t osiz = tsiz * *len; - char *p = realloc (*buf, osiz << 1); - if (!p) { - free(*buf); - return -1; - } - - memset (p + osiz, 0, osiz); - *buf = p; - *len <<= 1; - - return 0; -} - -static int -str2argv (char *str, char ***argv) -{ - char *p = NULL; - char *savetok = NULL; - int argc = 0; - size_t argv_len = 32; - int ret = 0; - - assert (str); - str = strdup (str); - if (!str) - return -1; - - *argv = calloc (argv_len, sizeof (**argv)); - if (!*argv) - goto error; - - while ((p = strtok_r (str, " ", &savetok))) { - str = NULL; - - argc++; - if (argc == argv_len) { - ret = duplexpand ((void *)argv, - sizeof (**argv), - &argv_len); - if (ret == -1) - goto error; - } - (*argv)[argc - 1] = p; - } - - return argc; - - error: - fprintf (stderr, "out of memory\n"); - return -1; -} - -static int -invoke_gsyncd (int argc, char **argv) -{ - char config_file[PATH_MAX] = {0,}; - size_t gluster_workdir_len = 0; - runner_t runner = {0,}; - int i = 0; - int j = 0; - char *nargv[argc + 4]; - char *python = NULL; - - if (restricted) { - size_t len; - /* in restricted mode we forcibly use the system-wide config */ - runinit (&runner); - runner_add_args (&runner, SBIN_DIR"/gluster", - "--log-file=-", "system::", "getwd", - NULL); - runner_redir (&runner, STDOUT_FILENO, RUN_PIPE); - if (runner_start (&runner) == 0 && - fgets (config_file, PATH_MAX, - runner_chio (&runner, STDOUT_FILENO)) != NULL && - (len = strlen (config_file)) && - config_file[len - 1] == '\n' && - runner_end (&runner) == 0) - gluster_workdir_len = len - 1; - - if (gluster_workdir_len) { - if (gluster_workdir_len + 1 + strlen (GSYNCD_CONF) + 1 > - PATH_MAX) - goto error; - config_file[gluster_workdir_len] = '/'; - strcat (config_file, GSYNCD_CONF); - } else - goto error; - - if (setenv ("_GSYNCD_RESTRICTED_", "1", 1) == -1) - goto error; - } - - if (chdir ("/") == -1) - goto error; - - j = 0; - python = getenv("PYTHON"); - if(!python) - python = PYTHON; - nargv[j++] = python; - nargv[j++] = GSYNCD_PREFIX"/python/syncdaemon/"GSYNCD_PY; - for (i = 1; i < argc; i++) - nargv[j++] = argv[i]; - if (config_file[0]) { - nargv[j++] = "-c"; - nargv[j++] = config_file; - } - nargv[j++] = NULL; - - execvp (python, nargv); - - fprintf (stderr, "exec of '%s' failed\n", python); - return 127; - - error: - fprintf (stderr, "gsyncd initializaion failed\n"); - return 1; -} - - -static int -find_gsyncd (pid_t pid, pid_t ppid, char *name, void *data) -{ - char buf[NAME_MAX * 2] = {0,}; - char path[PATH_MAX] = {0,}; - char *p = NULL; - int zeros = 0; - int ret = 0; - int fd = -1; - pid_t *pida = (pid_t *)data; - - if (ppid != pida[0]) - return 0; - - sprintf (path, PROC"/%d/cmdline", pid); - fd = open (path, O_RDONLY); - if (fd == -1) - return 0; - ret = read (fd, buf, sizeof (buf)); - close (fd); - if (ret == -1) - return 0; - for (zeros = 0, p = buf; zeros < 2 && p < buf + ret; p++) - zeros += !*p; - - ret = 0; - switch (zeros) { - case 2: - if ((strcmp (basename (buf), basename (PYTHON)) || - strcmp (basename (buf + strlen (buf) + 1), GSYNCD_PY)) == 0) { - ret = 1; - break; - } - /* fallthrough */ - case 1: - if (strcmp (basename (buf), GSYNCD_PY) == 0) - ret = 1; - } - - if (ret == 1) { - if (pida[1] != -1) { - fprintf (stderr, GSYNCD_PY" sibling is not unique"); - return -1; - } - pida[1] = pid; - } - - return 0; -} - -static int -invoke_rsync (int argc, char **argv) -{ - int i = 0; - char path[PATH_MAX] = {0,}; - pid_t pid = -1; - pid_t ppid = -1; - pid_t pida[] = {-1, -1}; - char *name = NULL; - char buf[PATH_MAX + 1] = {0,}; - int ret = 0; - - assert (argv[argc] == NULL); - - if (argc < 2 || strcmp (argv[1], "--server") != 0) - goto error; - - for (i = 2; i < argc && argv[i][0] == '-'; i++); - - if (!(i == argc - 2 && strcmp (argv[i], ".") == 0 && argv[i + 1][0] == '/')) { - fprintf (stderr, "need an rsync invocation without protected args\n"); - goto error; - } - - /* look up sshd we are spawned from */ - for (pid = getpid () ;; pid = ppid) { - ppid = pidinfo (pid, &name); - if (ppid < 0) { - fprintf (stderr, "sshd ancestor not found\n"); - goto error; - } - if (strcmp (name, "sshd") == 0) { - GF_FREE (name); - break; - } - GF_FREE (name); - } - /* look up "ssh-sibling" gsyncd */ - pida[0] = pid; - ret = prociter (find_gsyncd, pida); - if (ret == -1 || pida[1] == -1) { - fprintf (stderr, "gsyncd sibling not found\n"); - goto error; - } - /* check if rsync target matches gsyncd target */ - sprintf (path, PROC"/%d/cwd", pida[1]); - ret = readlink (path, buf, sizeof (buf)); - if (ret == -1 || ret == sizeof (buf)) - goto error; - if (strcmp (argv[argc - 1], "/") == 0 /* root dir cannot be a target */ || - (strcmp (argv[argc - 1], path) /* match against gluster target */ && - strcmp (argv[argc - 1], buf) /* match against file target */) != 0) { - fprintf (stderr, "rsync target does not match "GEOREP" session\n"); - goto error; - } - - argv[0] = RSYNC; - - execvp (RSYNC, argv); - - fprintf (stderr, "exec of "RSYNC" failed\n"); - return 127; - - error: - fprintf (stderr, "disallowed "RSYNC" invocation\n"); - return 1; -} - - -struct invocable { - char *name; - int (*invoker) (int argc, char **argv); -}; - -struct invocable invocables[] = { - { "rsync", invoke_rsync }, - { "gsyncd", invoke_gsyncd }, - { NULL, NULL} -}; - -int -main (int argc, char **argv) -{ - char *evas = NULL; - struct invocable *i = NULL; - char *b = NULL; - char *sargv = NULL; - -#ifdef USE_LIBGLUSTERFS - glusterfs_ctx_t *ctx = NULL; - - ctx = glusterfs_ctx_new (); - if (!ctx) - return ENOMEM; - - if (glusterfs_globals_init (ctx)) - return 1; - - THIS->ctx = ctx; -#endif - - evas = getenv (_GLUSTERD_CALLED_); - if (evas && strcmp (evas, "1") == 0) - /* OK, we know glusterd called us, no need to look for further config - * ... altough this conclusion should not inherit to our children - */ - unsetenv (_GLUSTERD_CALLED_); - else { - /* we regard all gsyncd invocations unsafe - * that do not come from glusterd and - * therefore restrict it - */ - restricted = 1; - - if (!getenv (_GSYNCD_DISPATCHED_)) { - evas = getenv ("SSH_ORIGINAL_COMMAND"); - if (evas) - sargv = evas; - else { - evas = getenv ("SHELL"); - if (evas && strcmp (basename (evas), "gsyncd") == 0 && - argc == 3 && strcmp (argv[1], "-c") == 0) - sargv = argv[2]; - } - } - - } - - if (!(sargv && restricted)) - return invoke_gsyncd (argc, argv); - - argc = str2argv (sargv, &argv); - if (argc == -1 || setenv (_GSYNCD_DISPATCHED_, "1", 1) == -1) { - fprintf (stderr, "internal error\n"); - return 1; - } - - b = basename (argv[0]); - for (i = invocables; i->name; i++) { - if (strcmp (b, i->name) == 0) - return i->invoker (argc, argv); - } - - fprintf (stderr, "invoking %s in restricted SSH session is not allowed\n", - b); - - return 1; -} diff --git a/xlators/features/marker/utils/src/procdiggy.c b/xlators/features/marker/utils/src/procdiggy.c deleted file mode 100644 index 1eba414c1..000000000 --- a/xlators/features/marker/utils/src/procdiggy.c +++ /dev/null @@ -1,121 +0,0 @@ -/* - Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include <stdlib.h> -#include <stdio.h> -#include <unistd.h> -#include <string.h> -#include <ctype.h> -#include <sys/param.h> /* for PATH_MAX */ - -#include "common-utils.h" -#include "procdiggy.h" - -pid_t -pidinfo (pid_t pid, char **name) -{ - char buf[NAME_MAX * 2] = {0,}; - FILE *f = NULL; - char path[PATH_MAX] = {0,}; - char *p = NULL; - int ret = 0; - - sprintf (path, PROC"/%d/status", pid); - - f = fopen (path, "r"); - if (!f) - return -1; - - if (name) - *name = NULL; - for (;;) { - size_t len; - memset (buf, 0, sizeof (buf)); - if (fgets (buf, sizeof (buf), f) == NULL || - (len = strlen (buf)) == 0 || - buf[len - 1] != '\n') { - pid = -1; - goto out; - } - buf[len - 1] = '\0'; - - if (name && !*name) { - p = strtail (buf, "Name:"); - if (p) { - while (isspace (*++p)); - *name = gf_strdup (p); - if (!*name) { - pid = -2; - goto out; - } - continue; - } - } - - p = strtail (buf, "PPid:"); - if (p) - break; - } - - while (isspace (*++p)); - ret = gf_string2int (p, &pid); - if (ret == -1) - pid = -1; - - out: - fclose (f); - if (pid == -1 && name && *name) - GF_FREE (name); - if (pid == -2) - fprintf (stderr, "out of memory\n"); - return pid; -} - -int -prociter (int (*proch) (pid_t pid, pid_t ppid, char *tmpname, void *data), - void *data) -{ - char *name = NULL; - DIR *d = NULL; - struct dirent *de = NULL; - pid_t pid = -1; - pid_t ppid = -1; - int ret = 0; - - d = opendir (PROC); - if (!d) - return -1; - while (errno = 0, de = readdir (d)) { - if (gf_string2int (de->d_name, &pid) != -1 && pid >= 0) { - ppid = pidinfo (pid, &name); - switch (ppid) { - case -1: continue; - case -2: ret = -1; break; - } - ret = proch (pid, ppid, name, data); - GF_FREE (name); - if (ret) - break; - } - } - closedir (d); - if (!de && errno) { - fprintf (stderr, "failed to traverse "PROC" (%s)\n", - strerror (errno)); - ret = -1; - } - - return ret; -} diff --git a/xlators/features/marker/utils/syncdaemon/Makefile.am b/xlators/features/marker/utils/syncdaemon/Makefile.am deleted file mode 100644 index cc7cee102..000000000 --- a/xlators/features/marker/utils/syncdaemon/Makefile.am +++ /dev/null @@ -1,6 +0,0 @@ -syncdaemondir = $(libexecdir)/glusterfs/python/syncdaemon - -syncdaemon_PYTHON = gconf.py gsyncd.py __init__.py master.py README.md repce.py resource.py configinterface.py syncdutils.py monitor.py libcxattr.py \ - $(top_builddir)/contrib/ipaddr-py/ipaddr.py - -CLEANFILES = diff --git a/xlators/features/marker/utils/syncdaemon/README.md b/xlators/features/marker/utils/syncdaemon/README.md deleted file mode 100644 index d45006932..000000000 --- a/xlators/features/marker/utils/syncdaemon/README.md +++ /dev/null @@ -1,81 +0,0 @@ -gsycnd, the Gluster Syncdaemon -============================== - -REQUIREMENTS ------------- - -_gsyncd_ is a program which can operate either in _master_ or in _slave_ mode. -Requirements are categorized according to this. - -* supported OS is GNU/Linux -* Python >= 2.5, or 2.4 with Ctypes (see below) (both) -* OpenSSH >= 4.0 (master) / SSH2 compliant sshd (eg. openssh) (slave) -* rsync (both) -* glusterfs with marker support (master); glusterfs (optional on slave) -* FUSE; for supported versions consult glusterfs - -INSTALLATION ------------- - -As of now, the supported way of operation is running from the source directory. - -If you use Python 2.4.x, you need to install the [Ctypes module](http://python.net/crew/theller/ctypes/). - -CONFIGURATION -------------- - -gsyncd tunables are a subset of the long command-line options; for listing them, -type - - gsyncd.py --help - -and see the long options up to "--config-file". (The leading double dash should be omitted; -interim underscores and dashes are interchangeable.) The set of options bear some resemblance -to those of glusterfs and rsync. - -The config file format matches the following syntax: - - <option1>: <value1> - <option2>: <value2> - # comment - -By default (unless specified by the option `-c`), gsyncd looks for config file at _conf/gsyncd.conf_ -in the source tree. - -USAGE ------ - -gsyncd is a utilitly for continous mirroring, ie. it mirrors master to slave incrementally. -Assume we have a gluster volume _pop_ at localhost. We try to set up the following mirrors -for it with gysncd: - -1. _/data/mirror_ -2. local gluster volume _yow_ -3. _/data/far_mirror_ at example.com -4. gluster volume _moz_ at example.com - -The respective gsyncd invocations are (demoing some syntax sugaring): - -1. - - gsyncd.py gluster://localhost:pop file:///data/mirror - - or short form - - gsyncd.py :pop /data/mirror - -2. `gsyncd :pop :yow` -3. - - gsyncd.py :pop ssh://example.com:/data/far_mirror - - or short form - - gsyncd.py :pop example.com:/data/far_mirror - -4. `gsyncd.py :pop example.com::moz` - -gsyncd has to be available on both sides; it's location on the remote side has to be specified -via the "--remote-gsyncd" option (or "remote-gsyncd" config file parameter). (This option can also be -used for setting options on the remote side, although the suggested mode of operation is to -set parameters like log file / pid file in the configuration file.) diff --git a/xlators/features/marker/utils/syncdaemon/__codecheck.py b/xlators/features/marker/utils/syncdaemon/__codecheck.py deleted file mode 100644 index e3386afba..000000000 --- a/xlators/features/marker/utils/syncdaemon/__codecheck.py +++ /dev/null @@ -1,46 +0,0 @@ -import os -import os.path -import sys -import tempfile -import shutil - -ipd = tempfile.mkdtemp(prefix = 'codecheck-aux') - -try: - # add a fake ipaddr module, we don't want to - # deal with the real one (just test our code) - f = open(os.path.join(ipd, 'ipaddr.py'), 'w') - f.write(""" -class IPAddress(object): - pass -class IPNetwork(list): - pass -""") - f.close() - sys.path.append(ipd) - - fl = os.listdir(os.path.dirname(sys.argv[0]) or '.') - fl.sort() - for f in fl: - if f[-3:] != '.py' or f[0] == '_': - continue - m = f[:-3] - sys.stdout.write('importing %s ...' % m) - __import__(m) - print(' OK.') - - def sys_argv_set(a): - sys.argv = sys.argv[:1] + a - - gsyncd = sys.modules['gsyncd'] - for a in [['--help'], ['--version'], ['--canonicalize-escape-url', '/foo']]: - print('>>> invoking program with args: %s' % ' '.join(a)) - pid = os.fork() - if not pid: - sys_argv_set(a) - gsyncd.main() - _, r = os.waitpid(pid, 0) - if r: - raise RuntimeError('invocation failed') -finally: - shutil.rmtree(ipd) diff --git a/xlators/features/marker/utils/syncdaemon/__init__.py b/xlators/features/marker/utils/syncdaemon/__init__.py deleted file mode 100644 index e69de29bb..000000000 --- a/xlators/features/marker/utils/syncdaemon/__init__.py +++ /dev/null diff --git a/xlators/features/marker/utils/syncdaemon/configinterface.py b/xlators/features/marker/utils/syncdaemon/configinterface.py deleted file mode 100644 index e55bec519..000000000 --- a/xlators/features/marker/utils/syncdaemon/configinterface.py +++ /dev/null @@ -1,224 +0,0 @@ -try: - import ConfigParser -except ImportError: - # py 3 - import configparser as ConfigParser -import re -from string import Template - -from syncdutils import escape, unescape, norm, update_file, GsyncdError - -SECT_ORD = '__section_order__' -SECT_META = '__meta__' -config_version = 2.0 - -re_type = type(re.compile('')) - - -class MultiDict(object): - """a virtual dict-like class which functions as the union of underlying dicts""" - - def __init__(self, *dd): - self.dicts = dd - - def __getitem__(self, key): - val = None - for d in self.dicts: - if d.get(key): - val = d[key] - if not val: - raise KeyError(key) - return val - - -class GConffile(object): - """A high-level interface to ConfigParser which flattens the two-tiered - config layout by implenting automatic section dispatch based on initial - parameters. - - Also ensure section ordering in terms of their time of addition -- a compat - hack for Python < 2.7. - """ - - def _normconfig(self): - """normalize config keys by s/-/_/g""" - for n, s in self.config._sections.items(): - if n.find('__') == 0: - continue - s2 = type(s)() - for k, v in s.items(): - if k.find('__') != 0: - k = norm(k) - s2[k] = v - self.config._sections[n] = s2 - - def __init__(self, path, peers, *dd): - """ - - .path: location of config file - - .config: underlying ConfigParser instance - - .peers: on behalf of whom we flatten .config - (master, or master-slave url pair) - - .auxdicts: template subtituents - """ - self.peers = peers - self.path = path - self.auxdicts = dd - self.config = ConfigParser.RawConfigParser() - self.config.read(path) - self._normconfig() - - def section(self, rx=False): - """get the section name of the section representing .peers in .config""" - peers = self.peers - if not peers: - peers = ['.', '.'] - rx = True - if rx: - st = 'peersrx' - else: - st = 'peers' - return ' '.join([st] + [escape(u) for u in peers]) - - @staticmethod - def parse_section(section): - """retrieve peers sequence encoded by section name - (as urls or regexen, depending on section type) - """ - sl = section.split() - st = sl.pop(0) - sl = [unescape(u) for u in sl] - if st == 'peersrx': - sl = [re.compile(u) for u in sl] - return sl - - def ord_sections(self): - """Return an ordered list of sections. - - Ordering happens based on the auxiliary - SECT_ORD section storing indices for each - section added through the config API. - - To not to go corrupt in case of manually - written config files, we take care to append - also those sections which are not registered - in SECT_ORD. - - Needed for python 2.{4,5,6} where ConfigParser - cannot yet order sections/options internally. - """ - so = {} - if self.config.has_section(SECT_ORD): - so = self.config._sections[SECT_ORD] - so2 = {} - for k, v in so.items(): - if k != '__name__': - so2[k] = int(v) - tv = 0 - if so2: - tv = max(so2.values()) + 1 - ss = [s for s in self.config.sections() if s.find('__') != 0] - for s in ss: - if s in so.keys(): - continue - so2[s] = tv - tv += 1 - def scmp(x, y): - return cmp(*(so2[s] for s in (x, y))) - ss.sort(scmp) - return ss - - def update_to(self, dct, allow_unresolved=False): - """update @dct from key/values of ours. - - key/values are collected from .config by filtering the regexp sections - according to match, and from .section. The values are treated as templates, - which are substituted from .auxdicts and (in case of regexp sections) - match groups. - """ - if not self.peers: - raise GsyncdError('no peers given, cannot select matching options') - def update_from_sect(sect, mud): - for k, v in self.config._sections[sect].items(): - if k == '__name__': - continue - if allow_unresolved: - dct[k] = Template(v).safe_substitute(mud) - else: - dct[k] = Template(v).substitute(mud) - for sect in self.ord_sections(): - sp = self.parse_section(sect) - if isinstance(sp[0], re_type) and len(sp) == len(self.peers): - match = True - mad = {} - for i in range(len(sp)): - m = sp[i].search(self.peers[i]) - if not m: - match = False - break - for j in range(len(m.groups())): - mad['match%d_%d' % (i+1, j+1)] = m.groups()[j] - if match: - update_from_sect(sect, MultiDict(dct, mad, *self.auxdicts)) - if self.config.has_section(self.section()): - update_from_sect(self.section(), MultiDict(dct, *self.auxdicts)) - - def get(self, opt=None): - """print the matching key/value pairs from .config, - or if @opt given, the value for @opt (according to the - logic described in .update_to) - """ - d = {} - self.update_to(d, allow_unresolved = True) - if opt: - opt = norm(opt) - v = d.get(opt) - if v: - print(v) - else: - for k, v in d.iteritems(): - if k == '__name__': - continue - print("%s: %s" % (k, v)) - - def write(self, trfn, opt, *a, **kw): - """update on-disk config transactionally - - @trfn is the transaction function - """ - def mergeconf(f): - self.config = ConfigParser.RawConfigParser() - self.config.readfp(f) - self._normconfig() - if not self.config.has_section(SECT_META): - self.config.add_section(SECT_META) - self.config.set(SECT_META, 'version', config_version) - return trfn(norm(opt), *a, **kw) - def updateconf(f): - self.config.write(f) - update_file(self.path, updateconf, mergeconf) - - def _set(self, opt, val, rx=False): - """set @opt to @val in .section""" - sect = self.section(rx) - if not self.config.has_section(sect): - self.config.add_section(sect) - # regarding SECT_ORD, cf. ord_sections - if not self.config.has_section(SECT_ORD): - self.config.add_section(SECT_ORD) - self.config.set(SECT_ORD, sect, len(self.config._sections[SECT_ORD])) - self.config.set(sect, opt, val) - return True - - def set(self, opt, *a, **kw): - """perform ._set transactionally""" - self.write(self._set, opt, *a, **kw) - - def _delete(self, opt, rx=False): - """delete @opt from .section""" - sect = self.section(rx) - if self.config.has_section(sect): - return self.config.remove_option(sect, opt) - - def delete(self, opt, *a, **kw): - """perform ._delete transactionally""" - self.write(self._delete, opt, *a, **kw) diff --git a/xlators/features/marker/utils/syncdaemon/gconf.py b/xlators/features/marker/utils/syncdaemon/gconf.py deleted file mode 100644 index 146c72a18..000000000 --- a/xlators/features/marker/utils/syncdaemon/gconf.py +++ /dev/null @@ -1,20 +0,0 @@ -import os - -class GConf(object): - """singleton class to store globals - shared between gsyncd modules""" - - ssh_ctl_dir = None - ssh_ctl_args = None - cpid = None - pid_file_owned = False - log_exit = False - permanent_handles = [] - log_metadata = {} - - @classmethod - def setup_ssh_ctl(cls, ctld): - cls.ssh_ctl_dir = ctld - cls.ssh_ctl_args = ["-oControlMaster=auto", "-S", os.path.join(ctld, "gsycnd-ssh-%r@%h:%p")] - -gconf = GConf() diff --git a/xlators/features/marker/utils/syncdaemon/gsyncd.py b/xlators/features/marker/utils/syncdaemon/gsyncd.py deleted file mode 100644 index 387900e6c..000000000 --- a/xlators/features/marker/utils/syncdaemon/gsyncd.py +++ /dev/null @@ -1,419 +0,0 @@ -#!/usr/bin/env python - -import os -import os.path -import sys -import time -import logging -import signal -import optparse -import fcntl -import fnmatch -from optparse import OptionParser, SUPPRESS_HELP -from logging import Logger -from errno import EEXIST, ENOENT - -from ipaddr import IPAddress, IPNetwork - -from gconf import gconf -from syncdutils import FreeObject, norm, grabpidfile, finalize, log_raise_exception -from syncdutils import GsyncdError, select, set_term_handler, privileged -from configinterface import GConffile -import resource -from monitor import monitor - -class GLogger(Logger): - """Logger customizations for gsyncd. - - It implements a log format similar to that of glusterfs. - """ - - def makeRecord(self, name, level, *a): - rv = Logger.makeRecord(self, name, level, *a) - rv.nsecs = (rv.created - int(rv.created)) * 1000000 - fr = sys._getframe(4) - callee = fr.f_locals.get('self') - if callee: - ctx = str(type(callee)).split("'")[1].split('.')[-1] - else: - ctx = '<top>' - if not hasattr(rv, 'funcName'): - rv.funcName = fr.f_code.co_name - rv.lvlnam = logging.getLevelName(level)[0] - rv.ctx = ctx - return rv - - @classmethod - def setup(cls, **kw): - lbl = kw.get('label', "") - if lbl: - lbl = '(' + lbl + ')' - lprm = {'datefmt': "%Y-%m-%d %H:%M:%S", - 'format': "[%(asctime)s.%(nsecs)d] %(lvlnam)s [%(module)s" + lbl + ":%(lineno)s:%(funcName)s] %(ctx)s: %(message)s"} - lprm.update(kw) - lvl = kw.get('level', logging.INFO) - lprm['level'] = lvl - logging.root = cls("root", lvl) - logging.setLoggerClass(cls) - logging.getLogger().handlers = [] - logging.basicConfig(**lprm) - - @classmethod - def _gsyncd_loginit(cls, **kw): - lkw = {} - if gconf.log_level: - lkw['level'] = gconf.log_level - if kw.get('log_file'): - if kw['log_file'] in ('-', '/dev/stderr'): - lkw['stream'] = sys.stderr - elif kw['log_file'] == '/dev/stdout': - lkw['stream'] = sys.stdout - else: - lkw['filename'] = kw['log_file'] - - cls.setup(label=kw.get('label'), **lkw) - - lkw.update({'saved_label': kw.get('label')}) - gconf.log_metadata = lkw - gconf.log_exit = True - -def startup(**kw): - """set up logging, pidfile grabbing, daemonization""" - if getattr(gconf, 'pid_file', None) and kw.get('go_daemon') != 'postconn': - if not grabpidfile(): - sys.stderr.write("pidfile is taken, exiting.\n") - sys.exit(2) - gconf.pid_file_owned = True - - if kw.get('go_daemon') == 'should': - x, y = os.pipe() - gconf.cpid = os.fork() - if gconf.cpid: - os.close(x) - sys.exit() - os.close(y) - os.setsid() - dn = os.open(os.devnull, os.O_RDWR) - for f in (sys.stdin, sys.stdout, sys.stderr): - os.dup2(dn, f.fileno()) - if getattr(gconf, 'pid_file', None): - if not grabpidfile(gconf.pid_file + '.tmp'): - raise GsyncdError("cannot grab temporary pidfile") - os.rename(gconf.pid_file + '.tmp', gconf.pid_file) - # wait for parent to terminate - # so we can start up with - # no messing from the dirty - # ol' bustard - select((x,), (), ()) - os.close(x) - - GLogger._gsyncd_loginit(**kw) - -def main(): - """main routine, signal/exception handling boilerplates""" - gconf.starttime = time.time() - set_term_handler() - GLogger.setup() - excont = FreeObject(exval = 0) - try: - try: - main_i() - except: - log_raise_exception(excont) - finally: - finalize(exval = excont.exval) - -def main_i(): - """internal main routine - - parse command line, decide what action will be taken; - we can either: - - query/manipulate configuration - - format gsyncd urls using gsyncd's url parsing engine - - start service in following modes, in given stages: - - monitor: startup(), monitor() - - master: startup(), connect_remote(), connect(), service_loop() - - slave: startup(), connect(), service_loop() - """ - rconf = {'go_daemon': 'should'} - - def store_abs(opt, optstr, val, parser): - if val and val != '-': - val = os.path.abspath(val) - setattr(parser.values, opt.dest, val) - def store_local(opt, optstr, val, parser): - rconf[opt.dest] = val - def store_local_curry(val): - return lambda o, oo, vx, p: store_local(o, oo, val, p) - def store_local_obj(op, dmake): - return lambda o, oo, vx, p: store_local(o, oo, FreeObject(op=op, **dmake(vx)), p) - - op = OptionParser(usage="%prog [options...] <master> <slave>", version="%prog 0.0.1") - op.add_option('--gluster-command-dir', metavar='DIR', default='') - op.add_option('--gluster-log-file', metavar='LOGF', default=os.devnull, type=str, action='callback', callback=store_abs) - op.add_option('--gluster-log-level', metavar='LVL') - op.add_option('--gluster-params', metavar='PRMS', default='') - op.add_option('--gluster-cli-options', metavar='OPTS', default='--log-file=-') - op.add_option('--mountbroker', metavar='LABEL') - op.add_option('-p', '--pid-file', metavar='PIDF', type=str, action='callback', callback=store_abs) - op.add_option('-l', '--log-file', metavar='LOGF', type=str, action='callback', callback=store_abs) - op.add_option('--log-file-mbr', metavar='LOGF', type=str, action='callback', callback=store_abs) - op.add_option('--state-file', metavar='STATF', type=str, action='callback', callback=store_abs) - op.add_option('--ignore-deletes', default=False, action='store_true') - op.add_option('--use-rsync-xattrs', default=False, action='store_true') - op.add_option('-L', '--log-level', metavar='LVL') - op.add_option('-r', '--remote-gsyncd', metavar='CMD', default=os.path.abspath(sys.argv[0])) - op.add_option('--volume-id', metavar='UUID') - op.add_option('--session-owner', metavar='ID') - op.add_option('-s', '--ssh-command', metavar='CMD', default='ssh') - op.add_option('--rsync-command', metavar='CMD', default='rsync') - op.add_option('--rsync-options', metavar='OPTS', default='--sparse') - op.add_option('--rsync-ssh-options', metavar='OPTS', default='--compress') - op.add_option('--timeout', metavar='SEC', type=int, default=120) - op.add_option('--connection-timeout', metavar='SEC', type=int, default=60, help=SUPPRESS_HELP) - op.add_option('--sync-jobs', metavar='N', type=int, default=3) - op.add_option('--turns', metavar='N', type=int, default=0, help=SUPPRESS_HELP) - op.add_option('--allow-network', metavar='IPS', default='') - op.add_option('--socketdir', metavar='DIR') - op.add_option('--state-socket-unencoded', metavar='SOCKF', type=str, action='callback', callback=store_abs) - op.add_option('--checkpoint', metavar='LABEL', default='') - # tunables for failover/failback mechanism: - # None - gsyncd behaves as normal - # blind - gsyncd works with xtime pairs to identify - # candidates for synchronization - # wrapup - same as normal mode but does not assign - # xtimes to orphaned files - # see crawl() for usage of the above tunables - op.add_option('--special-sync-mode', type=str, help=SUPPRESS_HELP) - - op.add_option('-c', '--config-file', metavar='CONF', type=str, action='callback', callback=store_local) - # duh. need to specify dest or value will be mapped to None :S - op.add_option('--monitor', dest='monitor', action='callback', callback=store_local_curry(True)) - op.add_option('--feedback-fd', dest='feedback_fd', type=int, help=SUPPRESS_HELP, action='callback', callback=store_local) - op.add_option('--listen', dest='listen', help=SUPPRESS_HELP, action='callback', callback=store_local_curry(True)) - op.add_option('-N', '--no-daemon', dest="go_daemon", action='callback', callback=store_local_curry('dont')) - op.add_option('--debug', dest="go_daemon", action='callback', callback=lambda *a: (store_local_curry('dont')(*a), - setattr(a[-1].values, 'log_file', '-'), - setattr(a[-1].values, 'log_level', 'DEBUG'))), - - for a in ('check', 'get'): - op.add_option('--config-' + a, metavar='OPT', type=str, dest='config', action='callback', - callback=store_local_obj(a, lambda vx: {'opt': vx})) - op.add_option('--config-get-all', dest='config', action='callback', callback=store_local_obj('get', lambda vx: {'opt': None})) - for m in ('', '-rx', '-glob'): - # call this code 'Pythonic' eh? - # have to define a one-shot local function to be able to inject (a value depending on the) - # iteration variable into the inner lambda - def conf_mod_opt_regex_variant(rx): - op.add_option('--config-set' + m, metavar='OPT VAL', type=str, nargs=2, dest='config', action='callback', - callback=store_local_obj('set', lambda vx: {'opt': vx[0], 'val': vx[1], 'rx': rx})) - op.add_option('--config-del' + m, metavar='OPT', type=str, dest='config', action='callback', - callback=store_local_obj('del', lambda vx: {'opt': vx, 'rx': rx})) - conf_mod_opt_regex_variant(m and m[1:] or False) - - op.add_option('--normalize-url', dest='url_print', action='callback', callback=store_local_curry('normal')) - op.add_option('--canonicalize-url', dest='url_print', action='callback', callback=store_local_curry('canon')) - op.add_option('--canonicalize-escape-url', dest='url_print', action='callback', callback=store_local_curry('canon_esc')) - - tunables = [ norm(o.get_opt_string()[2:]) for o in op.option_list if o.callback in (store_abs, 'store_true', None) and o.get_opt_string() not in ('--version', '--help') ] - remote_tunables = [ 'listen', 'go_daemon', 'timeout', 'session_owner', 'config_file', 'use_rsync_xattrs' ] - rq_remote_tunables = { 'listen': True } - - # precedence for sources of values: 1) commandline, 2) cfg file, 3) defaults - # -- for this to work out we need to tell apart defaults from explicitly set - # options... so churn out the defaults here and call the parser with virgin - # values container. - defaults = op.get_default_values() - opts, args = op.parse_args(values=optparse.Values()) - confdata = rconf.get('config') - if not (len(args) == 2 or \ - (len(args) == 1 and rconf.get('listen')) or \ - (len(args) <= 2 and confdata) or \ - rconf.get('url_print')): - sys.stderr.write("error: incorrect number of arguments\n\n") - sys.stderr.write(op.get_usage() + "\n") - sys.exit(1) - - restricted = os.getenv('_GSYNCD_RESTRICTED_') - - if restricted: - allopts = {} - allopts.update(opts.__dict__) - allopts.update(rconf) - bannedtuns = set(allopts.keys()) - set(remote_tunables) - if bannedtuns: - raise GsyncdError('following tunables cannot be set with restricted SSH invocaton: ' + \ - ', '.join(bannedtuns)) - for k, v in rq_remote_tunables.items(): - if not k in allopts or allopts[k] != v: - raise GsyncdError('tunable %s is not set to value %s required for restricted SSH invocaton' % \ - (k, v)) - - confrx = getattr(confdata, 'rx', None) - if confrx: - # peers are regexen, don't try to parse them - if confrx == 'glob': - args = [ '\A' + fnmatch.translate(a) for a in args ] - canon_peers = args - namedict = {} - else: - rscs = [resource.parse_url(u) for u in args] - dc = rconf.get('url_print') - if dc: - for r in rscs: - print(r.get_url(**{'normal': {}, - 'canon': {'canonical': True}, - 'canon_esc': {'canonical': True, 'escaped': True}}[dc])) - return - local = remote = None - if rscs: - local = rscs[0] - if len(rscs) > 1: - remote = rscs[1] - if not local.can_connect_to(remote): - raise GsyncdError("%s cannot work with %s" % (local.path, remote and remote.path)) - pa = ([], [], []) - urlprms = ({}, {'canonical': True}, {'canonical': True, 'escaped': True}) - for x in rscs: - for i in range(len(pa)): - pa[i].append(x.get_url(**urlprms[i])) - peers, canon_peers, canon_esc_peers = pa - # creating the namedict, a dict representing various ways of referring to / repreenting - # peers to be fillable in config templates - mods = (lambda x: x, lambda x: x[0].upper() + x[1:], lambda x: 'e' + x[0].upper() + x[1:]) - if remote: - rmap = { local: ('local', 'master'), remote: ('remote', 'slave') } - else: - rmap = { local: ('local', 'slave') } - namedict = {} - for i in range(len(rscs)): - x = rscs[i] - for name in rmap[x]: - for j in range(3): - namedict[mods[j](name)] = pa[j][i] - if x.scheme == 'gluster': - namedict[name + 'vol'] = x.volume - if not 'config_file' in rconf: - rconf['config_file'] = os.path.join(os.path.dirname(sys.argv[0]), "conf/gsyncd.conf") - gcnf = GConffile(rconf['config_file'], canon_peers, defaults.__dict__, opts.__dict__, namedict) - - checkpoint_change = False - if confdata: - opt_ok = norm(confdata.opt) in tunables + [None] - if confdata.op == 'check': - if opt_ok: - sys.exit(0) - else: - sys.exit(1) - elif not opt_ok: - raise GsyncdError("not a valid option: " + confdata.opt) - if confdata.op == 'get': - gcnf.get(confdata.opt) - elif confdata.op == 'set': - gcnf.set(confdata.opt, confdata.val, confdata.rx) - elif confdata.op == 'del': - gcnf.delete(confdata.opt, confdata.rx) - # when modifying checkpoint, it's important to make a log - # of that, so in that case we go on to set up logging even - # if its just config invocation - if confdata.opt == 'checkpoint' and confdata.op in ('set', 'del') and \ - not confdata.rx: - checkpoint_change = True - if not checkpoint_change: - return - - gconf.__dict__.update(defaults.__dict__) - gcnf.update_to(gconf.__dict__) - gconf.__dict__.update(opts.__dict__) - gconf.configinterface = gcnf - - if restricted and gconf.allow_network: - ssh_conn = os.getenv('SSH_CONNECTION') - if not ssh_conn: - #legacy env var - ssh_conn = os.getenv('SSH_CLIENT') - if ssh_conn: - allowed_networks = [ IPNetwork(a) for a in gconf.allow_network.split(',') ] - client_ip = IPAddress(ssh_conn.split()[0]) - allowed = False - for nw in allowed_networks: - if client_ip in nw: - allowed = True - break - if not allowed: - raise GsyncdError("client IP address is not allowed") - - ffd = rconf.get('feedback_fd') - if ffd: - fcntl.fcntl(ffd, fcntl.F_SETFD, fcntl.FD_CLOEXEC) - - #normalize loglevel - lvl0 = gconf.log_level - if isinstance(lvl0, str): - lvl1 = lvl0.upper() - lvl2 = logging.getLevelName(lvl1) - # I have _never_ _ever_ seen such an utterly braindead - # error condition - if lvl2 == "Level " + lvl1: - raise GsyncdError('cannot recognize log level "%s"' % lvl0) - gconf.log_level = lvl2 - - if not privileged() and gconf.log_file_mbr: - gconf.log_file = gconf.log_file_mbr - - if checkpoint_change: - try: - GLogger._gsyncd_loginit(log_file=gconf.log_file, label='conf') - if confdata.op == 'set': - logging.info('checkpoint %s set' % confdata.val) - elif confdata.op == 'del': - logging.info('checkpoint info was reset') - except IOError: - if sys.exc_info()[1].errno == ENOENT: - # directory of log path is not present, - # which happens if we get here from - # a peer-multiplexed "config-set checkpoint" - # (as that directory is created only on the - # original node) - pass - else: - raise - return - - go_daemon = rconf['go_daemon'] - be_monitor = rconf.get('monitor') - - if not be_monitor and isinstance(remote, resource.SSH) and \ - go_daemon == 'should': - go_daemon = 'postconn' - log_file = None - else: - log_file = gconf.log_file - if be_monitor: - label = 'monitor' - elif remote: - #master - label = '' - else: - label = 'slave' - startup(go_daemon=go_daemon, log_file=log_file, label=label) - - if be_monitor: - return monitor() - - logging.info("syncing: %s" % " -> ".join(peers)) - resource.Popen.init_errhandler() - if remote: - go_daemon = remote.connect_remote(go_daemon=go_daemon) - if go_daemon: - startup(go_daemon=go_daemon, log_file=gconf.log_file) - # complete remote connection in child - remote.connect_remote(go_daemon='done') - local.connect() - if ffd: - os.close(ffd) - local.service_loop(*[r for r in [remote] if r]) - - -if __name__ == "__main__": - main() diff --git a/xlators/features/marker/utils/syncdaemon/libcxattr.py b/xlators/features/marker/utils/syncdaemon/libcxattr.py deleted file mode 100644 index f0a9d2292..000000000 --- a/xlators/features/marker/utils/syncdaemon/libcxattr.py +++ /dev/null @@ -1,72 +0,0 @@ -import os -from ctypes import * -from ctypes.util import find_library - -class Xattr(object): - """singleton that wraps the extended attribues system - interface for python using ctypes - - Just implement it to the degree we need it, in particular - - we need just the l*xattr variants, ie. we never want symlinks to be - followed - - don't need size discovery for getxattr, as we always know the exact - sizes we expect - """ - - libc = CDLL(find_library("libc")) - - @classmethod - def geterrno(cls): - return c_int.in_dll(cls.libc, 'errno').value - - @classmethod - def raise_oserr(cls): - errn = cls.geterrno() - raise OSError(errn, os.strerror(errn)) - - @classmethod - def _query_xattr(cls, path, siz, syscall, *a): - if siz: - buf = create_string_buffer('\0' * siz) - else: - buf = None - ret = getattr(cls.libc, syscall)(*((path,) + a + (buf, siz))) - if ret == -1: - cls.raise_oserr() - if siz: - return buf.raw[:ret] - else: - return ret - - @classmethod - def lgetxattr(cls, path, attr, siz=0): - return cls._query_xattr( path, siz, 'lgetxattr', attr) - - @classmethod - def llistxattr(cls, path, siz=0): - ret = cls._query_xattr(path, siz, 'llistxattr') - if isinstance(ret, str): - ret = ret.split('\0') - return ret - - @classmethod - def lsetxattr(cls, path, attr, val): - ret = cls.libc.lsetxattr(path, attr, val, len(val), 0) - if ret == -1: - cls.raise_oserr() - - @classmethod - def lremovexattr(cls, path, attr): - ret = cls.libc.lremovexattr(path, attr) - if ret == -1: - cls.raise_oserr() - - @classmethod - def llistxattr_buf(cls, path): - """listxattr variant with size discovery""" - size = cls.llistxattr(path) - if size == -1: - cls.raise_oserr() - if size == 0: - return [] - return cls.llistxattr(path, size) diff --git a/xlators/features/marker/utils/syncdaemon/master.py b/xlators/features/marker/utils/syncdaemon/master.py deleted file mode 100644 index f903f3059..000000000 --- a/xlators/features/marker/utils/syncdaemon/master.py +++ /dev/null @@ -1,961 +0,0 @@ -import os -import sys -import time -import stat -import random -import signal -import logging -import socket -import errno -import re -from errno import ENOENT, ENODATA, EPIPE -from threading import currentThread, Condition, Lock -from datetime import datetime -try: - from hashlib import md5 as md5 -except ImportError: - # py 2.4 - from md5 import new as md5 - -from gconf import gconf -from syncdutils import FreeObject, Thread, GsyncdError, boolify, \ - escape, unescape, select - -URXTIME = (-1, 0) - -# Utility functions to help us to get to closer proximity -# of the DRY principle (no, don't look for elevated or -# perspectivistic things here) - -def _xtime_now(): - t = time.time() - sec = int(t) - nsec = int((t - sec) * 1000000) - return (sec, nsec) - -def _volinfo_hook_relax_foreign(self): - volinfo_sys = self.get_sys_volinfo() - fgn_vi = volinfo_sys[self.KFGN] - if fgn_vi: - expiry = fgn_vi['timeout'] - int(time.time()) + 1 - logging.info('foreign volume info found, waiting %d sec for expiry' % \ - expiry) - time.sleep(expiry) - volinfo_sys = self.get_sys_volinfo() - self.volinfo_state, state_change = self.volinfo_state_machine(self.volinfo_state, - volinfo_sys) - if self.inter_master: - raise GsyncdError("cannot be intermediate master in special mode") - return (volinfo_sys, state_change) - - -# The API! - -def gmaster_builder(): - """produce the GMaster class variant corresponding - to sync mode""" - this = sys.modules[__name__] - modemixin = gconf.special_sync_mode - if not modemixin: - modemixin = 'normal' - logging.info('setting up master for %s sync mode' % modemixin) - modemixin = getattr(this, modemixin.capitalize() + 'Mixin') - sendmarkmixin = boolify(gconf.use_rsync_xattrs) and SendmarkRsyncMixin or SendmarkNormalMixin - purgemixin = boolify(gconf.ignore_deletes) and PurgeNoopMixin or PurgeNormalMixin - class _GMaster(GMasterBase, modemixin, sendmarkmixin, purgemixin): - pass - return _GMaster - - -# Mixin classes that implement the data format -# and logic particularities of the certain -# sync modes - -class NormalMixin(object): - """normal geo-rep behavior""" - - minus_infinity = URXTIME - - # following staticmethods ideally would be - # methods of an xtime object (in particular, - # implementing the hooks needed for comparison - # operators), but at this point we don't yet - # have a dedicated xtime class - - @staticmethod - def serialize_xtime(xt): - return "%d.%d" % tuple(xt) - - @staticmethod - def deserialize_xtime(xt): - return tuple(int(x) for x in xt.split(".")) - - @staticmethod - def native_xtime(xt): - return xt - - @staticmethod - def xtime_geq(xt0, xt1): - return xt0 >= xt1 - - def make_xtime_opts(self, is_master, opts): - if not 'create' in opts: - opts['create'] = is_master and not self.inter_master - if not 'default_xtime' in opts: - if is_master and self.inter_master: - opts['default_xtime'] = ENODATA - else: - opts['default_xtime'] = URXTIME - - def xtime_low(self, server, path, **opts): - xt = server.xtime(path, self.uuid) - if isinstance(xt, int) and xt != ENODATA: - return xt - if xt == ENODATA or xt < self.volmark: - if opts['create']: - xt = _xtime_now() - server.set_xtime(path, self.uuid, xt) - else: - xt = opts['default_xtime'] - return xt - - def keepalive_payload_hook(self, timo, gap): - # first grab a reference as self.volinfo - # can be changed in main thread - vi = self.volinfo - if vi: - # then have a private copy which we can mod - vi = vi.copy() - vi['timeout'] = int(time.time()) + timo - else: - # send keep-alives more frequently to - # avoid a delay in announcing our volume info - # to slave if it becomes established in the - # meantime - gap = min(10, gap) - return (vi, gap) - - def volinfo_hook(self): - volinfo_sys = self.get_sys_volinfo() - self.volinfo_state, state_change = self.volinfo_state_machine(self.volinfo_state, - volinfo_sys) - return (volinfo_sys, state_change) - - def xtime_reversion_hook(self, path, xtl, xtr): - if xtr > xtl: - raise GsyncdError("timestamp corruption for " + path) - - def need_sync(self, e, xte, xtrd): - return xte > xtrd - - def set_slave_xtime(self, path, mark): - self.slave.server.set_xtime(path, self.uuid, mark) - -class WrapupMixin(NormalMixin): - """a variant that differs from normal in terms - of ignoring non-indexed files""" - - @staticmethod - def make_xtime_opts(is_master, opts): - if not 'create' in opts: - opts['create'] = False - if not 'default_xtime' in opts: - opts['default_xtime'] = URXTIME - - @staticmethod - def keepalive_payload_hook(timo, gap): - return (None, gap) - - def volinfo_hook(self): - return _volinfo_hook_relax_foreign(self) - -class BlindMixin(object): - """Geo-rep flavor using vectored xtime. - - Coordinates are the master, slave uuid pair; - in master coordinate behavior is normal, - in slave coordinate we force synchronization - on any value difference (these are in disjunctive - relation, ie. if either orders the entry to be - synced, it shall be synced. - """ - - minus_infinity = (URXTIME, None) - - @staticmethod - def serialize_xtime(xt): - a = [] - for x in xt: - if not x: - x = ('None', '') - a.extend(x) - return '.'.join(str(n) for n in a) - - @staticmethod - def deserialize_xtime(xt): - a = xt.split(".") - a = (tuple(a[0:2]), tuple(a[3:4])) - b = [] - for p in a: - if p[0] == 'None': - p = None - else: - p = tuple(int(x) for x in p) - b.append(p) - return tuple(b) - - @staticmethod - def native_xtime(xt): - return xt[0] - - @staticmethod - def xtime_geq(xt0, xt1): - return (not xt1[0] or xt0[0] >= xt1[0]) and \ - (not xt1[1] or xt0[1] >= xt1[1]) - - @property - def ruuid(self): - if self.volinfo_r: - return self.volinfo_r['uuid'] - - @staticmethod - def make_xtime_opts(is_master, opts): - if not 'create' in opts: - opts['create'] = is_master - if not 'default_xtime' in opts: - opts['default_xtime'] = URXTIME - - def xtime_low(self, server, path, **opts): - xtd = server.xtime_vec(path, self.uuid, self.ruuid) - if isinstance(xtd, int): - return xtd - xt = (xtd[self.uuid], xtd[self.ruuid]) - if not xt[1] and (not xt[0] or xt[0] < self.volmark): - if opts['create']: - # not expected, but can happen if file originates - # from interrupted gsyncd transfer - logging.warn('have to fix up missing xtime on ' + path) - xt0 = _xtime_now() - server.set_xtime(path, self.uuid, xt0) - else: - xt0 = opts['default_xtime'] - xt = (xt0, xt[1]) - return xt - - @staticmethod - def keepalive_payload_hook(timo, gap): - return (None, gap) - - def volinfo_hook(self): - res = _volinfo_hook_relax_foreign(self) - volinfo_r_new = self.slave.server.native_volume_info() - if volinfo_r_new['retval']: - raise GsyncdError("slave is corrupt") - if getattr(self, 'volinfo_r', None): - if self.volinfo_r['uuid'] != volinfo_r_new['uuid']: - raise GsyncdError("uuid mismatch on slave") - self.volinfo_r = volinfo_r_new - return res - - def xtime_reversion_hook(self, path, xtl, xtr): - if not isinstance(xtr[0], int) and \ - (isinstance(xtl[0], int) or xtr[0] > xtl[0]): - raise GsyncdError("timestamp corruption for " + path) - - def need_sync(self, e, xte, xtrd): - if xte[0]: - if not xtrd[0] or xte[0] > xtrd[0]: - # there is outstanding diff at 0th pos, - # we can short-cut to true - return True - # we arrived to this point by either of these - # two possiblilites: - # - no outstanding difference at 0th pos, - # wanna see 1st pos if he raises veto - # against "no need to sync" proposal - # - no data at 0th pos, 1st pos will have - # to decide (due to xtime assignment, - # in this case 1st pos does carry data - # -- iow, if 1st pos did not have data, - # and 0th neither, 0th would have been - # force-feeded) - if not xte[1]: - # no data, no veto - return False - # the hard work: for 1st pos, - # the conduct is fetch corresponding - # slave data and do a "blind" comparison - # (ie. do not care who is newer, we trigger - # sync on non-identical xitmes) - xtr = self.xtime(e, self.slave) - return isinstance(xtr, int) or xte[1] != xtr[1] - - def set_slave_xtime(self, path, mark): - xtd = {} - for (u, t) in zip((self.uuid, self.ruuid), mark): - if t: - xtd[u] = t - self.slave.server.set_xtime_vec(path, xtd) - - -# Further mixins for certain tunable behaviors - -class SendmarkNormalMixin(object): - - def sendmark_regular(self, *a, **kw): - return self.sendmark(*a, **kw) - -class SendmarkRsyncMixin(object): - - def sendmark_regular(self, *a, **kw): - pass - - -class PurgeNormalMixin(object): - - def purge_missing(self, path, names): - self.slave.server.purge(path, names) - -class PurgeNoopMixin(object): - - def purge_missing(self, path, names): - pass - - - -class GMasterBase(object): - """abstract class impementling master role""" - - KFGN = 0 - KNAT = 1 - - def get_sys_volinfo(self): - """query volume marks on fs root - - err out on multiple foreign masters - """ - fgn_vis, nat_vi = self.master.server.foreign_volume_infos(), \ - self.master.server.native_volume_info() - fgn_vi = None - if fgn_vis: - if len(fgn_vis) > 1: - raise GsyncdError("cannot work with multiple foreign masters") - fgn_vi = fgn_vis[0] - return fgn_vi, nat_vi - - @property - def uuid(self): - if self.volinfo: - return self.volinfo['uuid'] - - @property - def volmark(self): - if self.volinfo: - return self.volinfo['volume_mark'] - - @property - def inter_master(self): - """decide if we are an intermediate master - in a cascading setup - """ - return self.volinfo_state[self.KFGN] and True or False - - def xtime(self, path, *a, **opts): - """get amended xtime - - as of amending, we can create missing xtime, or - determine a valid value if what we get is expired - (as of the volume mark expiry); way of amendig - depends on @opts and on subject of query (master - or slave). - """ - if a: - rsc = a[0] - else: - rsc = self.master - self.make_xtime_opts(rsc == self.master, opts) - return self.xtime_low(rsc.server, path, **opts) - - def __init__(self, master, slave): - self.master = master - self.slave = slave - self.jobtab = {} - self.syncer = Syncer(slave) - # crawls vs. turns: - # - self.crawls is simply the number of crawl() invocations on root - # - one turn is a maximal consecutive sequence of crawls so that each - # crawl in it detects a change to be synced - # - self.turns is the number of turns since start - # - self.total_turns is a limit so that if self.turns reaches it, then - # we exit (for diagnostic purposes) - # so, eg., if the master fs changes unceasingly, self.turns will remain 0. - self.crawls = 0 - self.turns = 0 - self.total_turns = int(gconf.turns) - self.lastreport = {'crawls': 0, 'turns': 0} - self.start = None - self.change_seen = None - self.syncTime=0 - self.lastSyncTime=0 - self.crawlStartTime=0 - self.crawlTime=0 - self.filesSynced=0 - self.bytesSynced=0 - # the authoritative (foreign, native) volinfo pair - # which lets us deduce what to do when we refetch - # the volinfos from system - uuid_preset = getattr(gconf, 'volume_id', None) - self.volinfo_state = (uuid_preset and {'uuid': uuid_preset}, None) - # the actual volinfo we make use of - self.volinfo = None - self.terminate = False - self.checkpoint_thread = None - - @classmethod - def _checkpt_param(cls, chkpt, prm, xtimish=True): - """use config backend to lookup a parameter belonging to - checkpoint @chkpt""" - cprm = getattr(gconf, 'checkpoint_' + prm, None) - if not cprm: - return - chkpt_mapped, val = cprm.split(':', 1) - if unescape(chkpt_mapped) != chkpt: - return - if xtimish: - val = cls.deserialize_xtime(val) - return val - - @classmethod - def _set_checkpt_param(cls, chkpt, prm, val, xtimish=True): - """use config backend to store a parameter associated - with checkpoint @chkpt""" - if xtimish: - val = cls.serialize_xtime(val) - gconf.configinterface.set('checkpoint_' + prm, "%s:%s" % (escape(chkpt), val)) - - @staticmethod - def humantime(*tpair): - """format xtime-like (sec, nsec) pair to human readable format""" - ts = datetime.fromtimestamp(float('.'.join(str(n) for n in tpair))).\ - strftime("%Y-%m-%d %H:%M:%S") - if len(tpair) > 1: - ts += '.' + str(tpair[1]) - return ts - - def get_extra_info(self): - str_info="\nFile synced : %d" %(self.filesSynced) - str_info+="\nBytes Synced : %d KB" %(self.syncer.bytesSynced) - str_info+="\nSync Time : %f seconds" %(self.syncTime) - self.crawlTime=datetime.now()-self.crawlStartTime - years , days =divmod(self.crawlTime.days,365.25) - years=int(years) - days=int(days) - - date="" - m, s = divmod(self.crawlTime.seconds, 60) - h, m = divmod(m, 60) - - if years!=0 : - date+=str(years)+" year " - if days!=0 : - date+=str(days)+" day " - if h!=0 : - date+=str(h)+" H : " - if m!=0 or h!=0 : - date+=str(m)+" M : " - - date+=str(s)+" S" - self.crawlTime=date - str_info+="\nCrawl Time : %s" %(str(self.crawlTime)) - str_info+="\n\0" - return str_info - - def checkpt_service(self, chan, chkpt, tgt): - """checkpoint service loop - - monitor and verify checkpoint status for @chkpt, and listen - for incoming requests for whom we serve a pretty-formatted - status report""" - if not chkpt: - # dummy loop for the case when there is no checkpt set - while True: - select([chan], [], []) - conn, _ = chan.accept() - conn.send(self.get_extra_info()) - conn.close() - completed = self._checkpt_param(chkpt, 'completed', xtimish=False) - if completed: - completed = tuple(int(x) for x in completed.split('.')) - while True: - s,_,_ = select([chan], [], [], (not completed) and 5 or None) - # either request made and we re-check to not - # give back stale data, or we still hunting for completion - if self.native_xtime(tgt) and self.native_xtime(tgt) < self.volmark: - # indexing has been reset since setting the checkpoint - status = "is invalid" - else: - xtr = self.xtime('.', self.slave) - if isinstance(xtr, int): - raise GsyncdError("slave root directory is unaccessible (%s)", - os.strerror(xtr)) - ncompleted = self.xtime_geq(xtr, tgt) - if completed and not ncompleted: # stale data - logging.warn("completion time %s for checkpoint %s became stale" % \ - (self.humantime(*completed), chkpt)) - completed = None - gconf.confdata.delete('checkpoint-completed') - if ncompleted and not completed: # just reaching completion - completed = "%.6f" % time.time() - self._set_checkpt_param(chkpt, 'completed', completed, xtimish=False) - completed = tuple(int(x) for x in completed.split('.')) - logging.info("checkpoint %s completed" % chkpt) - status = completed and \ - "completed at " + self.humantime(completed[0]) or \ - "not reached yet" - if s: - conn = None - try: - conn, _ = chan.accept() - try: - conn.send(" | checkpoint %s %s %s" % (chkpt, status,self.get_extra_info())) - except: - exc = sys.exc_info()[1] - if (isinstance(exc, OSError) or isinstance(exc, IOError)) and \ - exc.errno == EPIPE: - logging.debug('checkpoint client disconnected') - else: - raise - finally: - if conn: - conn.close() - - def start_checkpoint_thread(self): - """prepare and start checkpoint service""" - if self.checkpoint_thread or not ( - getattr(gconf, 'state_socket_unencoded', None) and getattr(gconf, 'socketdir', None) - ): - return - chan = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - state_socket = os.path.join(gconf.socketdir, md5(gconf.state_socket_unencoded).hexdigest() + ".socket") - try: - os.unlink(state_socket) - except: - if sys.exc_info()[0] == OSError: - pass - chan.bind(state_socket) - chan.listen(1) - checkpt_tgt = None - if gconf.checkpoint: - checkpt_tgt = self._checkpt_param(gconf.checkpoint, 'target') - if not checkpt_tgt: - checkpt_tgt = self.xtime('.') - if isinstance(checkpt_tgt, int): - raise GsyncdError("master root directory is unaccessible (%s)", - os.strerror(checkpt_tgt)) - self._set_checkpt_param(gconf.checkpoint, 'target', checkpt_tgt) - logging.debug("checkpoint target %s has been determined for checkpoint %s" % \ - (repr(checkpt_tgt), gconf.checkpoint)) - t = Thread(target=self.checkpt_service, args=(chan, gconf.checkpoint, checkpt_tgt)) - t.start() - self.checkpoint_thread = t - - def crawl_loop(self): - """start the keep-alive thread and iterate .crawl""" - timo = int(gconf.timeout or 0) - if timo > 0: - def keep_alive(): - while True: - vi, gap = self.keepalive_payload_hook(timo, timo * 0.5) - self.slave.server.keep_alive(vi) - time.sleep(gap) - t = Thread(target=keep_alive) - t.start() - self.lastreport['time'] = time.time() - self.crawlStartTime=datetime.now() - while not self.terminate: - self.crawl() - - def add_job(self, path, label, job, *a, **kw): - """insert @job function to job table at @path with @label""" - if self.jobtab.get(path) == None: - self.jobtab[path] = [] - self.jobtab[path].append((label, a, lambda : job(*a, **kw))) - - def add_failjob(self, path, label): - """invoke .add_job with a job that does nothing just fails""" - logging.debug('salvaged: ' + label) - self.add_job(path, label, lambda: False) - - def wait(self, path, *args): - """perform jobs registered for @path - - Reset jobtab entry for @path, - determine success as the conjuction of - success of all the jobs. In case of - success, call .sendmark on @path - """ - jobs = self.jobtab.pop(path, []) - succeed = True - for j in jobs: - ret = j[-1]() - if not ret: - succeed = False - if succeed: - self.sendmark(path, *args) - return succeed - - def sendmark(self, path, mark, adct=None): - """update slave side xtime for @path to master side xtime - - also can send a setattr payload (see Server.setattr). - """ - if adct: - self.slave.server.setattr(path, adct) - self.set_slave_xtime(path, mark) - - @staticmethod - def volinfo_state_machine(volinfo_state, volinfo_sys): - """compute new volinfo_state from old one and incoming - as of current system state, also indicating if there was a - change regarding which volume mark is the authoritative one - - @volinfo_state, @volinfo_sys are pairs of volume mark dicts - (foreign, native). - - Note this method is marked as static, ie. the computation is - pure, without reliance on any excess implicit state. State - transitions which are deemed as ambiguous or banned will raise - an exception. - - """ - # store the value below "boxed" to emulate proper closures - # (variables of the enclosing scope are available inner functions - # provided they are no reassigned; mutation is OK). - param = FreeObject(relax_mismatch = False, state_change = None, index=-1) - def select_vi(vi0, vi): - param.index += 1 - if vi and (not vi0 or vi0['uuid'] == vi['uuid']): - if not vi0 and not param.relax_mismatch: - param.state_change = param.index - # valid new value found; for the rest, we are graceful about - # uuid mismatch - param.relax_mismatch = True - return vi - if vi0 and vi and vi0['uuid'] != vi['uuid'] and not param.relax_mismatch: - # uuid mismatch for master candidate, bail out - raise GsyncdError("aborting on uuid change from %s to %s" % \ - (vi0['uuid'], vi['uuid'])) - # fall back to old - return vi0 - newstate = tuple(select_vi(*vip) for vip in zip(volinfo_state, volinfo_sys)) - srep = lambda vi: vi and vi['uuid'][0:8] - logging.debug('(%s, %s) << (%s, %s) -> (%s, %s)' % \ - tuple(srep(vi) for vi in volinfo_state + volinfo_sys + newstate)) - return newstate, param.state_change - - def crawl(self, path='.', xtl=None): - """crawling... - - Standing around - All the right people - Crawling - Tennis on Tuesday - The ladder is long - It is your nature - You've gotta suntan - Football on Sunday - Society boy - - Recursively walk the master side tree and check if updates are - needed due to xtime differences. One invocation of crawl checks - children of @path and do a recursive enter only on - those directory children where there is an update needed. - - Way of updates depend on file type: - - for symlinks, sync them directy and synchronously - - for regular children, register jobs for @path (cf. .add_job) to start - and wait on their rsync - - for directory children, register a job for @path which waits (.wait) - on jobs for the given child - (other kind of filesystem nodes are not considered) - - Those slave side children which do not exist on master are simply - purged (see Server.purge). - - Behavior is fault tolerant, synchronization is adaptive: if some action fails, - just go on relentlessly, adding a fail job (see .add_failjob) which will prevent - the .sendmark on @path, so when the next crawl will arrive to @path it will not - see it as up-to-date and will try to sync it again. While this semantics can be - supported by funky design principles (http://c2.com/cgi/wiki?LazinessImpatienceHubris), - the ultimate reason which excludes other possibilities is simply transience: we cannot - assert that the file systems (master / slave) underneath do not change and actions - taken upon some condition will not lose their context by the time they are performed. - """ - if path == '.': - if self.start: - self.crawls += 1 - logging.debug("... crawl #%d done, took %.6f seconds" % \ - (self.crawls, time.time() - self.start)) - time.sleep(1) - self.start = time.time() - should_display_info = self.start - self.lastreport['time'] >= 60 - if should_display_info: - logging.info("completed %d crawls, %d turns", - self.crawls - self.lastreport['crawls'], - self.turns - self.lastreport['turns']) - self.lastreport.update(crawls = self.crawls, - turns = self.turns, - time = self.start) - volinfo_sys, state_change = self.volinfo_hook() - if self.inter_master: - self.volinfo = volinfo_sys[self.KFGN] - else: - self.volinfo = volinfo_sys[self.KNAT] - if state_change == self.KFGN or (state_change == self.KNAT and not self.inter_master): - logging.info('new master is %s', self.uuid) - if self.volinfo: - logging.info("%s master with volume id %s ..." % \ - (self.inter_master and "intermediate" or "primary", - self.uuid)) - if state_change == self.KFGN: - gconf.configinterface.set('volume_id', self.uuid) - if self.volinfo: - if self.volinfo['retval']: - raise GsyncdError ("master is corrupt") - self.start_checkpoint_thread() - else: - if should_display_info or self.crawls == 0: - if self.inter_master: - logging.info("waiting for being synced from %s ..." % \ - self.volinfo_state[self.KFGN]['uuid']) - else: - logging.info("waiting for volume info ...") - return - logging.debug("entering " + path) - if not xtl: - xtl = self.xtime(path) - if isinstance(xtl, int): - self.add_failjob(path, 'no-local-node') - return - xtr = self.xtime(path, self.slave) - if isinstance(xtr, int): - if xtr != ENOENT: - self.slave.server.purge(path) - try: - self.slave.server.mkdir(path) - except OSError: - self.add_failjob(path, 'no-remote-node') - return - xtr = self.minus_infinity - else: - self.xtime_reversion_hook(path, xtl, xtr) - if xtl == xtr: - if path == '.' and self.change_seen: - self.turns += 1 - self.change_seen = False - if self.total_turns: - logging.info("finished turn #%s/%s" % \ - (self.turns, self.total_turns)) - if self.turns == self.total_turns: - logging.info("reached turn limit") - self.terminate = True - return - if path == '.': - self.change_seen = True - try: - dem = self.master.server.entries(path) - except OSError: - self.add_failjob(path, 'local-entries-fail') - return - random.shuffle(dem) - try: - des = self.slave.server.entries(path) - except OSError: - self.slave.server.purge(path) - try: - self.slave.server.mkdir(path) - des = self.slave.server.entries(path) - except OSError: - self.add_failjob(path, 'remote-entries-fail') - return - dd = set(des) - set(dem) - if dd: - self.purge_missing(path, dd) - chld = [] - for e in dem: - e = os.path.join(path, e) - xte = self.xtime(e) - if isinstance(xte, int): - logging.warn("irregular xtime for %s: %s" % (e, errno.errorcode[xte])) - elif self.need_sync(e, xte, xtr): - chld.append((e, xte)) - def indulgently(e, fnc, blame=None): - if not blame: - blame = path - try: - return fnc(e) - except (IOError, OSError): - ex = sys.exc_info()[1] - if ex.errno == ENOENT: - logging.warn("salvaged ENOENT for " + e) - self.add_failjob(blame, 'by-indulgently') - return False - else: - raise - for e, xte in chld: - st = indulgently(e, lambda e: os.lstat(e)) - if st == False: - continue - mo = st.st_mode - adct = {'own': (st.st_uid, st.st_gid)} - if stat.S_ISLNK(mo): - if indulgently(e, lambda e: self.slave.server.symlink(os.readlink(e), e)) == False: - continue - self.sendmark(e, xte, adct) - elif stat.S_ISREG(mo): - logging.debug("syncing %s ..." % e) - pb = self.syncer.add(e) - timeA=datetime.now() - def regjob(e, xte, pb): - if pb.wait(): - logging.debug("synced " + e) - self.sendmark_regular(e, xte) - - timeB=datetime.now() - self.lastSyncTime=timeB-timeA - self.syncTime=(self.syncTime+self.lastSyncTime.microseconds)/(10.0**6) - self.filesSynced=self.filesSynced+1 - return True - else: - logging.warn("failed to sync " + e) - self.add_job(path, 'reg', regjob, e, xte, pb) - elif stat.S_ISDIR(mo): - adct['mode'] = mo - if indulgently(e, lambda e: (self.add_job(path, 'cwait', self.wait, e, xte, adct), - self.crawl(e, xte), - True)[-1], blame=e) == False: - continue - else: - # ignore fifos, sockets and special files - pass - if path == '.': - self.wait(path, xtl) - -class BoxClosedErr(Exception): - pass - -class PostBox(list): - """synchronized collection for storing things thought of as "requests" """ - - def __init__(self, *a): - list.__init__(self, *a) - # too bad Python stdlib does not have read/write locks... - # it would suffivce to grab the lock in .append as reader, in .close as writer - self.lever = Condition() - self.open = True - self.done = False - - def wait(self): - """wait on requests to be processed""" - self.lever.acquire() - if not self.done: - self.lever.wait() - self.lever.release() - return self.result - - def wakeup(self, data): - """wake up requestors with the result""" - self.result = data - self.lever.acquire() - self.done = True - self.lever.notifyAll() - self.lever.release() - - def append(self, e): - """post a request""" - self.lever.acquire() - if not self.open: - raise BoxClosedErr - list.append(self, e) - self.lever.release() - - def close(self): - """prohibit the posting of further requests""" - self.lever.acquire() - self.open = False - self.lever.release() - -class Syncer(object): - """a staged queue to relay rsync requests to rsync workers - - By "staged queue" its meant that when a consumer comes to the - queue, it takes _all_ entries, leaving the queue empty. - (I don't know if there is an official term for this pattern.) - - The queue uses a PostBox to accumulate incoming items. - When a consumer (rsync worker) comes, a new PostBox is - set up and the old one is passed on to the consumer. - - Instead of the simplistic scheme of having one big lock - which synchronizes both the addition of new items and - PostBox exchanges, use a separate lock to arbitrate consumers, - and rely on PostBox's synchronization mechanisms take - care about additions. - - There is a corner case racy situation, producers vs. consumers, - which is not handled by this scheme: namely, when the PostBox - exchange occurs in between being passed to the producer for posting - and the post placement. But that's what Postbox.close is for: - such a posting will find the PostBox closed, in which case - the producer can re-try posting against the actual PostBox of - the queue. - - To aid accumlation of items in the PostBoxen before grabbed - by an rsync worker, the worker goes to sleep a bit after - each completed syncjob. - """ - - def __init__(self, slave): - """spawn worker threads""" - self.slave = slave - self.lock = Lock() - self.pb = PostBox() - self.bytesSynced=0 - for i in range(int(gconf.sync_jobs)): - t = Thread(target=self.syncjob) - t.start() - - def syncjob(self): - """the life of a worker""" - while True: - pb = None - while True: - self.lock.acquire() - if self.pb: - pb, self.pb = self.pb, PostBox() - self.lock.release() - if pb: - break - time.sleep(0.5) - pb.close() - po = self.slave.rsync(pb) - if po.returncode == 0: - regEx=re.search('\ *total\ *transferred\ *file\ *size:\ *(\d+)\ *bytes\ *',po.stdout.read(),re.IGNORECASE) - if regEx: - self.bytesSynced+=(int(regEx.group(1)))/1024 - ret = True - elif po.returncode in (23, 24): - # partial transfer (cf. rsync(1)), that's normal - ret = False - else: - po.errfail() - pb.wakeup(ret) - - def add(self, e): - while True: - pb = self.pb - try: - pb.append(e) - return pb - except BoxClosedErr: - pass diff --git a/xlators/features/marker/utils/syncdaemon/monitor.py b/xlators/features/marker/utils/syncdaemon/monitor.py deleted file mode 100644 index b8956dcc2..000000000 --- a/xlators/features/marker/utils/syncdaemon/monitor.py +++ /dev/null @@ -1,129 +0,0 @@ -import os -import sys -import time -import signal -import logging -from gconf import gconf -from syncdutils import update_file, select, waitpid, set_term_handler - -class Monitor(object): - """class which spawns and manages gsyncd workers""" - - def __init__(self): - self.state = None - - def set_state(self, state): - """set the state that can be used by external agents - like glusterd for status reporting""" - if state == self.state: - return - self.state = state - logging.info('new state: %s' % state) - if getattr(gconf, 'state_file', None): - update_file(gconf.state_file, lambda f: f.write(state + '\n')) - - def monitor(self): - """the monitor loop - - Basic logic is a blantantly simple blunt heuristics: - if spawned client survives 60 secs, it's considered OK. - This servers us pretty well as it's not vulneralbe to - any kind of irregular behavior of the child... - - ... well, except for one: if children is hung up on - waiting for some event, it can survive aeons, still - will be defunct. So we tweak the above logic to - expect the worker to send us a signal within 60 secs - (in the form of closing its end of a pipe). The worker - does this when it's done with the setup stage - ready to enter the service loop (note it's the setup - stage which is vulnerable to hangs -- the full - blown worker blows up on EPIPE if the net goes down, - due to the keep-alive thread) - """ - def sigcont_handler(*a): - """ - Re-init logging and send group kill signal - """ - md = gconf.log_metadata - logging.shutdown() - lcls = logging.getLoggerClass() - lcls.setup(label=md.get('saved_label'), **md) - pid = os.getpid() - os.kill(-pid, signal.SIGUSR1) - signal.signal(signal.SIGUSR1, lambda *a: ()) - signal.signal(signal.SIGCONT, sigcont_handler) - - argv = sys.argv[:] - for o in ('-N', '--no-daemon', '--monitor'): - while o in argv: - argv.remove(o) - argv.extend(('-N', '-p', '')) - argv.insert(0, os.path.basename(sys.executable)) - - self.set_state('starting...') - ret = 0 - def nwait(p, o=0): - p2, r = waitpid(p, o) - if not p2: - return - return r - def exit_signalled(s): - """ child teminated due to receipt of SIGUSR1 """ - return (os.WIFSIGNALED(s) and (os.WTERMSIG(s) == signal.SIGUSR1)) - def exit_status(s): - if os.WIFEXITED(s): - return os.WEXITSTATUS(s) - return 1 - conn_timeout = int(gconf.connection_timeout) - while ret in (0, 1): - logging.info('-' * conn_timeout) - logging.info('starting gsyncd worker') - pr, pw = os.pipe() - cpid = os.fork() - if cpid == 0: - os.close(pr) - os.execv(sys.executable, argv + ['--feedback-fd', str(pw)]) - os.close(pw) - t0 = time.time() - so = select((pr,), (), (), conn_timeout)[0] - os.close(pr) - if so: - ret = nwait(cpid, os.WNOHANG) - if ret != None: - logging.debug("worker died before establishing connection") - else: - logging.debug("worker seems to be connected (?? racy check)") - while time.time() < t0 + conn_timeout: - ret = nwait(cpid, os.WNOHANG) - if ret != None: - logging.debug("worker died in startup phase") - break - time.sleep(1) - else: - logging.debug("worker not confirmed in %d sec, aborting it" % \ - conn_timeout) - # relax one SIGTERM by setting a handler that sets back - # standard handler - set_term_handler(lambda *a: set_term_handler()) - # give a chance to graceful exit - os.kill(-os.getpid(), signal.SIGTERM) - time.sleep(1) - os.kill(cpid, signal.SIGKILL) - ret = nwait(cpid) - if ret == None: - self.set_state('OK') - ret = nwait(cpid) - if exit_signalled(ret): - ret = 0 - else: - ret = exit_status(ret) - if ret in (0,1): - self.set_state('faulty') - time.sleep(10) - self.set_state('inconsistent') - return ret - -def monitor(): - """oh yeah, actually Monitor is used as singleton, too""" - return Monitor().monitor() diff --git a/xlators/features/marker/utils/syncdaemon/repce.py b/xlators/features/marker/utils/syncdaemon/repce.py deleted file mode 100644 index 755fb61df..000000000 --- a/xlators/features/marker/utils/syncdaemon/repce.py +++ /dev/null @@ -1,225 +0,0 @@ -import os -import sys -import time -import logging -from threading import Condition -try: - import thread -except ImportError: - # py 3 - import _thread as thread -try: - from Queue import Queue -except ImportError: - # py 3 - from queue import Queue -try: - import cPickle as pickle -except ImportError: - # py 3 - import pickle - -from syncdutils import Thread, select - -pickle_proto = -1 -repce_version = 1.0 - -def ioparse(i, o): - if isinstance(i, int): - i = os.fdopen(i) - # rely on duck typing for recognizing - # streams as that works uniformly - # in py2 and py3 - if hasattr(o, 'fileno'): - o = o.fileno() - return (i, o) - -def send(out, *args): - """pickle args and write out wholly in one syscall - - ie. not use the ability of pickle to dump directly to - a stream, as that would potentially mess up messages - by interleaving them - """ - os.write(out, pickle.dumps(args, pickle_proto)) - -def recv(inf): - """load an object from input stream""" - return pickle.load(inf) - - -class RepceServer(object): - """RePCe is Hungarian for canola, http://hu.wikipedia.org/wiki/Repce - - ... also our homebrewed RPC backend where the transport layer is - reduced to a pair of filehandles. - - This is the server component. - """ - - def __init__(self, obj, i, o, wnum=6): - """register a backend object .obj to which incoming messages - are dispatched, also incoming/outcoming streams - """ - self.obj = obj - self.inf, self.out = ioparse(i, o) - self.wnum = wnum - self.q = Queue() - - def service_loop(self): - """fire up worker threads, get messages and dispatch among them""" - for i in range(self.wnum): - t = Thread(target=self.worker) - t.start() - try: - while True: - self.q.put(recv(self.inf)) - except EOFError: - logging.info("terminating on reaching EOF.") - - def worker(self): - """life of a worker - - Get message, extract its id, method name and arguments - (kwargs not supported), call method on .obj. - Send back message id + return value. - If method call throws an exception, rescue it, and send - back the exception as result (with flag marking it as - exception). - """ - while True: - in_data = self.q.get(True) - rid = in_data[0] - rmeth = in_data[1] - exc = False - if rmeth == '__repce_version__': - res = repce_version - else: - try: - res = getattr(self.obj, rmeth)(*in_data[2:]) - except: - res = sys.exc_info()[1] - exc = True - logging.exception("call failed: ") - send(self.out, rid, exc, res) - - -class RepceJob(object): - """class representing message status we can use - for waiting on reply""" - - def __init__(self, cbk): - """ - - .rid: (process-wise) unique id - - .cbk: what we do upon receiving reply - """ - self.rid = (os.getpid(), thread.get_ident(), time.time()) - self.cbk = cbk - self.lever = Condition() - self.done = False - - def __repr__(self): - return ':'.join([str(x) for x in self.rid]) - - def wait(self): - self.lever.acquire() - if not self.done: - self.lever.wait() - self.lever.release() - return self.result - - def wakeup(self, data): - self.result = data - self.lever.acquire() - self.done = True - self.lever.notify() - self.lever.release() - - -class RepceClient(object): - """RePCe is Hungarian for canola, http://hu.wikipedia.org/wiki/Repce - - ... also our homebrewed RPC backend where the transport layer is - reduced to a pair of filehandles. - - This is the client component. - """ - - def __init__(self, i, o): - self.inf, self.out = ioparse(i, o) - self.jtab = {} - t = Thread(target = self.listen) - t.start() - - def listen(self): - while True: - select((self.inf,), (), ()) - rid, exc, res = recv(self.inf) - rjob = self.jtab.pop(rid) - if rjob.cbk: - rjob.cbk(rjob, [exc, res]) - - def push(self, meth, *args, **kw): - """wrap arguments in a RepceJob, send them to server - and return the RepceJob - - @cbk to pass on RepceJob can be given as kwarg. - """ - cbk = kw.get('cbk') - if not cbk: - def cbk(rj, res): - if res[0]: - raise res[1] - rjob = RepceJob(cbk) - self.jtab[rjob.rid] = rjob - logging.debug("call %s %s%s ..." % (repr(rjob), meth, repr(args))) - send(self.out, rjob.rid, meth, *args) - return rjob - - def __call__(self, meth, *args): - """RePCe client is callabe, calling it implements a synchronous remote call - - We do a .push with a cbk which does a wakeup upon receiving anwser, then wait - on the RepceJob. - """ - rjob = self.push(meth, *args, **{'cbk': lambda rj, res: rj.wakeup(res)}) - exc, res = rjob.wait() - if exc: - logging.error('call %s (%s) failed on peer with %s' % (repr(rjob), meth, str(type(res).__name__))) - raise res - logging.debug("call %s %s -> %s" % (repr(rjob), meth, repr(res))) - return res - - class mprx(object): - """method proxy, standard trick to implement rubyesque method_missing - in Python - - A class is a closure factory, you know what I mean, or go read some SICP. - """ - def __init__(self, ins, meth): - self.ins = ins - self.meth = meth - - def __call__(self, *a): - return self.ins(self.meth, *a) - - def __getattr__(self, meth): - """this implements transparent method dispatch to remote object, - so that you don't need to call the RepceClient instance like - - rclient('how_old_are_you_if_born_in', 1979) - - but you can make it into an ordinary method call like - - rclient.how_old_are_you_if_born_in(1979) - """ - return self.mprx(self, meth) - - def __version__(self): - """used in handshake to verify compatibility""" - d = {'proto': self('__repce_version__')} - try: - d['object'] = self('version') - except AttributeError: - pass - return d diff --git a/xlators/features/marker/utils/syncdaemon/resource.py b/xlators/features/marker/utils/syncdaemon/resource.py deleted file mode 100644 index 73102fbcb..000000000 --- a/xlators/features/marker/utils/syncdaemon/resource.py +++ /dev/null @@ -1,972 +0,0 @@ -import re -import os -import sys -import stat -import time -import fcntl -import errno -import struct -import socket -import logging -import tempfile -import threading -import subprocess -from errno import EEXIST, ENOENT, ENODATA, ENOTDIR, ELOOP, EISDIR -from select import error as SelectError - -from gconf import gconf -import repce -from repce import RepceServer, RepceClient -from master import gmaster_builder -import syncdutils -from syncdutils import GsyncdError, select, privileged, boolify - -UrlRX = re.compile('\A(\w+)://([^ *?[]*)\Z') -HostRX = re.compile('[a-z\d](?:[a-z\d.-]*[a-z\d])?', re.I) -UserRX = re.compile("[\w!\#$%&'*+-\/=?^_`{|}~]+") - -def sup(x, *a, **kw): - """a rubyesque "super" for python ;) - - invoke caller method in parent class with given args. - """ - return getattr(super(type(x), x), sys._getframe(1).f_code.co_name)(*a, **kw) - -def desugar(ustr): - """transform sugared url strings to standard <scheme>://<urlbody> form - - parsing logic enforces the constraint that sugared forms should contatin - a ':' or a '/', which ensures that sugared urls do not conflict with - gluster volume names. - """ - m = re.match('([^:]*):(.*)', ustr) - if m: - if not m.groups()[0]: - return "gluster://localhost" + ustr - elif '@' in m.groups()[0] or re.search('[:/]', m.groups()[1]): - return "ssh://" + ustr - else: - return "gluster://" + ustr - else: - if ustr[0] != '/': - raise GsyncdError("cannot resolve sugared url '%s'" % ustr) - ap = os.path.normpath(ustr) - if ap.startswith('//'): - ap = ap[1:] - return "file://" + ap - -def gethostbyname(hnam): - """gethostbyname wrapper""" - try: - return socket.gethostbyname(hnam) - except socket.gaierror: - ex = sys.exc_info()[1] - raise GsyncdError("failed to resolve %s: %s" % \ - (hnam, ex.strerror)) - -def parse_url(ustr): - """instantiate an url object by scheme-to-class dispatch - - The url classes taken into consideration are the ones in - this module whose names are full-caps. - """ - m = UrlRX.match(ustr) - if not m: - ustr = desugar(ustr) - m = UrlRX.match(ustr) - if not m: - raise GsyncdError("malformed url") - sch, path = m.groups() - this = sys.modules[__name__] - if not hasattr(this, sch.upper()): - raise GsyncdError("unknown url scheme " + sch) - return getattr(this, sch.upper())(path) - - -class _MetaXattr(object): - """singleton class, a lazy wrapper around the - libcxattr module - - libcxattr (a heavy import due to ctypes) is - loaded only when when the single - instance is tried to be used. - - This reduces runtime for those invocations - which do not need filesystem manipulation - (eg. for config, url parsing) - """ - - def __getattr__(self, meth): - from libcxattr import Xattr as LXattr - xmeth = [ m for m in dir(LXattr) if m[0] != '_' ] - if not meth in xmeth: - return - for m in xmeth: - setattr(self, m, getattr(LXattr, m)) - return getattr(self, meth) - -Xattr = _MetaXattr() - - -class Popen(subprocess.Popen): - """customized subclass of subprocess.Popen with a ring - buffer for children error output""" - - @classmethod - def init_errhandler(cls): - """start the thread which handles children's error output""" - cls.errstore = {} - def tailer(): - while True: - errstore = cls.errstore.copy() - try: - poe, _ ,_ = select([po.stderr for po in errstore], [], [], 1) - except (ValueError, SelectError): - continue - for po in errstore: - if po.stderr not in poe: - continue - po.lock.acquire() - try: - if po.on_death_row: - continue - la = errstore[po] - try: - fd = po.stderr.fileno() - except ValueError: # file is already closed - continue - l = os.read(fd, 1024) - if not l: - continue - tots = len(l) - for lx in la: - tots += len(lx) - while tots > 1<<20 and la: - tots -= len(la.pop(0)) - la.append(l) - finally: - po.lock.release() - t = syncdutils.Thread(target = tailer) - t.start() - cls.errhandler = t - - @classmethod - def fork(cls): - """fork wrapper that restarts errhandler thread in child""" - pid = os.fork() - if not pid: - cls.init_errhandler() - return pid - - def __init__(self, args, *a, **kw): - """customizations for subprocess.Popen instantiation - - - 'close_fds' is taken to be the default - - if child's stderr is chosen to be managed, - register it with the error handler thread - """ - self.args = args - if 'close_fds' not in kw: - kw['close_fds'] = True - self.lock = threading.Lock() - self.on_death_row = False - try: - sup(self, args, *a, **kw) - except: - ex = sys.exc_info()[1] - if not isinstance(ex, OSError): - raise - raise GsyncdError("""execution of "%s" failed with %s (%s)""" % \ - (args[0], errno.errorcode[ex.errno], os.strerror(ex.errno))) - if kw.get('stderr') == subprocess.PIPE: - assert(getattr(self, 'errhandler', None)) - self.errstore[self] = [] - - def errlog(self): - """make a log about child's failure event""" - filling = "" - if self.elines: - filling = ", saying:" - logging.error("""command "%s" returned with %s%s""" % \ - (" ".join(self.args), repr(self.returncode), filling)) - lp = '' - def logerr(l): - logging.error(self.args[0] + "> " + l) - for l in self.elines: - ls = l.split('\n') - ls[0] = lp + ls[0] - lp = ls.pop() - for ll in ls: - logerr(ll) - if lp: - logerr(lp) - - def errfail(self): - """fail nicely if child did not terminate with success""" - self.errlog() - syncdutils.finalize(exval = 1) - - def terminate_geterr(self, fail_on_err = True): - """kill child, finalize stderr harvesting (unregister - from errhandler, set up .elines), fail on error if - asked for - """ - self.lock.acquire() - try: - self.on_death_row = True - finally: - self.lock.release() - elines = self.errstore.pop(self) - if self.poll() == None: - self.terminate() - if self.poll() == None: - time.sleep(0.1) - self.kill() - self.wait() - while True: - if not select([self.stderr],[],[],0.1)[0]: - break - b = os.read(self.stderr.fileno(), 1024) - if b: - elines.append(b) - else: - break - self.stderr.close() - self.elines = elines - if fail_on_err and self.returncode != 0: - self.errfail() - - -class Server(object): - """singleton implemening those filesystem access primitives - which are needed for geo-replication functionality - - (Singleton in the sense it's a class which has only static - and classmethods and is used directly, without instantiation.) - """ - - GX_NSPACE = (privileged() and "trusted" or "system") + ".glusterfs" - NTV_FMTSTR = "!" + "B"*19 + "II" - FRGN_XTRA_FMT = "I" - FRGN_FMTSTR = NTV_FMTSTR + FRGN_XTRA_FMT - - def _pathguard(f): - """decorator method that checks - the path argument of the decorated - functions to make sure it does not - point out of the managed tree - """ - - fc = getattr(f, 'func_code', None) - if not fc: - # python 3 - fc = f.__code__ - pi = list(fc.co_varnames).index('path') - def ff(*a): - path = a[pi] - ps = path.split('/') - if path[0] == '/' or '..' in ps: - raise ValueError('unsafe path') - return f(*a) - return ff - - @staticmethod - @_pathguard - def entries(path): - """directory entries in an array""" - # prevent symlinks being followed - if not stat.S_ISDIR(os.lstat(path).st_mode): - raise OSError(ENOTDIR, os.strerror(ENOTDIR)) - return os.listdir(path) - - @classmethod - @_pathguard - def purge(cls, path, entries=None): - """force-delete subtrees - - If @entries is not specified, delete - the whole subtree under @path (including - @path). - - Otherwise, @entries should be a - a sequence of children of @path, and - the effect is identical with a joint - @entries-less purge on them, ie. - - for e in entries: - cls.purge(os.path.join(path, e)) - """ - me_also = entries == None - if not entries: - try: - # if it's a symlink, prevent - # following it - try: - os.unlink(path) - return - except OSError: - ex = sys.exc_info()[1] - if ex.errno == EISDIR: - entries = os.listdir(path) - else: - raise - except OSError: - ex = sys.exc_info()[1] - if ex.errno in (ENOTDIR, ENOENT, ELOOP): - try: - os.unlink(path) - return - except OSError: - ex = sys.exc_info()[1] - if ex.errno == ENOENT: - return - raise - else: - raise - for e in entries: - cls.purge(os.path.join(path, e)) - if me_also: - os.rmdir(path) - - @classmethod - @_pathguard - def _create(cls, path, ctor): - """path creation backend routine""" - try: - ctor(path) - except OSError: - ex = sys.exc_info()[1] - if ex.errno == EEXIST: - cls.purge(path) - return ctor(path) - raise - - @classmethod - @_pathguard - def mkdir(cls, path): - cls._create(path, os.mkdir) - - @classmethod - @_pathguard - def symlink(cls, lnk, path): - cls._create(path, lambda p: os.symlink(lnk, p)) - - @classmethod - @_pathguard - def xtime(cls, path, uuid): - """query xtime extended attribute - - Return xtime of @path for @uuid as a pair of integers. - "Normal" errors due to non-existent @path or extended attribute - are tolerated and errno is returned in such a case. - """ - - try: - return struct.unpack('!II', Xattr.lgetxattr(path, '.'.join([cls.GX_NSPACE, uuid, 'xtime']), 8)) - except OSError: - ex = sys.exc_info()[1] - if ex.errno in (ENOENT, ENODATA, ENOTDIR): - return ex.errno - else: - raise - - @classmethod - def xtime_vec(cls, path, *uuids): - """vectored version of @xtime - - accepts a list of uuids and returns a dictionary - with uuid as key(s) and xtime as value(s) - """ - xt = {} - for uuid in uuids: - xtu = cls.xtime(path, uuid) - if xtu == ENODATA: - xtu = None - if isinstance(xtu, int): - return xtu - xt[uuid] = xtu - return xt - - @classmethod - @_pathguard - def set_xtime(cls, path, uuid, mark): - """set @mark as xtime for @uuid on @path""" - Xattr.lsetxattr(path, '.'.join([cls.GX_NSPACE, uuid, 'xtime']), struct.pack('!II', *mark)) - - @classmethod - def set_xtime_vec(cls, path, mark_dct): - """vectored (or dictered) version of set_xtime - - ignore values that match @ignore - """ - for u,t in mark_dct.items(): - cls.set_xtime(path, u, t) - - @staticmethod - @_pathguard - def setattr(path, adct): - """set file attributes - - @adct is a dict, where 'own', 'mode' and 'times' - keys are looked for and values used to perform - chown, chmod or utimes on @path. - """ - own = adct.get('own') - if own: - os.lchown(path, *own) - mode = adct.get('mode') - if mode: - os.chmod(path, stat.S_IMODE(mode)) - times = adct.get('times') - if times: - os.utime(path, times) - - @staticmethod - def pid(): - return os.getpid() - - last_keep_alive = 0 - @classmethod - def keep_alive(cls, dct): - """process keepalive messages. - - Return keep-alive counter (number of received keep-alive - messages). - - Now the "keep-alive" message can also have a payload which is - used to set a foreign volume-mark on the underlying file system. - """ - if dct: - key = '.'.join([cls.GX_NSPACE, 'volume-mark', dct['uuid']]) - val = struct.pack(cls.FRGN_FMTSTR, - *(dct['version'] + - tuple(int(x,16) for x in re.findall('(?:[\da-f]){2}', dct['uuid'])) + - (dct['retval'],) + dct['volume_mark'][0:2] + (dct['timeout'],))) - Xattr.lsetxattr('.', key, val) - cls.last_keep_alive += 1 - return cls.last_keep_alive - - @staticmethod - def version(): - """version used in handshake""" - return 1.0 - - -class SlaveLocal(object): - """mix-in class to implement some factes of a slave server - - ("mix-in" is sort of like "abstract class", ie. it's not - instantiated just included in the ancesty DAG. I use "mix-in" - to indicate that it's not used as an abstract base class, - rather just taken in to implement additional functionality - on the basis of the assumed availability of certain interfaces.) - """ - - def can_connect_to(self, remote): - """determine our position in the connectibility matrix""" - return not remote - - def service_loop(self): - """start a RePCe server serving self's server - - stop servicing if a timeout is configured and got no - keep-alime in that inteval - """ - - if boolify(gconf.use_rsync_xattrs) and not privileged(): - raise GsyncdError("using rsync for extended attributes is not supported") - - repce = RepceServer(self.server, sys.stdin, sys.stdout, int(gconf.sync_jobs)) - t = syncdutils.Thread(target=lambda: (repce.service_loop(), - syncdutils.finalize())) - t.start() - logging.info("slave listening") - if gconf.timeout and int(gconf.timeout) > 0: - while True: - lp = self.server.last_keep_alive - time.sleep(int(gconf.timeout)) - if lp == self.server.last_keep_alive: - logging.info("connection inactive for %d seconds, stopping" % int(gconf.timeout)) - break - else: - select((), (), ()) - -class SlaveRemote(object): - """mix-in class to implement an interface to a remote slave""" - - def connect_remote(self, rargs=[], **opts): - """connects to a remote slave - - Invoke an auxiliary utility (slave gsyncd, possibly wrapped) - which sets up the connection and set up a RePCe client to - communicate throuh its stdio. - """ - slave = opts.get('slave', self.url) - extra_opts = [] - so = getattr(gconf, 'session_owner', None) - if so: - extra_opts += ['--session-owner', so] - if boolify(gconf.use_rsync_xattrs): - extra_opts.append('--use-rsync-xattrs') - po = Popen(rargs + gconf.remote_gsyncd.split() + extra_opts + \ - ['-N', '--listen', '--timeout', str(gconf.timeout), slave], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - gconf.transport = po - return self.start_fd_client(po.stdout, po.stdin, **opts) - - def start_fd_client(self, i, o, **opts): - """set up RePCe client, handshake with server - - It's cut out as a separate method to let - subclasses hook into client startup - """ - self.server = RepceClient(i, o) - rv = self.server.__version__() - exrv = {'proto': repce.repce_version, 'object': Server.version()} - da0 = (rv, exrv) - da1 = ({}, {}) - for i in range(2): - for k, v in da0[i].iteritems(): - da1[i][k] = int(v) - if da1[0] != da1[1]: - raise GsyncdError("RePCe major version mismatch: local %s, remote %s" % (exrv, rv)) - - def rsync(self, files, *args): - """invoke rsync""" - if not files: - raise GsyncdError("no files to sync") - logging.debug("files: " + ", ".join(files)) - argv = gconf.rsync_command.split() + \ - ['-aR0', '--files-from=-', '--super','--stats', '--numeric-ids', '--no-implied-dirs'] + \ - gconf.rsync_options.split() + (boolify(gconf.use_rsync_xattrs) and ['--xattrs'] or []) + \ - ['.'] + list(args) - po = Popen(argv, stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE) - for f in files: - po.stdin.write(f) - po.stdin.write('\0') - - po.stdin.close() - po.wait() - po.terminate_geterr(fail_on_err = False) - - return po - - -class AbstractUrl(object): - """abstract base class for url scheme classes""" - - def __init__(self, path, pattern): - m = re.search(pattern, path) - if not m: - raise GsyncdError("malformed path") - self.path = path - return m.groups() - - @property - def scheme(self): - return type(self).__name__.lower() - - def canonical_path(self): - return self.path - - def get_url(self, canonical=False, escaped=False): - """format self's url in various styles""" - if canonical: - pa = self.canonical_path() - else: - pa = self.path - u = "://".join((self.scheme, pa)) - if escaped: - u = syncdutils.escape(u) - return u - - @property - def url(self): - return self.get_url() - - - ### Concrete resource classes ### - - -class FILE(AbstractUrl, SlaveLocal, SlaveRemote): - """scheme class for file:// urls - - can be used to represent a file slave server - on slave side, or interface to a remote file - file server on master side - """ - - class FILEServer(Server): - """included server flavor""" - pass - - server = FILEServer - - def __init__(self, path): - sup(self, path, '^/') - - def connect(self): - """inhibit the resource beyond""" - os.chdir(self.path) - - def rsync(self, files): - return sup(self, files, self.path) - - -class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): - """scheme class for gluster:// urls - - can be used to represent a gluster slave server - on slave side, or interface to a remote gluster - slave on master side, or to represent master - (slave-ish features come from the mixins, master - functionality is outsourced to GMaster from master) - """ - - class GLUSTERServer(Server): - "server enhancements for a glusterfs backend""" - - @classmethod - def _attr_unpack_dict(cls, xattr, extra_fields = ''): - """generic volume mark fetching/parsing backed""" - fmt_string = cls.NTV_FMTSTR + extra_fields - buf = Xattr.lgetxattr('.', xattr, struct.calcsize(fmt_string)) - vm = struct.unpack(fmt_string, buf) - m = re.match('(.{8})(.{4})(.{4})(.{4})(.{12})', "".join(['%02x' % x for x in vm[2:18]])) - uuid = '-'.join(m.groups()) - volinfo = { 'version': vm[0:2], - 'uuid' : uuid, - 'retval' : vm[18], - 'volume_mark': vm[19:21], - } - if extra_fields: - return volinfo, vm[-len(extra_fields):] - else: - return volinfo - - @classmethod - def foreign_volume_infos(cls): - """return list of valid (not expired) foreign volume marks""" - dict_list = [] - xattr_list = Xattr.llistxattr_buf('.') - for ele in xattr_list: - if ele.find('.'.join([cls.GX_NSPACE, 'volume-mark', ''])) == 0: - d, x = cls._attr_unpack_dict(ele, cls.FRGN_XTRA_FMT) - now = int(time.time()) - if x[0] > now: - logging.debug("volinfo[%s] expires: %d (%d sec later)" % \ - (d['uuid'], x[0], x[0] - now)) - d['timeout'] = x[0] - dict_list.append(d) - else: - try: - Xattr.lremovexattr('.', ele) - except OSError: - pass - return dict_list - - @classmethod - def native_volume_info(cls): - """get the native volume mark of the underlying gluster volume""" - try: - return cls._attr_unpack_dict('.'.join([cls.GX_NSPACE, 'volume-mark'])) - except OSError: - ex = sys.exc_info()[1] - if ex.errno != ENODATA: - raise - - server = GLUSTERServer - - def __init__(self, path): - self.host, self.volume = sup(self, path, '^(%s):(.+)' % HostRX.pattern) - - def canonical_path(self): - return ':'.join([gethostbyname(self.host), self.volume]) - - def can_connect_to(self, remote): - """determine our position in the connectibility matrix""" - return True - - class Mounter(object): - """Abstract base class for mounter backends""" - - def __init__(self, params): - self.params = params - self.mntpt = None - - @classmethod - def get_glusterprog(cls): - return os.path.join(gconf.gluster_command_dir, cls.glusterprog) - - def umount_l(self, d): - """perform lazy umount""" - po = Popen(self.make_umount_argv(d), stderr=subprocess.PIPE) - po.wait() - return po - - @classmethod - def make_umount_argv(cls, d): - raise NotImplementedError - - def make_mount_argv(self, *a): - raise NotImplementedError - - def cleanup_mntpt(self, *a): - pass - - def handle_mounter(self, po): - po.wait() - - def inhibit(self, *a): - """inhibit a gluster filesystem - - Mount glusterfs over a temporary mountpoint, - change into the mount, and lazy unmount the - filesystem. - """ - - mpi, mpo = os.pipe() - mh = Popen.fork() - if mh: - os.close(mpi) - fcntl.fcntl(mpo, fcntl.F_SETFD, fcntl.FD_CLOEXEC) - d = None - margv = self.make_mount_argv(*a) - if self.mntpt: - # mntpt is determined pre-mount - d = self.mntpt - os.write(mpo, d + '\0') - po = Popen(margv, **self.mountkw) - self.handle_mounter(po) - po.terminate_geterr() - logging.debug('auxiliary glusterfs mount in place') - if not d: - # mntpt is determined during mount - d = self.mntpt - os.write(mpo, d + '\0') - os.write(mpo, 'M') - t = syncdutils.Thread(target=lambda: os.chdir(d)) - t.start() - tlim = gconf.starttime + int(gconf.connection_timeout) - while True: - if not t.isAlive(): - break - if time.time() >= tlim: - syncdutils.finalize(exval = 1) - time.sleep(1) - os.close(mpo) - _, rv = syncdutils.waitpid(mh, 0) - if rv: - rv = (os.WIFEXITED(rv) and os.WEXITSTATUS(rv) or 0) - \ - (os.WIFSIGNALED(rv) and os.WTERMSIG(rv) or 0) - logging.warn('stale mount possibly left behind on ' + d) - raise GsyncdError("cleaning up temp mountpoint %s failed with status %d" % \ - (d, rv)) - else: - rv = 0 - try: - os.setsid() - os.close(mpo) - mntdata = '' - while True: - c = os.read(mpi, 1) - if not c: - break - mntdata += c - if mntdata: - mounted = False - if mntdata[-1] == 'M': - mntdata = mntdata[:-1] - assert(mntdata) - mounted = True - assert(mntdata[-1] == '\0') - mntpt = mntdata[:-1] - assert(mntpt) - if mounted: - po = self.umount_l(mntpt) - po.terminate_geterr(fail_on_err = False) - if po.returncode != 0: - po.errlog() - rv = po.returncode - self.cleanup_mntpt(mntpt) - except: - logging.exception('mount cleanup failure:') - rv = 200 - os._exit(rv) - logging.debug('auxiliary glusterfs mount prepared') - - class DirectMounter(Mounter): - """mounter backend which calls mount(8), umount(8) directly""" - - mountkw = {'stderr': subprocess.PIPE} - glusterprog = 'glusterfs' - - @staticmethod - def make_umount_argv(d): - return ['umount', '-l', d] - - def make_mount_argv(self): - self.mntpt = tempfile.mkdtemp(prefix = 'gsyncd-aux-mount-') - return [self.get_glusterprog()] + ['--' + p for p in self.params] + [self.mntpt] - - def cleanup_mntpt(self, mntpt = None): - if not mntpt: - mntpt = self.mntpt - os.rmdir(mntpt) - - class MountbrokerMounter(Mounter): - """mounter backend using the mountbroker gluster service""" - - mountkw = {'stderr': subprocess.PIPE, 'stdout': subprocess.PIPE} - glusterprog = 'gluster' - - @classmethod - def make_cli_argv(cls): - return [cls.get_glusterprog()] + gconf.gluster_cli_options.split() + ['system::'] - - @classmethod - def make_umount_argv(cls, d): - return cls.make_cli_argv() + ['umount', d, 'lazy'] - - def make_mount_argv(self, label): - return self.make_cli_argv() + \ - ['mount', label, 'user-map-root=' + syncdutils.getusername()] + self.params - - def handle_mounter(self, po): - self.mntpt = po.stdout.readline()[:-1] - po.stdout.close() - sup(self, po) - if po.returncode != 0: - # if cli terminated with error due to being - # refused by glusterd, what it put - # out on stdout is a diagnostic message - logging.error('glusterd answered: %s' % self.mntpt) - - def connect(self): - """inhibit the resource beyond - - Choose mounting backend (direct or mountbroker), - set up glusterfs parameters and perform the mount - with given backend - """ - - label = getattr(gconf, 'mountbroker', None) - if not label and not privileged(): - label = syncdutils.getusername() - mounter = label and self.MountbrokerMounter or self.DirectMounter - params = gconf.gluster_params.split() + \ - (gconf.gluster_log_level and ['log-level=' + gconf.gluster_log_level] or []) + \ - ['log-file=' + gconf.gluster_log_file, 'volfile-server=' + self.host, - 'volfile-id=' + self.volume, 'client-pid=-1'] - mounter(params).inhibit(*[l for l in [label] if l]) - - def connect_remote(self, *a, **kw): - sup(self, *a, **kw) - self.slavedir = "/proc/%d/cwd" % self.server.pid() - - def service_loop(self, *args): - """enter service loop - - - if slave given, instantiate GMaster and - pass control to that instance, which implements - master behavior - - else do that's what's inherited - """ - if args: - gmaster_builder()(self, args[0]).crawl_loop() - else: - sup(self, *args) - - def rsync(self, files): - return sup(self, files, self.slavedir) - - -class SSH(AbstractUrl, SlaveRemote): - """scheme class for ssh:// urls - - interface to remote slave on master side - implementing an ssh based proxy - """ - - def __init__(self, path): - self.remote_addr, inner_url = sup(self, path, - '^((?:%s@)?%s):(.+)' % tuple([ r.pattern for r in (UserRX, HostRX) ])) - self.inner_rsc = parse_url(inner_url) - - def canonical_path(self): - m = re.match('([^@]+)@(.+)', self.remote_addr) - if m: - u, h = m.groups() - else: - u, h = syncdutils.getusername(), self.remote_addr - remote_addr = '@'.join([u, gethostbyname(h)]) - return ':'.join([remote_addr, self.inner_rsc.get_url(canonical=True)]) - - def can_connect_to(self, remote): - """determine our position in the connectibility matrix""" - return False - - def start_fd_client(self, *a, **opts): - """customizations for client startup - - - be a no-op if we are to daemonize (client startup is deferred - to post-daemon stage) - - determine target url for rsync after consulting server - """ - if opts.get('deferred'): - return a - sup(self, *a) - ityp = type(self.inner_rsc) - if ityp == FILE: - slavepath = self.inner_rsc.path - elif ityp == GLUSTER: - slavepath = "/proc/%d/cwd" % self.server.pid() - else: - raise NotImplementedError - self.slaveurl = ':'.join([self.remote_addr, slavepath]) - - def connect_remote(self, go_daemon=None): - """connect to inner slave url through outer ssh url - - Wrap the connecting utility in ssh. - - Much care is put into daemonizing: in that case - ssh is started before daemonization, but - RePCe client is to be created after that (as ssh - interactive password auth would be defeated by - a daemonized ssh, while client should be present - only in the final process). In that case the action - is taken apart to two parts, this method is ivoked - once pre-daemon, once post-daemon. Use @go_daemon - to deiced what part to perform. - - [NB. ATM gluster product does not makes use of interactive - authentication.] - """ - if go_daemon == 'done': - return self.start_fd_client(*self.fd_pair) - gconf.setup_ssh_ctl(tempfile.mkdtemp(prefix='gsyncd-aux-ssh-')) - deferred = go_daemon == 'postconn' - ret = sup(self, gconf.ssh_command.split() + gconf.ssh_ctl_args + [self.remote_addr], slave=self.inner_rsc.url, deferred=deferred) - if deferred: - # send a message to peer so that we can wait for - # the answer from which we know connection is - # established and we can proceed with daemonization - # (doing that too early robs the ssh passwd prompt...) - # However, we'd better not start the RepceClient - # before daemonization (that's not preserved properly - # in daemon), we just do a an ad-hoc linear put/get. - i, o = ret - inf = os.fdopen(i) - repce.send(o, None, '__repce_version__') - select((inf,), (), ()) - repce.recv(inf) - # hack hack hack: store a global reference to the file - # to save it from getting GC'd which implies closing it - gconf.permanent_handles.append(inf) - self.fd_pair = (i, o) - return 'should' - - def rsync(self, files): - return sup(self, files, '-e', " ".join(gconf.ssh_command.split() + gconf.ssh_ctl_args), - *(gconf.rsync_ssh_options.split() + [self.slaveurl])) diff --git a/xlators/features/marker/utils/syncdaemon/syncdutils.py b/xlators/features/marker/utils/syncdaemon/syncdutils.py deleted file mode 100644 index 0764c0790..000000000 --- a/xlators/features/marker/utils/syncdaemon/syncdutils.py +++ /dev/null @@ -1,288 +0,0 @@ -import os -import sys -import pwd -import time -import fcntl -import shutil -import logging -from threading import Lock, Thread as baseThread -from errno import EACCES, EAGAIN, EPIPE, ENOTCONN, ECONNABORTED, EINTR, errorcode -from signal import signal, SIGTERM, SIGKILL -from time import sleep -import select as oselect -from os import waitpid as owaitpid -try: - from cPickle import PickleError -except ImportError: - # py 3 - from pickle import PickleError - -from gconf import gconf - -try: - # py 3 - from urllib import parse as urllib -except ImportError: - import urllib - -def escape(s): - """the chosen flavor of string escaping, used all over - to turn whatever data to creatable representation""" - return urllib.quote_plus(s) - -def unescape(s): - """inverse of .escape""" - return urllib.unquote_plus(s) - -def norm(s): - if s: - return s.replace('-', '_') - -def update_file(path, updater, merger = lambda f: True): - """update a file in a transaction-like manner""" - - fr = fw = None - try: - fd = os.open(path, os.O_CREAT|os.O_RDWR) - try: - fr = os.fdopen(fd, 'r+b') - except: - os.close(fd) - raise - fcntl.lockf(fr, fcntl.LOCK_EX) - if not merger(fr): - return - - tmpp = path + '.tmp.' + str(os.getpid()) - fd = os.open(tmpp, os.O_CREAT|os.O_EXCL|os.O_WRONLY) - try: - fw = os.fdopen(fd, 'wb', 0) - except: - os.close(fd) - raise - updater(fw) - os.fsync(fd) - os.rename(tmpp, path) - finally: - for fx in (fr, fw): - if fx: - fx.close() - -def grabfile(fname, content=None): - """open @fname + contest for its fcntl lock - - @content: if given, set the file content to it - """ - # damn those messy open() mode codes - fd = os.open(fname, os.O_CREAT|os.O_RDWR) - f = os.fdopen(fd, 'r+b', 0) - try: - fcntl.lockf(f, fcntl.LOCK_EX|fcntl.LOCK_NB) - except: - ex = sys.exc_info()[1] - f.close() - if isinstance(ex, IOError) and ex.errno in (EACCES, EAGAIN): - # cannot grab, it's taken - return - raise - if content: - try: - f.truncate() - f.write(content) - except: - f.close() - raise - gconf.permanent_handles.append(f) - return f - -def grabpidfile(fname=None, setpid=True): - """.grabfile customization for pid files""" - if not fname: - fname = gconf.pid_file - content = None - if setpid: - content = str(os.getpid()) + '\n' - return grabfile(fname, content=content) - -final_lock = Lock() - -def finalize(*a, **kw): - """all those messy final steps we go trough upon termination - - Do away with pidfile, ssh control dir and logging. - """ - final_lock.acquire() - if getattr(gconf, 'pid_file', None): - rm_pidf = gconf.pid_file_owned - if gconf.cpid: - # exit path from parent branch of daemonization - rm_pidf = False - while True: - f = grabpidfile(setpid=False) - if not f: - # child has already taken over pidfile - break - if os.waitpid(gconf.cpid, os.WNOHANG)[0] == gconf.cpid: - # child has terminated - rm_pidf = True - break; - time.sleep(0.1) - if rm_pidf: - try: - os.unlink(gconf.pid_file) - except: - ex = sys.exc_info()[1] - if ex.errno == ENOENT: - pass - else: - raise - if gconf.ssh_ctl_dir and not gconf.cpid: - shutil.rmtree(gconf.ssh_ctl_dir) - if getattr(gconf, 'state_socket', None): - try: - os.unlink(gconf.state_socket) - except: - if sys.exc_info()[0] == OSError: - pass - if gconf.log_exit: - logging.info("exiting.") - sys.stdout.flush() - sys.stderr.flush() - os._exit(kw.get('exval', 0)) - -def log_raise_exception(excont): - """top-level exception handler - - Try to some fancy things to cover up we face with an error. - Translate some weird sounding but well understood exceptions - into human-friendly lingo - """ - is_filelog = False - for h in logging.getLogger().handlers: - fno = getattr(getattr(h, 'stream', None), 'fileno', None) - if fno and not os.isatty(fno()): - is_filelog = True - - exc = sys.exc_info()[1] - if isinstance(exc, SystemExit): - excont.exval = exc.code or 0 - raise - else: - logtag = None - if isinstance(exc, GsyncdError): - if is_filelog: - logging.error(exc.args[0]) - sys.stderr.write('failure: ' + exc.args[0] + '\n') - elif isinstance(exc, PickleError) or isinstance(exc, EOFError) or \ - ((isinstance(exc, OSError) or isinstance(exc, IOError)) and \ - exc.errno == EPIPE): - logging.error('connection to peer is broken') - if hasattr(gconf, 'transport'): - gconf.transport.wait() - if gconf.transport.returncode == 127: - logging.warn("!!!!!!!!!!!!!") - logging.warn('!!! getting "No such file or directory" errors ' - "is most likely due to MISCONFIGURATION, please consult " - "http://access.redhat.com/knowledge/docs/en-US/Red_Hat_Storage/2.0/html/Administration_Guide/chap-User_Guide-Geo_Rep-Preparation-Settingup_Environment.html") - logging.warn("!!!!!!!!!!!!!") - gconf.transport.terminate_geterr() - elif isinstance(exc, OSError) and exc.errno in (ENOTCONN, ECONNABORTED): - logging.error('glusterfs session went down [%s]', errorcode[exc.errno]) - else: - logtag = "FAIL" - if not logtag and logging.getLogger().isEnabledFor(logging.DEBUG): - logtag = "FULL EXCEPTION TRACE" - if logtag: - logging.exception(logtag + ": ") - sys.stderr.write("failed with %s.\n" % type(exc).__name__) - excont.exval = 1 - sys.exit(excont.exval) - - -class FreeObject(object): - """wildcard class for which any attribute can be set""" - - def __init__(self, **kw): - for k,v in kw.items(): - setattr(self, k, v) - -class Thread(baseThread): - """thread class flavor for gsyncd - - - always a daemon thread - - force exit for whole program if thread - function coughs up an exception - """ - def __init__(self, *a, **kw): - tf = kw.get('target') - if tf: - def twrap(*aa): - excont = FreeObject(exval = 0) - try: - tf(*aa) - except: - try: - log_raise_exception(excont) - finally: - finalize(exval = excont.exval) - kw['target'] = twrap - baseThread.__init__(self, *a, **kw) - self.setDaemon(True) - -class GsyncdError(Exception): - pass - -def getusername(uid = None): - if uid == None: - uid = os.geteuid() - return pwd.getpwuid(uid).pw_name - -def privileged(): - return os.geteuid() == 0 - -def boolify(s): - """ - Generic string to boolean converter - - return - - Quick return if string 's' is of type bool - - True if it's in true_list - - False if it's in false_list - - Warn if it's not present in either and return False - """ - true_list = ['true', 'yes', '1', 'on'] - false_list = ['false', 'no', '0', 'off'] - - if isinstance(s, bool): - return s - - rv = False - lstr = s.lower() - if lstr in true_list: - rv = True - elif not lstr in false_list: - logging.warn("Unknown string (%s) in string to boolean conversion defaulting to False\n" % (s)) - - return rv - -def eintr_wrap(func, exc, *a): - """ - wrapper around syscalls resilient to interrupt caused - by signals - """ - while True: - try: - return func(*a) - except exc: - ex = sys.exc_info()[1] - if not ex.args[0] == EINTR: - raise - -def select(*a): - return eintr_wrap(oselect.select, oselect.error, *a) - -def waitpid (*a): - return eintr_wrap(owaitpid, OSError, *a) - -def set_term_handler(hook=lambda *a: finalize(*a, **{'exval': 1})): - signal(SIGTERM, hook) diff --git a/xlators/features/qemu-block/Makefile.am b/xlators/features/qemu-block/Makefile.am new file mode 100644 index 000000000..af437a64d --- /dev/null +++ b/xlators/features/qemu-block/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src diff --git a/xlators/features/qemu-block/src/Makefile.am b/xlators/features/qemu-block/src/Makefile.am new file mode 100644 index 000000000..08a7b62a0 --- /dev/null +++ b/xlators/features/qemu-block/src/Makefile.am @@ -0,0 +1,155 @@ +if ENABLE_QEMU_BLOCK +xlator_LTLIBRARIES = qemu-block.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +qemu_block_la_LDFLAGS = -module -avoid-version +qemu_block_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(GLIB_LIBS) -lz -lrt + +qemu_block_la_SOURCES_qemu = \ + $(CONTRIBDIR)/qemu/qemu-coroutine.c \ + $(CONTRIBDIR)/qemu/qemu-coroutine-lock.c \ + $(CONTRIBDIR)/qemu/qemu-coroutine-sleep.c \ + $(CONTRIBDIR)/qemu/coroutine-ucontext.c \ + $(CONTRIBDIR)/qemu/block.c \ + $(CONTRIBDIR)/qemu/nop-symbols.c + +qemu_block_la_SOURCES_qemu_util = \ + $(CONTRIBDIR)/qemu/util/aes.c \ + $(CONTRIBDIR)/qemu/util/bitmap.c \ + $(CONTRIBDIR)/qemu/util/bitops.c \ + $(CONTRIBDIR)/qemu/util/cutils.c \ + $(CONTRIBDIR)/qemu/util/error.c \ + $(CONTRIBDIR)/qemu/util/hbitmap.c \ + $(CONTRIBDIR)/qemu/util/iov.c \ + $(CONTRIBDIR)/qemu/util/module.c \ + $(CONTRIBDIR)/qemu/util/oslib-posix.c \ + $(CONTRIBDIR)/qemu/util/qemu-option.c \ + $(CONTRIBDIR)/qemu/util/qemu-error.c \ + $(CONTRIBDIR)/qemu/util/qemu-thread-posix.c \ + $(CONTRIBDIR)/qemu/util/unicode.c \ + $(CONTRIBDIR)/qemu/util/hexdump.c + +qemu_block_la_SOURCES_qemu_block = \ + $(CONTRIBDIR)/qemu/block/snapshot.c \ + $(CONTRIBDIR)/qemu/block/qcow2-cache.c \ + $(CONTRIBDIR)/qemu/block/qcow2-cluster.c \ + $(CONTRIBDIR)/qemu/block/qcow2-refcount.c \ + $(CONTRIBDIR)/qemu/block/qcow2-snapshot.c \ + $(CONTRIBDIR)/qemu/block/qcow2.c \ + $(CONTRIBDIR)/qemu/block/qed-check.c \ + $(CONTRIBDIR)/qemu/block/qed-cluster.c \ + $(CONTRIBDIR)/qemu/block/qed-gencb.c \ + $(CONTRIBDIR)/qemu/block/qed-l2-cache.c \ + $(CONTRIBDIR)/qemu/block/qed-table.c \ + $(CONTRIBDIR)/qemu/block/qed.c + +qemu_block_la_SOURCES_qemu_qobject = \ + $(CONTRIBDIR)/qemu/qobject/json-lexer.c \ + $(CONTRIBDIR)/qemu/qobject/json-parser.c \ + $(CONTRIBDIR)/qemu/qobject/json-streamer.c \ + $(CONTRIBDIR)/qemu/qobject/qbool.c \ + $(CONTRIBDIR)/qemu/qobject/qdict.c \ + $(CONTRIBDIR)/qemu/qobject/qerror.c \ + $(CONTRIBDIR)/qemu/qobject/qfloat.c \ + $(CONTRIBDIR)/qemu/qobject/qint.c \ + $(CONTRIBDIR)/qemu/qobject/qjson.c \ + $(CONTRIBDIR)/qemu/qobject/qlist.c \ + $(CONTRIBDIR)/qemu/qobject/qstring.c + +qemu_block_la_SOURCES = \ + $(qemu_block_la_SOURCES_qemu) \ + $(qemu_block_la_SOURCES_qemu_util) \ + $(qemu_block_la_SOURCES_qemu_block) \ + $(qemu_block_la_SOURCES_qemu_qobject) \ + bdrv-xlator.c \ + coroutine-synctask.c \ + bh-syncop.c \ + monitor-logging.c \ + clock-timer.c \ + qemu-block.c \ + qb-coroutines.c + +noinst_HEADERS_qemu = \ + $(CONTRIBDIR)/qemu/config-host.h \ + $(CONTRIBDIR)/qemu/qapi-types.h \ + $(CONTRIBDIR)/qemu/qmp-commands.h \ + $(CONTRIBDIR)/qemu/trace/generated-tracers.h \ + $(CONTRIBDIR)/qemu/include/config.h \ + $(CONTRIBDIR)/qemu/include/glib-compat.h \ + $(CONTRIBDIR)/qemu/include/qemu-common.h \ + $(CONTRIBDIR)/qemu/include/trace.h \ + $(CONTRIBDIR)/qemu/include/block/coroutine.h \ + $(CONTRIBDIR)/qemu/include/block/aio.h \ + $(CONTRIBDIR)/qemu/include/block/block.h \ + $(CONTRIBDIR)/qemu/include/block/block_int.h \ + $(CONTRIBDIR)/qemu/include/block/blockjob.h \ + $(CONTRIBDIR)/qemu/include/block/coroutine.h \ + $(CONTRIBDIR)/qemu/include/block/coroutine_int.h \ + $(CONTRIBDIR)/qemu/include/block/snapshot.h \ + $(CONTRIBDIR)/qemu/include/exec/cpu-common.h \ + $(CONTRIBDIR)/qemu/include/exec/hwaddr.h \ + $(CONTRIBDIR)/qemu/include/exec/poison.h \ + $(CONTRIBDIR)/qemu/include/fpu/softfloat.h \ + $(CONTRIBDIR)/qemu/include/migration/migration.h \ + $(CONTRIBDIR)/qemu/include/migration/qemu-file.h \ + $(CONTRIBDIR)/qemu/include/migration/vmstate.h \ + $(CONTRIBDIR)/qemu/include/monitor/monitor.h \ + $(CONTRIBDIR)/qemu/include/monitor/readline.h \ + $(CONTRIBDIR)/qemu/include/qapi/error.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/json-lexer.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/json-parser.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/json-streamer.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/qbool.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/qdict.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/qerror.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/qfloat.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/qint.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/qjson.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/qlist.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/qobject.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/qstring.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/types.h \ + $(CONTRIBDIR)/qemu/include/qemu/aes.h \ + $(CONTRIBDIR)/qemu/include/qemu/atomic.h \ + $(CONTRIBDIR)/qemu/include/qemu/bitmap.h \ + $(CONTRIBDIR)/qemu/include/qemu/bitops.h \ + $(CONTRIBDIR)/qemu/include/qemu/bswap.h \ + $(CONTRIBDIR)/qemu/include/qemu/compiler.h \ + $(CONTRIBDIR)/qemu/include/qemu/error-report.h \ + $(CONTRIBDIR)/qemu/include/qemu/event_notifier.h \ + $(CONTRIBDIR)/qemu/include/qemu/hbitmap.h \ + $(CONTRIBDIR)/qemu/include/qemu/host-utils.h \ + $(CONTRIBDIR)/qemu/include/qemu/iov.h \ + $(CONTRIBDIR)/qemu/include/qemu/main-loop.h \ + $(CONTRIBDIR)/qemu/include/qemu/module.h \ + $(CONTRIBDIR)/qemu/include/qemu/notify.h \ + $(CONTRIBDIR)/qemu/include/qemu/option.h \ + $(CONTRIBDIR)/qemu/include/qemu/option_int.h \ + $(CONTRIBDIR)/qemu/include/qemu/osdep.h \ + $(CONTRIBDIR)/qemu/include/qemu/queue.h \ + $(CONTRIBDIR)/qemu/include/qemu/sockets.h \ + $(CONTRIBDIR)/qemu/include/qemu/thread-posix.h \ + $(CONTRIBDIR)/qemu/include/qemu/thread.h \ + $(CONTRIBDIR)/qemu/include/qemu/timer.h \ + $(CONTRIBDIR)/qemu/include/qemu/typedefs.h \ + $(CONTRIBDIR)/qemu/include/sysemu/sysemu.h \ + $(CONTRIBDIR)/qemu/include/sysemu/os-posix.h \ + $(CONTRIBDIR)/qemu/block/qcow2.h \ + $(CONTRIBDIR)/qemu/block/qed.h + +noinst_HEADERS = \ + $(noinst_HEADERS_qemu) \ + qemu-block.h \ + qemu-block-memory-types.h \ + qb-coroutines.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(CONTRIBDIR)/qemu \ + -I$(CONTRIBDIR)/qemu/include \ + -DGLUSTER_XLATOR + +AM_CFLAGS = -fno-strict-aliasing -Wall $(GF_CFLAGS) $(GLIB_CFLAGS) + +CLEANFILES = + +endif diff --git a/xlators/features/qemu-block/src/bdrv-xlator.c b/xlators/features/qemu-block/src/bdrv-xlator.c new file mode 100644 index 000000000..106c59775 --- /dev/null +++ b/xlators/features/qemu-block/src/bdrv-xlator.c @@ -0,0 +1,397 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "inode.h" +#include "syncop.h" +#include "qemu-block.h" +#include "block/block_int.h" + +typedef struct BDRVGlusterState { + inode_t *inode; +} BDRVGlusterState; + +static QemuOptsList runtime_opts = { + .name = "gluster", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = "filename", + .type = QEMU_OPT_STRING, + .help = "GFID of file", + }, + { /* end of list */ } + }, +}; + +inode_t * +qb_inode_from_filename (const char *filename) +{ + const char *iptr = NULL; + inode_t *inode = NULL; + + iptr = filename + 17; + sscanf (iptr, "%p", &inode); + + return inode; +} + + +int +qb_inode_to_filename (inode_t *inode, char *filename, int size) +{ + return snprintf (filename, size, "gluster://inodep:%p", inode); +} + + +static fd_t * +fd_from_bs (BlockDriverState *bs) +{ + BDRVGlusterState *s = bs->opaque; + + return fd_anonymous (s->inode); +} + + +static int +qemu_gluster_open (BlockDriverState *bs, QDict *options, int bdrv_flags) +{ + inode_t *inode = NULL; + BDRVGlusterState *s = bs->opaque; + QemuOpts *opts = NULL; + Error *local_err = NULL; + const char *filename = NULL; + char gfid_str[128]; + int ret; + qb_conf_t *conf = THIS->private; + + opts = qemu_opts_create_nofail(&runtime_opts); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (error_is_set(&local_err)) { + qerror_report_err(local_err); + error_free(local_err); + return -EINVAL; + } + + filename = qemu_opt_get(opts, "filename"); + + /* + * gfid:<gfid> format means we're opening a backing image. + */ + ret = sscanf(filename, "gluster://gfid:%s", gfid_str); + if (ret) { + loc_t loc = {0,}; + struct iatt buf = {0,}; + uuid_t gfid; + + uuid_parse(gfid_str, gfid); + + loc.inode = inode_find(conf->root_inode->table, gfid); + if (!loc.inode) { + loc.inode = inode_new(conf->root_inode->table); + uuid_copy(loc.inode->gfid, gfid); + } + + uuid_copy(loc.gfid, loc.inode->gfid); + ret = syncop_lookup(FIRST_CHILD(THIS), &loc, NULL, &buf, NULL, + NULL); + if (ret) { + loc_wipe(&loc); + return -errno; + } + + s->inode = inode_ref(loc.inode); + loc_wipe(&loc); + } else { + inode = qb_inode_from_filename (filename); + if (!inode) + return -EINVAL; + + s->inode = inode_ref(inode); + } + + return 0; +} + + +static int +qemu_gluster_create (const char *filename, QEMUOptionParameter *options) +{ + uint64_t total_size = 0; + inode_t *inode = NULL; + fd_t *fd = NULL; + struct iatt stat = {0, }; + int ret = 0; + + inode = qb_inode_from_filename (filename); + if (!inode) + return -EINVAL; + + while (options && options->name) { + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { + total_size = options->value.n / BDRV_SECTOR_SIZE; + } + options++; + } + + fd = fd_anonymous (inode); + if (!fd) + return -ENOMEM; + + ret = syncop_fstat (FIRST_CHILD(THIS), fd, &stat); + if (ret) { + fd_unref (fd); + return -errno; + } + + if (stat.ia_size) { + /* format ONLY if the filesize is 0 bytes */ + fd_unref (fd); + return -EFBIG; + } + + if (total_size) { + ret = syncop_ftruncate (FIRST_CHILD(THIS), fd, total_size); + if (ret) { + fd_unref (fd); + return -errno; + } + } + + fd_unref (fd); + return 0; +} + + +static int +qemu_gluster_co_readv (BlockDriverState *bs, int64_t sector_num, int nb_sectors, + QEMUIOVector *qiov) +{ + fd_t *fd = NULL; + off_t offset = 0; + size_t size = 0; + struct iovec *iov = NULL; + int count = 0; + struct iobref *iobref = NULL; + int ret = 0; + + fd = fd_from_bs (bs); + if (!fd) + return -EIO; + + offset = sector_num * BDRV_SECTOR_SIZE; + size = nb_sectors * BDRV_SECTOR_SIZE; + + ret = syncop_readv (FIRST_CHILD(THIS), fd, size, offset, 0, + &iov, &count, &iobref); + if (ret < 0) { + ret = -errno; + goto out; + } + + iov_copy (qiov->iov, qiov->niov, iov, count); /* *choke!* */ + +out: + GF_FREE (iov); + if (iobref) + iobref_unref (iobref); + fd_unref (fd); + return ret; +} + + +static int +qemu_gluster_co_writev (BlockDriverState *bs, int64_t sector_num, int nb_sectors, + QEMUIOVector *qiov) +{ + fd_t *fd = NULL; + off_t offset = 0; + size_t size = 0; + struct iobref *iobref = NULL; + struct iobuf *iobuf = NULL; + struct iovec iov = {0, }; + int ret = -ENOMEM; + + fd = fd_from_bs (bs); + if (!fd) + return -EIO; + + offset = sector_num * BDRV_SECTOR_SIZE; + size = nb_sectors * BDRV_SECTOR_SIZE; + + iobuf = iobuf_get2 (THIS->ctx->iobuf_pool, size); + if (!iobuf) + goto out; + + iobref = iobref_new (); + if (!iobref) { + iobuf_unref (iobuf); + goto out; + } + + iobref_add (iobref, iobuf); + + iov_unload (iobuf_ptr (iobuf), qiov->iov, qiov->niov); /* *choke!* */ + + iov.iov_base = iobuf_ptr (iobuf); + iov.iov_len = size; + + ret = syncop_writev (FIRST_CHILD(THIS), fd, &iov, 1, offset, iobref, 0); + if (ret < 0) + ret = -errno; + +out: + if (iobuf) + iobuf_unref (iobuf); + if (iobref) + iobref_unref (iobref); + fd_unref (fd); + return ret; +} + + +static int +qemu_gluster_co_flush (BlockDriverState *bs) +{ + fd_t *fd = NULL; + int ret = 0; + + fd = fd_from_bs (bs); + + ret = syncop_flush (FIRST_CHILD(THIS), fd); + + fd_unref (fd); + + return ret; +} + + +static int +qemu_gluster_co_fsync (BlockDriverState *bs) +{ + fd_t *fd = NULL; + int ret = 0; + + fd = fd_from_bs (bs); + + ret = syncop_fsync (FIRST_CHILD(THIS), fd, 0); + + fd_unref (fd); + + return ret; +} + + +static int +qemu_gluster_truncate (BlockDriverState *bs, int64_t offset) +{ + fd_t *fd = NULL; + int ret = 0; + + fd = fd_from_bs (bs); + + ret = syncop_ftruncate (FIRST_CHILD(THIS), fd, offset); + + fd_unref (fd); + + if (ret < 0) + return ret; + + return ret; +} + + +static int64_t +qemu_gluster_getlength (BlockDriverState *bs) +{ + fd_t *fd = NULL; + int ret = 0; + struct iatt iatt = {0, }; + + fd = fd_from_bs (bs); + + ret = syncop_fstat (FIRST_CHILD(THIS), fd, &iatt); + if (ret < 0) + return -1; + + return iatt.ia_size; +} + + +static int64_t +qemu_gluster_allocated_file_size (BlockDriverState *bs) +{ + fd_t *fd = NULL; + int ret = 0; + struct iatt iatt = {0, }; + + fd = fd_from_bs (bs); + + ret = syncop_fstat (FIRST_CHILD(THIS), fd, &iatt); + if (ret < 0) + return -1; + + return iatt.ia_blocks * 512; +} + + +static void +qemu_gluster_close (BlockDriverState *bs) +{ + BDRVGlusterState *s = NULL; + + s = bs->opaque; + + inode_unref (s->inode); + + return; +} + + +static QEMUOptionParameter qemu_gluster_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size" + }, + { NULL } +}; + + +static BlockDriver bdrv_gluster = { + .format_name = "gluster", + .protocol_name = "gluster", + .instance_size = sizeof(BDRVGlusterState), + .bdrv_file_open = qemu_gluster_open, + .bdrv_close = qemu_gluster_close, + .bdrv_create = qemu_gluster_create, + .bdrv_getlength = qemu_gluster_getlength, + .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, + .bdrv_co_readv = qemu_gluster_co_readv, + .bdrv_co_writev = qemu_gluster_co_writev, + .bdrv_co_flush_to_os = qemu_gluster_co_flush, + .bdrv_co_flush_to_disk = qemu_gluster_co_fsync, + .bdrv_truncate = qemu_gluster_truncate, + .create_options = qemu_gluster_create_options, +}; + + +static void bdrv_gluster_init(void) +{ + bdrv_register(&bdrv_gluster); +} + + +block_init(bdrv_gluster_init); diff --git a/xlators/features/qemu-block/src/bh-syncop.c b/xlators/features/qemu-block/src/bh-syncop.c new file mode 100644 index 000000000..e8686f6d4 --- /dev/null +++ b/xlators/features/qemu-block/src/bh-syncop.c @@ -0,0 +1,48 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "syncop.h" +#include "qemu-block-memory-types.h" + +#include "block/aio.h" + +void +qemu_bh_schedule (QEMUBH *bh) +{ + return; +} + +void +qemu_bh_cancel (QEMUBH *bh) +{ + return; +} + +void +qemu_bh_delete (QEMUBH *bh) +{ + +} + +QEMUBH * +qemu_bh_new (QEMUBHFunc *cb, void *opaque) +{ + return NULL; +} diff --git a/xlators/features/qemu-block/src/clock-timer.c b/xlators/features/qemu-block/src/clock-timer.c new file mode 100644 index 000000000..fcbec6ad1 --- /dev/null +++ b/xlators/features/qemu-block/src/clock-timer.c @@ -0,0 +1,60 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "syncop.h" +#include "qemu-block-memory-types.h" + +#include "qemu/timer.h" + +QEMUClock *vm_clock; +int use_rt_clock = 0; + +QEMUTimer *qemu_new_timer (QEMUClock *clock, int scale, + QEMUTimerCB *cb, void *opaque) +{ + return NULL; +} + +int64_t qemu_get_clock_ns (QEMUClock *clock) +{ + return 0; +} + +void qemu_mod_timer (QEMUTimer *ts, int64_t expire_time) +{ + return; +} + +void qemu_free_timer (QEMUTimer *ts) +{ + +} + +void qemu_del_timer (QEMUTimer *ts) +{ + +} + +bool qemu_aio_wait() +{ + synctask_wake (synctask_get()); + synctask_yield (synctask_get()); + return 0; +} diff --git a/xlators/features/qemu-block/src/coroutine-synctask.c b/xlators/features/qemu-block/src/coroutine-synctask.c new file mode 100644 index 000000000..e43988a95 --- /dev/null +++ b/xlators/features/qemu-block/src/coroutine-synctask.c @@ -0,0 +1,116 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "syncop.h" +#include "qemu-block-memory-types.h" + +#include "qemu-block.h" + +/* + * This code serves as the bridge from the main glusterfs context to the qemu + * coroutine context via synctask. We create a single threaded syncenv with a + * single synctask responsible for processing a queue of coroutines. The qemu + * code invoked from within the synctask function handlers uses the ucontext + * coroutine implementation and scheduling logic internal to qemu. This + * effectively donates a thread of execution to qemu and its internal coroutine + * management. + * + * NOTE: The existence of concurrent synctasks has proven quite racy with regard + * to qemu coroutine management, particularly related to the lifecycle + * differences with top-level synctasks and internally created coroutines and + * interactions with qemu-internal queues (and locks, in turn). We explicitly + * disallow this scenario, via the queue, until it is more well supported. + */ + +static struct { + struct list_head queue; + gf_lock_t lock; + struct synctask *task; +} qb_co; + +static void +init_qbco() +{ + INIT_LIST_HEAD(&qb_co.queue); + LOCK_INIT(&qb_co.lock); +} + +static int +synctask_nop_cbk (int ret, call_frame_t *frame, void *opaque) +{ + return 0; +} + +static int +qb_synctask_wrap (void *opaque) +{ + qb_local_t *qb_local, *tmp; + + LOCK(&qb_co.lock); + + while (!list_empty(&qb_co.queue)) { + list_for_each_entry_safe(qb_local, tmp, &qb_co.queue, list) { + list_del_init(&qb_local->list); + break; + } + + UNLOCK(&qb_co.lock); + + qb_local->synctask_fn(qb_local); + /* qb_local is now unwound and gone! */ + + LOCK(&qb_co.lock); + } + + qb_co.task = NULL; + + UNLOCK(&qb_co.lock); + + return 0; +} + +int +qb_coroutine (call_frame_t *frame, synctask_fn_t fn) +{ + qb_local_t *qb_local = NULL; + qb_conf_t *qb_conf = NULL; + static int init = 0; + + qb_local = frame->local; + qb_local->synctask_fn = fn; + qb_conf = frame->this->private; + + if (!init) { + init = 1; + init_qbco(); + } + + LOCK(&qb_co.lock); + + if (!qb_co.task) + qb_co.task = synctask_create(qb_conf->env, qb_synctask_wrap, + synctask_nop_cbk, frame, NULL); + + list_add_tail(&qb_local->list, &qb_co.queue); + + UNLOCK(&qb_co.lock); + + return 0; +} diff --git a/xlators/features/qemu-block/src/monitor-logging.c b/xlators/features/qemu-block/src/monitor-logging.c new file mode 100644 index 000000000..d37c37f0f --- /dev/null +++ b/xlators/features/qemu-block/src/monitor-logging.c @@ -0,0 +1,50 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "qemu-block-memory-types.h" + +#include "block/block_int.h" + +Monitor *cur_mon; + +int +monitor_cur_is_qmp() +{ + /* No QMP support here */ + return 0; +} + +void +monitor_set_error (Monitor *mon, QError *qerror) +{ + /* NOP here */ + return; +} + + +void +monitor_vprintf(Monitor *mon, const char *fmt, va_list ap) +{ + char buf[4096]; + + vsnprintf(buf, sizeof(buf), fmt, ap); + + gf_log (THIS->name, GF_LOG_ERROR, "%s", buf); +} diff --git a/xlators/features/qemu-block/src/qb-coroutines.c b/xlators/features/qemu-block/src/qb-coroutines.c new file mode 100644 index 000000000..7c52adb21 --- /dev/null +++ b/xlators/features/qemu-block/src/qb-coroutines.c @@ -0,0 +1,662 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "inode.h" +#include "call-stub.h" +#include "defaults.h" +#include "qemu-block-memory-types.h" +#include "qemu-block.h" +#include "qb-coroutines.h" + + +int +qb_format_and_resume (void *opaque) +{ + qb_local_t *local = NULL; + call_frame_t *frame = NULL; + call_stub_t *stub = NULL; + inode_t *inode = NULL; + char filename[64]; + char base_filename[128]; + int use_base = 0; + qb_inode_t *qb_inode = NULL; + Error *local_err = NULL; + fd_t *fd = NULL; + dict_t *xattr = NULL; + qb_conf_t *qb_conf = NULL; + int ret = -1; + + local = opaque; + frame = local->frame; + stub = local->stub; + inode = local->inode; + qb_conf = frame->this->private; + + qb_inode_to_filename (inode, filename, 64); + + qb_inode = qb_inode_ctx_get (frame->this, inode); + + /* + * See if the caller specified a backing image. + */ + if (!uuid_is_null(qb_inode->backing_gfid) || qb_inode->backing_fname) { + loc_t loc = {0,}; + char gfid_str[64]; + struct iatt buf; + + if (!uuid_is_null(qb_inode->backing_gfid)) { + loc.inode = inode_find(qb_conf->root_inode->table, + qb_inode->backing_gfid); + if (!loc.inode) { + loc.inode = inode_new(qb_conf->root_inode->table); + uuid_copy(loc.inode->gfid, + qb_inode->backing_gfid); + } + uuid_copy(loc.gfid, loc.inode->gfid); + } else if (qb_inode->backing_fname) { + loc.inode = inode_new(qb_conf->root_inode->table); + loc.name = qb_inode->backing_fname; + loc.parent = inode_parent(inode, NULL, NULL); + loc_path(&loc, loc.name); + } + + /* + * Lookup the backing image. Verify existence and/or get the + * gfid if we don't already have it. + */ + ret = syncop_lookup(FIRST_CHILD(frame->this), &loc, NULL, &buf, + NULL, NULL); + GF_FREE(qb_inode->backing_fname); + if (ret) { + loc_wipe(&loc); + ret = errno; + goto err; + } + + uuid_copy(qb_inode->backing_gfid, buf.ia_gfid); + loc_wipe(&loc); + + /* + * We pass the filename of the backing image into the qemu block + * subsystem as the associated gfid. This is embedded into the + * clone image and passed along to the gluster bdrv backend when + * the block subsystem needs to operate on the backing image on + * behalf of the clone. + */ + uuid_unparse(qb_inode->backing_gfid, gfid_str); + snprintf(base_filename, sizeof(base_filename), + "gluster://gfid:%s", gfid_str); + use_base = 1; + } + + bdrv_img_create (filename, qb_inode->fmt, + use_base ? base_filename : NULL, 0, 0, qb_inode->size, + 0, &local_err, true); + + if (error_is_set (&local_err)) { + gf_log (frame->this->name, GF_LOG_ERROR, "%s", + error_get_pretty (local_err)); + error_free (local_err); + QB_STUB_UNWIND (stub, -1, EIO); + return 0; + } + + fd = fd_anonymous (inode); + if (!fd) { + gf_log (frame->this->name, GF_LOG_ERROR, + "could not create anonymous fd for %s", + uuid_utoa (inode->gfid)); + QB_STUB_UNWIND (stub, -1, ENOMEM); + return 0; + } + + xattr = dict_new (); + if (!xattr) { + gf_log (frame->this->name, GF_LOG_ERROR, + "could not allocate xattr dict for %s", + uuid_utoa (inode->gfid)); + QB_STUB_UNWIND (stub, -1, ENOMEM); + fd_unref (fd); + return 0; + } + + ret = dict_set_str (xattr, qb_conf->qb_xattr_key, local->fmt); + if (ret) { + gf_log (frame->this->name, GF_LOG_ERROR, + "could not dict_set for %s", + uuid_utoa (inode->gfid)); + QB_STUB_UNWIND (stub, -1, ENOMEM); + fd_unref (fd); + dict_unref (xattr); + return 0; + } + + ret = syncop_fsetxattr (FIRST_CHILD(THIS), fd, xattr, 0); + if (ret) { + ret = errno; + gf_log (frame->this->name, GF_LOG_ERROR, + "failed to setxattr for %s", + uuid_utoa (inode->gfid)); + QB_STUB_UNWIND (stub, -1, ret); + fd_unref (fd); + dict_unref (xattr); + return 0; + } + + fd_unref (fd); + dict_unref (xattr); + + QB_STUB_UNWIND (stub, 0, 0); + + return 0; + +err: + QB_STUB_UNWIND(stub, -1, ret); + return 0; +} + + +static BlockDriverState * +qb_bs_create (inode_t *inode, const char *fmt) +{ + char filename[64]; + BlockDriverState *bs = NULL; + BlockDriver *drv = NULL; + int op_errno = 0; + int ret = 0; + + bs = bdrv_new (uuid_utoa (inode->gfid)); + if (!bs) { + op_errno = ENOMEM; + gf_log (THIS->name, GF_LOG_ERROR, + "could not allocate @bdrv for gfid:%s", + uuid_utoa (inode->gfid)); + goto err; + } + + drv = bdrv_find_format (fmt); + if (!drv) { + op_errno = EINVAL; + gf_log (THIS->name, GF_LOG_ERROR, + "Unknown file format: %s for gfid:%s", + fmt, uuid_utoa (inode->gfid)); + goto err; + } + + qb_inode_to_filename (inode, filename, 64); + + ret = bdrv_open (bs, filename, NULL, BDRV_O_RDWR, drv); + if (ret < 0) { + op_errno = -ret; + gf_log (THIS->name, GF_LOG_ERROR, + "Unable to bdrv_open() gfid:%s (%s)", + uuid_utoa (inode->gfid), strerror (op_errno)); + goto err; + } + + return bs; +err: + errno = op_errno; + return NULL; +} + + +int +qb_co_open (void *opaque) +{ + qb_local_t *local = NULL; + call_frame_t *frame = NULL; + call_stub_t *stub = NULL; + inode_t *inode = NULL; + qb_inode_t *qb_inode = NULL; + + local = opaque; + frame = local->frame; + stub = local->stub; + inode = local->inode; + + qb_inode = qb_inode_ctx_get (frame->this, inode); + if (!qb_inode->bs) { + /* FIXME: we need locks around this when + enabling multithreaded syncop/coroutine + for qemu-block + */ + + qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); + if (!qb_inode->bs) { + QB_STUB_UNWIND (stub, -1, errno); + return 0; + } + } + qb_inode->refcnt++; + + QB_STUB_RESUME (stub); + + return 0; +} + + +int +qb_co_writev (void *opaque) +{ + qb_local_t *local = NULL; + call_frame_t *frame = NULL; + call_stub_t *stub = NULL; + inode_t *inode = NULL; + qb_inode_t *qb_inode = NULL; + QEMUIOVector qiov = {0, }; + int ret = 0; + + local = opaque; + frame = local->frame; + stub = local->stub; + inode = local->inode; + + qb_inode = qb_inode_ctx_get (frame->this, inode); + if (!qb_inode->bs) { + /* FIXME: we need locks around this when + enabling multithreaded syncop/coroutine + for qemu-block + */ + + qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); + if (!qb_inode->bs) { + QB_STUB_UNWIND (stub, -1, errno); + return 0; + } + } + + qemu_iovec_init_external (&qiov, stub->args.vector, stub->args.count); + + ret = bdrv_pwritev (qb_inode->bs, stub->args.offset, &qiov); + + if (ret < 0) { + QB_STUB_UNWIND (stub, -1, -ret); + } else { + QB_STUB_UNWIND (stub, ret, 0); + } + + return 0; +} + + +int +qb_co_readv (void *opaque) +{ + qb_local_t *local = NULL; + call_frame_t *frame = NULL; + call_stub_t *stub = NULL; + inode_t *inode = NULL; + qb_inode_t *qb_inode = NULL; + struct iobuf *iobuf = NULL; + struct iobref *iobref = NULL; + struct iovec iov = {0, }; + int ret = 0; + + local = opaque; + frame = local->frame; + stub = local->stub; + inode = local->inode; + + qb_inode = qb_inode_ctx_get (frame->this, inode); + if (!qb_inode->bs) { + /* FIXME: we need locks around this when + enabling multithreaded syncop/coroutine + for qemu-block + */ + + qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); + if (!qb_inode->bs) { + QB_STUB_UNWIND (stub, -1, errno); + return 0; + } + } + + if (stub->args.offset >= qb_inode->size) { + QB_STUB_UNWIND (stub, 0, 0); + return 0; + } + + iobuf = iobuf_get2 (frame->this->ctx->iobuf_pool, stub->args.size); + if (!iobuf) { + QB_STUB_UNWIND (stub, -1, ENOMEM); + return 0; + } + + iobref = iobref_new (); + if (!iobref) { + QB_STUB_UNWIND (stub, -1, ENOMEM); + iobuf_unref (iobuf); + return 0; + } + + if (iobref_add (iobref, iobuf) < 0) { + iobuf_unref (iobuf); + iobref_unref (iobref); + QB_STUB_UNWIND (stub, -1, ENOMEM); + return 0; + } + + ret = bdrv_pread (qb_inode->bs, stub->args.offset, iobuf_ptr (iobuf), + stub->args.size); + + if (ret < 0) { + QB_STUB_UNWIND (stub, -1, -ret); + iobref_unref (iobref); + return 0; + } + + iov.iov_base = iobuf_ptr (iobuf); + iov.iov_len = ret; + + stub->args_cbk.vector = iov_dup (&iov, 1); + stub->args_cbk.count = 1; + stub->args_cbk.iobref = iobref; + + QB_STUB_UNWIND (stub, ret, 0); + + return 0; +} + + +int +qb_co_fsync (void *opaque) +{ + qb_local_t *local = NULL; + call_frame_t *frame = NULL; + call_stub_t *stub = NULL; + inode_t *inode = NULL; + qb_inode_t *qb_inode = NULL; + int ret = 0; + + local = opaque; + frame = local->frame; + stub = local->stub; + inode = local->inode; + + qb_inode = qb_inode_ctx_get (frame->this, inode); + if (!qb_inode->bs) { + /* FIXME: we need locks around this when + enabling multithreaded syncop/coroutine + for qemu-block + */ + + qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); + if (!qb_inode->bs) { + QB_STUB_UNWIND (stub, -1, errno); + return 0; + } + } + + ret = bdrv_flush (qb_inode->bs); + + if (ret < 0) { + QB_STUB_UNWIND (stub, -1, -ret); + } else { + QB_STUB_UNWIND (stub, ret, 0); + } + + return 0; +} + + +static void +qb_update_size_xattr (xlator_t *this, fd_t *fd, const char *fmt, off_t offset) +{ + char val[QB_XATTR_VAL_MAX]; + qb_conf_t *qb_conf = NULL; + dict_t *xattr = NULL; + + qb_conf = this->private; + + snprintf (val, QB_XATTR_VAL_MAX, "%s:%llu", + fmt, (long long unsigned) offset); + + xattr = dict_new (); + if (!xattr) + return; + + if (dict_set_str (xattr, qb_conf->qb_xattr_key, val) != 0) { + dict_unref (xattr); + return; + } + + syncop_fsetxattr (FIRST_CHILD(this), fd, xattr, 0); + dict_unref (xattr); +} + + +int +qb_co_truncate (void *opaque) +{ + qb_local_t *local = NULL; + call_frame_t *frame = NULL; + call_stub_t *stub = NULL; + inode_t *inode = NULL; + qb_inode_t *qb_inode = NULL; + int ret = 0; + off_t offset = 0; + xlator_t *this = NULL; + + this = THIS; + + local = opaque; + frame = local->frame; + stub = local->stub; + inode = local->inode; + + qb_inode = qb_inode_ctx_get (frame->this, inode); + if (!qb_inode->bs) { + /* FIXME: we need locks around this when + enabling multithreaded syncop/coroutine + for qemu-block + */ + + qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); + if (!qb_inode->bs) { + QB_STUB_UNWIND (stub, -1, errno); + return 0; + } + } + + syncop_fstat (FIRST_CHILD(this), local->fd, &stub->args_cbk.prestat); + stub->args_cbk.prestat.ia_size = qb_inode->size; + + ret = bdrv_truncate (qb_inode->bs, stub->args.offset); + if (ret < 0) + goto out; + + offset = bdrv_getlength (qb_inode->bs); + + qb_inode->size = offset; + + syncop_fstat (FIRST_CHILD(this), local->fd, &stub->args_cbk.poststat); + stub->args_cbk.poststat.ia_size = qb_inode->size; + + qb_update_size_xattr (this, local->fd, qb_inode->fmt, qb_inode->size); + +out: + if (ret < 0) { + QB_STUB_UNWIND (stub, -1, -ret); + } else { + QB_STUB_UNWIND (stub, ret, 0); + } + + return 0; +} + + +int +qb_co_close (void *opaque) +{ + qb_local_t *local = NULL; + call_frame_t *frame = NULL; + inode_t *inode = NULL; + qb_inode_t *qb_inode = NULL; + BlockDriverState *bs = NULL; + + local = opaque; + inode = local->inode; + + qb_inode = qb_inode_ctx_get (THIS, inode); + + if (!--qb_inode->refcnt) { + bs = qb_inode->bs; + qb_inode->bs = NULL; + bdrv_delete (bs); + } + + frame = local->frame; + frame->local = NULL; + qb_local_free (THIS, local); + STACK_DESTROY (frame->root); + + return 0; +} + + +int +qb_snapshot_create (void *opaque) +{ + qb_local_t *local = NULL; + call_frame_t *frame = NULL; + call_stub_t *stub = NULL; + inode_t *inode = NULL; + qb_inode_t *qb_inode = NULL; + QEMUSnapshotInfo sn; + struct timeval tv = {0, }; + int ret = 0; + + local = opaque; + frame = local->frame; + stub = local->stub; + inode = local->inode; + + qb_inode = qb_inode_ctx_get (frame->this, inode); + if (!qb_inode->bs) { + /* FIXME: we need locks around this when + enabling multithreaded syncop/coroutine + for qemu-block + */ + + qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); + if (!qb_inode->bs) { + QB_STUB_UNWIND (stub, -1, errno); + return 0; + } + } + + memset (&sn, 0, sizeof (sn)); + pstrcpy (sn.name, sizeof(sn.name), local->name); + gettimeofday (&tv, NULL); + sn.date_sec = tv.tv_sec; + sn.date_nsec = tv.tv_usec * 1000; + + ret = bdrv_snapshot_create (qb_inode->bs, &sn); + if (ret < 0) { + QB_STUB_UNWIND (stub, -1, -ret); + } else { + QB_STUB_UNWIND (stub, ret, 0); + } + + return 0; +} + + +int +qb_snapshot_delete (void *opaque) +{ + qb_local_t *local = NULL; + call_frame_t *frame = NULL; + call_stub_t *stub = NULL; + inode_t *inode = NULL; + qb_inode_t *qb_inode = NULL; + int ret = 0; + + local = opaque; + frame = local->frame; + stub = local->stub; + inode = local->inode; + + qb_inode = qb_inode_ctx_get (frame->this, inode); + if (!qb_inode->bs) { + /* FIXME: we need locks around this when + enabling multithreaded syncop/coroutine + for qemu-block + */ + + qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); + if (!qb_inode->bs) { + QB_STUB_UNWIND (stub, -1, errno); + return 0; + } + } + + ret = bdrv_snapshot_delete (qb_inode->bs, local->name); + + if (ret < 0) { + QB_STUB_UNWIND (stub, -1, -ret); + } else { + QB_STUB_UNWIND (stub, ret, 0); + } + + return 0; +} + + +int +qb_snapshot_goto (void *opaque) +{ + qb_local_t *local = NULL; + call_frame_t *frame = NULL; + call_stub_t *stub = NULL; + inode_t *inode = NULL; + qb_inode_t *qb_inode = NULL; + int ret = 0; + + local = opaque; + frame = local->frame; + stub = local->stub; + inode = local->inode; + + qb_inode = qb_inode_ctx_get (frame->this, inode); + if (!qb_inode->bs) { + /* FIXME: we need locks around this when + enabling multithreaded syncop/coroutine + for qemu-block + */ + + qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); + if (!qb_inode->bs) { + QB_STUB_UNWIND (stub, -1, errno); + return 0; + } + } + + ret = bdrv_snapshot_goto (qb_inode->bs, local->name); + + if (ret < 0) { + QB_STUB_UNWIND (stub, -1, -ret); + } else { + QB_STUB_UNWIND (stub, ret, 0); + } + + return 0; +} diff --git a/xlators/features/qemu-block/src/qb-coroutines.h b/xlators/features/qemu-block/src/qb-coroutines.h new file mode 100644 index 000000000..583319f3b --- /dev/null +++ b/xlators/features/qemu-block/src/qb-coroutines.h @@ -0,0 +1,30 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __QB_COROUTINES_H +#define __QB_COROUTINES_H + +#include "syncop.h" +#include "call-stub.h" +#include "block/block_int.h" +#include "monitor/monitor.h" + +int qb_format_and_resume (void *opaque); +int qb_snapshot_create (void *opaque); +int qb_snapshot_delete (void *opaque); +int qb_snapshot_goto (void *opaque); +int qb_co_open (void *opaque); +int qb_co_close (void *opaque); +int qb_co_writev (void *opaque); +int qb_co_readv (void *opaque); +int qb_co_fsync (void *opaque); +int qb_co_truncate (void *opaque); + +#endif /* __QB_COROUTINES_H */ diff --git a/xlators/features/qemu-block/src/qemu-block-memory-types.h b/xlators/features/qemu-block/src/qemu-block-memory-types.h new file mode 100644 index 000000000..267b3893f --- /dev/null +++ b/xlators/features/qemu-block/src/qemu-block-memory-types.h @@ -0,0 +1,25 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef __QB_MEM_TYPES_H__ +#define __QB_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_qb_mem_types_ { + gf_qb_mt_qb_conf_t = gf_common_mt_end + 1, + gf_qb_mt_qb_inode_t, + gf_qb_mt_qb_local_t, + gf_qb_mt_coroutinesynctask_t, + gf_qb_mt_end +}; +#endif + diff --git a/xlators/features/qemu-block/src/qemu-block.c b/xlators/features/qemu-block/src/qemu-block.c new file mode 100644 index 000000000..48bbf3140 --- /dev/null +++ b/xlators/features/qemu-block/src/qemu-block.c @@ -0,0 +1,1140 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "inode.h" +#include "call-stub.h" +#include "defaults.h" +#include "qemu-block-memory-types.h" +#include "qemu-block.h" +#include "qb-coroutines.h" + + +qb_inode_t * +__qb_inode_ctx_get (xlator_t *this, inode_t *inode) +{ + uint64_t value = 0; + qb_inode_t *qb_inode = NULL; + + __inode_ctx_get (inode, this, &value); + qb_inode = (qb_inode_t *)(unsigned long) value; + + return qb_inode; +} + + +qb_inode_t * +qb_inode_ctx_get (xlator_t *this, inode_t *inode) +{ + qb_inode_t *qb_inode = NULL; + + LOCK (&inode->lock); + { + qb_inode = __qb_inode_ctx_get (this, inode); + } + UNLOCK (&inode->lock); + + return qb_inode; +} + + +qb_inode_t * +qb_inode_ctx_del (xlator_t *this, inode_t *inode) +{ + uint64_t value = 0; + qb_inode_t *qb_inode = NULL; + + inode_ctx_del (inode, this, &value); + qb_inode = (qb_inode_t *)(unsigned long) value; + + return qb_inode; +} + + +int +qb_inode_cleanup (xlator_t *this, inode_t *inode, int warn) +{ + qb_inode_t *qb_inode = NULL; + + qb_inode = qb_inode_ctx_del (this, inode); + + if (!qb_inode) + return 0; + + if (warn) + gf_log (this->name, GF_LOG_WARNING, + "inode %s no longer block formatted", + uuid_utoa (inode->gfid)); + + /* free (qb_inode->bs); */ + + GF_FREE (qb_inode); + + return 0; +} + + +int +qb_iatt_fixup (xlator_t *this, inode_t *inode, struct iatt *iatt) +{ + qb_inode_t *qb_inode = NULL; + + qb_inode = qb_inode_ctx_get (this, inode); + if (!qb_inode) + return 0; + + iatt->ia_size = qb_inode->size; + + return 0; +} + + +int +qb_format_extract (xlator_t *this, char *format, inode_t *inode) +{ + char *s, *save; + uint64_t size = 0; + char fmt[QB_XATTR_VAL_MAX+1] = {0, }; + qb_inode_t *qb_inode = NULL; + char *formatstr = NULL; + uuid_t gfid = {0,}; + char gfid_str[64] = {0,}; + int ret; + + strncpy(fmt, format, QB_XATTR_VAL_MAX); + + s = strtok_r(fmt, ":", &save); + if (!s) + goto invalid; + formatstr = gf_strdup(s); + + s = strtok_r(NULL, ":", &save); + if (!s) + goto invalid; + if (gf_string2bytesize (s, &size)) + goto invalid; + if (!size) + goto invalid; + + s = strtok_r(NULL, "\0", &save); + if (s && !strncmp(s, "<gfid:", strlen("<gfid:"))) { + /* + * Check for valid gfid backing image specifier. + */ + if (strlen(s) + 1 > sizeof(gfid_str)) + goto invalid; + ret = sscanf(s, "<gfid:%[^>]s", gfid_str); + if (ret == 1) { + ret = uuid_parse(gfid_str, gfid); + if (ret < 0) + goto invalid; + } + } + + qb_inode = qb_inode_ctx_get (this, inode); + if (!qb_inode) + qb_inode = GF_CALLOC (1, sizeof (*qb_inode), + gf_qb_mt_qb_inode_t); + if (!qb_inode) { + GF_FREE(formatstr); + return ENOMEM; + } + + strncpy(qb_inode->fmt, formatstr, QB_XATTR_VAL_MAX); + qb_inode->size = size; + + /* + * If a backing gfid was not specified, interpret any remaining bytes + * associated with a backing image as a filename local to the parent + * directory. The format processing will validate further. + */ + if (!uuid_is_null(gfid)) + uuid_copy(qb_inode->backing_gfid, gfid); + else if (s) + qb_inode->backing_fname = gf_strdup(s); + + inode_ctx_set (inode, this, (void *)&qb_inode); + + GF_FREE(formatstr); + + return 0; + +invalid: + GF_FREE(formatstr); + + gf_log (this->name, GF_LOG_WARNING, + "invalid format '%s' in inode %s", format, + uuid_utoa (inode->gfid)); + return EINVAL; +} + + +void +qb_local_free (xlator_t *this, qb_local_t *local) +{ + if (local->inode) + inode_unref (local->inode); + if (local->fd) + fd_unref (local->fd); + GF_FREE (local); +} + + +int +qb_local_init (call_frame_t *frame) +{ + qb_local_t *qb_local = NULL; + + qb_local = GF_CALLOC (1, sizeof (*qb_local), gf_qb_mt_qb_local_t); + if (!qb_local) + return -1; + INIT_LIST_HEAD(&qb_local->list); + + qb_local->frame = frame; + frame->local = qb_local; + + return 0; +} + + +int +qb_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *buf, + dict_t *xdata, struct iatt *postparent) +{ + char *format = NULL; + qb_conf_t *conf = NULL; + + conf = this->private; + + if (op_ret == -1) + goto out; + + /* + * Cache the root inode for dealing with backing images. The format + * coroutine and the gluster qemu backend driver both use the root inode + * table to verify and/or redirect I/O to the backing image via + * anonymous fd's. + */ + if (!conf->root_inode && __is_root_gfid(inode->gfid)) + conf->root_inode = inode_ref(inode); + + if (!xdata) + goto out; + + if (dict_get_str (xdata, conf->qb_xattr_key, &format)) + goto out; + + if (!format) { + qb_inode_cleanup (this, inode, 1); + goto out; + } + + op_errno = qb_format_extract (this, format, inode); + if (op_errno) + op_ret = -1; + + qb_iatt_fixup (this, inode, buf); +out: + QB_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf, + xdata, postparent); + return 0; +} + + +int +qb_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + qb_conf_t *conf = NULL; + + conf = this->private; + + xdata = xdata ? dict_ref (xdata) : dict_new (); + + if (!xdata) + goto enomem; + + if (dict_set_int32 (xdata, conf->qb_xattr_key, 0)) + goto enomem; + + STACK_WIND (frame, qb_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + dict_unref (xdata); + return 0; +enomem: + QB_STACK_UNWIND (lookup, frame, -1, ENOMEM, 0, 0, 0, 0); + if (xdata) + dict_unref (xdata); + return 0; +} + + +int +qb_setxattr_format (call_frame_t *frame, xlator_t *this, call_stub_t *stub, + dict_t *xattr, inode_t *inode) +{ + char *format = NULL; + int op_errno = 0; + qb_local_t *qb_local = NULL; + data_t *data = NULL; + qb_inode_t *qb_inode; + + if (!(data = dict_get (xattr, "trusted.glusterfs.block-format"))) { + QB_STUB_RESUME (stub); + return 0; + } + + format = alloca (data->len + 1); + memcpy (format, data->data, data->len); + format[data->len] = 0; + + op_errno = qb_format_extract (this, format, inode); + if (op_errno) { + QB_STUB_UNWIND (stub, -1, op_errno); + return 0; + } + qb_inode = qb_inode_ctx_get(this, inode); + + qb_local = frame->local; + + qb_local->stub = stub; + qb_local->inode = inode_ref (inode); + + snprintf(qb_local->fmt, QB_XATTR_VAL_MAX, "%s:%lu", qb_inode->fmt, + qb_inode->size); + + qb_coroutine (frame, qb_format_and_resume); + + return 0; +} + + +int +qb_setxattr_snapshot_create (call_frame_t *frame, xlator_t *this, + call_stub_t *stub, dict_t *xattr, inode_t *inode) +{ + qb_local_t *qb_local = NULL; + char *name = NULL; + data_t *data = NULL; + + if (!(data = dict_get (xattr, "trusted.glusterfs.block-snapshot-create"))) { + QB_STUB_RESUME (stub); + return 0; + } + + name = alloca (data->len + 1); + memcpy (name, data->data, data->len); + name[data->len] = 0; + + qb_local = frame->local; + + qb_local->stub = stub; + qb_local->inode = inode_ref (inode); + strncpy (qb_local->name, name, 128); + + qb_coroutine (frame, qb_snapshot_create); + + return 0; +} + + +int +qb_setxattr_snapshot_delete (call_frame_t *frame, xlator_t *this, + call_stub_t *stub, dict_t *xattr, inode_t *inode) +{ + qb_local_t *qb_local = NULL; + char *name = NULL; + data_t *data = NULL; + + if (!(data = dict_get (xattr, "trusted.glusterfs.block-snapshot-delete"))) { + QB_STUB_RESUME (stub); + return 0; + } + + name = alloca (data->len + 1); + memcpy (name, data->data, data->len); + name[data->len] = 0; + + qb_local = frame->local; + + qb_local->stub = stub; + qb_local->inode = inode_ref (inode); + strncpy (qb_local->name, name, 128); + + qb_coroutine (frame, qb_snapshot_delete); + + return 0; +} + +int +qb_setxattr_snapshot_goto (call_frame_t *frame, xlator_t *this, + call_stub_t *stub, dict_t *xattr, inode_t *inode) +{ + qb_local_t *qb_local = NULL; + char *name = NULL; + data_t *data = NULL; + + if (!(data = dict_get (xattr, "trusted.glusterfs.block-snapshot-goto"))) { + QB_STUB_RESUME (stub); + return 0; + } + + name = alloca (data->len + 1); + memcpy (name, data->data, data->len); + name[data->len] = 0; + + qb_local = frame->local; + + qb_local->stub = stub; + qb_local->inode = inode_ref (inode); + strncpy (qb_local->name, name, 128); + + qb_coroutine (frame, qb_snapshot_goto); + + return 0; +} + + +int +qb_setxattr_common (call_frame_t *frame, xlator_t *this, call_stub_t *stub, + dict_t *xattr, inode_t *inode) +{ + data_t *data = NULL; + + if ((data = dict_get (xattr, "trusted.glusterfs.block-format"))) { + qb_setxattr_format (frame, this, stub, xattr, inode); + return 0; + } + + if ((data = dict_get (xattr, "trusted.glusterfs.block-snapshot-create"))) { + qb_setxattr_snapshot_create (frame, this, stub, xattr, inode); + return 0; + } + + if ((data = dict_get (xattr, "trusted.glusterfs.block-snapshot-delete"))) { + qb_setxattr_snapshot_delete (frame, this, stub, xattr, inode); + return 0; + } + + if ((data = dict_get (xattr, "trusted.glusterfs.block-snapshot-goto"))) { + qb_setxattr_snapshot_goto (frame, this, stub, xattr, inode); + return 0; + } + + QB_STUB_RESUME (stub); + + return 0; +} + + +int +qb_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr, + int flags, dict_t *xdata) +{ + call_stub_t *stub = NULL; + + if (qb_local_init (frame) != 0) + goto enomem; + + stub = fop_setxattr_stub (frame, default_setxattr_resume, loc, xattr, + flags, xdata); + if (!stub) + goto enomem; + + qb_setxattr_common (frame, this, stub, xattr, loc->inode); + + return 0; +enomem: + QB_STACK_UNWIND (setxattr, frame, -1, ENOMEM, 0); + return 0; +} + + +int +qb_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr, + int flags, dict_t *xdata) +{ + call_stub_t *stub = NULL; + + if (qb_local_init (frame) != 0) + goto enomem; + + stub = fop_fsetxattr_stub (frame, default_fsetxattr_resume, fd, xattr, + flags, xdata); + if (!stub) + goto enomem; + + qb_setxattr_common (frame, this, stub, xattr, fd->inode); + + return 0; +enomem: + QB_STACK_UNWIND (fsetxattr, frame, -1, ENOMEM, 0); + return 0; +} + + +int +qb_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) +{ + call_stub_t *stub = NULL; + qb_local_t *qb_local = NULL; + + qb_local = frame->local; + + if (op_ret < 0) + goto unwind; + + if (!qb_inode_ctx_get (this, qb_local->inode)) + goto unwind; + + stub = fop_open_cbk_stub (frame, NULL, op_ret, op_errno, fd, xdata); + if (!stub) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + qb_local->stub = stub; + + qb_coroutine (frame, qb_co_open); + + return 0; +unwind: + QB_STACK_UNWIND (open, frame, op_ret, op_errno, fd, xdata); + return 0; +} + + +int +qb_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + fd_t *fd, dict_t *xdata) +{ + qb_local_t *qb_local = NULL; + qb_inode_t *qb_inode = NULL; + + qb_inode = qb_inode_ctx_get (this, loc->inode); + if (!qb_inode) { + STACK_WIND (frame, default_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, + xdata); + return 0; + } + + if (qb_local_init (frame) != 0) + goto enomem; + + qb_local = frame->local; + + qb_local->inode = inode_ref (loc->inode); + qb_local->fd = fd_ref (fd); + + STACK_WIND (frame, qb_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + return 0; +enomem: + QB_STACK_UNWIND (open, frame, -1, ENOMEM, 0, 0); + return 0; +} + + +int +qb_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) +{ + qb_local_t *qb_local = NULL; + qb_inode_t *qb_inode = NULL; + + qb_inode = qb_inode_ctx_get (this, fd->inode); + if (!qb_inode) { + STACK_WIND (frame, default_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, + offset, flags, iobref, xdata); + return 0; + } + + if (qb_local_init (frame) != 0) + goto enomem; + + qb_local = frame->local; + + qb_local->inode = inode_ref (fd->inode); + qb_local->fd = fd_ref (fd); + + qb_local->stub = fop_writev_stub (frame, NULL, fd, vector, count, + offset, flags, iobref, xdata); + if (!qb_local->stub) + goto enomem; + + qb_coroutine (frame, qb_co_writev); + + return 0; +enomem: + QB_STACK_UNWIND (writev, frame, -1, ENOMEM, 0, 0, 0); + return 0; +} + + +int +qb_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + qb_local_t *qb_local = NULL; + qb_inode_t *qb_inode = NULL; + + qb_inode = qb_inode_ctx_get (this, fd->inode); + if (!qb_inode) { + STACK_WIND (frame, default_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, + flags, xdata); + return 0; + } + + if (qb_local_init (frame) != 0) + goto enomem; + + qb_local = frame->local; + + qb_local->inode = inode_ref (fd->inode); + qb_local->fd = fd_ref (fd); + + qb_local->stub = fop_readv_stub (frame, NULL, fd, size, offset, + flags, xdata); + if (!qb_local->stub) + goto enomem; + + qb_coroutine (frame, qb_co_readv); + + return 0; +enomem: + QB_STACK_UNWIND (readv, frame, -1, ENOMEM, 0, 0, 0, 0, 0); + return 0; +} + + +int +qb_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int dsync, + dict_t *xdata) +{ + qb_local_t *qb_local = NULL; + qb_inode_t *qb_inode = NULL; + + qb_inode = qb_inode_ctx_get (this, fd->inode); + if (!qb_inode) { + STACK_WIND (frame, default_fsync_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, dsync, xdata); + return 0; + } + + if (qb_local_init (frame) != 0) + goto enomem; + + qb_local = frame->local; + + qb_local->inode = inode_ref (fd->inode); + qb_local->fd = fd_ref (fd); + + qb_local->stub = fop_fsync_stub (frame, NULL, fd, dsync, xdata); + + if (!qb_local->stub) + goto enomem; + + qb_coroutine (frame, qb_co_fsync); + + return 0; +enomem: + QB_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0); + return 0; +} + + +int +qb_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + qb_local_t *qb_local = NULL; + qb_inode_t *qb_inode = NULL; + + qb_inode = qb_inode_ctx_get (this, fd->inode); + if (!qb_inode) { + STACK_WIND (frame, default_flush_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, fd, xdata); + return 0; + } + + if (qb_local_init (frame) != 0) + goto enomem; + + qb_local = frame->local; + + qb_local->inode = inode_ref (fd->inode); + qb_local->fd = fd_ref (fd); + + qb_local->stub = fop_flush_stub (frame, NULL, fd, xdata); + + if (!qb_local->stub) + goto enomem; + + qb_coroutine (frame, qb_co_fsync); + + return 0; +enomem: + QB_STACK_UNWIND (flush, frame, -1, ENOMEM, 0); + return 0; +} + +static int32_t +qb_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + qb_conf_t *conf = this->private; + gf_dirent_t *entry; + char *format; + + list_for_each_entry(entry, &entries->list, list) { + if (!entry->inode || !entry->dict) + continue; + + format = NULL; + if (dict_get_str(entry->dict, conf->qb_xattr_key, &format)) + continue; + + if (!format) { + qb_inode_cleanup(this, entry->inode, 1); + continue; + } + + if (qb_format_extract(this, format, entry->inode)) + continue; + + qb_iatt_fixup(this, entry->inode, &entry->d_stat); + } + + STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, entries, xdata); + return 0; +} + +static int32_t +qb_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) +{ + qb_conf_t *conf = this->private; + + xdata = xdata ? dict_ref(xdata) : dict_new(); + if (!xdata) + goto enomem; + + if (dict_set_int32 (xdata, conf->qb_xattr_key, 0)) + goto enomem; + + STACK_WIND(frame, qb_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, off, xdata); + + dict_unref(xdata); + return 0; + +enomem: + QB_STACK_UNWIND(readdirp, frame, -1, ENOMEM, NULL, NULL); + if (xdata) + dict_unref(xdata); + return 0; +} + +int +qb_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + qb_local_t *qb_local = NULL; + qb_inode_t *qb_inode = NULL; + + qb_inode = qb_inode_ctx_get (this, loc->inode); + if (!qb_inode) { + STACK_WIND (frame, default_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, + xdata); + return 0; + } + + if (qb_local_init (frame) != 0) + goto enomem; + + qb_local = frame->local; + + qb_local->inode = inode_ref (loc->inode); + qb_local->fd = fd_anonymous (loc->inode); + + qb_local->stub = fop_truncate_stub (frame, NULL, loc, offset, xdata); + + if (!qb_local->stub) + goto enomem; + + qb_coroutine (frame, qb_co_truncate); + + return 0; +enomem: + QB_STACK_UNWIND (truncate, frame, -1, ENOMEM, 0, 0, 0); + return 0; +} + + +int +qb_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + qb_local_t *qb_local = NULL; + qb_inode_t *qb_inode = NULL; + + qb_inode = qb_inode_ctx_get (this, fd->inode); + if (!qb_inode) { + STACK_WIND (frame, default_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, + xdata); + return 0; + } + + if (qb_local_init (frame) != 0) + goto enomem; + + qb_local = frame->local; + + qb_local->inode = inode_ref (fd->inode); + qb_local->fd = fd_ref (fd); + + qb_local->stub = fop_ftruncate_stub (frame, NULL, fd, offset, xdata); + + if (!qb_local->stub) + goto enomem; + + qb_coroutine (frame, qb_co_truncate); + + return 0; +enomem: + QB_STACK_UNWIND (ftruncate, frame, -1, ENOMEM, 0, 0, 0); + return 0; +} + + +int +qb_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *iatt, dict_t *xdata) +{ + inode_t *inode = NULL; + + inode = frame->local; + frame->local = NULL; + + if (inode) { + qb_iatt_fixup (this, inode, iatt); + inode_unref (inode); + } + + QB_STACK_UNWIND (stat, frame, op_ret, op_errno, iatt, xdata); + + return 0; +} + +int +qb_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + if (qb_inode_ctx_get (this, loc->inode)) + frame->local = inode_ref (loc->inode); + + STACK_WIND (frame, qb_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + return 0; +} + + +int +qb_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *iatt, dict_t *xdata) +{ + inode_t *inode = NULL; + + inode = frame->local; + frame->local = NULL; + + if (inode) { + qb_iatt_fixup (this, inode, iatt); + inode_unref (inode); + } + + QB_STACK_UNWIND (fstat, frame, op_ret, op_errno, iatt, xdata); + + return 0; +} + + +int +qb_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + if (qb_inode_ctx_get (this, fd->inode)) + frame->local = inode_ref (fd->inode); + + STACK_WIND (frame, qb_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + return 0; +} + + +int +qb_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, struct iatt *post, + dict_t *xdata) +{ + inode_t *inode = NULL; + + inode = frame->local; + frame->local = NULL; + + if (inode) { + qb_iatt_fixup (this, inode, pre); + qb_iatt_fixup (this, inode, post); + inode_unref (inode); + } + + QB_STACK_UNWIND (setattr, frame, op_ret, op_errno, pre, post, xdata); + + return 0; +} + + +int +qb_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf, + int valid, dict_t *xdata) +{ + if (qb_inode_ctx_get (this, loc->inode)) + frame->local = inode_ref (loc->inode); + + STACK_WIND (frame, qb_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, buf, valid, xdata); + return 0; +} + + +int +qb_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, struct iatt *post, + dict_t *xdata) +{ + inode_t *inode = NULL; + + inode = frame->local; + frame->local = NULL; + + if (inode) { + qb_iatt_fixup (this, inode, pre); + qb_iatt_fixup (this, inode, post); + inode_unref (inode); + } + + QB_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, pre, post, xdata); + + return 0; +} + + +int +qb_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf, + int valid, dict_t *xdata) +{ + if (qb_inode_ctx_get (this, fd->inode)) + frame->local = inode_ref (fd->inode); + + STACK_WIND (frame, qb_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, buf, valid, xdata); + return 0; +} + + +int +qb_forget (xlator_t *this, inode_t *inode) +{ + return qb_inode_cleanup (this, inode, 0); +} + + +int +qb_release (xlator_t *this, fd_t *fd) +{ + call_frame_t *frame = NULL; + + frame = create_frame (this, this->ctx->pool); + if (!frame) { + gf_log (this->name, GF_LOG_ERROR, + "Could not allocate frame. " + "Leaking QEMU BlockDriverState"); + return -1; + } + + if (qb_local_init (frame) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "Could not allocate local. " + "Leaking QEMU BlockDriverState"); + STACK_DESTROY (frame->root); + return -1; + } + + if (qb_coroutine (frame, qb_co_close) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "Could not allocate coroutine. " + "Leaking QEMU BlockDriverState"); + qb_local_free (this, frame->local); + frame->local = NULL; + STACK_DESTROY (frame->root); + } + + return 0; +} + +int +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + ret = xlator_mem_acct_init (this, gf_qb_mt_end + 1); + + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Memory accounting init " + "failed"); + return ret; +} + + +int +reconfigure (xlator_t *this, dict_t *options) +{ + return 0; +} + + +int +init (xlator_t *this) +{ + qb_conf_t *conf = NULL; + int32_t ret = -1; + static int bdrv_inited = 0; + + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "FATAL: qemu-block (%s) not configured with exactly " + "one child", this->name); + goto out; + } + + conf = GF_CALLOC (1, sizeof (*conf), gf_qb_mt_qb_conf_t); + if (!conf) + goto out; + + /* configure 'option window-size <size>' */ + GF_OPTION_INIT ("default-password", conf->default_password, str, out); + + /* qemu coroutines use "co_mutex" for synchronizing among themselves. + However "co_mutex" itself is not threadsafe if the coroutine framework + is multithreaded (which usually is not). However synctasks are + fundamentally multithreaded, so for now create a syncenv which has + scaling limits set to max 1 thread so that the qemu coroutines can + execute "safely". + + Future work: provide an implementation of "co_mutex" which is + threadsafe and use the global multithreaded ctx->env syncenv. + */ + conf->env = syncenv_new (0, 1, 1); + + this->private = conf; + + ret = 0; + + snprintf (conf->qb_xattr_key, QB_XATTR_KEY_MAX, QB_XATTR_KEY_FMT, + this->name); + + cur_mon = (void *) 1; + + if (!bdrv_inited) { + bdrv_init (); + bdrv_inited = 1; + } + +out: + if (ret) + GF_FREE (conf); + + return ret; +} + + +void +fini (xlator_t *this) +{ + qb_conf_t *conf = NULL; + + conf = this->private; + + this->private = NULL; + + if (conf->root_inode) + inode_unref(conf->root_inode); + GF_FREE (conf); + + return; +} + + +struct xlator_fops fops = { + .lookup = qb_lookup, + .fsetxattr = qb_fsetxattr, + .setxattr = qb_setxattr, + .open = qb_open, + .writev = qb_writev, + .readv = qb_readv, + .fsync = qb_fsync, + .truncate = qb_truncate, + .ftruncate = qb_ftruncate, + .stat = qb_stat, + .fstat = qb_fstat, + .setattr = qb_setattr, + .fsetattr = qb_fsetattr, + .flush = qb_flush, +/* + .getxattr = qb_getxattr, + .fgetxattr = qb_fgetxattr +*/ + .readdirp = qb_readdirp, +}; + + +struct xlator_cbks cbks = { + .forget = qb_forget, + .release = qb_release, +}; + + +struct xlator_dumpops dumpops = { +}; + + +struct volume_options options[] = { + { .key = {"default-password"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "", + .description = "Default password for the AES encrypted block images." + }, + { .key = {NULL} }, +}; diff --git a/xlators/features/qemu-block/src/qemu-block.h b/xlators/features/qemu-block/src/qemu-block.h new file mode 100644 index 000000000..c95f2799a --- /dev/null +++ b/xlators/features/qemu-block/src/qemu-block.h @@ -0,0 +1,109 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __QEMU_BLOCK_H +#define __QEMU_BLOCK_H + +#include "syncop.h" +#include "call-stub.h" +#include "block/block_int.h" +#include "monitor/monitor.h" + +/* QB_XATTR_KEY_FMT is the on-disk xattr stored in the inode which + indicates that the file must be "interpreted" by the block format + logic. The value of the key is of the pattern: + + "format:virtual_size" + + e.g + + "qcow2:20GB" or "qed:100GB" + + The format and virtual size are colon separated. The format is + a case sensitive string which qemu recognizes. virtual_size is + specified as a size which glusterfs recognizes as size (i.e., + value accepted by gf_string2bytesize()) +*/ +#define QB_XATTR_KEY_FMT "trusted.glusterfs.%s.format" + +#define QB_XATTR_KEY_MAX 64 + +#define QB_XATTR_VAL_MAX 64 + + +typedef struct qb_inode { + char fmt[QB_XATTR_VAL_MAX]; /* this is only the format, not "format:size" */ + size_t size; /* virtual size in bytes */ + BlockDriverState *bs; + int refcnt; + uuid_t backing_gfid; + char *backing_fname; +} qb_inode_t; + + +typedef struct qb_conf { + Monitor *mon; + struct syncenv *env; + char qb_xattr_key[QB_XATTR_KEY_MAX]; + char *default_password; + inode_t *root_inode; +} qb_conf_t; + + +typedef struct qb_local { + call_frame_t *frame; /* backpointer */ + call_stub_t *stub; + inode_t *inode; + fd_t *fd; + char fmt[QB_XATTR_VAL_MAX+1]; + char name[256]; + synctask_fn_t synctask_fn; + struct list_head list; +} qb_local_t; + +void qb_local_free (xlator_t *this, qb_local_t *local); +int qb_coroutine (call_frame_t *frame, synctask_fn_t fn); +inode_t *qb_inode_from_filename (const char *filename); +int qb_inode_to_filename (inode_t *inode, char *filename, int size); +int qb_format_extract (xlator_t *this, char *format, inode_t *inode); + +qb_inode_t *qb_inode_ctx_get (xlator_t *this, inode_t *inode); + +#define QB_STACK_UNWIND(typ, frame, args ...) do { \ + qb_local_t *__local = frame->local; \ + xlator_t *__this = frame->this; \ + \ + frame->local = NULL; \ + STACK_UNWIND_STRICT (typ, frame, args); \ + if (__local) \ + qb_local_free (__this, __local); \ + } while (0) + +#define QB_STUB_UNWIND(stub, op_ret, op_errno) do { \ + qb_local_t *__local = stub->frame->local; \ + xlator_t *__this = stub->frame->this; \ + \ + stub->frame->local = NULL; \ + call_unwind_error (stub, op_ret, op_errno); \ + if (__local) \ + qb_local_free (__this, __local); \ + } while (0) + +#define QB_STUB_RESUME(stub_errno) do { \ + qb_local_t *__local = stub->frame->local; \ + xlator_t *__this = stub->frame->this; \ + \ + stub->frame->local = NULL; \ + call_resume (stub); \ + if (__local) \ + qb_local_free (__this, __local); \ + } while (0) + +#endif /* !__QEMU_BLOCK_H */ diff --git a/xlators/features/quiesce/src/quiesce.c b/xlators/features/quiesce/src/quiesce.c index 73eb91947..24c7dc6ed 100644 --- a/xlators/features/quiesce/src/quiesce.c +++ b/xlators/features/quiesce/src/quiesce.c @@ -111,7 +111,7 @@ void gf_quiesce_enqueue (xlator_t *this, call_stub_t *stub) { quiesce_priv_t *priv = NULL; - struct timeval timeout = {0,}; + struct timespec timeout = {0,}; priv = this->private; if (!priv) { @@ -129,7 +129,7 @@ gf_quiesce_enqueue (xlator_t *this, call_stub_t *stub) if (!priv->timer) { timeout.tv_sec = 20; - timeout.tv_usec = 0; + timeout.tv_nsec = 0; priv->timer = gf_timer_call_after (this->ctx, timeout, @@ -2492,7 +2492,7 @@ notify (xlator_t *this, int event, void *data, ...) { int ret = 0; quiesce_priv_t *priv = NULL; - struct timeval timeout = {0,}; + struct timespec timeout = {0,}; priv = this->private; if (!priv) @@ -2525,7 +2525,7 @@ notify (xlator_t *this, int event, void *data, ...) if (priv->timer) break; timeout.tv_sec = 20; - timeout.tv_usec = 0; + timeout.tv_nsec = 0; priv->timer = gf_timer_call_after (this->ctx, timeout, diff --git a/xlators/features/quota/src/quota.c b/xlators/features/quota/src/quota.c index 4ea54cca8..c527e7ca7 100644 --- a/xlators/features/quota/src/quota.c +++ b/xlators/features/quota/src/quota.c @@ -2969,6 +2969,177 @@ err: return 0; } +int32_t +quota_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t ret = 0; + uint64_t ctx_int = 0; + quota_inode_ctx_t *ctx = NULL; + quota_local_t *local = NULL; + quota_dentry_t *dentry = NULL; + int64_t delta = 0; + + local = frame->local; + + if ((op_ret < 0) || (local == NULL)) { + goto out; + } + + ret = inode_ctx_get (local->loc.inode, this, &ctx_int); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to get the context", local->loc.path); + goto out; + } + + ctx = (quota_inode_ctx_t *)(unsigned long) ctx_int; + + if (ctx == NULL) { + gf_log (this->name, GF_LOG_WARNING, + "quota context not set in %s (gfid:%s)", + local->loc.path, uuid_utoa (local->loc.inode->gfid)); + goto out; + } + + LOCK (&ctx->lock); + { + ctx->buf = *postbuf; + } + UNLOCK (&ctx->lock); + + list_for_each_entry (dentry, &ctx->parents, next) { + delta = (postbuf->ia_blocks - prebuf->ia_blocks) * 512; + quota_update_size (this, local->loc.inode, + dentry->name, dentry->par, delta); + } + +out: + QUOTA_STACK_UNWIND (fallocate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + return 0; +} + +int32_t +quota_fallocate_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t mode, off_t offset, size_t len, dict_t *xdata) +{ + quota_local_t *local = NULL; + int32_t op_errno = EINVAL; + + local = frame->local; + if (local == NULL) { + gf_log (this->name, GF_LOG_WARNING, "local is NULL"); + goto unwind; + } + + if (local->op_ret == -1) { + op_errno = local->op_errno; + goto unwind; + } + + STACK_WIND (frame, quota_fallocate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len, + xdata); + return 0; + +unwind: + QUOTA_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t +quota_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + int32_t ret = -1, op_errno = EINVAL; + int32_t parents = 0; + quota_local_t *local = NULL; + quota_inode_ctx_t *ctx = NULL; + quota_priv_t *priv = NULL; + call_stub_t *stub = NULL; + quota_dentry_t *dentry = NULL; + + GF_ASSERT (frame); + GF_VALIDATE_OR_GOTO ("quota", this, unwind); + GF_VALIDATE_OR_GOTO (this->name, fd, unwind); + + local = quota_local_new (); + if (local == NULL) { + goto unwind; + } + + frame->local = local; + local->loc.inode = inode_ref (fd->inode); + + ret = quota_inode_ctx_get (fd->inode, -1, this, NULL, NULL, &ctx, 0); + if (ctx == NULL) { + gf_log (this->name, GF_LOG_WARNING, + "quota context not set in inode (gfid:%s)", + uuid_utoa (fd->inode->gfid)); + goto unwind; + } + + stub = fop_fallocate_stub(frame, quota_fallocate_helper, fd, mode, offset, len, + xdata); + if (stub == NULL) { + op_errno = ENOMEM; + goto unwind; + } + + priv = this->private; + GF_VALIDATE_OR_GOTO (this->name, priv, unwind); + + LOCK (&ctx->lock); + { + list_for_each_entry (dentry, &ctx->parents, next) { + parents++; + } + } + UNLOCK (&ctx->lock); + + /* + * Note that by using len as the delta we're assuming the range from + * offset to offset+len has not already been allocated. This can result + * in ENOSPC errors attempting to allocate an already allocated range. + */ + local->delta = len; + local->stub = stub; + local->link_count = parents; + + list_for_each_entry (dentry, &ctx->parents, next) { + ret = quota_check_limit (frame, fd->inode, this, dentry->name, + dentry->par); + if (ret == -1) { + break; + } + } + + stub = NULL; + + LOCK (&local->lock); + { + local->link_count = 0; + if (local->validate_count == 0) { + stub = local->stub; + local->stub = NULL; + } + } + UNLOCK (&local->lock); + + if (stub != NULL) { + call_resume (stub); + } + + return 0; + +unwind: + QUOTA_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + int32_t mem_acct_init (xlator_t *this) @@ -3304,6 +3475,7 @@ struct xlator_fops fops = { .removexattr = quota_removexattr, .fremovexattr = quota_fremovexattr, .readdirp = quota_readdirp, + .fallocate = quota_fallocate, }; struct xlator_cbks cbks = { |
