diff options
Diffstat (limited to 'xlators/storage')
| -rw-r--r-- | xlators/storage/Makefile.am | 3 | ||||
| -rw-r--r-- | xlators/storage/bdb/Makefile.am | 3 | ||||
| -rw-r--r-- | xlators/storage/bdb/src/Makefile.am | 18 | ||||
| -rw-r--r-- | xlators/storage/bdb/src/bctx.c | 394 | ||||
| -rw-r--r-- | xlators/storage/bdb/src/bdb-ll.c | 1455 | ||||
| -rw-r--r-- | xlators/storage/bdb/src/bdb.c | 3371 | ||||
| -rw-r--r-- | xlators/storage/bdb/src/bdb.h | 439 | ||||
| -rw-r--r-- | xlators/storage/posix/Makefile.am | 3 | ||||
| -rw-r--r-- | xlators/storage/posix/src/Makefile.am | 17 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix.c | 3715 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix.h | 110 | ||||
| -rw-r--r-- | xlators/storage/posix/src/xattr-cache.c | 521 | ||||
| -rw-r--r-- | xlators/storage/posix/src/xattr-cache.h | 65 | 
13 files changed, 10114 insertions, 0 deletions
diff --git a/xlators/storage/Makefile.am b/xlators/storage/Makefile.am new file mode 100644 index 00000000000..59b9689699e --- /dev/null +++ b/xlators/storage/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = posix $(BDB_SUBDIR) + +CLEANFILES =  diff --git a/xlators/storage/bdb/Makefile.am b/xlators/storage/bdb/Makefile.am new file mode 100644 index 00000000000..d471a3f9243 --- /dev/null +++ b/xlators/storage/bdb/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES =  diff --git a/xlators/storage/bdb/src/Makefile.am b/xlators/storage/bdb/src/Makefile.am new file mode 100644 index 00000000000..c0ab394bc58 --- /dev/null +++ b/xlators/storage/bdb/src/Makefile.am @@ -0,0 +1,18 @@ + +xlator_LTLIBRARIES = bdb.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage + +bdb_la_LDFLAGS = -module -avoidversion + +bdb_la_SOURCES = bctx.c bdb-ll.c bdb.c +bdb_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la  + +noinst_HEADERS = bdb.h  + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D__USE_FILE_OFFSET64 -D_GNU_SOURCE -D$(GF_HOST_OS) -Wall \ +	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +AM_LDFLAGS = -ldb + +CLEANFILES =  + diff --git a/xlators/storage/bdb/src/bctx.c b/xlators/storage/bdb/src/bctx.c new file mode 100644 index 00000000000..2bfa3ea8762 --- /dev/null +++ b/xlators/storage/bdb/src/bctx.c @@ -0,0 +1,394 @@ +/* +  Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +  This file is part of GlusterFS. + +  GlusterFS is free software; you can redistribute it and/or modify +  it under the terms of the GNU General Public License as published +  by the Free Software Foundation; either version 3 of the License, +  or (at your option) any later version. + +  GlusterFS is distributed in the hope that it will be useful, but +  WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  General Public License for more details. + +  You should have received a copy of the GNU General Public License +  along with this program.  If not, see +  <http://www.gnu.org/licenses/>. +*/ + +#include <list.h> +#include <bdb.h> +#include <libgen.h> /* for dirname */ + +static void +__destroy_bctx (bctx_t *bctx) +{ +	if (bctx->directory) +		FREE (bctx->directory); +   +	if (bctx->db_path) +		FREE (bctx->db_path); +   +	FREE (bctx); +} + +static void +__unhash_bctx (bctx_t *bctx) +{ +	list_del_init (&bctx->b_hash); +} + +static int32_t +bctx_table_prune (bctx_table_t *table) +{ +	int32_t ret = 0; +	struct list_head purge = {0,}; +	struct list_head *next = NULL; +	bctx_t *entry = NULL; +	bctx_t *del = NULL, *tmp = NULL; +   +	if (!table) +		return 0; +   +	INIT_LIST_HEAD (&purge); +   +	LOCK (&table->lock); +	{ +		if ((table->lru_limit) && +		    (table->lru_size > table->lru_limit)) { +			while (table->lru_size > table->lru_limit) { +				next = table->b_lru.next; +				entry = list_entry (next, bctx_t, list); +	 +				list_move_tail (next, &table->purge); +				__unhash_bctx (entry); +	 +				table->lru_size--; +				ret++; +			} +		} +		list_move_tail (&purge, &table->purge); +		list_del_init (&table->purge); +	} +	UNLOCK (&table->lock); +   +	{ +		list_for_each_entry_safe (del, tmp, &purge, list) { +			list_del_init (&del->list); +			if (del->dbp) { +				ret = del->dbp->close (del->dbp, 0); +				if (ret != 0) { +					gf_log (table->this->name, GF_LOG_ERROR, +						"failed to close db on path (%s): %s",  +						del->directory, db_strerror (ret)); +				} else { +					gf_log (table->this->name, GF_LOG_WARNING, +						"close db for path %s; table->lru_count = %d",  +						del->directory, table->lru_size); +				} +			} +			__destroy_bctx (del); +		} +	} + +	return ret; +} + + +/* struct bdb_ctx related */ +static inline uint32_t +bdb_key_hash (char *key, uint32_t hash_size) +{ +	uint32_t hash = 0; +   +	hash = *key; +   +	if (hash) { +		for (key += 1; *key != '\0'; key++) { +			hash = (hash << 5) - hash + *key; +		} +	} +   +	return (hash + *key) % hash_size; +} + +static void +__hash_bctx (bctx_t *bctx) +{ +	bctx_table_t *table = NULL; +	char *key = NULL; + +	table = bctx->table; + +	MAKE_KEY_FROM_PATH (key, bctx->directory); +	bctx->key_hash = bdb_key_hash (key, table->hash_size); + +	list_del_init (&bctx->b_hash); +	list_add (&bctx->b_hash, &table->b_hash[bctx->key_hash]); +} + +static inline bctx_t * +__bctx_passivate (bctx_t *bctx) +{ +	if (bctx->dbp) { +		list_move_tail (&bctx->list, &(bctx->table->b_lru)); +		bctx->table->lru_size++; +	} else { +		list_move_tail (&bctx->list, &bctx->table->purge); +		__unhash_bctx (bctx); +	} +	return bctx; +} + +static inline bctx_t * +__bctx_activate (bctx_t *bctx) +{ +	list_move (&bctx->list, &bctx->table->active); +	bctx->table->lru_size--; +   +	return bctx; +} + +static bctx_t * +__bdb_ctx_unref (bctx_t *bctx) +{ +	assert (bctx->ref); +   +	--bctx->ref; +   +	if (!bctx->ref) +		bctx = __bctx_passivate (bctx); +   +	return bctx; +} + + +bctx_t * +bctx_unref (bctx_t *bctx) +{ +	bctx_table_t *table = NULL; + +	if (!bctx && !bctx->table) +		return NULL; +   +	table = bctx->table; + +	LOCK (&table->lock);     +	{ +		bctx = __bdb_ctx_unref (bctx); +	} +	UNLOCK (&table->lock); +   +	bctx_table_prune (table); + +	return bctx; +} + +/* + * NOTE: __bdb_ctx_ref() is called only after holding table->lock and bctx->lock, in that order + */ +static inline bctx_t * +__bctx_ref (bctx_t *bctx) +{ +	if (!bctx->ref) +		__bctx_activate (bctx); + +	bctx->ref++; + +	return bctx; +} + +bctx_t * +bctx_ref (bctx_t *bctx) +{ +	LOCK (&(bctx->table->lock)); +	{ +		__bctx_ref (bctx); +	} +	UNLOCK (&(bctx->table->lock)); + +	return bctx; +} + + +#define BDB_THIS(table) (table->this) + +static inline bctx_t * +__create_bctx (bctx_table_t *table, +	       const char *path) +{ +	bctx_t *bctx = NULL; +	char *db_path = NULL; + +	bctx = CALLOC (1, sizeof (*bctx)); +	GF_VALIDATE_OR_GOTO ("bctx", bctx, out); + +	bctx->table = table; +	bctx->directory = strdup (path); +	GF_VALIDATE_OR_GOTO ("bctx", bctx->directory, out); + +	MAKE_REAL_PATH_TO_STORAGE_DB (db_path, BDB_THIS (table), path); + +	bctx->db_path = strdup (db_path); +	GF_VALIDATE_OR_GOTO ("bctx", bctx->directory, out); + +	INIT_LIST_HEAD (&bctx->c_list); +	INIT_LIST_HEAD (&bctx->list); +	INIT_LIST_HEAD (&bctx->b_hash); + +	LOCK_INIT (&bctx->lock); + +	__hash_bctx (bctx); + +	list_add (&bctx->list, &table->b_lru); +	table->lru_size++; + +out: +	return bctx; +} + +/* bctx_lookup - lookup bctx_t for the directory @directory. (see description of bctx_t in bdb.h) + * + * @table:     bctx_table_t for this instance of bdb. + * @directory: directory for which bctx_t is being looked up.  + */ +bctx_t * +bctx_lookup (bctx_table_t *table,  +	     const char *directory) +{ +	char    *key = NULL; +	uint32_t key_hash = 0; +	bctx_t  *trav = NULL, *bctx = NULL, *tmp = NULL; +	int32_t  need_break = 0; +	 +	GF_VALIDATE_OR_GOTO ("bctx", table, out); +	GF_VALIDATE_OR_GOTO ("bctx", directory, out); + +	MAKE_KEY_FROM_PATH (key, directory); +	key_hash = bdb_key_hash (key, table->hash_size); + +	LOCK (&table->lock); +	{ +		if (!list_empty (&table->b_hash[key_hash])) { +			list_for_each_entry_safe (trav, tmp, &table->b_hash[key_hash], b_hash) { +				LOCK(&trav->lock); +				if (!strcmp(trav->directory, directory)) { +					bctx = __bctx_ref (trav); +					need_break = 1; +				}  +				UNLOCK(&trav->lock); +				if (need_break) +					break; +			} +		} +     +		if (!bctx) { +			bctx = __create_bctx (table, directory); +			bctx = __bctx_ref (bctx); +		}  +	} +	UNLOCK (&table->lock); +out: +	return bctx; +} + + +bctx_t * +bctx_parent (bctx_table_t *table, +	     const char *path) +{ +	char   *pathname = NULL, *directory = NULL; +	bctx_t *bctx = NULL; + +	GF_VALIDATE_OR_GOTO ("bctx", table, out); +	GF_VALIDATE_OR_GOTO ("bctx", path, out); +   +	pathname = strdup (path); +	GF_VALIDATE_OR_GOTO ("bctx", pathname, out); +	directory = dirname (pathname); + +	bctx = bctx_lookup (table, directory); +	GF_VALIDATE_OR_GOTO ("bctx", bctx, out); +   +out: +	if (pathname) +		free (pathname); +	return bctx; +} + +inline int32_t +bdb_db_rename (bctx_table_t *table,  +	       const char *oldpath,  +	       const char *newpath) +{ +	DB_ENV *dbenv = NULL; +	int32_t ret = -1; +	 +	GF_VALIDATE_OR_GOTO ("bctx", table, out); +	GF_VALIDATE_OR_GOTO ("bctx", oldpath, out); +	GF_VALIDATE_OR_GOTO ("bctx", newpath, out); + +	dbenv = table->dbenv; +	GF_VALIDATE_OR_GOTO ("bctx", dbenv, out); + +	LOCK (&table->lock); +	{ +		ret = dbenv->dbrename (dbenv, NULL, oldpath, NULL, newpath, 0); +     +		if (ret != 0) { +			gf_log ("bctx", +				GF_LOG_ERROR, +				"failed to rename %s to %s: %s",  +				oldpath, newpath, db_strerror (ret)); +		} else { +			gf_log ("bctx", +				GF_LOG_DEBUG, +				"successfully renamed %s to %s: %s", +				oldpath, newpath, db_strerror (ret)); +		} +	} +	UNLOCK (&table->lock); + +out: +	return ret; +} + +bctx_t * +bctx_rename (bctx_t *bctx,  +	     const char *db_newpath) +{ +	bctx_table_t *table = NULL; +	int32_t ret = -1; + +	table = bctx->table; + +	LOCK (&table->lock); +	{ +		__unhash_bctx (bctx); +		list_del_init (&bctx->list); +		if (bctx->dbp) { +			ret = bctx->dbp->close (bctx->dbp, 0); +			if (ret != 0) { +				gf_log ("bdb-ll", +					GF_LOG_ERROR, +					"failed to close db for directory %s (%s)", +					bctx->directory, db_strerror (ret)); +			} +			bctx->dbp = NULL; +		} +	} +	UNLOCK (&table->lock); +   +	ret = bdb_db_rename (table, bctx->db_path, db_newpath); +	 +	if (ret != 0) { +		gf_log ("bctx", +			GF_LOG_ERROR, +			"bdb_db_rename failed for directory %s", +			bctx->directory); +		bctx = NULL; +	}  + +	return bctx; +} diff --git a/xlators/storage/bdb/src/bdb-ll.c b/xlators/storage/bdb/src/bdb-ll.c new file mode 100644 index 00000000000..40e7d187759 --- /dev/null +++ b/xlators/storage/bdb/src/bdb-ll.c @@ -0,0 +1,1455 @@ +/* +  Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> +  This file is part of GlusterFS. + +  GlusterFS is free software; you can redistribute it and/or modify +  it under the terms of the GNU General Public License as published +  by the Free Software Foundation; either version 3 of the License, +  or (at your option) any later version. + +  GlusterFS is distributed in the hope that it will be useful, but +  WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  General Public License for more details. + +  You should have received a copy of the GNU General Public License +  along with this program.  If not, see +  <http://www.gnu.org/licenses/>. +*/ + +#include <libgen.h> +#include "bdb.h" +#include <list.h> +/*  + * implement the procedures to interact with bdb */ + +/**************************************************************** + * + * General wrappers and utility procedures for bdb xlator + * + ****************************************************************/ +#define BDB_LL_PAGE_SIZE_DEFAULT    4096 +#define BDB_LL_PAGE_SIZE_MIN    4096 +#define BDB_LL_PAGE_SIZE_MAX    65536 + +ino_t +bdb_inode_transform (ino_t parent, +                     bctx_t *bctx) +{ +	struct bdb_private *private = NULL; +	ino_t               ino = -1; +	 +	GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); + +	private = bctx->table->this->private; + +	LOCK (&private->ino_lock); +	ino = ++private->next_ino; +	UNLOCK (&private->ino_lock); +out: +	return ino; +} + + +/*********************************************************** + * + *  bdb storage database utilities + * + **********************************************************/ + +/* + * bdb_db_open - opens a storage db. + * + * @ctx: context specific to the directory for which we are supposed to open db + * + * see, if we have empty slots to open a db. + *      if (no-empty-slots), then prune open dbs and close as many as possible + *      if (empty-slot-available), tika muchkonDu db open maaDu + * + * NOTE: illi baro munche lock hiDkobEku + */ +static DB * +bdb_db_open (bctx_t *bctx) +{ +	DB *storage_dbp = NULL; +	int32_t op_ret = -1; +	bctx_table_t *table = NULL; +	 +	GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); + +	table = bctx->table; +	GF_VALIDATE_OR_GOTO ("bdb-ll", table, out); + +	/* we have to do the following, we can't deny someone of db_open ;) */ +	op_ret = db_create (&storage_dbp, table->dbenv, 0); +	if (op_ret != 0) { +		gf_log ("bdb-ll", GF_LOG_ERROR, +			"failed to do db_create for directory %s (%s)", +			bctx->directory, db_strerror (op_ret)); +		storage_dbp = NULL; +		goto out; +	}  + +	if (table->page_size) { +		op_ret = storage_dbp->set_pagesize (storage_dbp,  +						    table->page_size); +		if (op_ret != 0) { +			gf_log ("bdb-ll", GF_LOG_ERROR,  +				"failed to set the page_size (%"PRIu64") for directory %s (%s)",  +				table->page_size, bctx->directory, db_strerror (op_ret)); +		} else { +			gf_log ("bdb-ll", GF_LOG_DEBUG, +				"page-size (%"PRIu64") set on DB",  +				table->page_size); +		} +	} +      +	op_ret = storage_dbp->open (storage_dbp, +				    NULL, +				    bctx->db_path, +				    NULL, +				    table->access_mode, +				    table->dbflags, +				    0); +	if (op_ret != 0 ) { +		gf_log ("bdb-ll", +			GF_LOG_ERROR, +			"failed to open storage-db for directory %s (%s)",  +			bctx->db_path, db_strerror (op_ret)); +		storage_dbp = NULL; +	}  + +out: +	return storage_dbp; +} + + + +int32_t +bdb_cursor_close (bctx_t *bctx, +		  DBC *cursorp) +{ +	int32_t ret = -1; +	 +	GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); +	GF_VALIDATE_OR_GOTO ("bdb-ll", cursorp, out); + +	LOCK (&bctx->lock); +	{ +#ifdef HAVE_BDB_CURSOR_GET +		ret = cursorp->close (cursorp); +#else +		ret = cursorp->c_close (cursorp); +#endif +		if ((ret != 0)) { +			gf_log ("bdb-ll", +				GF_LOG_ERROR, +				"failed to close db cursor for directory %s (%s)", +				bctx->directory, db_strerror (ret)); +		} +	} +	UNLOCK (&bctx->lock); + +out:  +	return ret; +} + + +int32_t +bdb_cursor_open (bctx_t *bctx, +		 DBC **cursorpp) +{ +	int32_t ret = -1; +	 +	GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); +	GF_VALIDATE_OR_GOTO ("bdb-ll", cursorpp, out); + +	LOCK (&bctx->lock); +	{ +		if (bctx->dbp) { +			/* do nothing, just continue */ +			ret = 0; +		} else { +			bctx->dbp = bdb_db_open (bctx); +			if (!bctx->dbp) { +				gf_log ("bdb-ll", +					GF_LOG_ERROR, +					"failed to open storage db for %s",  +					bctx->directory); +				ret = -1; +			} else { +				ret = 0; +			} +		} +   +		if (ret == 0) { +			/* all set, lets open cursor */ +			ret = bctx->dbp->cursor (bctx->dbp, NULL, cursorpp, 0); +			if (ret != 0) { +				gf_log ("bdb-ll", +					GF_LOG_ERROR, +					"failed to create a cursor for %s (%s)",  +					bctx->directory, db_strerror (ret)); +			} +		} +	} +	UNLOCK (&bctx->lock); + +out: +	return ret; +} + + +/* cache related */ +static bdb_cache_t * +bdb_cache_lookup (bctx_t *bctx, +                  char *path) +{ +	bdb_cache_t *bcache = NULL;  +	bdb_cache_t *trav   = NULL; +	char        *key    = NULL; +	 +	GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); +	GF_VALIDATE_OR_GOTO ("bdb-ll", path, out); + +	MAKE_KEY_FROM_PATH (key, path); + +	LOCK (&bctx->lock); +	{ +		list_for_each_entry (trav, &bctx->c_list, c_list) { +			if (!strcmp (trav->key, key)){ +				bcache = trav; +				break; +			} +		} +	} +	UNLOCK (&bctx->lock); + +out: +	return bcache; +} + +static int32_t +bdb_cache_insert (bctx_t *bctx,  +		  DBT *key,  +		  DBT *data) +{ +	bdb_cache_t *bcache = NULL; +	int32_t ret = -1; + +	GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); +	GF_VALIDATE_OR_GOTO ("bdb-ll", key, out); +	GF_VALIDATE_OR_GOTO ("bdb-ll", data, out); + +	LOCK (&bctx->lock); +	{ +		if (bctx->c_count > 5) { +			/* most of the times, we enter here */ +			/* FIXME: ugly, not supposed to disect any of the +			 * 'struct list_head' directly */ +			if (!list_empty (&bctx->c_list)) { +				bcache = list_entry (bctx->c_list.prev, bdb_cache_t, c_list); +				list_del_init (&bcache->c_list); +			} +			if (bcache->key) { +				free (bcache->key); +				bcache->key = strdup ((char *)key->data); +				GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->key, unlock); +			} else { +				/* should never come here */ +				gf_log ("bdb-ll", +					GF_LOG_CRITICAL, +					"bcache->key (null)"); +			} /* if(bcache->key)...else */ +			if (bcache->data) { +				free (bcache->data); +				bcache->data = memdup (data->data, data->size); +				GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->data, unlock); +				bcache->size = data->size; +			} else { +				/* should never come here */ +				gf_log ("bdb-ll", +					GF_LOG_CRITICAL, +					"bcache->data (null)"); +			} /* if(bcache->data)...else */ +			list_add (&bcache->c_list, &bctx->c_list); +			ret = 0; +		} else { +			/* we will be entering here very rarely */ +			bcache = CALLOC (1, sizeof (*bcache));     +			GF_VALIDATE_OR_GOTO ("bdb-ll", bcache, unlock); +			bcache->key = strdup ((char *)(key->data)); +			GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->key, unlock); +			bcache->data = memdup (data->data, data->size); +			GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->data, unlock); +			bcache->size = data->size; +			list_add (&bcache->c_list, &bctx->c_list); +			bctx->c_count++; +			ret = 0; +		} /* if(private->c_count < 5)...else */ +	} +unlock: +	UNLOCK (&bctx->lock); +out: +	return ret; +} + +static int32_t +bdb_cache_delete (bctx_t *bctx, +		  char *key) +{ +	bdb_cache_t *bcache = NULL;  +	bdb_cache_t *trav   = NULL; +	 +	GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); +	GF_VALIDATE_OR_GOTO ("bdb-ll", key, out); + +	LOCK (&bctx->lock); +	{ +		list_for_each_entry (trav, &bctx->c_list, c_list) { +			if (!strcmp (trav->key, key)){ +				bctx->c_count--; +				bcache = trav; +				break; +			} +		} +   +		if (bcache) { +			list_del_init (&bcache->c_list); +			free (bcache->key); +			free (bcache->data); +			free (bcache); +		} +	} +	UNLOCK (&bctx->lock); + +out: +	return 0; +} + +void * +bdb_db_stat (bctx_t *bctx,  +	     DB_TXN *txnid, +	     uint32_t flags) +{ +	DB     *storage = NULL; +	void   *stat    = NULL; +	int32_t ret     = -1; + +	LOCK (&bctx->lock); +	{ +		if (bctx->dbp == NULL) { +			bctx->dbp = bdb_db_open (bctx); +			storage = bctx->dbp; +		} else { +			/* we are just fine, lets continue */ +			storage = bctx->dbp; +		} /* if(bctx->dbp==NULL)...else */ +	} +	UNLOCK (&bctx->lock); + +	GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out); + +	ret = storage->stat (storage, txnid, &stat, flags); +     +	if (ret != 0) { +		gf_log ("bdb-ll", +			GF_LOG_ERROR, +			"failed to do DB->stat() on db file %s: %s",  +			bctx->db_path, db_strerror (ret)); +	} else { +		gf_log ("bdb-ll", +			GF_LOG_DEBUG, +			"successfully called DB->stat() on db file %s",  +			bctx->db_path); +	} +out: +	return stat; +   +} + +/* bdb_storage_get - retrieve a key/value pair corresponding to @path from the corresponding + *                   db file. + * + * @bctx: bctx_t * corresponding to the parent directory of @path. (should always be a valid + *        bctx).  bdb_storage_get should never be called if @bctx = NULL. + * @txnid: NULL if bdb_storage_get is not embedded in an explicit transaction or a valid + *         DB_TXN *, when embedded in an explicit transaction.         + * @path: path of the file to read from (translated to a database key using MAKE_KEY_FROM_PATH) + * @buf: char ** - pointer to a pointer to char. a read buffer is created in this procedure + *       and pointer to the buffer is passed through @buf to the caller. + * @size: size of the file content to be read. + * @offset: offset from which the file content to be read. + * + * NOTE: bdb_storage_get tries to open DB, if @bctx->dbp == NULL (@bctx->dbp == NULL,  + *      nobody has opened DB till now or DB was closed by bdb_table_prune()). + * + * NOTE: if private->cache is set (bdb xlator's internal caching enabled), then bdb_storage_get + *      first looks up the cache for key/value pair. if bdb_lookup_cache fails, then only  + *      DB->get() is called. also,  inserts a newly read key/value pair to cache through + *      bdb_insert_to_cache. + * + * return: 'number of bytes read' on success or -1 on error. + * + * also see: bdb_lookup_cache, bdb_insert_to_cache for details about bdb xlator's internal cache. + */ +int32_t +bdb_db_get (bctx_t *bctx, +	    DB_TXN *txnid, +	    const char *path, +	    char **buf, +	    size_t size, +	    off_t offset) +{ +	DB          *storage    = NULL; +	DBT          key        = {0,}; +	DBT          value      = {0,}; +	int32_t      ret        = -1; +	char        *key_string = NULL; +	bdb_cache_t *bcache     = NULL; +	int32_t      db_flags   = 0; +	uint8_t      need_break = 0; +	int32_t      retries    = 1; +	 +	GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); +	GF_VALIDATE_OR_GOTO ("bdb-ll", path, out); + +	MAKE_KEY_FROM_PATH (key_string, path); +	 +	if (bctx->cache &&  +	    ((bcache = bdb_cache_lookup (bctx, key_string)) != NULL)) { +		if (buf) { +			*buf = CALLOC (1, bcache->size); +			GF_VALIDATE_OR_GOTO ("bdb-ll", buf, out); +			memcpy (*buf, (bcache->data + offset), bcache->size); +		} +		ret = bcache->size; +	} else { +		LOCK (&bctx->lock); +		{ +			if (bctx->dbp == NULL) { +				bctx->dbp = bdb_db_open (bctx); +				storage = bctx->dbp; +			} else { +				/* we are just fine, lets continue */ +				storage = bctx->dbp; +			} /* if(bctx->dbp==NULL)...else */ +		} +		UNLOCK (&bctx->lock); +		 +		GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out); + +		key.data = (char *)key_string; +		key.size = strlen (key_string); +		key.flags = DB_DBT_USERMEM; +       +		if (bctx->cache){ +			/* we are called to return the size of the file */ +			value.flags = DB_DBT_MALLOC; +		} else { +			if (size) { +				value.flags = DB_DBT_MALLOC | DB_DBT_PARTIAL; +			} else { +				value.flags = DB_DBT_MALLOC; +			} +			value.dlen = size; +			value.doff = offset; +		} +       +		do { +			/* TODO: we prefer to give our own buffer to value.data +			 * and ask bdb to fill in it */ +			ret = storage->get (storage, txnid, &key, &value, db_flags); +         +			if (ret == DB_NOTFOUND) { +				gf_log ("bdb-ll", +					GF_LOG_DEBUG, +					"failed to do DB->get() for key: %s." +					" key not found in storage DB", key_string); +				ret = -1; +				need_break = 1; +			} else if (ret == DB_LOCK_DEADLOCK) { +				retries++; +				gf_log ("bdb-ll", +					GF_LOG_ERROR, +					"deadlock detected in DB->put. retrying DB->put (%d)",  +					retries); +			}else if (ret == 0) { +				/* successfully read data, lets set everything in place +				 * and return */ +				if (buf) { +					*buf = CALLOC (1, value.size); +					ERR_ABORT (*buf); +					memcpy (*buf, value.data, value.size); +				} +				ret = value.size; +				if (bctx->cache) +					bdb_cache_insert (bctx, &key, &value); +				free (value.data); +				need_break = 1; +			} else { +				gf_log ("bdb-ll", +					GF_LOG_ERROR, +					"failed to do DB->get() for key %s: %s",  +					key_string, db_strerror (ret)); +				ret = -1; +				need_break = 1; +			} +		} while (!need_break); +	} +out:   +	return ret; +}/* bdb_db_get */ + +/* bdb_storage_put - insert a key/value specified to the corresponding DB. + * + * @bctx: bctx_t * corresponding to the parent directory of @path.  + *        (should always be a valid bctx). bdb_storage_put should never be called if @bctx = NULL. + * @txnid: NULL if bdb_storage_put is not embedded in an explicit transaction or a valid + *         DB_TXN *, when embedded in an explicit transaction.  + * @key_string: key of the database entry. + * @buf: pointer to the buffer data to be written as data for @key_string. + * @size: size of @buf. + * @offset: offset in the key's data to be modified with provided data. + * @flags: valid flags are BDB_TRUNCATE_RECORD (to reduce the data of @key_string to 0 size). + * + * NOTE: bdb_storage_put tries to open DB, if @bctx->dbp == NULL (@bctx->dbp == NULL,  + *      nobody has opened DB till now or DB was closed by bdb_table_prune()). + * + * NOTE: bdb_storage_put deletes the key/value from bdb xlator's internal cache. + * + * return: 0 on success or -1 on error. + * + * also see: bdb_cache_delete for details on how a cached key/value pair is removed. + */ +int32_t +bdb_db_put (bctx_t *bctx, +	    DB_TXN *txnid, +	    const char *key_string, +	    const char *buf, +	    size_t size, +	    off_t offset, +	    int32_t flags) +{ +	DB     *storage = NULL; +	DBT     key = {0,}, value = {0,}; +	int32_t ret = -1; +	int32_t db_flags = DB_AUTO_COMMIT; +	uint8_t need_break = 0; +	int32_t retries = 1; + +	LOCK (&bctx->lock); +	{ +		if (bctx->dbp == NULL) { +			bctx->dbp = bdb_db_open (bctx); +			storage = bctx->dbp; +		} else { +			/* we are just fine, lets continue */ +			storage = bctx->dbp; +		} +	} +	UNLOCK (&bctx->lock); +	 +	GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out); + +	if (bctx->cache) { +		ret = bdb_cache_delete (bctx, (char *)key_string); +		GF_VALIDATE_OR_GOTO ("bdb-ll", (ret == 0), out); +	} + +	key.data = (void *)key_string; +	key.size = strlen (key_string); +     +	/* NOTE: bdb lets us expand the file, suppose value.size > value.len, then value.len bytes +	 *      from value.doff offset and value.size bytes will be written from value.doff and  +	 *      data from value.doff + value.dlen will be pushed value.doff + value.size +	 */ +	value.data = (void *)buf; + +	if (flags & BDB_TRUNCATE_RECORD) { +		value.size = size; +		value.doff = 0; +		value.dlen = offset; +	} else { +		value.size = size; +		value.dlen = size; +		value.doff = offset; +	} +	value.flags = DB_DBT_PARTIAL; +	if (buf == NULL && size == 0)  +		/* truncate called us */ +		value.flags = 0; +     +	do { +		ret = storage->put (storage, txnid, &key, &value, db_flags); +		if (ret == DB_LOCK_DEADLOCK) { +			retries++; +			gf_log ("bdb-ll", +				GF_LOG_ERROR, +				"deadlock detected in DB->put. retrying DB->put (%d)",  +				retries); +		} else if (ret) { +			/* write failed */ +			gf_log ("bdb-ll", +				GF_LOG_ERROR, +				"failed to do DB->put() for key %s: %s",  +				key_string, db_strerror (ret)); +			need_break = 1; +		} else { +			/* successfully wrote */ +			ret = 0; +			need_break = 1; +		} +	} while (!need_break); +out: +	return ret; +}/* bdb_db_put */ + + +/* bdb_storage_del - delete a key/value pair corresponding to @path from corresponding db file. + * + * @bctx: bctx_t * corresponding to the parent directory of @path.  + *       (should always be a valid bctx). bdb_storage_del should never be called + *       if @bctx = NULL. + * @txnid: NULL if bdb_storage_del is not embedded in an explicit transaction or a + *         valid DB_TXN *, when embedded in an explicit transaction.  + * @path: path to the file, whose key/value pair has to be deleted. + * + * NOTE: bdb_storage_del tries to open DB, if @bctx->dbp == NULL (@bctx->dbp == NULL,  + *      nobody has opened DB till now or DB was closed by bdb_table_prune()). + * + * return: 0 on success or -1 on error. + */ +int32_t +bdb_db_del (bctx_t *bctx, +	    DB_TXN *txnid, +	    const char *path) +{ +	DB     *storage    = NULL; +	DBT     key        = {0,}; +	char   *key_string = NULL; +	int32_t ret        = -1; +	int32_t db_flags   = 0; +	uint8_t need_break = 0; +	int32_t retries    = 1; + +	MAKE_KEY_FROM_PATH (key_string, path); + +	LOCK (&bctx->lock); +	{ +		if (bctx->dbp == NULL) { +			bctx->dbp = bdb_db_open (bctx); +			storage = bctx->dbp; +		} else { +			/* we are just fine, lets continue */ +			storage = bctx->dbp; +		} +	} +	UNLOCK (&bctx->lock);   +	 +	GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out); + +	ret = bdb_cache_delete (bctx, key_string); +	GF_VALIDATE_OR_GOTO ("bdb-ll", (ret == 0), out); + +	key.data = key_string; +	key.size = strlen (key_string); +	key.flags = DB_DBT_USERMEM; +     +	do { +		ret = storage->del (storage, txnid, &key, db_flags); +       +		if (ret == DB_NOTFOUND) { +			gf_log ("bdb-ll", +				GF_LOG_DEBUG, +				"failed to delete %s from storage db, doesn't exist in storage DB",  +				path); +			need_break = 1; +		} else if (ret == DB_LOCK_DEADLOCK) { +			retries++; +			gf_log ("bdb-ll", +				GF_LOG_ERROR, +				"deadlock detected in DB->put. retrying DB->put (%d)",  +				retries); +		}else if (ret == 0) { +			/* successfully deleted the entry */ +			gf_log ("bdb-ll", +				GF_LOG_DEBUG, +				"deleted %s from storage db", path); +			ret = 0; +			need_break = 1; +		} else { +			gf_log ("bdb-ll", +				GF_LOG_ERROR, +				"failed to delete %s from storage db: %s",  +				path, db_strerror (ret)); +			ret = -1; +			need_break = 1;     +		} +	} while (!need_break); +out: +	return ret; +} + +/* NOTE: bdb version compatibility wrapper */ +int32_t +bdb_cursor_get (DBC *cursorp, +                DBT *key, +                DBT *value, +                int32_t flags) +{ +	int32_t ret = -1; +	 +	GF_VALIDATE_OR_GOTO ("bdb-ll", cursorp, out); + +#ifdef HAVE_BDB_CURSOR_GET +	ret = cursorp->get (cursorp, key, value, flags); +#else +	ret = cursorp->c_get (cursorp, key, value, flags); +#endif +	if ((ret != 0)  && (ret != DB_NOTFOUND)) { +		gf_log ("bdb-ll", +			GF_LOG_ERROR, +			"failed to CURSOR->get() for key %s (%s)", +			(char *)key->data, db_strerror (ret)); +	} + +out: +	return ret; +}/* bdb_cursor_get */ + + +int32_t +bdb_dirent_size (DBT *key) +{ +	return ALIGN (24 /* FIX MEEEE!!! */ + key->size); +} + + +/* bdb_extract_bfd - translate a fd_t to a bfd (either a 'struct bdb_bfd' or 'struct bdb_dir') + * + * @fd->ctx is with bdb specific file handle during a successful bdb_open (also bdb_create) + *  or bdb_opendir. + * + * return: 'struct bdb_bfd *' or 'struct bdb_dir *' on success, or NULL on failure. + */ +inline void * +bdb_extract_bfd (fd_t *fd, +                 xlator_t *this) +{ +	uint64_t tmp_bfd = 0; +	void    *bfd     = NULL; + 	 +	GF_VALIDATE_OR_GOTO ("bdb-ll", fd, out); +	GF_VALIDATE_OR_GOTO ("bdb-ll", this, out); + +	fd_ctx_get (fd, this, &tmp_bfd); +	bfd = (void *)(long)bfd; + +out: +	return bfd; +} + +/* bdb_dbenv_init - initialize DB_ENV + * + *  initialization includes: + *   1. opening DB_ENV (db_env_create(), DB_ENV->open()).  + *      NOTE: see private->envflags for flags used. + *   2. DB_ENV->set_lg_dir - set log directory to be used for storing log files  + *     (log files are the files in which transaction logs are written by db). + *   3. DB_ENV->set_flags (DB_LOG_AUTOREMOVE) - set DB_ENV to automatically clear + *      the unwanted log files (flushed at each checkpoint). + *   4. DB_ENV->set_errfile - set errfile to be used by db to report detailed error logs.  + *     used only for debbuging purpose. + * + * return: returns a valid DB_ENV * on success or NULL on error. + * + */ +static DB_ENV * +bdb_dbenv_init (xlator_t *this, +		char *directory) +{ +	/* Create a DB environment */ +	DB_ENV        *dbenv       = NULL; +	int32_t        ret         = 0; +	bdb_private_t *private     = NULL; +	int32_t        fatal_flags = 0; +	 +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (directory, out); + +	private = this->private; +	VALIDATE_OR_GOTO (private, out); + +	ret = db_env_create (&dbenv, 0); +	VALIDATE_OR_GOTO ((ret == 0), out); +	 +	/* NOTE: set_errpfx returns 'void' */ +	dbenv->set_errpfx(dbenv, this->name); +	 +	ret = dbenv->set_lk_detect (dbenv, DB_LOCK_DEFAULT); +	VALIDATE_OR_GOTO ((ret == 0), out); + +	ret = dbenv->open(dbenv, directory,  +			  private->envflags,  +			  S_IRUSR | S_IWUSR); +	if ((ret != 0) && (ret != DB_RUNRECOVERY)) { +		gf_log (this->name,  +			GF_LOG_CRITICAL,  +			"failed to open DB environment (%s)",  +			db_strerror (ret)); +		dbenv = NULL; +		goto out; +	} else if (ret == DB_RUNRECOVERY) { +		fatal_flags = ((private->envflags & (~DB_RECOVER)) | DB_RECOVER_FATAL); +		ret = dbenv->open(dbenv, directory,  +				  fatal_flags,  +				  S_IRUSR | S_IWUSR); +		if (ret != 0) { +			gf_log (this->name,  +				GF_LOG_ERROR, +				"failed to open DB environment (%s) with DB_REOVER_FATAL", +				db_strerror (ret)); +			dbenv = NULL; +			goto out; +		} else { +			gf_log (this->name,  +				GF_LOG_WARNING, +				"opened DB environment after DB_RECOVER_FATAL: %s",  +				db_strerror (ret)); +		} +	} else { +		gf_log (this->name,  +			GF_LOG_DEBUG, +			"DB environment successfull opened: %s",  +			db_strerror (ret)); +	} + +         + +#if (DB_VERSION_MAJOR == 4 &&			\ +     DB_VERSION_MINOR == 7) +	if (private->log_auto_remove) { +		ret = dbenv->log_set_config (dbenv, DB_LOG_AUTO_REMOVE, 1); +	} else { +		ret = dbenv->log_set_config (dbenv, DB_LOG_AUTO_REMOVE, 0); +	} +#else +	if (private->log_auto_remove) { +		ret = dbenv->set_flags (dbenv, DB_LOG_AUTOREMOVE, 1); +	} else { +		ret = dbenv->set_flags (dbenv, DB_LOG_AUTOREMOVE, 0); +	}          +#endif +	if (ret != 0) { +		gf_log ("bctx", +			GF_LOG_ERROR, +			"failed to set DB_LOG_AUTOREMOVE on dbenv: %s", db_strerror (ret)); +	} else { +		gf_log ("bctx", +			GF_LOG_DEBUG, +			"DB_LOG_AUTOREMOVE set on dbenv"); +	} + +	if (private->transaction) { +		ret = dbenv->set_flags(dbenv, DB_AUTO_COMMIT, 1); +           +		if (ret != 0) { +			gf_log ("bctx", +				GF_LOG_ERROR, +				"failed to set DB_AUTO_COMMIT on dbenv: %s",  +				db_strerror (ret)); +		} else { +			gf_log ("bctx", +				GF_LOG_DEBUG, +				"DB_AUTO_COMMIT set on dbenv"); +		} +           +		if (private->txn_timeout) { +			ret = dbenv->set_timeout (dbenv,  +						  private->txn_timeout,  +						  DB_SET_TXN_TIMEOUT); +			if (ret != 0) { +				gf_log ("bctx", +					GF_LOG_ERROR, +					"failed to set TXN_TIMEOUT to %d milliseconds " +					"on dbenv: %s", 					 +					private->txn_timeout, db_strerror (ret)); +			} else { +				gf_log ("bctx", +					GF_LOG_DEBUG, +					"TXN_TIMEOUT set to %d milliseconds",  +					private->txn_timeout); +			} +		} + +		if (private->lock_timeout) { +			ret = dbenv->set_timeout(dbenv,  +						 private->txn_timeout,  +						 DB_SET_LOCK_TIMEOUT); +             +			if (ret != 0) { +				gf_log ("bctx", +					GF_LOG_ERROR, +					"failed to set LOCK_TIMEOUT to %d milliseconds " +					"on dbenv: %s",  +					private->lock_timeout, db_strerror (ret)); +			} else { +				gf_log ("bctx", +					GF_LOG_DEBUG, +					"LOCK_TIMEOUT set to %d milliseconds",  +					private->lock_timeout); +			} +		} + +		ret = dbenv->set_lg_dir (dbenv, private->logdir); +		 +		if (ret != 0) { +			gf_log ("bctx", +				GF_LOG_ERROR, +				"failed to set log directory for dbenv: %s", db_strerror (ret)); +		} else { +			gf_log ("bctx", +				GF_LOG_DEBUG, +				"set dbenv log dir to %s", private->logdir); +		} +		 +	} +         +	if (private->errfile) { +		private->errfp = fopen (private->errfile, "a+"); +		if (private->errfp) { +			dbenv->set_errfile (dbenv, private->errfp); +		} else { +			gf_log ("bctx", +				GF_LOG_ERROR, +				"failed to open errfile: %s", strerror (errno)); +		} +	} + +out: +	return dbenv; +} + +#define BDB_ENV(this) ((((struct bdb_private *)this->private)->b_table)->dbenv) + +/* bdb_checkpoint - during transactional usage, db does not directly write the data to db + *                  files, instead db writes a 'log' (similar to a journal entry) into a + *                  log file. db normally clears the log files during opening of an + *                  environment. since we expect a filesystem server to run for a pretty + *                  long duration and flushing 'log's during dbenv->open would prove very  + *                  costly, if we accumulate the log entries for one complete run of  + *                  glusterfs server. to flush the logs frequently, db provides a mechanism + *                  called 'checkpointing'. when we do a checkpoint, db flushes the logs to + *                  disk (writes changes to db files) and we can also clear the accumulated  + *                  log files after checkpointing. NOTE: removing unwanted log files is not + *                  part of dbenv->txn_checkpoint() call.  + * + * @data: xlator_t of the current instance of bdb xlator. + * + *  bdb_checkpoint is called in a different thread from the main glusterfs thread. bdb  + *  xlator creates the checkpoint thread after successfully opening the db environment.  + *  NOTE: bdb_checkpoint thread shares the DB_ENV handle with the filesystem thread. + * + *  db environment checkpointing frequency is controlled by  + *  'option checkpoint-timeout <time-in-seconds>' in volfile. + * + * NOTE: checkpointing thread is started only if 'option transaction on' specified in + *      volfile. checkpointing is not valid for non-transactional environments. + * + */ +static void * +bdb_checkpoint (void *data) +{ +	xlator_t *this = NULL; +	struct bdb_private *private = NULL; +	DB_ENV *dbenv = NULL; +	int32_t ret = 0; +	uint32_t active = 0; + +	this = (xlator_t *) data; +	dbenv = BDB_ENV(this); +	private = this->private; + +	for (;;sleep (private->checkpoint_timeout)) { +		LOCK (&private->active_lock); +		active = private->active; +		UNLOCK (&private->active_lock); +  +		if (active) { +			ret = dbenv->txn_checkpoint (dbenv, 1024, 0, 0); +			if (ret) { +				gf_log ("bctx", +					GF_LOG_ERROR, +					"failed to checkpoint environment: %s", db_strerror (ret)); +			} else { +				gf_log ("bctx", +					GF_LOG_DEBUG, +					"checkpointing successful"); +			}  +		} else { +			ret = dbenv->txn_checkpoint (dbenv, 1024, 0, 0); +			if (ret) { +				gf_log ("bctx", +					GF_LOG_ERROR, +					"failed to do final checkpoint environment: %s",  +					db_strerror (ret)); +			} else { +				gf_log ("bctx", +					GF_LOG_DEBUG, +					"final checkpointing successful"); +			} +			break; +		} +	} + +	return NULL; +} + +static inline void +BDB_CACHE_INIT (xlator_t *this, +		dict_t *options,  +		struct bdb_private *private) +{ +	/* cache is always on */ +	private->cache = ON; +} + +static inline void +BDB_LOG_REMOVE_INIT(xlator_t *this, +		    dict_t *options, +		    struct bdb_private *private) +{ +	private->log_auto_remove = 1; +	gf_log (this->name, +		GF_LOG_DEBUG, +		"DB_ENV will use DB_LOG_AUTO_REMOVE"); +} + +static inline void +BDB_ERRFILE_INIT (xlator_t *this, +		  dict_t *options, +		  struct bdb_private *private) +{ +	data_t *errfile = NULL; + +	errfile = dict_get (options, "errfile"); +	if (errfile) { +		private->errfile = strdup (errfile->data); +		gf_log (this->name, +			GF_LOG_DEBUG, +			"using errfile: %s", private->errfile); +	}  +} + +static inline void +BDB_TABLE_INIT (xlator_t *this, +		dict_t *options, +		struct bdb_private *private) +{ +	bctx_table_t *table = NULL; +	int32_t        idx                = 0; +   +	data_t        *lru_limit          = NULL; +	data_t        *page_size          = NULL; +	 +	table = CALLOC (1, sizeof (*table)); +	if (table) { +		INIT_LIST_HEAD(&(table->b_lru)); +		INIT_LIST_HEAD(&(table->active)); +		INIT_LIST_HEAD(&(table->purge)); + +		LOCK_INIT (&table->lock); +		LOCK_INIT (&table->checkpoint_lock); +       +		table->transaction = private->transaction; +		table->access_mode = private->access_mode; +		table->dbflags = private->dbflags; +		table->this    = this; + +		{ +			lru_limit = dict_get (options, "lru-limit"); +         +			/* TODO: set max lockers and max txns to accomodate  +			 * for more than lru_limit */ +			if (lru_limit) { +				table->lru_limit = strtol (lru_limit->data, NULL, 0); +				gf_log ("bdb-ll", +					GF_LOG_DEBUG, +					"setting bctx lru limit to %d", table->lru_limit); +			} else { +				table->lru_limit = BDB_DEFAULT_LRU_LIMIT; +			} +		} + +		{ +			page_size = dict_get (options, "page-size"); +         +			if (page_size) +			{ +				if (gf_string2bytesize (page_size->data,  +							&table->page_size) != 0) { +					gf_log ("bdb-ll",  +						GF_LOG_ERROR,  +						"invalid number format \"%s\"" +						" of \"option page-size\"",  +						page_size->data); +				} +             +				if (!(table->page_size >= BDB_LL_PAGE_SIZE_MIN &&  +				      table->page_size <= BDB_LL_PAGE_SIZE_MAX)) { +					gf_log ("bdb-ll",  +						GF_LOG_ERROR,  +						"pagesize %s is out of range." +						"Allowed pagesize is between %d and %d",  +						page_size->data,  +						BDB_LL_PAGE_SIZE_MIN,  +						BDB_LL_PAGE_SIZE_MAX); +				} +			} +			else { +				table->page_size = BDB_LL_PAGE_SIZE_DEFAULT; +			} +			gf_log ("bdb-ll",  +				GF_LOG_DEBUG, "using page-size %"PRIu64,  +				table->page_size); +		} +       +		table->hash_size = BDB_DEFAULT_HASH_SIZE; +		table->b_hash = CALLOC (BDB_DEFAULT_HASH_SIZE, sizeof (struct list_head)); + +		for (idx = 0; idx < table->hash_size; idx++) +			INIT_LIST_HEAD(&(table->b_hash[idx])); + +		private->b_table = table; +	} else { +		gf_log ("bdb-ll", +			GF_LOG_CRITICAL, +			"failed to allocate bctx table: out of memory"); +	} +} + +static inline void  +BDB_DIRECTORY_INIT (xlator_t *this,  +		    dict_t *options,  +		    struct bdb_private *private) +{ +	data_t *directory = NULL; +	data_t *logdir = NULL; +	int32_t op_ret = -1; +	struct stat stbuf = {0}; + +	directory = dict_get (options, "directory"); +     +	if (directory) { +		logdir = dict_get (options, "logdir"); +         +		if (logdir == NULL) { +			gf_log ("bdb-ll", +				GF_LOG_DEBUG, +				"using default logdir as database home"); +			private->logdir = strdup (directory->data); +					 +		} else { +			private->logdir = strdup (logdir->data); +			gf_log ("bdb-ll", +				GF_LOG_DEBUG, +				"using logdir: %s", private->logdir); +			umask (000); +			if (mkdir (private->logdir, 0777) == 0) { +				gf_log ("bdb-ll", GF_LOG_WARNING, +					"logdir specified (%s) not exists, created",  +					private->logdir); +			} +             +			op_ret = stat (private->logdir, &stbuf); +			if ((op_ret != 0) || !S_ISDIR (stbuf.st_mode)) { +				gf_log ("bdb-ll",  +					GF_LOG_ERROR,  +					"specified logdir doesn't exist, " +					"using default (environment home directory: %s)",  +					directory->data); +				private->logdir = strdup (directory->data); +			} +		} + +		private->b_table->dbenv = bdb_dbenv_init (this, directory->data); +       +		if (!private->b_table->dbenv) { +			gf_log ("bdb-ll", GF_LOG_ERROR, +				"failed to initialize db environment"); +			FREE (private); +			op_ret = -1; +		} else { +			if (private->transaction) { +				/* all well, start the checkpointing thread */ +				LOCK_INIT (&private->active_lock); +				 +				LOCK (&private->active_lock); +				private->active = 1; +				UNLOCK (&private->active_lock); +				pthread_create (&private->checkpoint_thread, NULL, +						bdb_checkpoint, this); +			} +		} +	} +} + +static inline void +BDB_DIR_MODE_INIT (xlator_t *this, +		   dict_t *options,  +		   struct bdb_private *private) +{ +	data_t *dir_mode = NULL; +	char *endptr = NULL; + +	dir_mode = dict_get (options, "dir-mode"); + +	if (dir_mode) { +		private->dir_mode = strtol (dir_mode->data, &endptr, 8); +		if ((*endptr) ||  +		    (!IS_VALID_FILE_MODE(private->dir_mode))) { +			gf_log (this->name, +				GF_LOG_DEBUG, +				"invalid dir-mode %o. setting to default %o",  +				private->dir_mode,  +				DEFAULT_DIR_MODE); +			private->dir_mode = DEFAULT_DIR_MODE; +		} else { +			gf_log (this->name, +				GF_LOG_DEBUG, +				"setting dir-mode to %o", private->dir_mode); +			private->dir_mode = private->dir_mode; +		} +	} else { +		private->dir_mode = DEFAULT_DIR_MODE; +	} +     +	private->dir_mode = private->dir_mode | S_IFDIR; +} + +static inline void +BDB_FILE_MODE_INIT (xlator_t *this, +		    dict_t *options,  +		    struct bdb_private *private) +{ +	data_t *file_mode = NULL; +	char *endptr = NULL; + +	file_mode = dict_get (options, "file-mode"); + +	if (file_mode) { +		private->file_mode = strtol (file_mode->data, &endptr, 8); + +		if ((*endptr) ||  +		    (!IS_VALID_FILE_MODE(private->file_mode))) { +			gf_log (this->name, +				GF_LOG_DEBUG, +				"invalid file-mode %o. setting to default %o",  +				private->file_mode,  +				DEFAULT_FILE_MODE); +			private->file_mode = DEFAULT_FILE_MODE; +		} else { +			gf_log (this->name, +				GF_LOG_DEBUG, +				"setting file-mode to %o", private->file_mode); +			private->file_mode = private->file_mode; +		} +	} else { +		private->file_mode = DEFAULT_FILE_MODE; +	} +     +	private->symlink_mode = private->file_mode | S_IFLNK; +	private->file_mode = private->file_mode | S_IFREG; +} + +static inline void +BDB_CHECKPOINT_TIMEOUT_INIT (xlator_t *this, +			     dict_t *options,  +			     struct bdb_private *private) +{ +	data_t        *checkpoint_timeout = NULL; + +	checkpoint_timeout = dict_get (options, "checkpoint-timeout"); +     +	private->checkpoint_timeout = BDB_DEFAULT_CHECKPOINT_TIMEOUT; + +	if (checkpoint_timeout) { +		private->checkpoint_timeout = strtol (checkpoint_timeout->data, NULL, 0); +       +		if (private->checkpoint_timeout < 5 || private->checkpoint_timeout > 60) { +			gf_log (this->name, +				GF_LOG_WARNING, +				"checkpoint-timeout %d seconds too %s",  +				private->checkpoint_timeout,  +				(private->checkpoint_timeout < 5)?"low":"high"); +		} else { +			gf_log (this->name, +				GF_LOG_DEBUG, +				"setting checkpoint-timeout to %d seconds",  +				private->checkpoint_timeout); +		} +	} else { +		gf_log (this->name, +			GF_LOG_DEBUG, +			"setting checkpoint-timeout to default: %d seconds",  +			private->checkpoint_timeout); +	} +} + +static inline void +BDB_LOCK_TIMEOUT_INIT (xlator_t *this, +		       dict_t *options,  +		       struct bdb_private *private) +{ +	data_t        *lock_timeout       = NULL; + +	lock_timeout = dict_get (options, "lock-timeout"); +     +	if (lock_timeout) { +		private->lock_timeout = strtol (lock_timeout->data, NULL, 0); +       +		if (private->lock_timeout > 4260000) { +			/* db allows us to DB_SET_LOCK_TIMEOUT to be set to a +			 * maximum of 71 mins (4260000 milliseconds) */ +			gf_log (this->name, +				GF_LOG_DEBUG, +				"lock-timeout %d, out of range", +				private->lock_timeout); +			private->lock_timeout = 0; +		} else { +			gf_log (this->name, +				GF_LOG_DEBUG, +				"setting lock-timeout to %d milliseconds",  +				private->lock_timeout); +		} +	} +} + +static inline void +BDB_TRANSACTION_TIMEOUT_INIT (xlator_t *this, +			      dict_t *options,  +			      struct bdb_private *private) +{ +	data_t *txn_timeout = NULL; +	txn_timeout = dict_get (options, "transaction-timeout"); +     +	if (txn_timeout) { +		private->txn_timeout = strtol (txn_timeout->data, NULL, 0); +       +		if (private->txn_timeout > 4260000) { +			/* db allows us to DB_SET_TXN_TIMEOUT to be set to a maximum +			 * of 71 mins (4260000 milliseconds) */ +			gf_log (this->name, +				GF_LOG_DEBUG, +				"transaction-timeout %d, out of range", +				private->txn_timeout); +			private->txn_timeout = 0; +		} else { +			gf_log (this->name, +				GF_LOG_DEBUG, +				"setting transaction-timeout to %d milliseconds",  +				private->txn_timeout); +		} +	} +} + +static inline void +BDB_TRANSACTION_INIT (xlator_t *this, +		      dict_t *options,  +		      struct bdb_private *private) +{ +	data_t *mode = NULL; + +	mode = dict_get (options, "mode"); +     +	if (mode && !strcmp (mode->data, "off")) { +		gf_log (this->name, +			GF_LOG_DEBUG, +			"cache mode selected"); +		private->envflags = DB_CREATE | DB_INIT_LOG |  +			DB_INIT_MPOOL | DB_THREAD; +		private->dbflags = DB_CREATE | DB_THREAD; +		private->transaction = OFF; +	} else { +		gf_log (this->name, +			GF_LOG_DEBUG, +			"persistant mode selected"); +		private->transaction = ON; +		private->envflags = DB_CREATE | DB_INIT_LOCK | DB_INIT_LOG |  +			DB_INIT_MPOOL | DB_INIT_TXN | DB_RECOVER | DB_THREAD; +		private->dbflags = DB_CREATE | DB_THREAD; +	} +} + +static inline void +BDB_ACCESS_MODE_INIT (xlator_t *this, +		      dict_t *options,  +		      struct bdb_private *private) +{ +	data_t *access_mode = NULL; +	 +	access_mode = dict_get (options, "access-mode"); +     +	if (access_mode && !strcmp (access_mode->data, "btree")) { +		gf_log (this->name, +			GF_LOG_DEBUG, +			"using access mode BTREE"); +		private->access_mode = DB_BTREE; +	} else { +		gf_log (this->name, +			GF_LOG_DEBUG, +			"using access mode HASH"); +		private->access_mode = DB_HASH; +	} +} + + +/* bdb_db_init - initialize bdb xlator + *  + * reads the options from @options dictionary and sets appropriate values in @this->private. + * also initializes DB_ENV. + * + * return: 0 on success or -1 on error (with logging the error through gf_log()). + */ +int +bdb_db_init (xlator_t *this, +             dict_t *options) +{ +	/* create a db entry for root */ +	int32_t        op_ret             = 0; +	bdb_private_t *private            = NULL; +   +	private = this->private; + +	BDB_CACHE_INIT (this, options, private); +	 +	BDB_ACCESS_MODE_INIT (this, options, private); + +	BDB_TRANSACTION_INIT (this, options, private); + +	BDB_TRANSACTION_TIMEOUT_INIT (this, options, private); + +	BDB_LOCK_TIMEOUT_INIT (this, options, private); + +	{ +		LOCK_INIT (&private->ino_lock); +		private->next_ino = 2; +	} +	 +	BDB_CHECKPOINT_TIMEOUT_INIT (this, options, private); +	 +	BDB_FILE_MODE_INIT (this, options, private); + +	BDB_DIR_MODE_INIT (this, options, private); + +	BDB_TABLE_INIT (this, options, private); + +	BDB_ERRFILE_INIT (this, options, private); + +	BDB_LOG_REMOVE_INIT (this, options, private); + +	BDB_DIRECTORY_INIT (this, options, private); + +	return op_ret; +} diff --git a/xlators/storage/bdb/src/bdb.c b/xlators/storage/bdb/src/bdb.c new file mode 100644 index 00000000000..e820e867a94 --- /dev/null +++ b/xlators/storage/bdb/src/bdb.c @@ -0,0 +1,3371 @@ +/* +   Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> +   This file is part of GlusterFS. + +   GlusterFS is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published +   by the Free Software Foundation; either version 3 of the License, +   or (at your option) any later version. + +   GlusterFS is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see +   <http://www.gnu.org/licenses/>. +*/ + +/* bdb based storage translator - named as 'bdb' translator + *  + *  + * There can be only two modes for files existing on bdb translator: + * 1. DIRECTORY - directories are stored by bdb as regular directories on background  + * file-system. directories also have an entry in the ns_db.db of their parent directory. + * 2. REGULAR FILE - regular files are stored as records in the storage_db.db present in + * the directory. regular files also have an entry in ns_db.db + * + * Internally bdb has a maximum of three different types of logical files associated with + * each directory: + * 1. storage_db.db - storage database, used to store the data corresponding to regular + *                   files in the form of key/value pair. file-name is the 'key' and data + *                   is 'value'. + * 2. directory (all subdirectories) - any subdirectory will have a regular directory entry. + */ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#define __XOPEN_SOURCE 500 + +#include <stdint.h> +#include <sys/time.h> +#include <errno.h> +#include <ftw.h> +#include <libgen.h> + +#include "glusterfs.h" +#include "dict.h" +#include "logging.h" +#include "bdb.h" +#include "xlator.h" +#include "defaults.h" +#include "common-utils.h" + +/* to be used only by fops, nobody else */ +#define BDB_ENV(this) ((((struct bdb_private *)this->private)->b_table)->dbenv) +#define B_TABLE(this) (((struct bdb_private *)this->private)->b_table) + + +int32_t  +bdb_mknod (call_frame_t *frame, +           xlator_t *this, +           loc_t *loc, +           mode_t mode, +           dev_t dev) +{ +	int32_t     op_ret     = -1; +	int32_t     op_errno   = EINVAL; +	char       *key_string = NULL; /* after translating loc->path to DB key */ +	char       *db_path    = NULL; +	bctx_t     *bctx       = NULL; +	struct stat stbuf      = {0,}; + + +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, loc, out); + +	if (!S_ISREG(mode)) { +		gf_log (this->name, +			GF_LOG_DEBUG, +			"mknod for non-regular file"); +		op_ret = -1; +		op_errno = EPERM; +		goto out; +	} /* if(!S_ISREG(mode)) */ +   +	bctx = bctx_parent (B_TABLE(this), loc->path); +   +	if (bctx == NULL) { +		gf_log (this->name, +			GF_LOG_ERROR, +			"failed to get bctx for path: %s", loc->path); +		op_ret = -1; +		op_errno = ENOENT; +		goto out; +	} /* if(bctx == NULL) */ + +	MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); +	 +	op_ret = lstat (db_path, &stbuf);				 +	op_errno = errno;				 +	if (op_ret != 0) {				 +		gf_log (this->name, GF_LOG_ERROR,	 +			"failed to lstat on %s (%s)",	 +			db_path, strerror (op_errno));		 +		goto out;				 +	}						 + +	MAKE_KEY_FROM_PATH (key_string, loc->path); +	op_ret = bdb_db_put (bctx, NULL, key_string, NULL, 0, 0, 0); +	if (op_ret > 0) { +		/* create successful */ +		stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx); +		stbuf.st_mode  = mode; +		stbuf.st_size = 0; +		stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); +	} else { +		gf_log (this->name, +			GF_LOG_ERROR, +			"bdb_db_get() failed for path: %s", loc->path); +		op_ret = -1; +		op_errno = ENOENT; +	}/* if (!op_ret)...else */ + +out: +	if (bctx) { +		/* NOTE: bctx_unref always returns success,  +		 * see description of bctx_unref for more details */ +		bctx_unref (bctx); +	} + +	frame->root->rsp_refs = NULL;   +   +	STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf); +	return 0; +} + +static inline int32_t +is_dir_empty (xlator_t *this, +              loc_t *loc) +{ +	int32_t        ret       = 1; +	bctx_t        *bctx      = NULL; +	DIR           *dir       = NULL; +	char          *real_path = NULL; +	void          *dbstat    = NULL; +	struct dirent *entry     = NULL; + +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, loc, out); +     +	bctx = bctx_lookup (B_TABLE(this), loc->path); +	if (bctx == NULL) { +		gf_log (this->name, +			GF_LOG_DEBUG,  +			"failed to get bctx from inode for dir: %s," +			"assuming empty directory", +			loc->path); +		ret = 1; +		goto out; +	} + +	dbstat = bdb_db_stat (bctx, NULL, 0); +	if (dbstat) { +		switch (bctx->table->access_mode) +		{ +		case DB_HASH: +			ret = (((DB_HASH_STAT *)dbstat)->hash_nkeys == 0); +			break; +		case DB_BTREE: +		case DB_RECNO: +			ret = (((DB_BTREE_STAT *)dbstat)->bt_nkeys == 0); +			break; +		case DB_QUEUE: +			ret = (((DB_QUEUE_STAT *)dbstat)->qs_nkeys == 0); +			break; +		case DB_UNKNOWN: +			gf_log (this->name, +				GF_LOG_CRITICAL, +				"unknown access-mode set for db"); +			ret = 0; +		} +	} else { +		gf_log (this->name, +			GF_LOG_ERROR, +			"failed to get db stat for db at path: %s", loc->path); +		ret = 1; +		goto out; +	} +	 +	MAKE_REAL_PATH (real_path, this, loc->path); +	dir = opendir (real_path); +	if (dir == NULL) { +		gf_log (this->name, +			GF_LOG_DEBUG, +			"failed to opendir(%s)", loc->path); +		ret = 0; +		goto out; +	} + +	while ((entry = readdir (dir))) { +		if ((!IS_BDB_PRIVATE_FILE(entry->d_name)) &&  +		    (!IS_DOT_DOTDOT(entry->d_name))) { +			gf_log (this->name, +				GF_LOG_DEBUG, +				"directory (%s) not empty, has a non-db entry",  +				loc->path); +			ret = 0; +			break; +		}/* if(!IS_BDB_PRIVATE_FILE()) */ +	} /* while(true) */ +	closedir (dir); +out:   +	if (bctx) { +		/* NOTE: bctx_unref always returns success,  +		 * see description of bctx_unref for more details */ +		bctx_unref (bctx); +	} + +	return ret; +} + +int32_t  +bdb_rename (call_frame_t *frame, +            xlator_t *this, +            loc_t *oldloc, +            loc_t *newloc) +{ +	struct bdb_private *private      = NULL; +	bctx_table_t       *table        = NULL; +	bctx_t             *oldbctx      = NULL; +	bctx_t             *newbctx      = NULL; +	bctx_t             *tmpbctx      = NULL; +	int32_t             op_ret       = -1; +	int32_t             op_errno     = ENOENT; +	int32_t             read_size    = 0; +	struct stat         stbuf        = {0,}; +	struct stat         old_stbuf    = {0,}; +	DB_TXN             *txnid        = NULL; +	char               *real_newpath = NULL; +	char               *real_oldpath = NULL; +	char               *oldkey       = NULL; +	char               *newkey       = NULL; +	char               *buf          = NULL; /* pointer to temporary buffer, where +						  * the contents of a file are read, if +						  * file being renamed is a regular file */ +	char               *real_db_newpath = NULL; +	char               *tmp_db_newpath  = NULL; + +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, newloc, out); +	GF_VALIDATE_OR_GOTO (this->name, oldloc, out); +     +	private = this->private; +	table = private->b_table; + +	MAKE_REAL_PATH (real_oldpath, this, oldloc->path); + +	if (S_ISREG (oldloc->inode->st_mode)) { +		oldbctx = bctx_parent (B_TABLE(this), oldloc->path); +		MAKE_REAL_PATH (real_newpath, this, newloc->path); + +		op_ret = lstat (real_newpath, &stbuf); +		 +		if ((op_ret == 0) && (S_ISDIR (stbuf.st_mode))) { +			op_ret = -1; +			op_errno = EISDIR; +			goto out; +		}  +		if (op_ret == 0) { +			/* destination is a symlink */ +			MAKE_KEY_FROM_PATH (oldkey, oldloc->path); +			MAKE_KEY_FROM_PATH (newkey, newloc->path); + +			op_ret = unlink (real_newpath); +			op_errno = errno; +			if (op_ret != 0) { +				gf_log (this->name, +					GF_LOG_ERROR, +					"failed to unlink %s (%s)",  +					newloc->path, strerror (op_errno)); +				goto out; +			} +			newbctx = bctx_parent (B_TABLE (this), newloc->path); +			GF_VALIDATE_OR_GOTO (this->name, newbctx, out); + +			op_ret = bdb_txn_begin (BDB_ENV(this), &txnid); + +			if ((read_size =  +			     bdb_db_get (oldbctx, txnid, oldkey, &buf, 0, 0)) < 0) { +				bdb_txn_abort (txnid); +			} else if ((op_ret =  +				    bdb_db_del (oldbctx, txnid, oldkey)) != 0) { +				bdb_txn_abort (txnid); +			} else if ((op_ret = bdb_db_put (newbctx, txnid,  +							 newkey, buf,  +							 read_size, 0, 0)) != 0) { +				bdb_txn_abort (txnid); +			} else { +				bdb_txn_commit (txnid); +			} +			 +			/* NOTE: bctx_unref always returns success,  +			 * see description of bctx_unref for more details */ +			bctx_unref (newbctx); +		} else { +			/* destination doesn't exist or a regular file */ +			MAKE_KEY_FROM_PATH (oldkey, oldloc->path); +			MAKE_KEY_FROM_PATH (newkey, newloc->path); + +			newbctx = bctx_parent (B_TABLE (this), newloc->path); +			GF_VALIDATE_OR_GOTO (this->name, newbctx, out); + +			op_ret = bdb_txn_begin (BDB_ENV(this), &txnid); + +			if ((read_size = bdb_db_get (oldbctx, txnid,  +						     oldkey, &buf,  +						     0, 0)) < 0) { +				bdb_txn_abort (txnid); +			} else if ((op_ret = bdb_db_del (oldbctx,  +							 txnid, oldkey)) != 0) { +				bdb_txn_abort (txnid); +			} else if ((op_ret = bdb_db_put (newbctx, txnid,  +							 newkey, buf,  +							 read_size, 0, 0)) != 0) { +				bdb_txn_abort (txnid); +			} else { +				bdb_txn_commit (txnid); +			} +       +			/* NOTE: bctx_unref always returns success,  +			 * see description of bctx_unref for more details */ +			bctx_unref (newbctx); +		} +		/* NOTE: bctx_unref always returns success,  +		 * see description of bctx_unref for more details */ +		bctx_unref (oldbctx); +	} else if (S_ISLNK (oldloc->inode->st_mode)) { +		MAKE_REAL_PATH (real_newpath, this, newloc->path); +		op_ret = lstat (real_newpath, &stbuf); +		if ((op_ret == 0) && (S_ISDIR (stbuf.st_mode))) { +			op_ret = -1; +			op_errno = EISDIR; +			goto out; +		} + +		if (op_ret == 0){ +			/* destination exists and is also a symlink */ +			MAKE_REAL_PATH (real_oldpath, this, oldloc->path); +			op_ret = rename (real_oldpath, real_newpath); +			op_errno = errno; +			 +			if (op_ret != 0) { +				gf_log (this->name, +					GF_LOG_ERROR, +					"failed to rename symlink %s (%s)",  +					oldloc->path, strerror (op_errno)); +			} +			goto out; +		}  +		 +		/* destination doesn't exist */ +		MAKE_REAL_PATH (real_oldpath, this, oldloc->path); +		MAKE_KEY_FROM_PATH (newkey, newloc->path); +		newbctx = bctx_parent (B_TABLE (this), newloc->path); +		GF_VALIDATE_OR_GOTO (this->name, newbctx, out); +		 +		op_ret = bdb_db_del (newbctx, txnid, newkey); +		if (op_ret != 0) { +			/* no problem */ +		}  +		op_ret = rename (real_oldpath, real_newpath); +		op_errno = errno; +		if (op_ret != 0) { +			gf_log (this->name,  +				GF_LOG_ERROR, +				"failed to rename %s to %s (%s)", +				oldloc->path, newloc->path, strerror (op_errno)); +			goto out; +		} +		/* NOTE: bctx_unref always returns success,  +		 * see description of bctx_unref for more details */ +		bctx_unref (newbctx); +	} else if (S_ISDIR (oldloc->inode->st_mode) &&  +		   (old_stbuf.st_nlink == 2)) { + +		tmp_db_newpath = tempnam (private->export_path, "rename_temp"); +		GF_VALIDATE_OR_GOTO (this->name, tmp_db_newpath, out); + +		MAKE_REAL_PATH (real_newpath, this, newloc->path); + +		MAKE_REAL_PATH_TO_STORAGE_DB (real_db_newpath, this, newloc->path); + +		oldbctx = bctx_lookup (B_TABLE(this), oldloc->path); +		op_ret = -1; +		op_errno = EINVAL; +		GF_VALIDATE_OR_GOTO (this->name, oldbctx, out); + +		op_ret = lstat (real_newpath, &stbuf); +		if ((op_ret == 0) &&  +		    S_ISDIR (stbuf.st_mode) &&  +		    is_dir_empty (this, newloc)) { +			 +			tmpbctx = bctx_rename (oldbctx, tmp_db_newpath); +			op_ret = -1; +			op_errno = ENOENT; +			GF_VALIDATE_OR_GOTO (this->name, tmpbctx, out); + +			op_ret = rename (real_oldpath, real_newpath); +			op_errno = errno; +			if (op_ret != 0) { +				gf_log (this->name, +					GF_LOG_ERROR, +					"rename directory %s to %s failed: %s",  +					oldloc->path, newloc->path,  +					strerror (errno)); +				op_ret = bdb_db_rename (table,  +							tmp_db_newpath,  +							oldbctx->db_path); +				if (op_ret != 0) { +					gf_log (this->name, +						GF_LOG_ERROR, +						"renaming temp database back to old db failed" +						" for directory %s", oldloc->path); +					goto out; +				} else { +					/* this is a error case, set op_errno & op_ret */ +					op_ret = -1; +					op_errno = ENOENT; /* TODO: errno */ +				} +			}  +			op_ret = bdb_db_rename (table, tmp_db_newpath, real_db_newpath); +			if (op_ret != 0) { +				gf_log (this->name, +					GF_LOG_ERROR, +					"renaming temp database to new db failed" +					" for directory %s", oldloc->path); +				goto out; +			} +		} else if ((op_ret != 0) && (errno == ENOENT)) { +			tmp_db_newpath = tempnam (private->export_path, "rename_temp"); +			GF_VALIDATE_OR_GOTO (this->name, tmp_db_newpath, out); + +			tmpbctx = bctx_rename (oldbctx, tmp_db_newpath); +			op_ret = -1; +			op_errno = ENOENT; +			GF_VALIDATE_OR_GOTO (this->name, tmpbctx, out); + +			op_ret = rename (real_oldpath, real_newpath); +			op_errno = errno; +			if (op_ret != 0) { +				gf_log (this->name, +					GF_LOG_ERROR, +					"rename directory %s to %s failed: %s",  +					oldloc->path, newloc->path,  +					strerror (errno)); +				op_ret = bdb_db_rename (table,  +							tmp_db_newpath,  +							oldbctx->db_path); +				if (op_ret != 0) { +					gf_log (this->name, +						GF_LOG_ERROR, +						"renaming temp database back to old db failed" +						" for directory %s", oldloc->path); +					goto out; +				} else { +					/* this is a error case, set op_errno & op_ret */ +					op_ret = -1; +					op_errno = ENOENT; /* TODO: errno */ +				} +			} else { +				op_ret = bdb_db_rename (table,  +							tmp_db_newpath,  +							real_db_newpath); +				if (op_ret != 0) { +					gf_log (this->name, +						GF_LOG_ERROR, +						"renaming temp database to new db failed" +						" for directory %s", oldloc->path); +					goto out; +				} else { +					/* this is a error case, set op_errno & op_ret */ +					op_ret = -1; +					op_errno = ENOENT; /* TODO: errno */ +				} +			} +		} +	} else { +		gf_log (this->name, +			GF_LOG_CRITICAL, +			"rename called on non-existent file type"); +		op_ret = -1; +		op_errno = EPERM; +	} + +out: +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, op_ret, op_errno, &stbuf); +	return 0; +} + +int32_t  +bdb_link (call_frame_t *frame,  +          xlator_t *this, +          loc_t *oldloc, +          loc_t *newloc) +{ +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, -1, EPERM, NULL, NULL); +	return 0; +} + +int32_t +is_space_left (xlator_t *this, +	       size_t size) +{ +	struct bdb_private *private = this->private; +	struct statvfs stbuf = {0,}; +	int32_t ret = -1; +	fsblkcnt_t req_blocks = 0; +	fsblkcnt_t usable_blocks = 0; + +	ret = statvfs (private->export_path, &stbuf); +	if (ret != 0) { +		gf_log (this->name, +			GF_LOG_ERROR, +			"failed to do statvfs on %s", private->export_path); +		return 0; +	} else { +		req_blocks = (size / stbuf.f_frsize) + 1; + +		usable_blocks = (stbuf.f_bfree - BDB_ENOSPC_THRESHOLD);  +		 +		gf_log (this->name, GF_LOG_DEBUG, +			"requested size: %"GF_PRI_SIZET"\nfree blocks: %"PRIu64"\nblock size: %lu\nfrag size: %lu", +			size, stbuf.f_bfree, stbuf.f_bsize, stbuf.f_frsize); +		 +		if (req_blocks < usable_blocks) +			return 1; +		else  +			return 0; +	} +} + +int32_t  +bdb_create (call_frame_t *frame, +            xlator_t *this, +            loc_t *loc, +            int32_t flags, +            mode_t mode, +            fd_t *fd) +{ +	int32_t             op_ret     = -1; +	int32_t             op_errno   = EPERM; +	char               *db_path    = NULL; +	struct stat         stbuf      = {0,}; +	bctx_t             *bctx       = NULL; +	struct bdb_private *private    = NULL;  +	char               *key_string = NULL; +	struct bdb_fd      *bfd        = NULL; +	 +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, loc, out); +	GF_VALIDATE_OR_GOTO (this->name, fd, out); + +	private = this->private; + +	bctx = bctx_parent (B_TABLE(this), loc->path); +	op_errno = ENOENT; +	GF_VALIDATE_OR_GOTO (this->name, bctx, out); + +	MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); +	op_ret = lstat (db_path, &stbuf);				 +	op_errno = errno;				 +	if (op_ret != 0) {				 +		gf_log (this->name, GF_LOG_ERROR,	 +			"failed to lstat on %s (%s)",	 +			db_path, strerror (op_errno));		 +		goto out;				 +	}						 + +	MAKE_KEY_FROM_PATH (key_string, loc->path); +	op_ret = bdb_db_put (bctx, NULL, key_string, NULL, 0, 0, 0); +	op_errno = EINVAL; +	GF_VALIDATE_OR_GOTO (this->name, (op_ret == 0), out); +	 +        /* create successful */ +	bfd = CALLOC (1, sizeof (*bfd)); +	op_ret = -1; +	op_errno = ENOMEM; +	GF_VALIDATE_OR_GOTO (this->name, bfd, out); +		 +	/* NOTE: bdb_get_bctx_from () returns bctx with a ref */ +	bfd->ctx = bctx;  +	bfd->key = strdup (key_string); +	op_ret = -1; +	op_errno = ENOMEM; +	GF_VALIDATE_OR_GOTO (this->name, bfd->key, out); +	 +	BDB_SET_BFD (this, fd, bfd); +		 +	stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx); +	stbuf.st_mode = private->file_mode; +	stbuf.st_size = 0; +	stbuf.st_nlink = 1; +	stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); +	op_ret = 0; +	op_errno = 0; +out: +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, op_ret, op_errno, fd, loc->inode, &stbuf); + +	return 0; +} + + +/* bdb_open + * + * as input parameters bdb_open gets the file name, i.e key. bdb_open should effectively  + * do: store key, open storage db, store storage-db pointer. + * + */ +int32_t  +bdb_open (call_frame_t *frame, +          xlator_t *this, +          loc_t *loc, +          int32_t flags, +          fd_t *fd) +{ +	int32_t         op_ret     = -1; +	int32_t         op_errno   = EINVAL; +	bctx_t         *bctx       = NULL; +	char           *key_string = NULL; +	struct bdb_fd  *bfd        = NULL; + +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, loc, out); +	GF_VALIDATE_OR_GOTO (this->name, fd, out); + +	bctx = bctx_parent (B_TABLE(this), loc->path); +	op_errno = EBADFD; +	GF_VALIDATE_OR_GOTO (this->name, bctx, out); + +	bfd = CALLOC (1, sizeof (*bfd)); +	op_errno = ENOMEM; +	GF_VALIDATE_OR_GOTO (this->name, bfd, out); + +	/* NOTE: bctx_parent () returns bctx with a ref */ +	bfd->ctx = bctx; +       +	MAKE_KEY_FROM_PATH (key_string, loc->path); +	bfd->key = strdup (key_string); +	op_ret = -1; +	op_errno = ENOMEM; +	GF_VALIDATE_OR_GOTO (this->name, bfd->key, out); + +	BDB_SET_BFD (this, fd, bfd); +	op_ret = 0; +out: +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, op_ret, op_errno, fd); + +	return 0; +} + +int32_t  +bdb_readv (call_frame_t *frame, +           xlator_t *this, +           fd_t *fd, +           size_t size, +           off_t offset) +{ +	int32_t        op_ret     = -1; +	int32_t        op_errno   = EINVAL; +	struct iovec   vec        = {0,}; +	struct stat    stbuf      = {0,}; +	struct bdb_fd *bfd        = NULL;   +	dict_t        *reply_dict = NULL; +	char          *buf        = NULL; +	data_t        *buf_data   = NULL; +	char          *db_path    = NULL; +	int32_t        read_size  = 0; + +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, fd, out); + +	bfd = bdb_extract_bfd (fd, this); +	op_errno = EBADFD; +	GF_VALIDATE_OR_GOTO (this->name, bfd, out); + +	MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bfd->ctx->directory); +	op_ret = lstat (db_path, &stbuf);				 +	op_errno = errno;				 +	if (op_ret != 0) {				 +		gf_log (this->name, GF_LOG_ERROR,	 +			"failed to lstat on %s (%s)",	 +			db_path, strerror (op_errno));		 +		goto out;				 +	}						 + +	/* we are ready to go */ +	op_ret = bdb_db_get (bfd->ctx, NULL,  +			     bfd->key, &buf,  +			     size, offset); +	read_size = op_ret; +	if (op_ret == -1) { +		gf_log (this->name, +			GF_LOG_ERROR, +			"failed to do db_storage_get()"); +		op_ret = -1; +		op_errno = ENOENT; +		goto out; +	} else if (op_ret == 0) { +		goto out; +	} + +	buf_data = get_new_data (); +	op_ret = -1; +	op_errno = ENOMEM; +	GF_VALIDATE_OR_GOTO (this->name, buf_data, out); + +	reply_dict = get_new_dict (); +	op_ret = -1; +	op_errno = ENOMEM; +	GF_VALIDATE_OR_GOTO (this->name, reply_dict, out); + +	buf_data->data      = buf; + +	if (size < read_size) { +		op_ret = size; +		read_size = size; +	} + +	buf_data->len       = op_ret; +       +	dict_set (reply_dict, NULL, buf_data); +       +	frame->root->rsp_refs = dict_ref (reply_dict); + +	vec.iov_base = buf; +	vec.iov_len = read_size; +       +	stbuf.st_ino = fd->inode->ino; +	stbuf.st_size = op_ret ;  +	stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); +	op_ret = size; +out:   +	STACK_UNWIND (frame, op_ret, op_errno, &vec, 1, &stbuf); + +	if (reply_dict) +		dict_unref (reply_dict); + +	return 0; +} + + +int32_t  +bdb_writev (call_frame_t *frame, +            xlator_t *this, +            fd_t *fd, +            struct iovec *vector, +            int32_t count, +            off_t offset) +{ +	int32_t        op_ret   = -1; +	int32_t        op_errno = EINVAL; +	struct stat    stbuf    = {0,}; +	struct bdb_fd *bfd      = NULL; +	int32_t        idx      = 0; +	off_t          c_off    = offset; +	int32_t        c_ret    = -1; +	char          *db_path  = NULL; +	size_t         total_size = 0; + +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, fd, out); +	GF_VALIDATE_OR_GOTO (this->name, vector, out); + +	bfd = bdb_extract_bfd (fd, this); +	op_errno = EBADFD; +	GF_VALIDATE_OR_GOTO (this->name, bfd, out); + +	MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bfd->ctx->directory); +	op_ret = lstat (db_path, &stbuf);				 +	op_errno = errno;				 +	if (op_ret != 0) {				 +		gf_log (this->name, GF_LOG_ERROR,	 +			"failed to lstat on %s (%s)",	 +			db_path, strerror (op_errno));		 +		goto out;				 +	}						 + +  +	for (idx = 0; idx < count; idx++) +		total_size += vector[idx].iov_len; +      +	if (!is_space_left (this, total_size)) { +		gf_log (this->name, +			GF_LOG_ERROR, +			"requested storage for %"GF_PRI_SIZET", ENOSPC", total_size); +		op_ret = -1; +		op_errno = ENOSPC; +		goto out; +	} +  + +	/* we are ready to go */ +	for (idx = 0; idx < count; idx++) { +		c_ret = bdb_db_put (bfd->ctx, NULL,  +				    bfd->key, vector[idx].iov_base,  +				    vector[idx].iov_len, c_off, 0); +		if (c_ret != 0) { +			gf_log (this->name, +				GF_LOG_ERROR, +				"failed to do bdb_db_put at offset: %"PRIu64" for file: %s",  +				c_off, bfd->key); +			break; +		} else { +			c_off += vector[idx].iov_len; +		} +		op_ret += vector[idx].iov_len; +	} /* for(idx=0;...)... */ +     +	if (c_ret) { +		/* write failed */ +		gf_log (this->name, +			GF_LOG_ERROR, +			"failed to do bdb_db_put(): %s",  +			db_strerror (op_ret)); +		op_ret = -1; +		op_errno = EBADFD; /* TODO: search for a more meaningful errno */ +		goto out; +	}  +	/* NOTE: we want to increment stbuf->st_size, as stored in db */ +	stbuf.st_size = op_ret; +	stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); +	op_errno = 0; + +out: +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, op_ret, op_errno, &stbuf); +	return 0; +} + +int32_t  +bdb_flush (call_frame_t *frame, +           xlator_t *this, +           fd_t *fd) +{ +	int32_t        op_ret   = -1; +	int32_t        op_errno = EPERM; +	struct bdb_fd *bfd      = NULL; +	 +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, fd, out); + +	bfd = bdb_extract_bfd (fd, this); +	op_errno = EBADFD; +	GF_VALIDATE_OR_GOTO (this->name, bfd, out); +	 +        /* do nothing */ +	op_ret = 0; +	op_errno = 0; + +out: +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, op_ret, op_errno); +	return 0; +} + +int32_t  +bdb_release (xlator_t *this, +	     fd_t *fd) +{ +  int32_t op_ret = -1; +  int32_t op_errno = EBADFD; +  struct bdb_fd *bfd = NULL; +   +  if ((bfd = bdb_extract_bfd (fd, this)) == NULL){ +    gf_log (this->name, +	    GF_LOG_ERROR, +	    "failed to extract %s specific information from fd:%p", this->name, fd); +    op_ret = -1; +    op_errno = EBADFD; +  } else { +    bctx_unref (bfd->ctx); +    bfd->ctx = NULL;  +     +    if (bfd->key) +      free (bfd->key); /* we did strdup() in bdb_open() */ +    free (bfd); +    op_ret = 0; +    op_errno = 0; +  } /* if((fd->ctx == NULL)...)...else */ + +  return 0; +}/* bdb_release */ + + +int32_t  +bdb_fsync (call_frame_t *frame, +           xlator_t *this, +           fd_t *fd, +           int32_t datasync) +{ +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, 0, 0); +	return 0; +}/* bdb_fsync */ + +static int gf_bdb_lk_log; + +int32_t  +bdb_lk (call_frame_t *frame, +        xlator_t *this, +        fd_t *fd, +        int32_t cmd, +        struct flock *lock) +{ +	struct flock nullock = {0, }; + +	gf_bdb_lk_log++; +	if (!(gf_bdb_lk_log % GF_UNIVERSAL_ANSWER)) { +		gf_log (this->name, GF_LOG_ERROR,  +			"\"features/posix-locks\" translator is not loaded, you need to use it"); +	} + +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, -1, ENOSYS, &nullock); +	return 0; +}/* bdb_lk */ + +/* bdb_lookup + * + * there are four possibilities for a file being looked up: + *  1. file exists and is a directory. + *  2. file exists and is a symlink. + *  3. file exists and is a regular file. + *  4. file does not exist. + * case 1 and 2 are handled by doing lstat() on the @loc. if the file is a directory or symlink,  + * lstat() succeeds. lookup continues to check if the @loc belongs to case-3 only if lstat() fails. + * to check for case 3, bdb_lookup does a bdb_db_get() for the given @loc. (see description of  + * bdb_db_get() for more details on how @loc is transformed into db handle and key). if check  + * for case 1, 2 and 3 fail, we proceed to conclude that file doesn't exist (case 4). + * + * @frame:      call frame. + * @this:       xlator_t of this instance of bdb xlator. + * @loc:        loc_t specifying the file to operate upon. + * @need_xattr: if need_xattr != 0, we are asked to return all the extended attributed of @loc,  + *             if any exist, in a dictionary. if @loc is a regular file and need_xattr is set, then  + *             we look for value of need_xattr. if need_xattr > sizo-of-the-file @loc, then the file + *             content of @loc is returned in dictionary of xattr with 'glusterfs.content' as + *             dictionary key. + * + * NOTE: bdb currently supports only directories, symlinks and regular files.  + * + * NOTE: bdb_lookup returns the 'struct stat' of underlying file itself, in case of directory and  + *      symlink (st_ino is modified as bdb allocates its own set of inodes of all files). for  + *      regular files, bdb uses 'struct stat' of the database file in which the @loc is stored  + *      as templete and modifies st_ino (see bdb_inode_transform for more details), st_mode (can  + *      be set in volfile 'option file-mode <mode>'), st_size (exact size of the @loc + *      contents), st_blocks (block count on the underlying filesystem to accomodate st_size,  + *      see BDB_COUNT_BLOCKS in bdb.h for more details). + */ +int32_t +bdb_lookup (call_frame_t *frame, +            xlator_t *this, +            loc_t *loc, +            dict_t *xattr_req) +{ +	struct stat stbuf           = {0, }; +	int32_t op_ret              = -1; +	int32_t op_errno            = ENOENT; +	dict_t *xattr               = NULL; +	char *pathname              = NULL; +	char *directory             = NULL; +	char *real_path             = NULL; +	bctx_t *bctx                = NULL; +	char *db_path               = NULL; +	struct bdb_private *private = NULL; +	char *key_string            = NULL; +	int32_t entry_size          = 0; +	char *file_content          = NULL; +	data_t *file_content_data   = NULL; +	uint64_t   need_xattr       = 0; + +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, loc, out); + +	private = this->private; + +	MAKE_REAL_PATH (real_path, this, loc->path); + +	pathname = strdup (loc->path); +	GF_VALIDATE_OR_GOTO (this->name, pathname, out); + +	directory = dirname (pathname); +	GF_VALIDATE_OR_GOTO (this->name, directory, out); + +	if (!strcmp (directory, loc->path)) { +		/* SPECIAL CASE: looking up root */ +		op_ret = lstat (real_path, &stbuf);				 +		op_errno = errno;				 +		if (op_ret != 0) {				 +			gf_log (this->name, GF_LOG_ERROR,	 +				"failed to lstat on %s (%s)",	 +				real_path, strerror (op_errno));		 +			goto out;				 +		}						 + +		/* bctx_lookup() returns NULL only when its time to wind up,  +		 * we should shutdown functioning */ +		bctx = bctx_lookup (B_TABLE(this), (char *)loc->path); +		op_ret = -1; +		op_errno = EINVAL; +		GF_VALIDATE_OR_GOTO (this->name, bctx, out); +		 +		stbuf.st_ino = 1; +		stbuf.st_mode = private->dir_mode; +	} else { +		MAKE_KEY_FROM_PATH (key_string, loc->path); +		op_ret = lstat (real_path, &stbuf); +		if ((op_ret == 0) && (S_ISDIR (stbuf.st_mode))){ +			bctx = bctx_lookup (B_TABLE(this), (char *)loc->path); +			op_ret = -1; +			op_errno = ENOMEM; +			GF_VALIDATE_OR_GOTO (this->name, bctx, out); + +			if (loc->ino) { +				/* revalidating directory inode */ +				gf_log (this->name, +					GF_LOG_DEBUG, +					"revalidating directory %s", (char *)loc->path); +				stbuf.st_ino = loc->ino; +			} else { +				stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx); +			} +			stbuf.st_mode = private->dir_mode; +			op_ret = 0; +			op_errno = 0; +			goto out; +		} else if (op_ret == 0) { +			/* a symlink */ +			gf_log (this->name, +				GF_LOG_DEBUG, +				"lookup called for symlink: %s", loc->path); +			bctx = bctx_parent (B_TABLE(this), loc->path); +			op_ret = -1; +			op_errno = ENOMEM; +			GF_VALIDATE_OR_GOTO (this->name, bctx, out); + +			if (loc->ino) { +				stbuf.st_ino = loc->ino; +			} else { +				stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx); +			} +			stbuf.st_mode = private->symlink_mode; +			op_ret = 0; +			op_errno = 0; +			goto out; +		}  +		 +		/* for regular files */ +		bctx = bctx_parent (B_TABLE(this), loc->path); +		op_ret = -1; +		op_errno = ENOENT; +		GF_VALIDATE_OR_GOTO (this->name, bctx, out); +		 +		if (GF_FILE_CONTENT_REQUESTED(xattr_req, &need_xattr)) { +			entry_size = bdb_db_get (bctx,  +						 NULL,  +						 loc->path,  +						 &file_content,  +						 0, 0); +		} else { +			entry_size = bdb_db_get (bctx,  +						 NULL,  +						 loc->path,  +						 NULL,  +						 0, 0); +		} +		 +		op_ret = entry_size; +		op_errno = ENOENT; +		if (op_ret == -1) { +			gf_log (this->name, +				GF_LOG_DEBUG, +				"returning ENOENT for %s", loc->path); +			goto out; +		} + +		MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); +		op_ret = lstat (db_path, &stbuf);				 +		op_errno = errno;				 +		if (op_ret != 0) {				 +			gf_log (this->name, GF_LOG_ERROR,	 +				"failed to lstat on %s (%s)",	 +				db_path, strerror (op_errno));		 +			goto out;				 +		}						 +		 +		if ((need_xattr >= entry_size) +		    && (entry_size) && (file_content)) { +			file_content_data = data_from_dynptr (file_content,  +							      entry_size); +			xattr = get_new_dict (); +			dict_set (xattr, "glusterfs.content",  +				  file_content_data); +		} else { +			if (file_content) +				free (file_content); +		} + +		if (loc->ino) { +			/* revalidate */ +			stbuf.st_ino = loc->ino; +			stbuf.st_size = entry_size; +			stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); +		} else { +			/* fresh lookup, create an inode number */ +			stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx); +			stbuf.st_size = entry_size; +			stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); +		}/* if(inode->ino)...else */ +		stbuf.st_nlink = 1; +		stbuf.st_mode = private->file_mode; +	} +	op_ret = 0; +out:   +	if (bctx) { +		/* NOTE: bctx_unref always returns success,  +		 * see description of bctx_unref for more details */ +		bctx_unref (bctx); +	} + +	if (pathname) +		free (pathname); +   +	if (xattr) +		dict_ref (xattr); + +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf, xattr); +   +	if (xattr) +		dict_unref (xattr); + +	return 0; +   +}/* bdb_lookup */ + +int32_t +bdb_stat (call_frame_t *frame, +          xlator_t *this, +          loc_t *loc) +{ +  +	struct stat stbuf           = {0,}; +	char *real_path             = NULL; +	int32_t op_ret              = -1; +	int32_t op_errno            = EINVAL; +	struct bdb_private *private = NULL; +	char *db_path               = NULL; +	bctx_t *bctx                = NULL; +	 +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, loc, out); + +	private = this->private; +	GF_VALIDATE_OR_GOTO (this->name, private, out); + +	MAKE_REAL_PATH (real_path, this, loc->path); + +	op_ret = lstat (real_path, &stbuf); +	op_errno = errno; +  	if (op_ret == 0) { +		/* directory or symlink */ +		stbuf.st_ino = loc->inode->ino; +		if (S_ISDIR(stbuf.st_mode)) +			stbuf.st_mode = private->dir_mode; +		else +			stbuf.st_mode = private->symlink_mode; +		/* we are done, lets unwind the stack */ +		goto out; +	}  + +	bctx = bctx_parent (B_TABLE(this), loc->path); +	op_ret = -1; +	op_errno = ENOENT; +	GF_VALIDATE_OR_GOTO (this->name, bctx, out); +   +	MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); +	op_ret = lstat (db_path, &stbuf);				 +	op_errno = errno;				 +	if (op_ret != 0) {				 +		gf_log (this->name, GF_LOG_ERROR,	 +			"failed to lstat on %s (%s)",	 +			db_path, strerror (op_errno));		 +		goto out;				 +	}						 + +	stbuf.st_size = bdb_db_get (bctx, NULL, loc->path, NULL, 0, 0); +	stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); +	stbuf.st_ino = loc->inode->ino; +	 +out: +	if (bctx) { +		/* NOTE: bctx_unref always returns success,  +		 * see description of bctx_unref for more details */ +		bctx_unref (bctx); +	} + +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + +	return 0; +}/* bdb_stat */ + + + +/* bdb_opendir - in the world of bdb, open/opendir is all about opening correspondind databases. + *               opendir in particular, opens the database for the directory which is + *               to be opened. after opening the database, a cursor to the database is also created. + *               cursor helps us get the dentries one after the other, and cursor maintains the state + *               about current positions in directory. pack 'pointer to db', 'pointer to the + *               cursor' into struct bdb_dir and store it in fd->ctx, we get from our parent xlator. + * + * @frame: call frame + * @this:  our information, as we filled during init() + * @loc:   location information + * @fd:    file descriptor structure (glusterfs internal) + * + * return value - immaterial, async call. + * + */ +int32_t  +bdb_opendir (call_frame_t *frame, +             xlator_t *this, +             loc_t *loc,  +             fd_t *fd) +{ +	char           *real_path = NULL; +	int32_t         op_ret    = -1; +	int32_t         op_errno  = EINVAL; +	bctx_t         *bctx      = NULL; +	struct bdb_dir *bfd       = NULL; +   +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, loc, out); +	GF_VALIDATE_OR_GOTO (this->name, fd, out); + +	MAKE_REAL_PATH (real_path, this, loc->path); + +	bctx = bctx_lookup (B_TABLE(this), (char *)loc->path); +	op_errno = EBADFD; +	GF_VALIDATE_OR_GOTO (this->name, bctx, out); + +	bfd = CALLOC (1, sizeof (*bfd)); +	op_errno = ENOMEM; +	GF_VALIDATE_OR_GOTO (this->name, bfd, out); + +	bfd->dir = opendir (real_path); +	op_errno = errno; +	GF_VALIDATE_OR_GOTO (this->name, bfd->dir, out); + +	/* NOTE: bctx_lookup() return bctx with ref */ +	bfd->ctx = bctx;  + +	bfd->path = strdup (real_path); +	op_errno = ENOMEM; +	GF_VALIDATE_OR_GOTO (this->name, bfd->path, out); + +	BDB_SET_BFD (this, fd, bfd); +	op_ret = 0; +out:   +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, op_ret, op_errno, fd); + +	return 0; +}/* bdb_opendir */ + + +int32_t +bdb_getdents (call_frame_t *frame, +              xlator_t     *this, +              fd_t         *fd, +              size_t        size, +              off_t         off, +              int32_t       flag) +{ +	int32_t         op_ret         = -1; +	int32_t         op_errno       = EINVAL; +	int32_t         ret            = -1; +	int32_t         real_path_len  = 0; +	int32_t         entry_path_len = 0; +	int32_t         count          = 0; +	char           *real_path      = NULL; +	char           *entry_path     = NULL; +	char           *db_path        = NULL; +	dir_entry_t     entries        = {0, }; +	dir_entry_t    *tmp            = NULL; +	DIR            *dir            = NULL; +	struct dirent  *dirent         = NULL; +	struct bdb_dir *bfd            = NULL; +	struct stat     db_stbuf       = {0,}; +	struct stat     buf            = {0,}; +	DBC            *cursorp        = NULL; +	size_t          tmp_name_len   = 0; + +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, fd, out); + +	bfd = bdb_extract_bfd (fd, this); +	op_errno = EBADFD; +	GF_VALIDATE_OR_GOTO (this->name, bfd, out); + +	MAKE_REAL_PATH (real_path, this, bfd->path); +	dir = bfd->dir; + +	while ((dirent = readdir (dir))) { +		if (!dirent) +			break; +     +		if (IS_BDB_PRIVATE_FILE(dirent->d_name)) { +			continue; +		} + +		tmp_name_len = strlen (dirent->d_name); +		if (entry_path_len < (real_path_len + 1 + (tmp_name_len) + 1)) { +			entry_path_len = real_path_len + tmp_name_len + 1024; +			entry_path = realloc (entry_path, entry_path_len); +			op_errno = ENOMEM; +			GF_VALIDATE_OR_GOTO (this->name, entry_path, out); +		} +		 +		strncpy (&entry_path[real_path_len+1], dirent->d_name, tmp_name_len); +		op_ret = stat (entry_path, &buf);				 +		op_errno = errno;				 +		if (op_ret != 0) {				 +			gf_log (this->name, GF_LOG_ERROR,	 +				"failed to lstat on %s (%s)",	 +				entry_path, strerror (op_errno));		 +			goto out;				 +		}						 + +		if ((flag == GF_GET_DIR_ONLY) &&  +		    (ret != -1 && !S_ISDIR(buf.st_mode))) { +			continue; +		} + +		tmp = CALLOC (1, sizeof (*tmp)); +		op_errno = ENOMEM; +		GF_VALIDATE_OR_GOTO (this->name, tmp, out); + +		tmp->name = strdup (dirent->d_name);		        +		op_errno = ENOMEM; +		GF_VALIDATE_OR_GOTO (this->name, dirent->d_name, out); +		 +		memcpy (&tmp->buf, &buf, sizeof  (buf)); + +		tmp->buf.st_ino = -1; +		if (S_ISLNK(tmp->buf.st_mode)) { +			char linkpath[ZR_PATH_MAX] = {0,}; +			ret = readlink (entry_path, linkpath, ZR_PATH_MAX); +			if (ret != -1) { +				linkpath[ret] = '\0'; +				tmp->link = strdup (linkpath); +			} +		} else { +			tmp->link = ""; +		} + +		count++; +         +		tmp->next = entries.next; +		entries.next = tmp; +		/* if size is 0, count can never be = size, so entire dir is read */ + +		if (count == size) +			break; +	} +     +	if ((flag != GF_GET_DIR_ONLY) && (count < size)) { +		/* read from db */ +		op_ret = bdb_cursor_open (bfd->ctx, &cursorp); +		op_errno = EINVAL; +		GF_VALIDATE_OR_GOTO (this->name, (op_ret == 0), out); +         +		MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bfd->ctx->directory); +		op_ret = lstat (db_path, &db_stbuf);				 +		op_errno = errno;				 +		if (op_ret != 0) {				 +			gf_log (this->name, GF_LOG_ERROR,	 +				"failed to lstat on %s (%s)",	 +				db_path, strerror (op_errno));		 +			goto out;				 +		}						 + +		/* read all the entries in database, one after the other and put into dictionary */ +		while (1) { +			DBT key = {0,}, value = {0,}; +           +			key.flags = DB_DBT_MALLOC; +			value.flags = DB_DBT_MALLOC; +			op_ret = bdb_cursor_get (cursorp, &key, &value, DB_NEXT); +           +			if (op_ret == DB_NOTFOUND) { +				gf_log (this->name, +					GF_LOG_DEBUG, +					"end of list of key/value pair in db for directory: %s",  +					bfd->ctx->directory); +				op_ret = 0; +				op_errno = 0; +				break; +			} else if (op_ret != 0){ +				gf_log (this->name, +					GF_LOG_ERROR, +					"failed to do cursor get for directory %s: %s",  +					bfd->ctx->directory, db_strerror (op_ret)); +				op_ret = -1; +				op_errno = ENOENT; +				break; +			} +			/* successfully read */ +			tmp = CALLOC (1, sizeof (*tmp)); +			op_errno = ENOMEM; +			GF_VALIDATE_OR_GOTO (this->name, tmp, out); + +			tmp->name = CALLOC (1, key.size + 1); +			op_errno = ENOMEM; +			GF_VALIDATE_OR_GOTO (this->name, tmp->name, out); + +			memcpy (tmp->name, key.data, key.size); +			tmp->buf = db_stbuf; +			tmp->buf.st_size = bdb_db_get (bfd->ctx, NULL,  +						       tmp->name, NULL,  +						       0, 0); +			tmp->buf.st_blocks = BDB_COUNT_BLOCKS (tmp->buf.st_size, \ +							       tmp->buf.st_blksize); +			/* FIXME: wat will be the effect of this? */ +			tmp->buf.st_ino = -1; +			count++; +         +			tmp->next = entries.next; +			tmp->link = ""; +			entries.next = tmp; +			/* if size is 0, count can never be = size, so entire dir is read */ +			if (count == size) +				break; + +			free (key.data); +		} /* while(1){ } */ +		bdb_cursor_close (bfd->ctx, cursorp); +	} else { +		/* do nothing */ +	} +	FREE (entry_path); +	op_ret = 0; + +out: +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, op_ret, op_errno, &entries, count); + +	while (entries.next) { +		tmp = entries.next; +		entries.next = entries.next->next; +		FREE (tmp->name); +		FREE (tmp); +	} +	return 0; +}/* bdb_getdents */ + + +int32_t  +bdb_releasedir (xlator_t *this, +		fd_t *fd) +{ +  int32_t op_ret = 0; +  int32_t op_errno = 0; +  struct bdb_dir *bfd = NULL; + +  if ((bfd = bdb_extract_bfd (fd, this)) == NULL) { +    gf_log (this->name,  +	    GF_LOG_ERROR,  +	    "failed to extract fd data from fd=%p", fd); +    op_ret = -1; +    op_errno = EBADF; +  } else { +    if (bfd->path) { +      free (bfd->path); +    } else { +      gf_log (this->name, GF_LOG_ERROR, "bfd->path was NULL. fd=%p bfd=%p", +	      fd, bfd); +    } +     +    if (bfd->dir) { +      closedir (bfd->dir); +    } else { +      gf_log (this->name, +	      GF_LOG_ERROR, +	      "bfd->dir is NULL."); +    } +    if (bfd->ctx) { +      bctx_unref (bfd->ctx); +    } else { +      gf_log (this->name, +	      GF_LOG_ERROR, +	      "bfd->ctx is NULL"); +    } +    free (bfd); +  } + +  return 0; +}/* bdb_releasedir */ + + +int32_t  +bdb_readlink (call_frame_t *frame, +              xlator_t *this, +              loc_t *loc, +              size_t size) +{ +	char   *dest      = NULL; +	int32_t op_ret    = -1; +	int32_t op_errno  = EPERM; +	char   *real_path = NULL; + +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, loc, out); + +	dest = alloca (size + 1); +	GF_VALIDATE_OR_GOTO (this->name, dest, out); + +	MAKE_REAL_PATH (real_path, this, loc->path); +   +	op_ret = readlink (real_path, dest, size); +   +	if (op_ret > 0) +		dest[op_ret] = 0; + +	op_errno = errno; +   +	if (op_ret == -1) { +		gf_log (this->name, +			GF_LOG_DEBUG, +			"readlink failed on %s: %s",  +			loc->path, strerror (op_errno)); +	} +out: +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, op_ret, op_errno, dest); + +	return 0; +}/* bdb_readlink */ + + +int32_t  +bdb_mkdir (call_frame_t *frame, +           xlator_t *this, +           loc_t *loc, +           mode_t mode) +{ +	int32_t op_ret = -1; +	int32_t ret = -1; +	int32_t op_errno = EINVAL; +	char *real_path = NULL; +	struct stat stbuf = {0, }; +	bctx_t *bctx = NULL; +	 +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, loc, out); + +	MAKE_REAL_PATH (real_path, this, loc->path); +	 +	op_ret = mkdir (real_path, mode);				 +	op_errno = errno;				 +	if (op_ret != 0) {				 +		gf_log (this->name, GF_LOG_ERROR,	 +			"failed to mkdir %s (%s)",	 +			real_path, strerror (op_errno));		 +		goto out;				 +	}						 +	 +	op_ret = chown (real_path, frame->root->uid, frame->root->gid);				 +	op_errno = errno;				 +	if (op_ret != 0) {				 +		gf_log (this->name, GF_LOG_ERROR,	 +			"failed to chmod on %s (%s)",	 +			real_path, strerror (op_errno));		 +		goto err;				 +	}						 + +	op_ret = lstat (real_path, &stbuf);				 +	op_errno = errno;				 +	if (op_ret != 0) {				 +		gf_log (this->name, GF_LOG_ERROR,	 +			"failed to lstat on %s (%s)",	 +			real_path, strerror (op_errno));		 +		goto err;				 +	}						 + +	bctx = bctx_lookup (B_TABLE(this), (char *)loc->path); +	op_errno = ENOMEM; +	GF_VALIDATE_OR_GOTO (this->name, bctx, err); + +	stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx); +	 +	goto out; + +err: +	ret = rmdir (real_path); +	if (ret != 0) {			        +		gf_log (this->name, +			GF_LOG_ERROR, +			"failed to rmdir the directory created (%s)", +			strerror (errno)); +	} +	 + +out:   +	if (bctx) { +		/* NOTE: bctx_unref always returns success,  +		 * see description of bctx_unref for more details */ +		bctx_unref (bctx); +	} + +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf); + +	return 0; +}/* bdb_mkdir */ + + +int32_t  +bdb_unlink (call_frame_t *frame, +            xlator_t *this, +            loc_t *loc) +{ +	int32_t op_ret    = -1; +	int32_t op_errno  = EINVAL; +	bctx_t *bctx      = NULL; +	char   *real_path = NULL; + +  	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, loc, out); + +	bctx = bctx_parent (B_TABLE(this), loc->path); +	op_errno = ENOENT; +	GF_VALIDATE_OR_GOTO (this->name, bctx, out); + +	op_ret = bdb_db_del (bctx, NULL, loc->path); +	if (op_ret == DB_NOTFOUND) { +		MAKE_REAL_PATH (real_path, this, loc->path); +		op_ret = unlink (real_path);				 +		op_errno = errno;				 +		if (op_ret != 0) {				 +			gf_log (this->name, GF_LOG_ERROR,	 +				"failed to unlink on %s (%s)",	 +				real_path, strerror (op_errno));		 +			goto out;				 +		}						 + +	} else if (op_ret == 0) { +		op_errno = 0; +	} +out: +	if (bctx) { +		/* NOTE: bctx_unref always returns success,  +		 * see description of bctx_unref for more details */ +		bctx_unref (bctx); +	} + +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, op_ret, op_errno); + +	return 0; +}/* bdb_unlink */ + + + +int32_t +bdb_do_rmdir (xlator_t *this, +              loc_t *loc) +{ +	char   *real_path = NULL; +	int32_t ret       = -1; +	bctx_t *bctx      = NULL; +	DB_ENV *dbenv     = NULL; +	 +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, loc, out); + +	dbenv = BDB_ENV(this); +	GF_VALIDATE_OR_GOTO (this->name, dbenv, out); + +	MAKE_REAL_PATH (real_path, this, loc->path); + +	bctx = bctx_lookup (B_TABLE(this), loc->path); +	GF_VALIDATE_OR_GOTO (this->name, bctx, out); +	 +	LOCK(&bctx->lock); +	{ +		if (bctx->dbp == NULL) { +			goto unlock; +		} +	 +		ret = bctx->dbp->close (bctx->dbp, 0); +		GF_VALIDATE_OR_GOTO (this->name, (ret == 0), unlock); + +		bctx->dbp = NULL; + +		ret = dbenv->dbremove (dbenv, NULL, bctx->db_path, NULL, 0); +		if (ret != 0) { +			gf_log (this->name, +				GF_LOG_ERROR, +				"failed to DB_ENV->dbremove() on path %s: %s",  +				loc->path, db_strerror (ret)); +		} +	} +unlock: +	UNLOCK(&bctx->lock); +     +	if (ret) { +		gf_log (this->name, +			GF_LOG_ERROR, +			"failed to remove db %s: %s", bctx->db_path, db_strerror (ret)); +		ret = -1; +		goto out; +	}  +	gf_log (this->name, +		GF_LOG_DEBUG, +		"removed db %s", bctx->db_path); +	ret = rmdir (real_path); + +out: +	if (bctx) { +		/* NOTE: bctx_unref always returns success,  +		 * see description of bctx_unref for more details */ +		bctx_unref (bctx); +	} + +	return ret; +} + +int32_t  +bdb_rmdir (call_frame_t *frame, +           xlator_t *this, +           loc_t *loc) +{ +	int32_t op_ret   = -1;  +	int32_t op_errno = ENOTEMPTY; + +	if (!is_dir_empty (this, loc)) { +		gf_log (this->name, +			GF_LOG_DEBUG, +			"rmdir: directory %s not empty", loc->path); +		op_errno = ENOTEMPTY; +		op_ret = -1; +		goto out; +	} + +	op_ret = bdb_do_rmdir (this, loc);				 +	if (op_ret != 0) {				 +		gf_log (this->name, GF_LOG_ERROR,	 +			"failed to bdb_do_rmdir on %s",	 +			loc->path);		 +		goto out;				 +	}						 + +out: +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, op_ret, op_errno); + +	return 0; +} /* bdb_rmdir */ + +int32_t  +bdb_symlink (call_frame_t *frame, +             xlator_t *this, +             const char *linkname, +             loc_t *loc) +{ +	int32_t             op_ret    = -1; +	int32_t             op_errno  = EINVAL; +	char               *real_path = NULL; +	struct stat         stbuf     = {0,}; +	struct bdb_private *private   = NULL;  +	bctx_t             *bctx      = NULL; +	 +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, loc, out); +	GF_VALIDATE_OR_GOTO (this->name, linkname, out); + +	private = this->private; +	GF_VALIDATE_OR_GOTO (this->name, private, out); + +	MAKE_REAL_PATH (real_path, this, loc->path); +	op_ret = symlink (linkname, real_path); +	op_errno = errno; +	if (op_ret == 0) { +		op_ret = lstat (real_path, &stbuf);				 +		op_errno = errno;				 +		if (op_ret != 0) {				 +			gf_log (this->name, GF_LOG_ERROR,	 +				"failed to lstat on %s (%s)",	 +				real_path, strerror (op_errno));		 +			goto err;				 +		}						 + +		bctx = bctx_parent (B_TABLE(this), loc->path); +		GF_VALIDATE_OR_GOTO (this->name, bctx, err); + +		stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx); +		stbuf.st_mode = private->symlink_mode; + +		goto out; +	} +err: +	op_ret = unlink (real_path); +	op_errno = errno; +	if (op_ret != 0) { +		gf_log (this->name,  +			GF_LOG_ERROR, +			"failed to unlink the previously created symlink (%s)", +			strerror (op_errno)); +	} +	op_ret = -1; +	op_errno = ENOENT; +out: +	if (bctx) { +		/* NOTE: bctx_unref always returns success,  +		 * see description of bctx_unref for more details */ +		bctx_unref (bctx); +	} + +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf); + +	return 0; +} /* bdb_symlink */ + +int32_t  +bdb_chmod (call_frame_t *frame, +           xlator_t *this, +           loc_t *loc, +           mode_t mode) +{ +	int32_t     op_ret    = -1; +	int32_t     op_errno  = EINVAL; +	char       *real_path = NULL; +	struct stat stbuf     = {0,}; +	 +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, loc, out); + +	MAKE_REAL_PATH (real_path, this, loc->path); +	op_ret = lstat (real_path, &stbuf);				 +	op_errno = errno;				 +	if (op_ret != 0) {				 +		gf_log (this->name, GF_LOG_ERROR,	 +			"failed to lstat on %s (%s)",	 +			real_path, strerror (op_errno));		 +		goto out;				 +	}						 + +	/* directory or symlink */ +	op_ret = chmod (real_path, mode); +	op_errno = errno; + +out:     +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + +	return 0; +}/* bdb_chmod */ + + +int32_t  +bdb_chown (call_frame_t *frame, +           xlator_t *this, +           loc_t *loc, +           uid_t uid, +           gid_t gid) +{ +	int32_t     op_ret    = -1; +	int32_t     op_errno  = EINVAL; +	char       *real_path = NULL; +	struct stat stbuf     = {0,}; + +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, loc, out); + +	MAKE_REAL_PATH (real_path, this, loc->path); +	op_ret = lstat (real_path, &stbuf);				 +	op_errno = errno;				 +	if (op_ret != 0) {				 +		gf_log (this->name, GF_LOG_ERROR,	 +			"failed to lstat on %s (%s)",	 +			real_path, strerror (op_errno));		 +		goto out;				 +	}						 + +	/* directory or symlink */ +	op_ret = lchown (real_path, uid, gid); +	op_errno = errno;  +out:     +	frame->root->rsp_refs = NULL;   +	STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + +	return 0; +}/* bdb_chown */ + + +int32_t  +bdb_truncate (call_frame_t *frame, +              xlator_t *this, +              loc_t *loc, +              off_t offset) +{ +	int32_t     op_ret     = -1; +	int32_t     op_errno   = EINVAL; +	char       *real_path  = NULL; +	struct stat stbuf      = {0,}; +	char       *db_path    = NULL; +	bctx_t     *bctx       = NULL; +	char       *key_string = NULL; + +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, loc, out); + +	bctx = bctx_parent (B_TABLE(this), loc->path); +	op_errno = ENOENT; +	GF_VALIDATE_OR_GOTO (this->name, bctx, out); + +	MAKE_REAL_PATH (real_path, this, loc->path); +	MAKE_KEY_FROM_PATH (key_string, loc->path); +     +	/* now truncate */ +	MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); +	op_ret = lstat (db_path, &stbuf);				 +	op_errno = errno;				 +	if (op_ret != 0) {				 +		gf_log (this->name, GF_LOG_ERROR,	 +			"failed to lstat on %s (%s)",	 +			db_path, strerror (op_errno));		 +		goto out;				 +	}						 + +	if (loc->inode->ino) { +		stbuf.st_ino = loc->inode->ino; +	}else { +		stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx); +	} +     +	op_ret = bdb_db_put (bctx, NULL, key_string, NULL, 0, 1, 0); +	if (op_ret == -1) { +		gf_log (this->name, +			GF_LOG_DEBUG, +			"failed to do bdb_db_put: %s",  +			db_strerror (op_ret)); +		op_ret = -1; +		op_errno = EINVAL; /* TODO: better errno */ +	}  + +out: +	if (bctx) { +		/* NOTE: bctx_unref always returns success,  +		 * see description of bctx_unref for more details */ +		bctx_unref (bctx); +	} + +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, op_ret, op_errno, &stbuf); +   +	return 0; +}/* bdb_truncate */ + + +int32_t  +bdb_utimens (call_frame_t *frame, +             xlator_t *this, +             loc_t *loc, +             struct timespec ts[2]) +{ +	int32_t     op_ret    = -1; +	int32_t     op_errno  = EPERM; +	char       *real_path = NULL; +	struct stat stbuf     = {0,}; +	struct timeval tv[2] = {{0,},}; +   +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, loc, out); + +	MAKE_REAL_PATH (real_path, this, loc->path); +	op_ret = lstat (real_path, &stbuf);				 +	op_errno = errno;				 +	if (op_ret != 0) {				 +		op_errno = EPERM; +		gf_log (this->name, GF_LOG_ERROR,	 +			"failed to lstat on %s (%s)",	 +			real_path, strerror (op_errno));		 +		goto out;				 +	}						 + +	/* directory or symlink */ +	tv[0].tv_sec = ts[0].tv_sec; +	tv[0].tv_usec = ts[0].tv_nsec / 1000; +	tv[1].tv_sec = ts[1].tv_sec; +	tv[1].tv_usec = ts[1].tv_nsec / 1000; +     +	op_ret = lutimes (real_path, tv); +	if (op_ret == -1 && errno == ENOSYS) { +		op_ret = utimes (real_path, tv); +	} +	op_errno = errno; +	if (op_ret == -1) { +		gf_log (this->name,  +			GF_LOG_WARNING,  +			"utimes on %s failed: %s",  +			loc->path, strerror (op_errno)); +		goto out; +	} + +	op_ret = lstat (real_path, &stbuf);				 +	op_errno = errno;				 +	if (op_ret != 0) {				 +		gf_log (this->name, GF_LOG_ERROR,	 +			"failed to lstat on %s (%s)",	 +			real_path, strerror (op_errno));		 +		goto out;				 +	}						 + +	stbuf.st_ino = loc->inode->ino; +     +out:   +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, op_ret, op_errno, &stbuf); +   +	return 0; +}/* bdb_utimens */ + +int32_t  +bdb_statfs (call_frame_t *frame, +            xlator_t *this, +            loc_t *loc) + +{ +	int32_t        op_ret    = -1; +	int32_t        op_errno  = EINVAL; +	char          *real_path = NULL; +	struct statvfs buf       = {0, }; + +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, loc, out); + +	MAKE_REAL_PATH (real_path, this, loc->path); + +	op_ret = statvfs (real_path, &buf); +	op_errno = errno; +out: +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, op_ret, op_errno, &buf); +	return 0; +}/* bdb_statfs */ + +static int gf_bdb_xattr_log; + +/* bdb_setxattr - set extended attributes. + * + * bdb allows setxattr operation only on directories.  + *    bdb reservers 'glusterfs.file.<attribute-name>' to operate on the content of the files  + * under the specified directory. 'glusterfs.file.<attribute-name>' transforms to contents of  + * file of name '<attribute-name>' under specified directory. + * + * @frame: call frame. + * @this:  xlator_t of this instance of bdb xlator. + * @loc:   loc_t specifying the file to operate upon. + * @dict:  list of extended attributes to set on @loc. + * @flags: can be XATTR_REPLACE (replace an existing extended attribute only if it exists) or + *         XATTR_CREATE (create an extended attribute only if it doesn't already exist). + * + * + */ +int32_t  +bdb_setxattr (call_frame_t *frame, +              xlator_t *this, +              loc_t *loc, +              dict_t *dict, +              int flags) +{ +	int32_t      op_ret = -1; +	int32_t      op_errno = EINVAL; +	data_pair_t *trav = dict->members_list; +	bctx_t      *bctx = NULL; +	char        *real_path = NULL; +	char        *key = NULL; + +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, loc, out); +	GF_VALIDATE_OR_GOTO (this->name, dict, out); + +	MAKE_REAL_PATH (real_path, this, loc->path); +	if (!S_ISDIR (loc->inode->st_mode)) { +		op_ret   = -1; +		op_errno = EPERM; +		goto out; +	} + +	while (trav) { +		if (ZR_FILE_CONTENT_REQUEST(trav->key) ) { +			bctx = bctx_lookup (B_TABLE(this), loc->path); +			op_errno = EINVAL; +			GF_VALIDATE_OR_GOTO (this->name, bctx, out); + +			key = &(trav->key[15]); + +			if (flags & XATTR_REPLACE) { +				/* replace only if previously exists, otherwise error out */ +				op_ret = bdb_db_get (bctx, NULL, key, +						     NULL, 0, 0); +				if (op_ret == -1) { +					/* key doesn't exist in database */ +					gf_log (this->name, +						GF_LOG_DEBUG, +						"cannot XATTR_REPLACE, xattr %s doesn't exist " +						"on path %s", key, loc->path); +					op_ret = -1; +					op_errno = ENOENT; +					break; +				}  +				op_ret = bdb_db_put (bctx, NULL,  +						     key, trav->value->data,  +						     trav->value->len,  +						     op_ret, BDB_TRUNCATE_RECORD); +				if (op_ret != 0) { +					op_ret   = -1; +					op_errno = EINVAL; +					break; +				}  +			} else { +				/* fresh create */ +				op_ret = bdb_db_put (bctx, NULL, key,  +						     trav->value->data,  +						     trav->value->len,  +						     0, 0); +				if (op_ret != 0) { +					op_ret   = -1; +					op_errno = EINVAL; +					break; +				} else { +					op_ret = 0; +					op_errno = 0; +				} /* if(op_ret!=0)...else */ +			} /* if(flags&XATTR_REPLACE)...else */ +			if (bctx) { +				/* NOTE: bctx_unref always returns success,  +				 * see description of bctx_unref for more details */ +				bctx_unref (bctx); +			} +		} else { +			/* do plain setxattr */ +			op_ret = lsetxattr (real_path,  +					    trav->key,  +					    trav->value->data,  +					    trav->value->len,  +					    flags); +			op_errno = errno; +			if ((op_ret == -1) && (op_errno != ENOENT)) { +				if (op_errno == ENOTSUP) { +					gf_bdb_xattr_log++; +					if (!(gf_bdb_xattr_log % GF_UNIVERSAL_ANSWER)) { +						gf_log (this->name, GF_LOG_WARNING,  +							"Extended Attributes support not present."\ +							"Please check"); +					} +				} else { +					gf_log (this->name, GF_LOG_DEBUG,  +						"setxattr failed on %s (%s)",  +						loc->path, strerror (op_errno)); +				} +				break; +			} +		} /* if(ZR_FILE_CONTENT_REQUEST())...else */ +		trav = trav->next; +	}/* while(trav) */ +out: +	frame->root->rsp_refs = NULL; +	 +	STACK_UNWIND (frame, op_ret, op_errno); +	return 0;   +}/* bdb_setxattr */ + + +/* bdb_gettxattr - get extended attributes. + * + * bdb allows getxattr operation only on directories.  + * bdb_getxattr retrieves the whole content of the file, when glusterfs.file.<attribute-name>  + * is specified.  + * + * @frame: call frame. + * @this:  xlator_t of this instance of bdb xlator. + * @loc:   loc_t specifying the file to operate upon. + * @name:  name of extended attributes to get for @loc. + * + * NOTE: see description of bdb_setxattr for details on how + *     'glusterfs.file.<attribute-name>' is handles by bdb. + */ +int32_t  +bdb_getxattr (call_frame_t *frame, +              xlator_t *this, +              loc_t *loc, +              const char *name) +{ +	int32_t op_ret         = 0;  +	int32_t op_errno       = 0; +	dict_t *dict           = NULL; +	bctx_t *bctx           = NULL;  +	char   *buf            = NULL; +	char   *key_string     = NULL; +	int32_t list_offset    = 0; +	size_t  size           = 0; +	size_t  remaining_size = 0; +	char   *real_path      = NULL; +	char    key[1024]      = {0,}; +	char   *value          = NULL; +	char   *list           = NULL; +	 +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, loc, out); +	GF_VALIDATE_OR_GOTO (this->name, name, out); + +	dict = get_new_dict (); +	GF_VALIDATE_OR_GOTO (this->name, dict, out); + +	if (!S_ISDIR (loc->inode->st_mode)) { +		gf_log (this->name, +			GF_LOG_DEBUG, +			"operation not permitted on a non-directory file: %s", loc->path); +		op_ret   = -1; +		op_errno = ENODATA; +		goto out; +	} + +	if (name && ZR_FILE_CONTENT_REQUEST(name)) { +		bctx = bctx_lookup (B_TABLE(this), loc->path); +		op_errno = EINVAL; +		GF_VALIDATE_OR_GOTO (this->name, bctx, out); + +		key_string = (char *)&(name[15]); + +		op_ret = bdb_db_get (bctx, NULL, key_string, &buf, 0, 0); +		if (op_ret == -1) { +			gf_log (this->name, +				GF_LOG_DEBUG, +				"failed to db get on directory: %s for key: %s",  +				bctx->directory, name); +			op_ret   = -1; +			op_errno = ENODATA; +			goto out; +		}  +		 +		dict_set (dict, (char *)name, data_from_dynptr (buf, op_ret)); +	} else { +		MAKE_REAL_PATH (real_path, this, loc->path); +		size = llistxattr (real_path, NULL, 0); +		op_errno = errno; +		if (size <= 0) { +			/* There are no extended attributes, send an empty dictionary */ +			if (size == -1 && op_errno != ENODATA) { +				if (op_errno == ENOTSUP) { +					gf_bdb_xattr_log++; +					if (!(gf_bdb_xattr_log % GF_UNIVERSAL_ANSWER))  +						gf_log (this->name,  +							GF_LOG_WARNING,  +							"Extended Attributes support not present."\ +							"Please check"); +				} else { +					gf_log (this->name,  +						GF_LOG_WARNING,  +						"llistxattr failed on %s (%s)",  +						loc->path, strerror (op_errno)); +				} +			} +			op_ret = -1; +			op_errno = ENODATA; +		} else { +			list = alloca (size + 1); +			op_errno = ENOMEM; +			GF_VALIDATE_OR_GOTO (this->name, list, out); + +			size = llistxattr (real_path, list, size); +			op_ret = size; +			op_errno = errno; +			if (size == -1) { +				gf_log (this->name, +					GF_LOG_ERROR, +					"llistxattr failed on %s (%s)", +					loc->path, strerror (errno)); +				goto out; +			} +			remaining_size = size; +			list_offset = 0; +			while (remaining_size > 0) { +				if(*(list+list_offset) == '\0') +					break; +				strcpy (key, list + list_offset); +				op_ret = lgetxattr (real_path, key, NULL, 0); +				if (op_ret == -1) +					break; +				value = CALLOC (op_ret + 1, sizeof(char)); +				GF_VALIDATE_OR_GOTO (this->name, value, out); + +				op_ret = lgetxattr (real_path, key, value, op_ret); +				if (op_ret == -1) +					break; +				value [op_ret] = '\0'; +				dict_set (dict, key, data_from_dynptr (value, op_ret)); +				remaining_size -= strlen (key) + 1; +				list_offset += strlen (key) + 1; +			} /* while(remaining_size>0) */ +		} /* if(size <= 0)...else */ +	} /* if(name...)...else */ + +out: +	if(bctx) { +		/* NOTE: bctx_unref always returns success,  +		 * see description of bctx_unref for more details */ +		bctx_unref (bctx); +	} + +	if (dict) +		dict_ref (dict); + +	STACK_UNWIND (frame, op_ret, op_errno, dict); + +	if (dict) +		dict_unref (dict); +   +	return 0; +}/* bdb_getxattr */ + + +int32_t  +bdb_removexattr (call_frame_t *frame, +                 xlator_t *this, +                 loc_t *loc, +                 const char *name) +{ +	int32_t op_ret    = -1;  +	int32_t op_errno  = EINVAL; +	bctx_t *bctx      = NULL; +	char   *real_path = NULL; +	 +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, loc, out); +	GF_VALIDATE_OR_GOTO (this->name, name, out); + +	if (!S_ISDIR(loc->inode->st_mode)) {	 +		gf_log (this->name, +			GF_LOG_WARNING, +			"operation not permitted on non-directory files"); +		op_ret = -1; +		op_errno = EPERM; +		goto out; +	}  + +	if (ZR_FILE_CONTENT_REQUEST(name)) { +		bctx = bctx_lookup (B_TABLE(this), loc->path); +		op_errno = EINVAL; +		GF_VALIDATE_OR_GOTO (this->name, bctx, out); + +		op_ret = bdb_db_del (bctx, NULL, name); +      		if (op_ret == -1) { +			gf_log (this->name, +				GF_LOG_ERROR, +				"failed to delete %s from db of %s directory",  +				name, loc->path); +			op_errno = EINVAL; /* TODO: errno */ +			goto out; +		}  +	} else { +		MAKE_REAL_PATH(real_path, this, loc->path); +		op_ret = lremovexattr (real_path, name); +		op_errno = errno; +		if (op_ret == -1) { +			if (op_errno == ENOTSUP) { +				gf_bdb_xattr_log++; +				if (!(gf_bdb_xattr_log % GF_UNIVERSAL_ANSWER))  +					gf_log (this->name, GF_LOG_WARNING,  +						"Extended Attributes support not present." +						"Please check"); +			} else { +				gf_log (this->name,  +					GF_LOG_WARNING,  +					"%s: %s",  +					loc->path, strerror (op_errno)); +			} +		} /* if(op_ret == -1) */ +	} /* if (ZR_FILE_CONTENT_REQUEST(name))...else */ + +out: +	if (bctx) { +		/* NOTE: bctx_unref always returns success,  +		 * see description of bctx_unref for more details */ +		bctx_unref (bctx); +	} + +	frame->root->rsp_refs = NULL;   +	STACK_UNWIND (frame, op_ret, op_errno); +	return 0; +}/* bdb_removexattr */ + + +int32_t  +bdb_fsyncdir (call_frame_t *frame, +	      xlator_t *this, +	      fd_t *fd, +	      int datasync) +{ +	int32_t op_ret = -1; +	int32_t op_errno = EINVAL; +	struct bdb_fd *bfd = NULL; +	 +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, fd, out); +	 +	frame->root->rsp_refs = NULL; + +	bfd = bdb_extract_bfd (fd, this); +	op_errno = EBADFD; +	GF_VALIDATE_OR_GOTO (this->name, bfd, out); + +out: +	STACK_UNWIND (frame, op_ret, op_errno); + +	return 0; +}/* bdb_fsycndir */ + + +int32_t  +bdb_access (call_frame_t *frame, +	    xlator_t *this, +	    loc_t *loc, +	    int32_t mask) +{ +	int32_t op_ret = -1; +	int32_t op_errno = EINVAL; +	char *real_path = NULL; + +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, loc, out); +	 +	MAKE_REAL_PATH (real_path, this, loc->path); + +	op_ret = access (real_path, mask); +	op_errno = errno; +	/* TODO: implement for db entries */ +out: +	frame->root->rsp_refs = NULL;   +	STACK_UNWIND (frame, op_ret, op_errno); +	return 0; +}/* bdb_access */ + + +int32_t  +bdb_ftruncate (call_frame_t *frame, +	       xlator_t *this, +	       fd_t *fd, +	       off_t offset) +{ +	int32_t op_ret = -1; +	int32_t op_errno = EPERM; +	struct stat buf = {0,}; +	 +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, fd, out); +	/* TODO: impelement */ +out:	 +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, op_ret, op_errno, &buf); + +	return 0; +} + +int32_t  +bdb_fchown (call_frame_t *frame, +            xlator_t *this, +            fd_t *fd, +            uid_t uid, +            gid_t gid) +{ +	int32_t op_ret = -1; +	int32_t op_errno = EPERM; +	struct stat buf = {0,}; + +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, fd, out); +	 +	/* TODO: implement */ +out:	 +	STACK_UNWIND (frame, op_ret, op_errno, &buf); + +	return 0; +} + + +int32_t  +bdb_fchmod (call_frame_t *frame, +            xlator_t *this, +            fd_t *fd, +            mode_t mode) +{ +	int32_t op_ret = -1; +	int32_t op_errno = EPERM; +	struct stat buf = {0,}; + +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, fd, out); +	 +	/* TODO: impelement */ +out:	 +	frame->root->rsp_refs = NULL;   +	STACK_UNWIND (frame, op_ret, op_errno, &buf); + +	return 0; +} + +int32_t  +bdb_setdents (call_frame_t *frame, +              xlator_t *this, +              fd_t *fd, +              int32_t flags, +              dir_entry_t *entries, +              int32_t count) +{ +	int32_t op_ret = -1, op_errno = EINVAL; +	char *entry_path = NULL; +	int32_t real_path_len = 0; +	int32_t entry_path_len = 0; +	int32_t ret = 0; +	struct bdb_dir *bfd = NULL; +	dir_entry_t *trav = NULL; + +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, fd, out); +	GF_VALIDATE_OR_GOTO (this->name, entries, out); + +	frame->root->rsp_refs = NULL; +	 +	bfd = bdb_extract_bfd (fd, this); +	op_errno = EBADFD; +	GF_VALIDATE_OR_GOTO (this->name, bfd, out); + +	real_path_len = strlen (bfd->path); +	entry_path_len = real_path_len + 256; +	entry_path = CALLOC (1, entry_path_len); +	GF_VALIDATE_OR_GOTO (this->name, entry_path, out); + +	strcpy (entry_path, bfd->path); +	entry_path[real_path_len] = '/'; +       +	trav = entries->next; +	while (trav) { +		char pathname[ZR_PATH_MAX] = {0,}; +		strcpy (pathname, entry_path); +		strcat (pathname, trav->name); +         +		if (S_ISDIR(trav->buf.st_mode)) { +			/* If the entry is directory, create it by calling 'mkdir'. If  +			 * directory is not present, it will be created, if its present,  +			 * no worries even if it fails. +			 */ +			ret = mkdir (pathname, trav->buf.st_mode); +			if ((ret == -1) && (errno != EEXIST)) { +				gf_log (this->name, +					GF_LOG_ERROR, +					"failed to created directory %s: %s",  +					pathname, strerror(errno)); +				goto loop; +			} + +			gf_log (this->name,  +				GF_LOG_DEBUG,  +				"Creating directory %s with mode (0%o)",  +				pathname, +				trav->buf.st_mode); +			/* Change the mode  +			 * NOTE: setdents tries its best to restore the state +			 *       of storage. if chmod and chown fail, they can be +			 *       ignored now */ +			ret = chmod (pathname, trav->buf.st_mode); +			if (ret != 0) { +				op_ret = -1; +				op_errno = errno; +				gf_log (this->name, +					GF_LOG_ERROR, +					"chmod failed on %s (%s)", +					pathname, strerror (errno)); +				goto loop; +			} +			/* change the ownership */ +			ret = chown (pathname, trav->buf.st_uid, trav->buf.st_gid); +			if (ret != 0) { +				op_ret = -1; +				op_errno = errno; +				gf_log (this->name, +					GF_LOG_ERROR, +					"chown failed on %s (%s)", +					pathname, strerror (errno)); +				goto loop; +			} +		} else if ((flags == GF_SET_IF_NOT_PRESENT) ||  +			   (flags != GF_SET_DIR_ONLY)) { +			/* Create a 0 byte file here */ +			if (S_ISREG (trav->buf.st_mode)) { +				op_ret = bdb_db_put (bfd->ctx, NULL,  +						     trav->name, NULL, 0, 0, 0); +				if (op_ret != 0) { +					/* create successful */ +					gf_log (this->name, +						GF_LOG_ERROR, +						"failed to create file %s", +						pathname); +				} /* if (!op_ret)...else */ +			} else if (S_ISLNK (trav->buf.st_mode)) { +				/* TODO: impelement */; +			} else { +				gf_log (this->name, +					GF_LOG_ERROR, +					"storage/bdb allows to create regular files only" +					"file %s (mode = %d) cannot be created", +					pathname, trav->buf.st_mode); +			} /* if(S_ISREG())...else */ +		} /* if(S_ISDIR())...else if */ +	loop: +		/* consider the next entry */ +		trav = trav->next; +	} /* while(trav) */ + +out: +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, op_ret, op_errno); +   +	FREE (entry_path); +	return 0; +} + +int32_t  +bdb_fstat (call_frame_t *frame, +           xlator_t *this, +           fd_t *fd) +{ +	int32_t        op_ret   = -1; +	int32_t        op_errno = EINVAL; +	struct stat    stbuf    = {0,}; +	struct bdb_fd *bfd      = NULL; +	bctx_t        *bctx     = NULL; +	char          *db_path  = NULL; + +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, fd, out); +	 +	bfd      = bdb_extract_bfd (fd, this); +	op_errno = EBADFD; +	GF_VALIDATE_OR_GOTO (this->name, bfd, out); + +	bctx = bfd->ctx; + +	MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); +	op_ret = lstat (db_path, &stbuf);				 +	op_errno = errno;				 +	if (op_ret != 0) {				 +		gf_log (this->name, GF_LOG_ERROR,	 +			"failed to lstat on %s (%s)",	 +			db_path, strerror (op_errno));		 +		goto out;				 +	}						 + +	stbuf.st_ino = fd->inode->ino; +	stbuf.st_size = bdb_db_get (bctx, NULL, bfd->key, NULL, 0, 0); +	stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); + +out: +	frame->root->rsp_refs = NULL; + +	STACK_UNWIND (frame, op_ret, op_errno, &stbuf); +	return 0; +} + + +int32_t +bdb_readdir (call_frame_t *frame, +             xlator_t *this, +             fd_t *fd, +             size_t size, +             off_t off) +{ +	struct bdb_dir *bfd        = NULL; +	int32_t         op_ret     = -1;  +	int32_t         op_errno   = EINVAL; +	size_t          filled     = 0; +	gf_dirent_t    *this_entry = NULL; +	gf_dirent_t     entries; +	struct dirent  *entry      = NULL; +	off_t           in_case    = 0; +	int32_t         this_size  = 0; +	DBC            *cursorp    = NULL; +	int32_t count = 0; + +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, fd, out); + +	INIT_LIST_HEAD (&entries.list); +	 +	bfd = bdb_extract_bfd (fd, this); +	op_errno = EBADFD; +	GF_VALIDATE_OR_GOTO (this->name, bfd, out); + +	op_errno = ENOMEM; + +	while (filled <= size) { +		this_entry = NULL; +		entry      = NULL; +		in_case    = 0; +		this_size  = 0; +         +		in_case = telldir (bfd->dir); +		entry = readdir (bfd->dir); +		if (!entry) +			break; + +		if (IS_BDB_PRIVATE_FILE(entry->d_name)) +			continue; +		 +		this_size = dirent_size (entry); +         +		if (this_size + filled > size) { +			seekdir (bfd->dir, in_case); +			break; +		} +		 +		count++; + +		this_entry = gf_dirent_for_name (entry->d_name); +		this_entry->d_ino = entry->d_ino; +           +		this_entry->d_off = -1; +           +		this_entry->d_type = entry->d_type; +		this_entry->d_len = entry->d_reclen; + + +		list_add (&this_entry->list, &entries.list); +           +		filled += this_size; +	} +	op_ret = filled; +	op_errno = 0; +	if (filled >= size) { +		goto out; +	} + +	/* hungry kyaa? */ +	op_ret = bdb_cursor_open (bfd->ctx, &cursorp); +	op_errno = EBADFD; +	GF_VALIDATE_OR_GOTO (this->name, (op_ret == 0), out); + +	/* TODO: fix d_off, don't use bfd->offset. wrong method */ +	if (strlen (bfd->offset)) { +		DBT key = {0,}, value = {0,}; +		key.data = bfd->offset; +		key.size = strlen (bfd->offset); +		key.flags = DB_DBT_USERMEM; +		value.dlen = 0; +		value.doff = 0; +		value.flags = DB_DBT_PARTIAL; + +		op_ret = bdb_cursor_get (cursorp, &key, &value, DB_SET); +		op_errno = EBADFD; +		GF_VALIDATE_OR_GOTO (this->name, (op_ret == 0), out); + +	} else { +		/* first time or last time, do nothing */ +	} + +	while (filled <= size) { +		DBT key = {0,}, value = {0,}; +            	this_entry = NULL; + +		key.flags = DB_DBT_MALLOC; +		value.dlen = 0; +		value.doff = 0;  +		value.flags = DB_DBT_PARTIAL; +		op_ret = bdb_cursor_get (cursorp, &key, &value, DB_NEXT); +             +		if (op_ret == DB_NOTFOUND) { +			/* we reached end of the directory */ +			op_ret = 0; +			op_errno = 0; +			break; +		} else if (op_ret != 0) { +			gf_log (this->name, +				GF_LOG_DEBUG, +				"database error during readdir"); +			op_ret = -1; +			op_errno = ENOENT; +			break; +		} /* if (op_ret == DB_NOTFOUND)...else if...else */ + +		if (key.data == NULL) { +			/* NOTE: currently ignore when we get key.data == NULL. +			 * TODO: we should not get key.data = NULL */ +			gf_log (this->name, +				GF_LOG_DEBUG, +				"null key read from db"); +			continue; +		}/* if(key.data)...else */ +		count++; +		this_size = bdb_dirent_size (&key); +		if (this_size + filled > size) +			break; +		/* TODO - consider endianness here */ +		this_entry = gf_dirent_for_name ((const char *)key.data); +		/* FIXME: bug, if someone is going to use ->d_ino */ +		this_entry->d_ino = -1; +		this_entry->d_off = 0; +		this_entry->d_type = 0; +		this_entry->d_len = key.size; +                 +		if (key.data) { +			strncpy (bfd->offset, key.data, key.size); +			bfd->offset [key.size] = '\0'; +			free (key.data); +		} + +		list_add (&this_entry->list, &entries.list); + +		filled += this_size; +	}/* while */ +	bdb_cursor_close (bfd->ctx, cursorp); +	op_ret = filled; +	op_errno = 0; +out: +	frame->root->rsp_refs = NULL; +	gf_log (this->name, +		GF_LOG_DEBUG, +		"read %"GF_PRI_SIZET" bytes for %d entries", filled, count); +	STACK_UNWIND (frame, count, op_errno, &entries); + +	gf_dirent_free (&entries); +     +	return 0; +} + + +int32_t  +bdb_stats (call_frame_t *frame, +           xlator_t *this, +           int32_t flags) + +{ +	int32_t op_ret = 0; +	int32_t op_errno = 0; + +	struct xlator_stats xlstats = {0, }, *stats = NULL;  +	struct statvfs buf; +	struct timeval tv; +	struct bdb_private *private = NULL; +	int64_t avg_read = 0; +	int64_t avg_write = 0; +	int64_t _time_ms = 0;  +	 +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); + +	private = (struct bdb_private *)(this->private); +	stats = &xlstats; +	 +  	op_ret = statvfs (private->export_path, &buf);				 +	op_errno = errno;				 +	if (op_ret != 0) {				 +		gf_log (this->name, GF_LOG_ERROR,	 +			"failed to statvfs on %s (%s)",	 +			private->export_path, strerror (op_errno));		 +		goto out;				 +	}						 + +	stats->nr_files = private->stats.nr_files; +	stats->nr_clients = private->stats.nr_clients; /* client info is maintained at FSd */ +	stats->free_disk = buf.f_bfree * buf.f_bsize; /* Number of Free block in the filesystem. */ +	stats->total_disk_size = buf.f_blocks * buf.f_bsize; /* */ +	stats->disk_usage = (buf.f_blocks - buf.f_bavail) * buf.f_bsize; + +	/* Calculate read and write usage */ +	gettimeofday (&tv, NULL); +   +	/* Read */ +	_time_ms = (tv.tv_sec - private->init_time.tv_sec) * 1000 + +		((tv.tv_usec - private->init_time.tv_usec) / 1000); + +	avg_read = (_time_ms) ? (private->read_value / _time_ms) : 0; /* KBps */ +	avg_write = (_time_ms) ? (private->write_value / _time_ms) : 0; /* KBps */ +   +	_time_ms = (tv.tv_sec - private->prev_fetch_time.tv_sec) * 1000 + +		((tv.tv_usec - private->prev_fetch_time.tv_usec) / 1000); +	if (_time_ms && ((private->interval_read / _time_ms) > private->max_read)) { +		private->max_read = (private->interval_read / _time_ms); +	} +	if (_time_ms && ((private->interval_write / _time_ms) > private->max_write)) { +		private->max_write = private->interval_write / _time_ms; +	} + +	stats->read_usage = avg_read / private->max_read; +	stats->write_usage = avg_write / private->max_write; + +	gettimeofday (&(private->prev_fetch_time), NULL); +	private->interval_read = 0; +	private->interval_write = 0; + +out: +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, op_ret, op_errno, stats); +	return 0; +} + + +int32_t  +bdb_inodelk (call_frame_t *frame, xlator_t *this, +	     loc_t *loc, int32_t cmd, struct flock *lock) +{ +        frame->root->rsp_refs = NULL; + +	gf_log (this->name, GF_LOG_CRITICAL, +		"\"features/posix-locks\" translator is not loaded. You need to use it for proper functioning of GlusterFS"); + +        STACK_UNWIND (frame, -1, ENOSYS); +        return 0; +} + + +int32_t  +bdb_finodelk (call_frame_t *frame, xlator_t *this, +	      fd_t *fd, int32_t cmd, struct flock *lock) +{ +        frame->root->rsp_refs = NULL; + +	gf_log (this->name, GF_LOG_CRITICAL, +		"\"features/posix-locks\" translator is not loaded. You need to use it for proper functioning of GlusterFS"); + +        STACK_UNWIND (frame, -1, ENOSYS); +        return 0; +} + + +int32_t  +bdb_entrylk (call_frame_t *frame, xlator_t *this, +	     loc_t *loc, const char *basename, entrylk_cmd cmd,  +	     entrylk_type type) +{ +        frame->root->rsp_refs = NULL; + +	gf_log (this->name, GF_LOG_CRITICAL, +		"\"features/posix-locks\" translator is not loaded. You need to use it for proper functioning of GlusterFS"); + +        STACK_UNWIND (frame, -1, ENOSYS); +        return 0; +} + + +int32_t  +bdb_fentrylk (call_frame_t *frame, xlator_t *this, +	      fd_t *fd, const char *basename, entrylk_cmd cmd,  +	      entrylk_type type) +{ +        frame->root->rsp_refs = NULL; + +	gf_log (this->name, GF_LOG_CRITICAL, +		"\"features/posix-locks\" translator is not loaded. You need to use it for proper functioning of GlusterFS"); + +        STACK_UNWIND (frame, -1, ENOSYS); +        return 0; +} + + +int32_t  +bdb_checksum (call_frame_t *frame, +              xlator_t *this, +              loc_t *loc, +              int32_t flag) +{ +	char          *real_path = NULL; +	DIR           *dir       = NULL; +	struct dirent *dirent    = NULL; +	uint8_t        file_checksum[ZR_FILENAME_MAX] = {0,}; +	uint8_t        dir_checksum[ZR_FILENAME_MAX]  = {0,}; +	int32_t        op_ret   = -1; +	int32_t        op_errno = EINVAL; +	int32_t        i = 0, length = 0; +	bctx_t        *bctx    = NULL; +	DBC           *cursorp = NULL; +	char          *data    = NULL; + +	GF_VALIDATE_OR_GOTO ("bdb", frame, out); +	GF_VALIDATE_OR_GOTO ("bdb", this, out); +	GF_VALIDATE_OR_GOTO (this->name, loc, out); + +	MAKE_REAL_PATH (real_path, this, loc->path); + +	{ +		dir = opendir (real_path); +		op_errno = errno; +		GF_VALIDATE_OR_GOTO (this->name, dir, out); +		while ((dirent = readdir (dir))) { +			if (!dirent) +				break; +         +			if (IS_BDB_PRIVATE_FILE(dirent->d_name)) +				continue; + +			length = strlen (dirent->d_name); +			for (i = 0; i < length; i++) +				dir_checksum[i] ^= dirent->d_name[i]; +		} /* while((dirent...)) */ +		closedir (dir); +	} + +	{ +		bctx = bctx_lookup (B_TABLE(this), (char *)loc->path); +		op_errno = EINVAL; +		GF_VALIDATE_OR_GOTO (this->name, bctx, out); + +		op_ret = bdb_cursor_open (bctx, &cursorp); +		op_errno = EINVAL; +		GF_VALIDATE_OR_GOTO (this->name, (op_ret == 0), out); + +		while (1) { +			DBT key = {0,}, value = {0,}; +           +			key.flags = DB_DBT_MALLOC; +			value.doff = 0; +			value.dlen = 0; +			op_ret = bdb_cursor_get (cursorp, &key, &value, DB_NEXT); +           +			if (op_ret == DB_NOTFOUND) { +				gf_log (this->name, +					GF_LOG_DEBUG, +					"end of list of key/value pair in db for " +					"directory: %s", bctx->directory); +				op_ret = 0; +				op_errno = 0; +				break; +			} else if (op_ret == 0){ +				/* successfully read */ +				data = key.data; +				length = key.size; +				for (i = 0; i < length; i++) +					file_checksum[i] ^= data[i]; +             +				free (key.data); +			} else { +				gf_log (this->name, +					GF_LOG_ERROR, +					"failed to do cursor get for directory %s: %s",  +					bctx->directory, db_strerror (op_ret)); +				op_ret = -1; +				op_errno = ENOENT; +				break; +			}/* if(op_ret == DB_NOTFOUND)...else if...else */ +		} /* while(1) */ +		bdb_cursor_close (bctx, cursorp); +	} +out: +	if (bctx) { +		/* NOTE: bctx_unref always returns success,  +		 * see description of bctx_unref for more details */ +		bctx_unref (bctx); +	} + +	frame->root->rsp_refs = NULL; +	STACK_UNWIND (frame, op_ret, op_errno, file_checksum, dir_checksum); + +	return 0; +} + +/** + * notify - when parent sends PARENT_UP, send CHILD_UP event from here + */ +int32_t +notify (xlator_t *this, +        int32_t event, +        void *data, +        ...) +{ +	switch (event) +	{ +	case GF_EVENT_PARENT_UP: +	{ +		/* Tell the parent that bdb xlator is up */ +		assert ((this->private != NULL) &&  +			(BDB_ENV(this) != NULL)); +		default_notify (this, GF_EVENT_CHILD_UP, data); +	} +	break; +	default: +		/* */ +		break; +	} +	return 0; +} + + + +/** + * init -  + */ +int32_t  +init (xlator_t *this) +{ +	int32_t             ret = -1; +	struct stat         buf = {0,}; +	struct bdb_private *_private = NULL; +	data_t             *directory = NULL; +	bctx_t             *bctx = NULL; + +	GF_VALIDATE_OR_GOTO ("bdb", this, out); + +	_private = CALLOC (1, sizeof (*_private)); +	GF_VALIDATE_OR_GOTO (this->name, _private, out); + +	if (this->children) { +		gf_log (this->name, +			GF_LOG_ERROR, +			"FATAL: storage/bdb cannot have subvolumes"); +		FREE (_private); +		goto out;; +	} + +	if (!this->parents) { +		gf_log (this->name, GF_LOG_WARNING, +			"dangling volume. check volfile "); +	} + +	directory = dict_get (this->options, "directory"); +	if (!directory) { +		gf_log (this->name, GF_LOG_ERROR, +			"export directory not specified in volfile"); +		FREE (_private); +		goto out; +	}  +	umask (000); // umask `masking' is done at the client side +	/* // * No need to create directory, sys admin should do it himself  +	if (mkdir (directory->data, 0777) == 0) { +		gf_log (this->name, GF_LOG_WARNING, +			"directory specified not exists, created"); +	} +	*/ +   +	/* Check whether the specified directory exists, if not create it. */ +	ret = stat (directory->data, &buf); +	if ((ret != 0) || !S_ISDIR (buf.st_mode)) { +		gf_log (this->name, GF_LOG_ERROR,  +			"specified directory '%s' doesn't exists, Exiting", directory->data); +		FREE (_private); +		goto out; +	} else { +		ret = 0; +	} + + +	_private->export_path = strdup (directory->data); +	_private->export_path_length = strlen (_private->export_path); + +	{ +		/* Stats related variables */ +		gettimeofday (&_private->init_time, NULL); +		gettimeofday (&_private->prev_fetch_time, NULL); +		_private->max_read = 1; +		_private->max_write = 1; +	} + +	this->private = (void *)_private; +	{ +		ret = bdb_db_init (this, this->options); +     +		if (ret == -1){ +			gf_log (this->name, +				GF_LOG_DEBUG, +				"failed to initialize database"); +			goto out; +		} else { +			bctx = bctx_lookup (_private->b_table, "/"); +			/* NOTE: we are not doing bctx_unref() for root bctx,  +			 *      let it remain in active list forever */ +			if (!bctx) { +				gf_log (this->name, +					GF_LOG_ERROR, +					"failed to allocate memory for root (/) bctx: out of memory"); +				goto out; +			} else { +				ret = 0; +			} +		} +	} +out: +	return ret; +} + +void  +bctx_cleanup (struct list_head *head) +{ +	bctx_t *trav    = NULL; +	bctx_t *tmp     = NULL; +	DB     *storage = NULL; + +	list_for_each_entry_safe (trav, tmp, head, list) { +		LOCK (&trav->lock); +		storage = trav->dbp; +		trav->dbp = NULL; +		list_del_init (&trav->list); +		UNLOCK (&trav->lock); +     +		if (storage) { +			storage->close (storage, 0); +			storage = NULL; +		} +	} +  	return; +} + +void +fini (xlator_t *this) +{ +	struct bdb_private *private = NULL;  +	int32_t             idx     = 0; +	int32_t             ret     = 0; +	private = this->private; + +	if (B_TABLE(this)) { +		/* close all the dbs from lru list */ +		bctx_cleanup (&(B_TABLE(this)->b_lru)); +		for (idx = 0; idx < B_TABLE(this)->hash_size; idx++) +			bctx_cleanup (&(B_TABLE(this)->b_hash[idx])); +     +		if (BDB_ENV(this)) { +			LOCK (&private->active_lock); +			private->active = 0; +			UNLOCK (&private->active_lock); +        +			ret = pthread_join (private->checkpoint_thread, NULL); +			if (ret != 0) { +				gf_log (this->name, +					GF_LOG_CRITICAL, +					"failed to join checkpoint thread"); +			} + +			/* TODO: pick each of the 'struct bctx' from private->b_hash +			 * and close all the databases that are open */ +			BDB_ENV(this)->close (BDB_ENV(this), 0); +		} else { +			/* impossible to reach here */ +		} + +		FREE (B_TABLE(this)); +	} +	FREE (private); +	return; +} + +struct xlator_mops mops = { +	.stats    = bdb_stats, +}; + +struct xlator_fops fops = { +	.lookup      = bdb_lookup, +	.stat        = bdb_stat, +	.opendir     = bdb_opendir, +	.readdir     = bdb_readdir, +	.readlink    = bdb_readlink, +	.mknod       = bdb_mknod, +	.mkdir       = bdb_mkdir, +	.unlink      = bdb_unlink, +	.rmdir       = bdb_rmdir, +	.symlink     = bdb_symlink, +	.rename      = bdb_rename, +	.link        = bdb_link, +	.chmod       = bdb_chmod, +	.chown       = bdb_chown, +	.truncate    = bdb_truncate, +	.utimens     = bdb_utimens, +	.create      = bdb_create, +	.open        = bdb_open, +	.readv       = bdb_readv, +	.writev      = bdb_writev, +	.statfs      = bdb_statfs, +	.flush       = bdb_flush, +	.fsync       = bdb_fsync, +	.setxattr    = bdb_setxattr, +	.getxattr    = bdb_getxattr, +	.removexattr = bdb_removexattr, +	.fsyncdir    = bdb_fsyncdir, +	.access      = bdb_access, +	.ftruncate   = bdb_ftruncate, +	.fstat       = bdb_fstat, +	.lk          = bdb_lk, +	.inodelk     = bdb_inodelk, +	.finodelk    = bdb_finodelk, +	.entrylk     = bdb_entrylk, +	.fentrylk    = bdb_fentrylk, +	.fchown      = bdb_fchown, +	.fchmod      = bdb_fchmod, +	.setdents    = bdb_setdents, +	.getdents    = bdb_getdents, +	.checksum    = bdb_checksum, +}; + +struct xlator_cbks cbks = { +	.release    = bdb_release, +	.releasedir = bdb_releasedir +}; + +#if 0 +struct volume_options options[] = { +	{ "directory", GF_OPTION_TYPE_PATH, 0, }, +	{ "logdir", GF_OPTION_TYPE_PATH, 0, }, +	{ "errfile", GF_OPTION_TYPE_PATH, 0, }, +	{ "dir-mode", GF_OPTION_TYPE_ANY, 0, },  // base 8 number  +	{ "file-mode", GF_OPTION_TYPE_ANY, 0, }, // base 8 number +	{ "page-size", GF_OPTION_TYPE_SIZET, -1, }, +	{ "lru-limit", GF_OPTION_TYPE_INT, -1, }, +	{ "lock-timeout", GF_OPTION_TYPE_TIME, 0, }, +	{ "checkpoint-timeout", GF_OPTION_TYPE_TIME, 0, }, +	{ "transaction-timeout", GF_OPTION_TYPE_TIME, 0, }, +	{ "mode", GF_OPTION_TYPE_BOOL, 0, }, // Should be 'cache' ??  +	{ "access-mode", GF_OPTION_TYPE_STR, 0, 0, 0, "btree"}, +	{ NULL, 0, } +}; + +#endif /* #if 0 */ diff --git a/xlators/storage/bdb/src/bdb.h b/xlators/storage/bdb/src/bdb.h new file mode 100644 index 00000000000..f2d962680dd --- /dev/null +++ b/xlators/storage/bdb/src/bdb.h @@ -0,0 +1,439 @@ +/* +  Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> +  This file is part of GlusterFS. + +  GlusterFS is free software; you can redistribute it and/or modify +  it under the terms of the GNU General Public License as published +  by the Free Software Foundation; either version 3 of the License, +  or (at your option) any later version. + +  GlusterFS is distributed in the hope that it will be useful, but +  WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  General Public License for more details. + +  You should have received a copy of the GNU General Public License +  along with this program.  If not, see +  <http://www.gnu.org/licenses/>. +*/ + +#ifndef _BDB_H +#define _BDB_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <stdio.h> +#include <dirent.h> +#include <unistd.h> +#include <sys/types.h> +#include <dirent.h> + +#include <db.h> + +#ifdef linux +#ifdef __GLIBC__ +#include <sys/fsuid.h> +#else +#include <unistd.h> +#endif +#endif + +#ifdef HAVE_SYS_XATTR_H +#include <sys/xattr.h> +#endif + +#ifdef HAVE_SYS_EXTATTR_H +#include <sys/extattr.h> +#endif + +#include <pthread.h> +#include "xlator.h" +#include "inode.h" +#include "compat.h" +#include "compat-errno.h" + +#define GLFS_BDB_STORAGE    "/glusterfs_storage.db" + +/* numbers are not so reader-friendly, so lets have ON and OFF macros */ +#define ON  1 +#define OFF 0 + +#define BDB_DEFAULT_LRU_LIMIT 100 +#define BDB_DEFAULT_HASH_SIZE 100 + +#define BDB_ENOSPC_THRESHOLD 25600 + +#define BDB_DEFAULT_CHECKPOINT_TIMEOUT 30 + +#define BCTX_ENV(bctx) (bctx->table->dbenv) +/* MAKE_REAL_PATH(var,this,path) + * make the real path on the underlying file-system + * + * @var:  destination to hold the real path + * @this: pointer to xlator_t corresponding to bdb xlator  + * @path: path, as seen from mount-point  + */ +#define MAKE_REAL_PATH(var, this, path) do {				\ +		int base_len = ((struct bdb_private *)this->private)->export_path_length; \ +		var = alloca (strlen (path) + base_len + 2);		\ +		strcpy (var, ((struct bdb_private *)this->private)->export_path); \ +		strcpy (&var[base_len], path);				\ +	} while (0) + +/* MAKE_REAL_PATH_TO_STORAGE_DB(var,this,path) + * make the real path to the storage-database file on file-system + * + * @var:  destination to hold the real path + * @this: pointer to xlator_t corresponding to bdb xlator  + * @path: path of the directory, as seen from mount-point  + */ +#define MAKE_REAL_PATH_TO_STORAGE_DB(var, this, path) do {		\ +		int base_len = ((struct bdb_private *)this->private)->export_path_length; \ +		var = alloca (strlen (path) + base_len + strlen (GLFS_BDB_STORAGE)); \ +		strcpy (var, ((struct bdb_private *)this->private)->export_path); \ +		strcpy (&var[base_len], path);				\ +		strcat (var, GLFS_BDB_STORAGE);				\ +	} while (0) + +/* MAKE_KEY_FROM_PATH(key,path) + * make a 'key', which we use as key in the underlying database by using the path + * + * @key:  destination to hold the key + * @path: path to file as seen from mount-point  + */ +#define MAKE_KEY_FROM_PATH(key, path) do {		\ +		char *tmp = alloca (strlen (path));	\ +		strcpy (tmp, path);			\ +		key = basename (tmp);			\ +	}while (0); + +/* BDB_DO_LSTAT(path,stbuf,dirent) + * construct real-path to a dirent and do lstat on the real-path + * + * @path:   path to the directory whose readdir is currently in progress + * @stbuf:  a 'struct stat *' + * @dirent: a 'struct dirent *' + */ +#define BDB_DO_LSTAT(path, stbuf, dirent) do {		\ +		char tmp_real_path[GF_PATH_MAX];	\ +		strcpy(tmp_real_path, path);		\ +		strcat (tmp_real_path, "/");		\ +		strcat(tmp_real_path, dirent->d_name);	\ +		ret = lstat (tmp_real_path, stbuf);	\ +	} while(0); + +/* IS_BDB_PRIVATE_FILE(name) + * check if a given 'name' is bdb xlator's internal file name  + * + * @name: basename of a file. + * + * bdb xlator reserves file names 'glusterfs_storage.db',  + * 'glusterfs_ns.db'(used by bdb xlator itself), 'log.*', '__db.*' (used by libdb) + */ +#define IS_BDB_PRIVATE_FILE(name) ((!strncmp(name, "__db.", 5)) ||	      \ +                                   (!strcmp(name, "glusterfs_storage.db")) || \ +                                   (!strcmp(name, "glusterfs_ns.db")) ||      \ +                                   (!strncmp(name, "log.0000", 8))) + +/* check if 'name' is '.' or '..' entry */ +#define IS_DOT_DOTDOT(name) ((!strncmp(name,".", 1)) || (!strncmp(name,"..", 2))) + +/* BDB_SET_BCTX(this,inode,bctx) + * put a stamp on inode. d00d, you are using bdb.. huhaha. + * pointer to 'struct bdb_ctx' is stored in inode's ctx of all directories.  + * this will happen either in lookup() or mkdir(). + * + * @this:  pointer xlator_t of bdb xlator. + * @inode: inode where 'struct bdb_ctx *' has to be stored. + * @bctx:  a 'struct bdb_ctx *' + */ +#define BDB_SET_BCTX(this,inode,bctx) do{                         \ +		inode_ctx_put(inode, this, (uint64_t)(long)bctx); \ +	}while (0); + +/* MAKE_BCTX_FROM_INODE(this,bctx,inode) + * extract bdb xlator's 'struct bdb_ctx *' from an inode's ctx. + * valid only if done for directory inodes, otherwise bctx = NULL. + * + * @this:  pointer xlator_t of bdb xlator. + * @bctx:  a 'struct bdb_ctx *' + * @inode: inode from where 'struct bdb_ctx *' has to be extracted.  + */ +#define MAKE_BCTX_FROM_INODE(this,bctx,inode) do{        \ +                uint64_t tmp_bctx = 0;                   \ +                inode_ctx_get (inode, this, &tmp_bctx);  \ +                if (ret == 0)                            \ +		        bctx = (void *)(long)tmp_bctx;   \ +	}while (0); + +#define BDB_SET_BFD(this,fd,bfd) do{		            \ +		fd_ctx_set (fd, this, (uint64_t)(long)bfd); \ +	}while (0); + +/* maximum number of open dbs that bdb xlator will ever have */ +#define BDB_MAX_OPEN_DBS 100 + +/* convert file size to block-count */ +#define BDB_COUNT_BLOCKS(size,blksize) (((size + blksize - 1)/blksize) - 1) + +/* file permissions, again macros are more readable */ +#define RWXRWXRWX         0777 +#define DEFAULT_FILE_MODE 0644 +#define DEFAULT_DIR_MODE  0755 + +/* see, if have a valid file permissions specification in @mode */ +#define IS_VALID_FILE_MODE(mode) (!(mode & (~RWXRWXRWX))) +#define IS_VALID_DIR_MODE(mode)  (!(mode & (~(RWXRWXRWX))) + +/* maximum retries for a failed transactional operation */		   +#define BDB_MAX_RETRIES 10 + +typedef struct bctx_table bctx_table_t; +typedef struct bdb_ctx    bctx_t; +typedef struct bdb_cache  bdb_cache_t; +typedef struct bdb_private bdb_private_t; +			  +struct bctx_table { +	uint64_t            dbflags;         /* flags to be used for opening each database */ +	uint64_t            cache;           /* cache: can be either ON or OFF */ +	gf_lock_t           lock;            /* used to lock the 'struct bctx_table *' */ +	gf_lock_t           checkpoint_lock; /* lock for checkpointing */ +	struct list_head   *b_hash;          /* hash table of 'struct bdb_ctx' */ +	struct list_head    active;          /* list of active 'struct bdb_ctx' */ +	struct list_head    b_lru;           /* lru list of inactive 'struct bdb_ctx' */ +	struct list_head    purge; +	uint32_t            lru_limit; +	uint32_t            lru_size; +	uint32_t            hash_size; +	DBTYPE              access_mode;     /* access mode for accessing the databases,  +					      * can be DB_HASH, DB_BTREE */ +	DB_ENV             *dbenv;           /* DB_ENV under which every db operation +					      * is carried over */ +	int32_t             transaction; +	xlator_t           *this; + +	uint64_t            page_size;       /* page-size of DB,  +					      * DB->set_pagesize(), should be set before DB->open */ +}; + +struct bdb_ctx { +	/* controller members */ +	struct list_head   list;        /* lru list of 'struct bdb_ctx's,  +					 * a bdb_ctx can exist in one of b_hash or lru lists */ +	struct list_head   b_hash;      /* directory 'name' hashed list of 'struct bdb_ctx's */ + +	struct bctx_table *table; +	int32_t            ref;         /* reference count */ +	gf_lock_t          lock;        /* used to lock this 'struct bdb_ctx' */ + +	char              *directory;   /* directory path */ +	DB                *dbp;         /* pointer to open database, that resides inside this directory */ +	uint32_t           cache;       /* cache ON or OFF */ + +	/* per directory cache, bdb xlator's internal cache */ +	struct list_head   c_list;      /* linked list of cached records */ +	int32_t            c_count;     /* number of cached records */ + +	int32_t            key_hash;    /* index to hash table list, to which this ctx belongs */ +	char              *db_path;     /* absolute path to db file */ +}; + +struct bdb_fd { +	struct bdb_ctx *ctx;            /* pointer to bdb_ctx of the parent directory */ +	char           *key;            /* name of the file. NOTE: basename, not the complete path */ +	int32_t         flags;          /* open flags */ +}; + +struct bdb_dir { +	struct bdb_ctx *ctx;              /* pointer to bdb_ctx of this directory */ +	DIR            *dir;              /* open directory pointer, as returned by opendir() */ +	char            offset[NAME_MAX]; /* FIXME: readdir offset, too crude. must go  */ +	char           *path;             /* path to this directory */ +}; + +/* cache */ +struct bdb_cache { +	struct list_head c_list;          /* list of 'struct bdb_cache' under a 'struct bdb_ctx' */ +	char            *key;             /* name of the file this cache holds. NOTE: basename of file */ +	char            *data;            /* file content */ +	size_t           size;            /* size of the file content that this cache holds */ +}; + + +struct bdb_private { +	inode_table_t      *itable;             /* pointer to inode table that we use */ +	int32_t             temp;               /**/ +	char                is_stateless;       /**/ +	char               *export_path;        /* path to the export directory +						 * (option directory <export-path>) */ +	int32_t             export_path_length; /* length of 'export_path' string */ + +	/* statistics */ +	struct xlator_stats stats;              /* Statistics, provides activity of the server */ +   +	struct timeval      prev_fetch_time; +	struct timeval      init_time; +	int32_t             max_read;           /* */ +	int32_t             max_write;          /* */ +	int64_t             interval_read;      /* Used to calculate the max_read value */ +	int64_t             interval_write;     /* Used to calculate the max_write value */ +	int64_t             read_value;         /* Total read, from init */ +	int64_t             write_value;        /* Total write, from init */ +   +	/* bdb xlator specific private data */ +	uint64_t            envflags;              /* flags used for opening DB_ENV for this xlator */  +	uint64_t            dbflags;               /* flags to be used for opening each database */ +	uint64_t            cache;                 /* cache: can be either ON or OFF */ +	uint32_t            transaction;           /* transaction: can be either ON or OFF */ +	uint32_t active; +	gf_lock_t active_lock; +	struct bctx_table  *b_table; +	DBTYPE              access_mode;           /* access mode for accessing the databases,  +						    * can be DB_HASH, DB_BTREE  +						    * (option access-mode <mode>) */ +	mode_t              file_mode;             /* mode for each and every file stored on bdb  +						    * (option file-mode <mode>) */ +	mode_t              dir_mode;              /* mode for each and every directory stored on bdb  +						    * (option dir-mode <mode>) */ +	mode_t              symlink_mode;          /* mode for each and every symlink stored on bdb */ +	pthread_t           checkpoint_thread;     /* pthread_t object used for creating checkpoint +						    * thread */ +	int32_t             checkpoint_timeout;    /* time duration between two consecutive checkpoint +						    * operations. +						    * (option checkpoint-timeout <time-in-seconds>) */ +	ino_t               next_ino;              /* inode number allocation counter */ +	gf_lock_t           ino_lock;              /* lock to protect 'next_ino' */ +	char               *logdir;                /* environment log directory +						    * (option logdir <directory>) */ +	char               *errfile;               /* errfile path, used by environment to +						    * print detailed error log. +						    * (option errfile <errfile-path>) */ +	FILE               *errfp;                 /* DB_ENV->set_errfile() expects us to fopen +						    * the errfile before doing DB_ENV->set_errfile() */ +	uint32_t            txn_timeout;           /* used by DB_ENV->set_timeout to set the timeout for +						    * a transactionally encapsulated DB->operation() to +						    * timeout before waiting for locks to be released. +						    * (option transaction-timeout <time-in-milliseconds>)  +						    */ +	uint32_t            lock_timeout; +	uint32_t            log_auto_remove;        /* DB_AUTO_LOG_REMOVE flag for DB_ENV*/ +	uint32_t            log_region_max; +}; + + +static inline int32_t  +bdb_txn_begin (DB_ENV *dbenv, +	       DB_TXN **ptxnid) +{ +	return dbenv->txn_begin (dbenv, NULL, ptxnid, 0); +} + +static inline int32_t +bdb_txn_abort (DB_TXN *txnid) +{ +	return txnid->abort (txnid); +} + +static inline int32_t +bdb_txn_commit (DB_TXN *txnid) +{ +	return txnid->commit (txnid, 0); +} + +inline void * +bdb_extract_bfd (fd_t *fd, xlator_t *this); + + +void * +bdb_db_stat (bctx_t *bctx,  +	     DB_TXN *txnid, +	     uint32_t flags); + +int32_t +bdb_db_get(struct bdb_ctx *bctx, +	   DB_TXN *txnid, +	   const char *key_string, +	   char **buf, +	   size_t size, +	   off_t offset); + +#define BDB_TRUNCATE_RECORD 0xcafebabe + +int32_t +bdb_db_put (struct bdb_ctx *bctx, +	    DB_TXN *txnid, +	    const char *key_string, +	    const char *buf, +	    size_t size, +	    off_t offset, +	    int32_t flags); + +int32_t +bdb_db_del (struct bdb_ctx *bctx, +	    DB_TXN *txnid, +	    const char *path); + +ino_t +bdb_inode_transform (ino_t parent, +		     struct bdb_ctx *bctx); + + +int32_t +bdb_cursor_open (struct bdb_ctx *bctx, +		 DBC **cursorp); + +int32_t +bdb_cursor_get (DBC *cursorp, +		DBT *key, +		DBT *value, +		int32_t flags); + + +int32_t +bdb_cursor_close (struct bdb_ctx *ctx, +		  DBC *cursorp); + + +int32_t +bdb_dirent_size (DBT *key); + +int32_t +dirent_size (struct dirent *entry); + +int +bdb_db_init (xlator_t *this, +	     dict_t *options); + +void +bdb_dbs_from_dict_close (dict_t *this, +			 char *key, +			 data_t *value, +			 void *data); + +bctx_t * +bctx_lookup (struct bctx_table *table, +	     const char *path); + +bctx_t * +bctx_parent +(struct bctx_table *table, + const char *path); + +bctx_t * +bctx_unref (bctx_t *ctx); + +bctx_t * +bctx_ref (bctx_t *ctx); + +bctx_t * +bctx_rename (bctx_t *bctx,  +	     const char *db_newpath); + +int32_t +bdb_db_rename (bctx_table_t *table,  +	       const char *tmp_db_newpath,  +	       const char *real_db_newpath); +#endif /* _BDB_H */ diff --git a/xlators/storage/posix/Makefile.am b/xlators/storage/posix/Makefile.am new file mode 100644 index 00000000000..d471a3f9243 --- /dev/null +++ b/xlators/storage/posix/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES =  diff --git a/xlators/storage/posix/src/Makefile.am b/xlators/storage/posix/src/Makefile.am new file mode 100644 index 00000000000..2859e09aa49 --- /dev/null +++ b/xlators/storage/posix/src/Makefile.am @@ -0,0 +1,17 @@ + +xlator_LTLIBRARIES = posix.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage + +posix_la_LDFLAGS = -module -avoidversion + +posix_la_SOURCES = posix.c xattr-cache.c +posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la  + +noinst_HEADERS = posix.h xattr-cache.h + +AM_CFLAGS = -fPIC -fno-strict-aliasing -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS) -Wall \ +	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles \ +	$(GF_CFLAGS) + +CLEANFILES =  + diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c new file mode 100644 index 00000000000..159f02ddeb4 --- /dev/null +++ b/xlators/storage/posix/src/posix.c @@ -0,0 +1,3715 @@ +/* +  Copyright (c) 2006, 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> +  This file is part of GlusterFS. + +  GlusterFS is free software; you can redistribute it and/or modify +  it under the terms of the GNU General Public License as published +  by the Free Software Foundation; either version 3 of the License, +  or (at your option) any later version. + +  GlusterFS is distributed in the hope that it will be useful, but +  WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  General Public License for more details. + +  You should have received a copy of the GNU General Public License +  along with this program.  If not, see +  <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#define __XOPEN_SOURCE 500 + +#include <stdint.h> +#include <sys/time.h> +#include <sys/resource.h> +#include <errno.h> +#include <ftw.h> + +#ifndef GF_BSD_HOST_OS +#include <alloca.h> +#endif /* GF_BSD_HOST_OS */ + +#include "glusterfs.h" +#include "dict.h" +#include "logging.h" +#include "posix.h" +#include "xlator.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#undef HAVE_SET_FSID +#ifdef HAVE_SET_FSID + +#define DECLARE_OLD_FS_ID_VAR uid_t old_fsuid; gid_t old_fsgid; + +#define SET_FS_ID(uid, gid) do {		\ +                old_fsuid = setfsuid (uid);     \ +                old_fsgid = setfsgid (gid);     \ +        } while (0) + +#define SET_TO_OLD_FS_ID() do {			\ +                setfsuid (old_fsuid);           \ +                setfsgid (old_fsgid);           \ +        } while (0) + +#else + +#define DECLARE_OLD_FS_ID_VAR +#define SET_FS_ID(uid, gid) +#define SET_TO_OLD_FS_ID() + +#endif + +typedef struct { +  	xlator_t    *this; +  	const char  *real_path; +  	dict_t      *xattr; +  	struct stat *stbuf; +	loc_t       *loc; +} posix_xattr_filler_t; + +int +posix_forget (xlator_t *this, inode_t *inode) +{ +	uint64_t tmp_cache = 0; +	if (!inode_ctx_del (inode, this, &tmp_cache)) +		dict_destroy ((dict_t *)(long)tmp_cache); + +	return 0; +} + +static void +_posix_xattr_get_set (dict_t *xattr_req, +    		      char *key, +    		      data_t *data, +    		      void *xattrargs) +{ +    	posix_xattr_filler_t *filler = xattrargs; +    	char     *value      = NULL; +    	ssize_t   xattr_size = -1; +    	int       ret      = -1; +  	char     *databuf  = NULL; +  	int       _fd      = -1; +	loc_t    *loc      = NULL; +	ssize_t  req_size  = 0; + + +    	/* should size be put into the data_t ? */ +	if (!strcmp (key, "glusterfs.content")) { +    		/* file content request */ +		req_size = data_to_uint64 (data); +		if (req_size >= filler->stbuf->st_size) { +			_fd = open (filler->real_path, O_RDONLY); + +			if (_fd == -1) { +				gf_log (filler->this->name, GF_LOG_ERROR, +					"opening file %s failed: %s", +					filler->real_path, strerror (errno)); +				goto err; +			} + +			databuf = calloc (1, filler->stbuf->st_size); +			 +			if (!databuf) { +				gf_log (filler->this->name, GF_LOG_ERROR, +					"out of memory :("); +				goto err; +			} + +			ret = read (_fd, databuf, filler->stbuf->st_size); +			if (ret == -1) { +				gf_log (filler->this->name, GF_LOG_ERROR, +					"read on file %s failed: %s", +					filler->real_path, strerror (errno)); +				goto err; +			} + +			ret = close (_fd); +			_fd = -1; +			if (ret == -1) { +				gf_log (filler->this->name, GF_LOG_ERROR, +					"close on file %s failed: %s", +					filler->real_path, strerror (errno)); +				goto err; +			} + +			ret = dict_set_bin (filler->xattr, key, +					    databuf, filler->stbuf->st_size); +			if (ret < 0) { +				goto err; +			} + +			/* To avoid double free in cleanup below */ +			databuf = NULL; +		err: +			if (_fd != -1) +				close (_fd); +			if (databuf) +				FREE (databuf); +		} +    	} else if (!strcmp (key, GLUSTERFS_OPEN_FD_COUNT)) { +		loc = filler->loc; +		if (!list_empty (&loc->inode->fd_list)) { +			ret = dict_set_uint32 (filler->xattr, key, 1); +		} else { +			ret = dict_set_uint32 (filler->xattr, key, 0); +		} +	} else { +		xattr_size = lgetxattr (filler->real_path, key, NULL, 0); + +		if (xattr_size > 0) { +			value = calloc (1, xattr_size + 1); + +			lgetxattr (filler->real_path, key, value, xattr_size); + +			value[xattr_size] = '\0'; +			ret = dict_set_bin (filler->xattr, key, +					    value, xattr_size); +			if (ret < 0) +				gf_log (filler->this->name, GF_LOG_ERROR, +					"dict set failed. path: %s, key: %s", +					filler->real_path, key); +		} +	} +} + + +dict_t * +posix_lookup_xattr_fill (xlator_t *this, const char *real_path, loc_t *loc, +    			 dict_t *xattr_req, struct stat *buf) +{ +    	dict_t     *xattr             = NULL; +    	posix_xattr_filler_t filler   = {0, }; + +    	xattr = get_new_dict(); +    	if (!xattr) { +    		gf_log (this->name, GF_LOG_ERROR, +    			"memory allocation failed :("); +    		goto out; +    	} + +    	filler.this      = this; +    	filler.real_path = real_path; +    	filler.xattr     = xattr; +    	filler.stbuf     = buf; +	filler.loc       = loc; + +    	dict_foreach (xattr_req, _posix_xattr_get_set, &filler); +out: +    	return xattr; +} + + +int32_t +posix_lookup (call_frame_t *frame, xlator_t *this, +              loc_t *loc, dict_t *xattr_req) +{ +        struct stat buf                = {0, }; +        char *      real_path          = NULL; +        int32_t     op_ret             = -1; +        int32_t     op_errno           = 0; +        dict_t *    xattr              = NULL; + +        struct posix_private  *priv    = NULL; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (loc, out); +	VALIDATE_OR_GOTO (loc->path, out); + +        MAKE_REAL_PATH (real_path, this, loc->path); + +        priv = this->private; + +        op_ret   = lstat (real_path, &buf); +        op_errno = errno; + +        if (op_ret == -1) { +		if (op_errno != ENOENT) { +			gf_log (this->name, GF_LOG_WARNING, +				"lstat on %s failed: %s", +				loc->path, strerror (op_errno)); +		} +                goto out; +        } + +	/* Make sure we don't access another mountpoint inside export dir. +	 * It may cause inode number to repeat from single export point, +	 * which leads to severe problems.. +	 */ +	if (priv->base_stdev != buf.st_dev) { +		op_errno = ENOENT; +		gf_log (this->name, GF_LOG_WARNING, +			"%s: different mountpoint/device, returning " +			"ENOENT", loc->path); +		goto out; +	} + +        if (xattr_req && (op_ret == 0)) { +		xattr = posix_lookup_xattr_fill (this, real_path, loc, +						 xattr_req, &buf); +        } + +	op_ret = 0; +out: +        frame->root->rsp_refs = NULL; + +        if (xattr) +                dict_ref (xattr); + +        STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &buf, xattr); + +        if (xattr) +                dict_unref (xattr); + +        return 0; +} + + +int32_t +posix_stat (call_frame_t *frame, +            xlator_t *this, +            loc_t *loc) +{ +        struct stat buf       = {0,}; +        char *      real_path = NULL; +        int32_t     op_ret    = -1; +        int32_t     op_errno  = 0; + +        DECLARE_OLD_FS_ID_VAR; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (loc, out); + +        SET_FS_ID (frame->root->uid, frame->root->gid); +        MAKE_REAL_PATH (real_path, this, loc->path); + +        op_ret = lstat (real_path, &buf); + +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +                        "lstat on %s: %s", loc->path, strerror (op_errno)); +                goto out; +        } + +        op_ret = 0; + + out: +        SET_TO_OLD_FS_ID(); +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno, &buf); + +        return 0; +} + +int32_t +posix_opendir (call_frame_t *frame, xlator_t *this, +               loc_t *loc, fd_t *fd) +{ +        char *            real_path = NULL; +        int32_t           op_ret    = -1; +        int32_t           op_errno  = 0; +        DIR *             dir       = NULL; +        struct posix_fd * pfd       = NULL; + +        DECLARE_OLD_FS_ID_VAR; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (loc, out); +        VALIDATE_OR_GOTO (loc->path, out); +        VALIDATE_OR_GOTO (fd, out); + +        SET_FS_ID (frame->root->uid, frame->root->gid); +        MAKE_REAL_PATH (real_path, this, loc->path); + +        dir = opendir (real_path); + +        if (dir == NULL) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_ERROR, +                        "opendir failed on %s (%s)", +			loc->path, strerror (op_errno)); +                goto out; +        } + +        op_ret = dirfd (dir); +	if (op_ret < 0) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_ERROR, +                        "dirfd() failed on %s (%s)", +			loc->path, strerror (op_errno)); +		goto out; +	} + +        pfd = CALLOC (1, sizeof (*fd)); +        if (!pfd) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_ERROR, +                        "out of memory :("); +                goto out; +        } + +        pfd->dir = dir; +        pfd->fd = dirfd (dir); +        pfd->path = strdup (real_path); +        if (!pfd->path) { +                gf_log (this->name, GF_LOG_ERROR, +                        "out of memory :("); +                goto out; +        } + +	fd_ctx_set (fd, this, (uint64_t)(long)pfd); + +        frame->root->rsp_refs = NULL; + +        op_ret = 0; + + out: +        if (op_ret == -1) { +                if (dir) { +                        closedir (dir); +                        dir = NULL; +                } +                if (pfd) { +                        if (pfd->path) +                                FREE (pfd->path); +                        FREE (pfd); +                        pfd = NULL; +                } +        } + +        SET_TO_OLD_FS_ID (); +        STACK_UNWIND (frame, op_ret, op_errno, fd); +        return 0; +} + + +int32_t +posix_getdents (call_frame_t *frame, xlator_t *this, +                fd_t *fd, size_t size, off_t off, int32_t flag) +{ +        int32_t           op_ret         = -1; +        int32_t           op_errno       = 0; +        char *            real_path      = NULL; +        dir_entry_t       entries        = {0, }; +        dir_entry_t *     tmp            = NULL; +        DIR *             dir            = NULL; +        struct dirent *   dirent         = NULL; +        int               real_path_len  = -1; +        int               entry_path_len = -1; +        char *            entry_path     = NULL; +        int               count          = 0; +        struct posix_fd * pfd            = NULL; +	uint64_t          tmp_pfd        = 0; +        struct stat       buf            = {0,}; +        int               ret            = -1; +        char              tmp_real_path[ZR_PATH_MAX]; +        char              linkpath[ZR_PATH_MAX]; + +        DECLARE_OLD_FS_ID_VAR ; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); + +        SET_FS_ID (frame->root->uid, frame->root->gid); + +        ret = fd_ctx_get (fd, this, &tmp_pfd); +        if (ret < 0) { +                op_errno = -ret; +                gf_log (this->name, GF_LOG_ERROR, +                        "fd %p does not have context in %s", +                        fd, this->name); +                goto out; +        } +	pfd = (struct posix_fd *)(long)tmp_pfd; +        if (!pfd->path) { +                op_errno = EBADFD; +                gf_log (this->name, GF_LOG_ERROR, +                        "pfd does not have path set (possibly file " +			"fd, fd=%p)", fd); +                goto out; +        } + +        real_path     = pfd->path; +        real_path_len = strlen (real_path); + +        entry_path_len = real_path_len + NAME_MAX; +        entry_path     = CALLOC (1, entry_path_len); + +        if (!entry_path) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_ERROR, +                        "out of memory :("); +                goto out; +        } + +        strncpy (entry_path, real_path, entry_path_len); +        entry_path[real_path_len] = '/'; + +        dir = pfd->dir; + +        if (!dir) { +                op_errno = EBADFD; +                gf_log (this->name, GF_LOG_ERROR, +                        "pfd does not have dir set (possibly file fd, " +			"fd=%p, path=`%s'", +                        fd, real_path); +                goto out; +        } + +        /* TODO: check for all the type of flag, and behave appropriately */ + +        while ((dirent = readdir (dir))) { +                if (!dirent) +                        break; + +                /* This helps in self-heal, when only directories +                   needs to be replicated */ + +                /* This is to reduce the network traffic, in case only +                   directory is needed from posix */ + +                strncpy (tmp_real_path, real_path, ZR_PATH_MAX); +                strncat (tmp_real_path, "/", +			 ZR_PATH_MAX - strlen (tmp_real_path)); + +                strncat (tmp_real_path, dirent->d_name, +                         ZR_PATH_MAX - strlen (tmp_real_path)); +                ret = lstat (tmp_real_path, &buf); + +                if ((flag == GF_GET_DIR_ONLY) +                    && (ret != -1 && !S_ISDIR(buf.st_mode))) { +                        continue; +                } + +                tmp = CALLOC (1, sizeof (*tmp)); + +                if (!tmp) { +                        op_errno = errno; +                        gf_log (this->name, GF_LOG_ERROR, +                                "out of memory :("); +                        goto out; +                } + +                tmp->name = strdup (dirent->d_name); +                if (!tmp->name) { +                        op_errno = errno; +                        gf_log (this->name, GF_LOG_ERROR, +                                "out of memory :("); +                        goto out; +                } + +                if (entry_path_len < +		    (real_path_len + 1 + strlen (tmp->name) + 1)) { +                        entry_path_len = (real_path_len + +					  strlen (tmp->name) + 1024); + +                        entry_path = realloc (entry_path, entry_path_len); +                } + +                strcpy (&entry_path[real_path_len+1], tmp->name); + +                ret = lstat (entry_path, &tmp->buf); + +                if (ret == -1) { +                        op_errno = errno; +                        gf_log (this->name, GF_LOG_ERROR, +				"lstat on %s failed: %s", +                                entry_path, strerror (op_errno)); +                        goto out; +                } + +                if (S_ISLNK(tmp->buf.st_mode)) { + +                        ret = readlink (entry_path, linkpath, ZR_PATH_MAX); +                        if (ret != -1) { +                                linkpath[ret] = '\0'; +                                tmp->link = strdup (linkpath); +                        } +                } else { +                        tmp->link = ""; +                } + +                count++; + +                tmp->next = entries.next; +                entries.next = tmp; + +                /* if size is 0, count can never be = size, so entire +		   dir is read */ +                if (count == size) +                        break; +        } + +        FREE (entry_path); + +        op_ret = 0; + + out: +        SET_TO_OLD_FS_ID (); + +        if (op_ret == -1) { +                if (entry_path) +                        FREE (entry_path); +        } + +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno, &entries, count); + +        if (op_ret == 0) { +                while (entries.next) { +                        tmp = entries.next; +                        entries.next = entries.next->next; +                        FREE (tmp->name); +                        FREE (tmp); +                } +        } + +        return 0; +} + + +int32_t +posix_releasedir (xlator_t *this, +		  fd_t *fd) +{ +        int32_t           op_ret   = -1; +        int32_t           op_errno = 0; +        struct posix_fd * pfd      = NULL; +	uint64_t          tmp_pfd  = 0; +        int               ret      = 0; + +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); + +        ret = fd_ctx_del (fd, this, &tmp_pfd); +        if (ret < 0) { +                op_errno = -ret; +                gf_log (this->name, GF_LOG_ERROR, +                        "pfd from fd=%p is NULL", fd); +                goto out; +        } + +	pfd = (struct posix_fd *)(long)tmp_pfd; +        if (!pfd->dir) { +                op_errno = EINVAL; +                gf_log (this->name, GF_LOG_ERROR, +                        "pfd->dir is NULL for fd=%p path=%s", +                        fd, pfd->path ? pfd->path : "<NULL>"); +                goto out; +        } + +        ret = closedir (pfd->dir); +        if (ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_ERROR, +                        "closedir on %p failed", pfd->dir); +                goto out; +        } +        pfd->dir = NULL; + +        if (!pfd->path) { +                op_errno = EBADFD; +                gf_log (this->name, GF_LOG_ERROR, +                        "pfd->path was NULL. fd=%p pfd=%p", +                        fd, pfd); +                goto out; +        } + +        op_ret = 0; + + out: +        if (pfd) { +                if (pfd->path) +                        FREE (pfd->path); +		FREE (pfd); +        } + +        return 0; +} + + +int32_t +posix_readlink (call_frame_t *frame, xlator_t *this, +                loc_t *loc, size_t size) +{ +        char *  dest      = NULL; +        int32_t op_ret    = -1; +        int32_t op_errno  = 0; +        char *  real_path = NULL; + +        DECLARE_OLD_FS_ID_VAR; + +	VALIDATE_OR_GOTO (frame, out); + +        SET_FS_ID (frame->root->uid, frame->root->gid); + +        dest = alloca (size + 1); + +        MAKE_REAL_PATH (real_path, this, loc->path); + +        op_ret = readlink (real_path, dest, size); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +                        "readlink on %s: %s", loc->path, strerror (op_errno)); +                goto out; +        } + +        dest[op_ret] = 0; + + out: +        SET_TO_OLD_FS_ID (); +        frame->root->rsp_refs = NULL; + +        STACK_UNWIND (frame, op_ret, op_errno, dest); + +        return 0; +} + +int32_t +posix_mknod (call_frame_t *frame, xlator_t *this, +             loc_t *loc, mode_t mode, dev_t dev) +{ +	int         tmp_fd    = 0; +        int32_t     op_ret    = -1; +        int32_t     op_errno  = 0; +        char *      real_path = 0; +        struct stat stbuf     = { 0, }; + +        DECLARE_OLD_FS_ID_VAR; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (loc, out); + +        SET_FS_ID (frame->root->uid, frame->root->gid); +        MAKE_REAL_PATH (real_path, this, loc->path); + +        op_ret = mknod (real_path, mode, dev); + +        if (op_ret == -1) { +                op_errno = errno; +		if ((op_errno == EINVAL) && S_ISREG (mode)) { +			/* Over Darwin, mknod with (S_IFREG|mode) +			   doesn't work */ +			tmp_fd = creat (real_path, mode); +			if (tmp_fd == -1) +				goto out; +			close (tmp_fd); +		} else { + +			gf_log (this->name, GF_LOG_ERROR, +				"mknod on %s: %s", loc->path, +				strerror (op_errno)); +			goto out; +		} +        } + +#ifndef HAVE_SET_FSID +        op_ret = lchown (real_path, frame->root->uid, frame->root->gid); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_ERROR, +                        "lchown on %s: %s", loc->path, strerror (op_errno)); +                goto out; +        } +#endif + +        op_ret = lstat (real_path, &stbuf); + +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_ERROR, +                        "mknod on %s: %s", loc->path, strerror (op_errno)); +                goto out; +        } + +        op_ret = 0; + + out: +        SET_TO_OLD_FS_ID (); + +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf); + +        return 0; +} + +int32_t +posix_mkdir (call_frame_t *frame, xlator_t *this, +             loc_t *loc, mode_t mode) +{ +        int32_t     op_ret    = -1; +        int32_t     op_errno  = 0; +        char *      real_path = NULL; +        struct stat stbuf     = {0, }; + +        DECLARE_OLD_FS_ID_VAR; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (loc, out); + +        SET_FS_ID (frame->root->uid, frame->root->gid); +        MAKE_REAL_PATH (real_path, this, loc->path); + +        op_ret = mkdir (real_path, mode); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +                        "mkdir of %s: %s", loc->path, strerror (op_errno)); +                goto out; +        } + +#ifndef HAVE_SET_FSID +        op_ret = chown (real_path, frame->root->uid, frame->root->gid); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_ERROR, +                        "chown on %s: %s", loc->path, strerror (op_errno)); +                goto out; +        } +#endif + +        op_ret = lstat (real_path, &stbuf); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_ERROR, +                        "lstat on %s: %s", loc->path, strerror (op_errno)); +                goto out; +        } + +        op_ret = 0; + + out: +        SET_TO_OLD_FS_ID (); + +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf); + +        return 0; +} + + +int32_t +posix_unlink (call_frame_t *frame, xlator_t *this, +              loc_t *loc) +{ +        int32_t op_ret    = -1; +        int32_t op_errno  = 0; +        char *  real_path = NULL; + +	xattr_cache_handle_t handle = {{0,}, 0}; + +        DECLARE_OLD_FS_ID_VAR; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (loc, out); + +        SET_FS_ID (frame->root->uid, frame->root->gid); +        MAKE_REAL_PATH (real_path, this, loc->path); + +	loc_copy (&handle.loc, loc); +	{ +		posix_xattr_cache_flush (this, &handle); +	} +	loc_wipe (&handle.loc); + +        op_ret = unlink (real_path); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +                        "unlink of %s: %s", loc->path, strerror (op_errno)); +                goto out; +        } + +        op_ret = 0; + + out: +        SET_TO_OLD_FS_ID (); +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno); + +        return 0; +} + +int32_t +posix_rmdir (call_frame_t *frame, xlator_t *this, +             loc_t *loc) +{ +        int32_t op_ret    = -1; +        int32_t op_errno  = 0; +        char *  real_path = 0; + +	xattr_cache_handle_t handle = {{0,}, 0}; + +        DECLARE_OLD_FS_ID_VAR; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (loc, out); + +        SET_FS_ID (frame->root->uid, frame->root->gid); +        MAKE_REAL_PATH (real_path, this, loc->path); + +	loc_copy (&handle.loc, loc); +	{ +		posix_xattr_cache_flush (this, &handle); +	} +	loc_wipe (&handle.loc); + +        op_ret = rmdir (real_path); +        op_errno = errno; + +	if (op_errno == EEXIST) +		/* Solaris sets errno = EEXIST instead of ENOTEMPTY */ +		op_errno = ENOTEMPTY; + +        if (op_ret == -1 && op_errno != ENOTEMPTY) { +                gf_log (this->name, GF_LOG_WARNING, +                        "rmdir of %s: %s", loc->path, strerror (op_errno)); +                goto out; +        } + + out: +        SET_TO_OLD_FS_ID (); +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno); + +        return 0; +} + +int32_t +posix_symlink (call_frame_t *frame, xlator_t *this, +               const char *linkname, loc_t *loc) +{ +        int32_t     op_ret    = -1; +        int32_t     op_errno  = 0; +        char *      real_path = 0; +        struct stat stbuf     = { 0, }; + +        DECLARE_OLD_FS_ID_VAR; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (linkname, out); +        VALIDATE_OR_GOTO (loc, out); + +        SET_FS_ID (frame->root->uid, frame->root->gid); +        MAKE_REAL_PATH (real_path, this, loc->path); + +        op_ret = symlink (linkname, real_path); + +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +                        "symlink of %s --> %s: %s", +                        loc->path, linkname, strerror (op_errno)); +                goto out; +        } + +#ifndef HAVE_SET_FSID +        op_ret = lchown (real_path, frame->root->uid, frame->root->gid); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +                        "lchown failed on %s: %s", +                        loc->path, strerror (op_errno)); +                goto out; +        } +#endif +        op_ret = lstat (real_path, &stbuf); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +                        "lstat failed on %s: %s", +                        loc->path, strerror (op_errno)); +                goto out; +        } + +        op_ret = 0; + + out: +        SET_TO_OLD_FS_ID (); + +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf); + +        return 0; +} + + +int +posix_rename (call_frame_t *frame, xlator_t *this, +              loc_t *oldloc, loc_t *newloc) +{ +        int32_t     op_ret       = -1; +        int32_t     op_errno     = 0; +        char *      real_oldpath = NULL; +        char *      real_newpath = NULL; +        struct stat stbuf        = {0, }; + +	xattr_cache_handle_t handle = {{0,}, 0}; + +        DECLARE_OLD_FS_ID_VAR; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (oldloc, out); +        VALIDATE_OR_GOTO (newloc, out); + +        SET_FS_ID (frame->root->uid, frame->root->gid); +        MAKE_REAL_PATH (real_oldpath, this, oldloc->path); +        MAKE_REAL_PATH (real_newpath, this, newloc->path); + +	loc_copy (&handle.loc, oldloc); +	{ +		posix_xattr_cache_flush (this, &handle); +	} +	loc_wipe (&handle.loc); + +        op_ret = rename (real_oldpath, real_newpath); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, +			(op_errno == ENOTEMPTY ? GF_LOG_DEBUG : GF_LOG_ERROR), +                        "rename of %s to %s failed: %s", +                        oldloc->path, newloc->path, strerror (op_errno)); +                goto out; +        } + +        op_ret = lstat (real_newpath, &stbuf); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +                        "lstat on %s failed: %s", +                        real_newpath, strerror (op_errno)); +                goto out; +        } + +        op_ret = 0; + + out: +        SET_TO_OLD_FS_ID (); + +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + +        return 0; +} + + +int +posix_link (call_frame_t *frame, xlator_t *this, +            loc_t *oldloc, loc_t *newloc) +{ +        int32_t     op_ret       = -1; +        int32_t     op_errno     = 0; +        char *      real_oldpath = 0; +        char *      real_newpath = 0; +        struct stat stbuf        = {0, }; + + +        DECLARE_OLD_FS_ID_VAR; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (oldloc, out); +        VALIDATE_OR_GOTO (newloc, out); + +        SET_FS_ID (frame->root->uid, frame->root->gid); +        MAKE_REAL_PATH (real_oldpath, this, oldloc->path); +        MAKE_REAL_PATH (real_newpath, this, newloc->path); + +        op_ret = link (real_oldpath, real_newpath); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +                        "link %s to %s failed: %s", +                        oldloc->path, newloc->path, strerror (op_errno)); +                goto out; +        } + +        op_ret = lstat (real_newpath, &stbuf); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +                        "lstat on %s failed: %s", +                        real_newpath, strerror (op_errno)); +                goto out; +        } + +        op_ret = 0; + + out: +        SET_TO_OLD_FS_ID (); + +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno, oldloc->inode, &stbuf); + +        return 0; +} + + +int +posix_chmod (call_frame_t *frame, xlator_t *this, +             loc_t *loc, mode_t mode) +{ +        int32_t     op_ret    = -1; +        int32_t     op_errno  = 0; +        char *      real_path = 0; +        struct stat stbuf     = {0,}; + +        DECLARE_OLD_FS_ID_VAR; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (loc, out); + +        SET_FS_ID (frame->root->uid, frame->root->gid); +        MAKE_REAL_PATH (real_path, this, loc->path); + +        if (S_ISLNK (loc->inode->st_mode)) { +                /* chmod on a link should always succeed */ +		op_ret = lstat (real_path, &stbuf); +		if (op_ret == -1) { +			op_errno = errno; +			gf_log (this->name, GF_LOG_ERROR, +				"lstat on %s failed: %s", +				real_path, strerror (op_errno)); +			goto out; +		} +		op_ret = 0; +                goto out; +        } + +        op_ret = lchmod (real_path, mode); +        if ((op_ret == -1) && (errno == ENOSYS)) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "lchmod not implemented, falling back to chmod"); +                op_ret = chmod (real_path, mode); +        } + +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_ERROR, "chmod on %s failed: %s", +                        loc->path, strerror (op_errno)); +                goto out; +        } + +        op_ret = lstat (real_path, &stbuf); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_ERROR, "lstat on %s failed: %s", +                        real_path, strerror (op_errno)); +                goto out; +        } + +        op_ret = 0; + + out: +        SET_TO_OLD_FS_ID (); + +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + +        return 0; +} + + +int +posix_chown (call_frame_t *frame, xlator_t *this, +             loc_t *loc, uid_t uid, gid_t gid) +{ +        int32_t     op_ret     = -1; +        int32_t     op_errno   = 0; +        char *      real_path  = 0; +        struct stat stbuf      = {0,}; + +        DECLARE_OLD_FS_ID_VAR; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (loc, out); + +        SET_FS_ID (frame->root->uid, frame->root->gid); +        MAKE_REAL_PATH (real_path, this, loc->path); + +        op_ret = lchown (real_path, uid, gid); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +			"lchown on %s failed: %s", +                        loc->path, strerror (op_errno)); +                goto out; +        } + +        op_ret = lstat (real_path, &stbuf); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +			"lstat on %s failed: %s", +                        real_path, strerror (op_errno)); +                goto out; +        } + +        op_ret = 0; + + out: +        SET_TO_OLD_FS_ID (); +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + +        return 0; +} + + +int32_t +posix_truncate (call_frame_t *frame, +                xlator_t *this, +                loc_t *loc, +                off_t offset) +{ +        int32_t     op_ret    = -1; +        int32_t     op_errno  = 0; +        char *      real_path = 0; +        struct stat stbuf     = {0,}; + +        DECLARE_OLD_FS_ID_VAR; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (loc, out); + +        SET_FS_ID (frame->root->uid, frame->root->gid); +        MAKE_REAL_PATH (real_path, this, loc->path); + +        op_ret = truncate (real_path, offset); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +                        "truncate on %s failed: %s", +                        loc->path, strerror (op_errno)); +                goto out; +        } + +        op_ret = lstat (real_path, &stbuf); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, "lstat on %s failed: %s", +                        real_path, strerror (op_errno)); +                goto out; +        } + +        op_ret = 0; + + out: +        SET_TO_OLD_FS_ID (); + +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + +        return 0; +} + + +int +posix_utimens (call_frame_t *frame, xlator_t *this, +               loc_t *loc, struct timespec ts[2]) +{ +        int32_t        op_ret    = -1; +        int32_t        op_errno  = 0; +        char *         real_path = 0; +        struct stat    stbuf     = {0,}; +        struct timeval tv[2]     = {{0,},{0,}}; + +        DECLARE_OLD_FS_ID_VAR; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (loc, out); + +        SET_FS_ID (frame->root->uid, frame->root->gid); +        MAKE_REAL_PATH (real_path, this, loc->path); + +        tv[0].tv_sec  = ts[0].tv_sec; +        tv[0].tv_usec = ts[0].tv_nsec / 1000; +        tv[1].tv_sec  = ts[1].tv_sec; +        tv[1].tv_usec = ts[1].tv_nsec / 1000; + +        op_ret = lutimes (real_path, tv); +        if ((op_ret == -1) && (errno == ENOSYS)) { +                op_ret = utimes (real_path, tv); +        } + +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +                        "utimes on %s: %s", real_path, strerror (op_errno)); +                goto out; +        } + +        op_ret = lstat (real_path, &stbuf); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +                        "lstat on %s: %s", real_path, strerror (op_errno)); +                goto out; +        } + +        op_ret = 0; + + out: +        SET_TO_OLD_FS_ID (); + +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + +        return 0; +} + +int32_t +posix_create (call_frame_t *frame, xlator_t *this, +              loc_t *loc, int32_t flags, mode_t mode, +              fd_t *fd) +{ +        int32_t                op_ret    = -1; +        int32_t                op_errno  = 0; +        int32_t                _fd       = -1; +        int                    _flags    = 0; +        char *                 real_path = NULL; +        struct stat            stbuf     = {0, }; +        struct posix_fd *      pfd       = NULL; +        struct posix_private * priv      = NULL; + +        DECLARE_OLD_FS_ID_VAR; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); +        VALIDATE_OR_GOTO (loc, out); +        VALIDATE_OR_GOTO (fd, out); + +        priv = this->private; + +        SET_FS_ID (frame->root->uid, frame->root->gid); +        MAKE_REAL_PATH (real_path, this, loc->path); + +        if (!flags) { +                _flags = O_CREAT | O_RDWR | O_EXCL; +        } +        else { +                _flags = flags | O_CREAT; +        } + +        if (priv->o_direct) +                flags |= O_DIRECT; + +        _fd = open (real_path, _flags, mode); + +        if (_fd == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +                        "open on %s: %s", loc->path, strerror (op_errno)); +                goto out; +        } + +#ifndef HAVE_SET_FSID +        op_ret = chown (real_path, frame->root->uid, frame->root->gid); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +                        "chown on %s failed: %s", +			real_path, strerror (op_errno)); +                goto out; +        } +#endif + +        op_ret = fstat (_fd, &stbuf); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +                        "fstat on %d failed: %s", _fd, strerror (op_errno)); +                goto out; +        } + +	op_ret = -1; +        pfd = CALLOC (1, sizeof (*pfd)); + +        if (!pfd) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_ERROR, +                        "out of memory :("); +                close (_fd); +                goto out; +        } + +        pfd->flags = flags; +        pfd->fd    = _fd; + +	fd_ctx_set (fd, this, (uint64_t)(long)pfd); + +        ((struct posix_private *)this->private)->stats.nr_files++; + +        op_ret = 0; + + out: +        SET_TO_OLD_FS_ID (); + +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno, fd, loc->inode, &stbuf); + +        return 0; +} + +int32_t +posix_open (call_frame_t *frame, xlator_t *this, +            loc_t *loc, int32_t flags, fd_t *fd) +{ +        int32_t                op_ret    = -1; +        int32_t                op_errno  = 0; +        char *                 real_path = NULL; +        int32_t                _fd       = -1; +        struct posix_fd *      pfd       = NULL; +        struct posix_private * priv      = NULL; + +        DECLARE_OLD_FS_ID_VAR; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); +        VALIDATE_OR_GOTO (loc, out); +        VALIDATE_OR_GOTO (fd, out); + +        priv = this->private; + +        SET_FS_ID (frame->root->uid, frame->root->gid); +        MAKE_REAL_PATH (real_path, this, loc->path); + +        if (priv->o_direct) +                flags |= O_DIRECT; + +        _fd = open (real_path, flags, 0); +        if (_fd == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +                        "open on %s: %s", real_path, strerror (op_errno)); +                goto out; +        } + +        pfd = CALLOC (1, sizeof (*pfd)); + +        if (!pfd) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_ERROR, +                        "out of memory :("); +                goto out; +        } + +        pfd->flags = flags; +        pfd->fd    = _fd; + +	fd_ctx_set (fd, this, (uint64_t)(long)pfd); + +        ((struct posix_private *)this->private)->stats.nr_files++; + +#ifndef HAVE_SET_FSID +        if (flags & O_CREAT) { +                op_ret = chown (real_path, frame->root->uid, frame->root->gid); +                if (op_ret == -1) { +                        op_errno = errno; +                        gf_log (this->name, GF_LOG_WARNING, +                                "chown on %s failed: %s", +				real_path, strerror (op_errno)); +                        goto out; +                } +        } +#endif + +        op_ret = 0; + + out: +        if (op_ret == -1) { +                if (_fd != -1) { +                        close (_fd); +                        _fd = -1; +                } +        } + +        SET_TO_OLD_FS_ID (); + +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno, fd); + +        return 0; +} + +#define ALIGN_BUF(ptr,bound) ((void *)((unsigned long)(ptr + bound - 1) & \ +                                       (unsigned long)(~(bound - 1)))) + +int +posix_readv (call_frame_t *frame, xlator_t *this, +             fd_t *fd, size_t size, off_t offset) +{ +	uint64_t               tmp_pfd    = 0; +        int32_t                op_ret     = -1; +        int32_t                op_errno   = 0; +        char *                 buf        = NULL; +        char *                 alloc_buf  = NULL; +        int                    _fd        = -1; +        struct posix_private * priv       = NULL; +        dict_t *               reply_dict = NULL; +        struct iovec           vec        = {0,}; +        struct posix_fd *      pfd        = NULL; +        struct stat            stbuf      = {0,}; +        int                    align      = 1; +        int                    ret        = -1; +        int                    dict_ret   = -1; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        ret = fd_ctx_get (fd, this, &tmp_pfd); +        if (ret < 0) { +                op_errno = -ret; +                gf_log (this->name, GF_LOG_ERROR, +			"pfd is NULL from fd=%p", fd); +                goto out; +        } +	pfd = (struct posix_fd *)(long)tmp_pfd; + +        if (!size) { +                op_errno = EINVAL; +                gf_log (this->name, GF_LOG_ERROR, "size == 0"); +                goto out; +        } + +        if (pfd->flags & O_DIRECT) { +                align = 4096;    /* align to page boundary */ +        } + +        alloc_buf = MALLOC (1 * (size + align)); +        if (!alloc_buf) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_ERROR, +                        "out of memory :("); +                goto out; +        } + +        /* page aligned buffer */ +        buf = ALIGN_BUF (alloc_buf, align); + +        _fd = pfd->fd; + +        op_ret = lseek (_fd, offset, SEEK_SET); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_ERROR, +			"lseek(%"PRId64") failed: %s", +                        offset, strerror (op_errno)); +                goto out; +        } + +        op_ret = read (_fd, buf, size); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +                        "read failed: %s", strerror (op_errno)); +                goto out; +        } + +        priv->read_value    += size; +        priv->interval_read += size; + +        vec.iov_base = buf; +        vec.iov_len  = op_ret; + +	op_ret = -1; +        reply_dict = get_new_dict (); +        if (!reply_dict) { +                gf_log (this->name, GF_LOG_ERROR, +                        "out of memory :("); +                goto out; +        } +        dict_ref (reply_dict); + +        dict_ret = dict_set_ptr (reply_dict, NULL, alloc_buf); +        if (dict_ret < 0) { +                op_errno = -dict_ret; +                gf_log (this->name, GF_LOG_ERROR, "could not dict_set: (%s)", +                        strerror (op_errno)); +                goto out; +        } + +        /* +         *  readv successful, and we need to get the stat of the file +         *  we read from +         */ + +        op_ret = fstat (_fd, &stbuf); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +                        "fstat failed: %s", strerror (op_errno)); +                goto out; +        } +	 +	op_ret = 0; + out: +        if (op_ret == -1) { +                frame->root->rsp_refs = NULL; + +                if (reply_dict) { +                        dict_unref (reply_dict); +                        reply_dict = NULL; +                } + +                if ((alloc_buf != NULL) && (dict_ret != -1)) +                        FREE (alloc_buf); +        } + +        if (reply_dict) +                frame->root->rsp_refs = reply_dict; + +        STACK_UNWIND (frame, op_ret, op_errno, &vec, 1, &stbuf); + +        if (reply_dict) +                dict_unref (reply_dict); + +        return 0; +} + + +int32_t +posix_writev (call_frame_t *frame, xlator_t *this, +              fd_t *fd, struct iovec *vector, int32_t count, off_t offset) +{ +        int32_t                op_ret   = -1; +        int32_t                op_errno = 0; +        int                    _fd      = -1; +        struct posix_private * priv     = NULL; +        struct posix_fd *      pfd      = NULL; +        struct stat            stbuf    = {0,}; +        int                      ret      = -1; + +        int    idx          = 0; +        int    align        = 4096; +        int    max_buf_size = 0; +        int    retval       = 0; +        char * buf          = NULL; +        char * alloc_buf    = NULL; +	uint64_t  tmp_pfd   = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); +        VALIDATE_OR_GOTO (vector, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        VALIDATE_OR_GOTO (priv, out); + +        ret = fd_ctx_get (fd, this, &tmp_pfd); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, +			"pfd is NULL from fd=%p", fd); +                op_errno = -ret; +                goto out; +        } +	pfd = (struct posix_fd *)(long)tmp_pfd; + +        _fd = pfd->fd; + +        op_ret = lseek (_fd, offset, SEEK_SET); + +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_ERROR, +			"lseek(%"PRId64") failed: %s", +                        offset, strerror (op_errno)); +                goto out; +        } + +        /* Check for the O_DIRECT flag during open() */ +        if (pfd->flags & O_DIRECT) { +                /* This is O_DIRECT'd file */ +		op_ret = -1; +                for (idx = 0; idx < count; idx++) { +                        if (max_buf_size < vector[idx].iov_len) +                                max_buf_size = vector[idx].iov_len; +                } + +                alloc_buf = MALLOC (1 * (max_buf_size + align)); +                if (!alloc_buf) { +                        op_errno = errno; +                        gf_log (this->name, GF_LOG_ERROR, +                                "out of memory :("); +                        goto out; +                } + +                for (idx = 0; idx < count; idx++) { +                        /* page aligned buffer */ +                        buf = ALIGN_BUF (alloc_buf, align); + +                        memcpy (buf, vector[idx].iov_base, +				vector[idx].iov_len); + +                        /* not sure whether writev works on O_DIRECT'd fd */ +                        retval = write (_fd, buf, vector[idx].iov_len); + +                        if (retval == -1) { +                                if (op_ret == -1) { +                                        op_errno = errno; +                                        gf_log (this->name, GF_LOG_WARNING, +                                                "O_DIRECT enabled: %s", +						strerror (op_errno)); +                                        goto out; +                                } + +                                break; +                        } +			if (op_ret == -1) +				op_ret = 0; +                        op_ret += retval; +                } + +        } else /* if (O_DIRECT) */ { + +                /* This is not O_DIRECT'd fd */ +                op_ret = writev (_fd, vector, count); +                if (op_ret == -1) { +                        op_errno = errno; +                        gf_log (this->name, GF_LOG_WARNING, +				"writev failed: %s", +                                strerror (op_errno)); +                        goto out; +                } +        } + +        priv->write_value    += op_ret; +        priv->interval_write += op_ret; + +        if (op_ret >= 0) { +                /* wiretv successful, we also need to get the stat of +                 * the file we wrote to +                 */ +                ret = fstat (_fd, &stbuf); +                if (ret == -1) { +			op_ret = -1; +                        op_errno = errno; +                        gf_log (this->name, GF_LOG_ERROR, "fstat failed: %s", +                                strerror (op_errno)); +                        goto out; +                } +        } + + out: +        if (alloc_buf) { +                FREE (alloc_buf); +        } + +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + +        return 0; +} + + +int32_t +posix_statfs (call_frame_t *frame, xlator_t *this, +              loc_t *loc) +{ +        char *                 real_path = NULL; +        int32_t                op_ret    = -1; +        int32_t                op_errno  = 0; +        struct statvfs         buf       = {0, }; +        struct posix_private * priv      = NULL; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (loc, out); +        VALIDATE_OR_GOTO (this->private, out); + +        MAKE_REAL_PATH (real_path, this, loc->path); + +        priv = this->private; + +        op_ret = statvfs (real_path, &buf); + +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_ERROR, "statvfs failed: %s", +                        strerror (op_errno)); +                goto out; +        } + +        if (!priv->export_statfs) { +                buf.f_blocks = 0; +                buf.f_bfree  = 0; +                buf.f_bavail = 0; +                buf.f_files  = 0; +                buf.f_ffree  = 0; +                buf.f_favail = 0; +        } + +        op_ret = 0; + + out: +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno, &buf); +        return 0; +} + + +int32_t +posix_flush (call_frame_t *frame, xlator_t *this, +             fd_t *fd) +{ +        int32_t           op_ret   = -1; +        int32_t           op_errno = 0; +        int               _fd      = -1; +        struct posix_fd * pfd      = NULL; +        int               ret      = -1; +	uint64_t          tmp_pfd  = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); + +        ret = fd_ctx_get (fd, this, &tmp_pfd); +        if (ret < 0) { +                op_errno = -ret; +                gf_log (this->name, GF_LOG_ERROR, +                        "pfd is NULL on fd=%p", fd); +                goto out; +        } +	pfd = (struct posix_fd *)(long)tmp_pfd; + +        _fd = pfd->fd; + +        /* do nothing */ +	posix_xattr_cache_flush_all (this); + +        op_ret = 0; + + out: +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno); + +        return 0; +} + + +int32_t +posix_release (xlator_t *this, +	       fd_t *fd) +{ +        int32_t                op_ret   = -1; +        int32_t                op_errno = 0; +        int                    _fd      = -1; +        struct posix_private * priv     = NULL; +        struct posix_fd *      pfd      = NULL; +        int                    ret      = -1; +	uint64_t               tmp_pfd  = 0; +	xattr_cache_handle_t   handle   = {{0,},0}; + +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); + +        priv = this->private; + +        priv->stats.nr_files--; + +        ret = fd_ctx_get (fd, this, &tmp_pfd); +        if (ret < 0) { +                op_errno = -ret; +                gf_log (this->name, GF_LOG_ERROR, +                        "pfd is NULL from fd=%p", fd); +                goto out; +        } +	pfd = (struct posix_fd *)(long)tmp_pfd; + +	handle.fd = fd; +	posix_xattr_cache_flush (this, &handle); + +        _fd = pfd->fd; + +	op_ret = close (_fd); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +                        "close(): %s", strerror (op_errno)); +		goto out; +        } + +        if (pfd->dir) { +		op_ret = -1; +                op_errno = EBADF; +                gf_log (this->name, GF_LOG_ERROR, +                        "pfd->dir is %p (not NULL) for file fd=%p", +                        pfd->dir, fd); +                goto out; +        } + +        op_ret = 0; + + out: +	if (pfd) +		FREE (pfd); + +        return 0; +} + + +int32_t +posix_fsync (call_frame_t *frame, xlator_t *this, +             fd_t *fd, int32_t datasync) +{ +        int32_t           op_ret   = -1; +        int32_t           op_errno = 0; +        int               _fd      = -1; +        struct posix_fd * pfd      = NULL; +        int               ret      = -1; +	uint64_t          tmp_pfd  = 0; + +        DECLARE_OLD_FS_ID_VAR; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); + +        SET_FS_ID (frame->root->uid, frame->root->gid); + +#ifdef GF_DARWIN_HOST_OS +        /* Always return success in case of fsync in MAC OS X */ +        op_ret = 0; +        goto out; +#endif + +        ret = fd_ctx_get (fd, this, &tmp_pfd); +        if (ret < 0) { +                op_errno = -ret; +                gf_log (this->name, GF_LOG_ERROR, "pfd not found in fd's ctx"); +                goto out; +        } +	pfd = (struct posix_fd *)(long)tmp_pfd; + +        _fd = pfd->fd; + +        if (datasync) { +                ; +#ifdef HAVE_FDATASYNC +                op_ret = fdatasync (_fd); +#endif +        } else { +                op_ret = fsync (_fd); +                if (op_ret == -1) { +                        op_errno = errno; +                        gf_log (this->name, GF_LOG_WARNING, "fsync: %s", +                                strerror (op_errno)); +                } +        } + +        op_ret = 0; + + out: +        SET_TO_OLD_FS_ID (); +        frame->root->rsp_refs = NULL; + +        STACK_UNWIND (frame, op_ret, op_errno); + +        return 0; +} + +static int gf_posix_xattr_enotsup_log; + +int +set_file_contents (xlator_t *this, char *real_path, +                   data_pair_t *trav, int flags) +{ +        char *      key                        = NULL; +        char        real_filepath[ZR_PATH_MAX] = {0,}; +        int32_t     file_fd                    = -1; +        int         op_ret                     = 0; +        int         ret                        = -1; + +        key = &(trav->key[15]); +        sprintf (real_filepath, "%s/%s", real_path, key); + +        if (flags & XATTR_REPLACE) { +                /* if file exists, replace it +                 * else, error out */ +                file_fd = open (real_filepath, O_TRUNC|O_WRONLY); + +                if (file_fd == -1) { +                        goto create; +                } + +                if (trav->value->len) { +                        ret = write (file_fd, trav->value->data, +				     trav->value->len); +                        if (ret == -1) { +                                op_ret = -errno; +                                gf_log (this->name, GF_LOG_ERROR, +					"write failed while doing setxattr " +					"for key %s on path %s: %s", +                                        key, real_filepath, strerror (errno)); +                                goto out; +                        } + +                        ret = close (file_fd); +                        if (ret == -1) { +                                op_ret = -errno; +                                gf_log (this->name, GF_LOG_ERROR, +                                        "close failed on %s: %s", +                                        real_filepath, strerror (errno)); +                                goto out; +                        } +                } + +        create: /* we know file doesn't exist, create it */ + +                file_fd = open (real_filepath, O_CREAT|O_WRONLY, 0644); + +                if (file_fd == -1) { +                        op_ret = -errno; +                        gf_log (this->name, GF_LOG_ERROR, +                                "failed to open file %s with O_CREAT: %s", +                                key, strerror (errno)); +                        goto out; +                } + +                ret = write (file_fd, trav->value->data, trav->value->len); +                if (ret == -1) { +                        op_ret = -errno; +                        gf_log (this->name, GF_LOG_ERROR, +                                "write failed on %s while setxattr with " +				"key %s: %s", +                                real_filepath, key, strerror (errno)); +                        goto out; +                } + +                ret = close (file_fd); +                if (ret == -1) { +                        op_ret = -errno; +                        gf_log (this->name, GF_LOG_ERROR, +                                "close failed on %s while setxattr with " +				"key %s: %s", +                                real_filepath, key, strerror (errno)); +                        goto out; +                } +        } + + out: +        return op_ret; +} + +int +handle_pair (xlator_t *this, char *real_path, +             data_pair_t *trav, int flags) +{ +        int sys_ret = -1; +        int ret     = 0; + +        if (ZR_FILE_CONTENT_REQUEST(trav->key)) { +                ret = set_file_contents (this, real_path, trav, flags); +        } else { +                sys_ret = lsetxattr (real_path, trav->key, trav->value->data, +                                     trav->value->len, flags); + +                if (sys_ret < 0) { +                        if (errno == ENOTSUP) { +                                GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, +						    this->name,GF_LOG_WARNING, +						    "Extended attributes not " +						    "supported"); +                        } else if (errno == ENOENT) { +                                gf_log (this->name, GF_LOG_DEBUG, +                                        "setxattr on %s failed: %s", real_path, +                                        strerror (errno)); +                        } else { + +#ifdef GF_DARWIN_HOST_OS +				gf_log (this->name, +					((errno == EINVAL) ? +					 GF_LOG_DEBUG : GF_LOG_WARNING), +					"%s: key:%s error:%s", +					real_path, trav->key, +					strerror (errno)); +#else /* ! DARWIN */ +                                gf_log (this->name, GF_LOG_WARNING, +                                        "%s: key:%s error:%s", +                                        real_path, trav->key, +					strerror (errno)); +#endif /* DARWIN */ +                        } + +                        ret = -errno; +                        goto out; +                } +        } + out: +        return ret; +} + +int32_t +posix_setxattr (call_frame_t *frame, xlator_t *this, +                loc_t *loc, dict_t *dict, int flags) +{ +        int32_t       op_ret                  = -1; +        int32_t       op_errno                = 0; +        char *        real_path               = NULL; +        data_pair_t * trav                    = NULL; +        int           ret                     = -1; + +        DECLARE_OLD_FS_ID_VAR; +        SET_FS_ID (frame->root->uid, frame->root->gid); + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (loc, out); +        VALIDATE_OR_GOTO (dict, out); + +        MAKE_REAL_PATH (real_path, this, loc->path); + +        trav = dict->members_list; + +        while (trav) { +                ret = handle_pair (this, real_path, trav, flags); +                if (ret < 0) { +                        op_errno = -ret; +                        goto out; +                } +                trav = trav->next; +        } + +        op_ret = 0; + + out: +        SET_TO_OLD_FS_ID (); +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno); + +        return 0; +} + +int +get_file_contents (xlator_t *this, char *real_path, +                   const char *name, char **contents) +{ +        char        real_filepath[ZR_PATH_MAX] = {0,}; +        char *      key                        = NULL; +        int32_t     file_fd                    = -1; +        struct stat stbuf                      = {0,}; +        int         op_ret                     = 0; +        int         ret                        = -1; + +        key = (char *) &(name[15]); +        sprintf (real_filepath, "%s/%s", real_path, key); + +        op_ret = lstat (real_filepath, &stbuf); +        if (op_ret == -1) { +                op_ret = -errno; +                gf_log (this->name, GF_LOG_ERROR, "lstat failed on %s: %s", +                        real_filepath, strerror (errno)); +                goto out; +        } + +        file_fd = open (real_filepath, O_RDONLY); + +        if (file_fd == -1) { +                op_ret = -errno; +                gf_log (this->name, GF_LOG_ERROR, "open failed on %s: %s", +                        real_filepath, strerror (errno)); +                goto out; +        } + +        *contents = CALLOC (stbuf.st_size + 1, sizeof(char)); + +        if (! *contents) { +                op_ret = -errno; +                gf_log (this->name, GF_LOG_ERROR, "out of memory :("); +                goto out; +        } + +        ret = read (file_fd, *contents, stbuf.st_size); +        if (ret <= 0) { +                op_ret = -1; +                gf_log (this->name, GF_LOG_ERROR, "read on %s failed", +                        real_filepath); +                goto out; +        } + +        *contents[stbuf.st_size] = '\0'; + +        op_ret = close (file_fd); +        file_fd = -1; +        if (op_ret == -1) { +                op_ret = -errno; +                gf_log (this->name, GF_LOG_ERROR, "close on %s failed: %s", +                        real_filepath, strerror (errno)); +                goto out; +        } + + out: +        if (op_ret < 0) { +                if (*contents) +                        FREE (*contents); +                if (file_fd != -1) +                        close (file_fd); +        } + +        return op_ret; +} + +/** + * posix_getxattr - this function returns a dictionary with all the + *                  key:value pair present as xattr. used for + *                  both 'listxattr' and 'getxattr'. + */ +int32_t +posix_getxattr (call_frame_t *frame, xlator_t *this, +                loc_t *loc, const char *name) +{ +        int32_t  op_ret         = -1; +        int32_t  op_errno       = ENOENT; +        int32_t  list_offset    = 0; +        size_t   size           = 0; +        size_t   remaining_size = 0; +        char     key[1024]      = {0,}; +        char *   value          = NULL; +        char *   list           = NULL; +        char *   real_path      = NULL; +        dict_t * dict           = NULL; +        char *   file_contents  = NULL; +        int      ret            = -1; + +        DECLARE_OLD_FS_ID_VAR; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (loc, out); + +        SET_FS_ID (frame->root->uid, frame->root->gid); +        MAKE_REAL_PATH (real_path, this, loc->path); + +        if (loc->inode && S_ISDIR(loc->inode->st_mode) && name && +	    ZR_FILE_CONTENT_REQUEST(name)) { +                ret = get_file_contents (this, real_path, name, +					 &file_contents); +                if (ret < 0) { +                        op_errno = -ret; +                        gf_log (this->name, GF_LOG_ERROR, +				"getting file contents failed: %s", +                                strerror (op_errno)); +                        goto out; +                } +        } + +        /* Get the total size */ +        dict = get_new_dict (); +        if (!dict) { +                gf_log (this->name, GF_LOG_ERROR, "out of memory :("); +                goto out; +        } + +        size = llistxattr (real_path, NULL, 0); +        if (size == -1) { +                op_errno = errno; +                if ((errno == ENOTSUP) || (errno == ENOSYS)) { +                        GF_LOG_OCCASIONALLY (gf_posix_xattr_enotsup_log, +                                             this->name, GF_LOG_WARNING, +                                             "Extended attributes not " +					     "supported."); +                } +                else { +                        gf_log (this->name, GF_LOG_ERROR, +				"listxattr failed on %s: %s", +                                real_path, strerror (op_errno)); +                } +                goto out; +        } + +        if (size == 0) +                goto done; + +        list = alloca (size + 1); +        if (!list) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_ERROR, "out of memory :("); +                goto out; +        } + +        size = llistxattr (real_path, list, size); + +        remaining_size = size; +        list_offset = 0; +        while (remaining_size > 0) { +                if(*(list + list_offset) == '\0') +                        break; + +                strcpy (key, list + list_offset); +                op_ret = lgetxattr (real_path, key, NULL, 0); +                if (op_ret == -1) +                        break; + +                value = CALLOC (op_ret + 1, sizeof(char)); +                if (!value) { +                        op_errno = errno; +                        gf_log (this->name, GF_LOG_ERROR, "out of memory :("); +                        goto out; +                } + +                op_ret = lgetxattr (real_path, key, value, op_ret); +                if (op_ret == -1) +                        break; + +                value [op_ret] = '\0'; +                dict_set (dict, key, data_from_dynptr (value, op_ret)); +                remaining_size -= strlen (key) + 1; +                list_offset += strlen (key) + 1; + +        } /* while (remaining_size > 0) */ + + done: +        op_ret = size; + +        if (dict) { +                dict_ref (dict); +        } + + out: +        SET_TO_OLD_FS_ID (); +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno, dict); + +        if (dict) +                dict_unref (dict); + +        return 0; +} + +int32_t +posix_removexattr (call_frame_t *frame, xlator_t *this, +                   loc_t *loc, const char *name) +{ +        int32_t op_ret    = -1; +        int32_t op_errno  = 0; +        char *  real_path = NULL; + +        DECLARE_OLD_FS_ID_VAR; + +        MAKE_REAL_PATH (real_path, this, loc->path); + +        SET_FS_ID (frame->root->uid, frame->root->gid); + +        op_ret = lremovexattr (real_path, name); + +        if (op_ret == -1) { +                op_errno = errno; +		if (op_errno != ENOATTR && op_errno != EPERM) +			gf_log (this->name, GF_LOG_WARNING, +				"removexattr on %s: %s", loc->path, +                        strerror (op_errno)); +                goto out; +        } + +        op_ret = 0; + + out: +        SET_TO_OLD_FS_ID (); + +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno); +        return 0; +} + + +int32_t +posix_fsyncdir (call_frame_t *frame, xlator_t *this, +                fd_t *fd, int datasync) +{ +        int32_t           op_ret   = -1; +        int32_t           op_errno = 0; +        struct posix_fd * pfd      = NULL; +        int               _fd      = -1; +        int               ret      = -1; +	uint64_t          tmp_pfd  = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); + +        ret = fd_ctx_get (fd, this, &tmp_pfd); +        if (ret < 0) { +                op_errno = -ret; +                gf_log (this->name, GF_LOG_ERROR, +                        "pfd is NULL, fd=%p", fd); +                goto out; +        } +	pfd = (struct posix_fd *)(long)tmp_pfd; + +        _fd = pfd->fd; + +        op_ret = 0; + + out: +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno); + +        return 0; +} + + +void +posix_print_xattr (dict_t *this, +		   char *key, +		   data_t *value, +		   void *data) +{ +	gf_log ("posix", GF_LOG_TRACE, +		"(key/val) = (%s/%d)", key, data_to_int32 (value)); +} + + +/** + * add_array - add two arrays of 32-bit numbers (stored in network byte order) + * dest = dest + src + * @count: number of 32-bit numbers + * FIXME: handle overflow + */ + +static void +__add_array (int32_t *dest, int32_t *src, int count) +{ +	int i = 0; +	for (i = 0; i < count; i++) { +		dest[i] = hton32 (ntoh32 (dest[i]) + ntoh32 (src[i])); +	} +} + + +/** + * xattrop - xattr operations - for internal use by GlusterFS + * @optype: ADD_ARRAY: + *            dict should contain: + *               "key" ==> array of 32-bit numbers + */ + + +int +posix_xattrop_common (call_frame_t *frame, xlator_t *this, +		      xattr_cache_handle_t *handle, +		      gf_xattrop_flags_t optype, dict_t *xattr) +{ +	int32_t         *array      = NULL; + +	int              ret   = 0; +	int              count = 0; + +	int              op_ret = 0; +	int              op_errno = 0; + +	data_pair_t     *trav = NULL; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (xattr, out); +	VALIDATE_OR_GOTO (this, out); + +	trav = xattr->members_list; + +	while (trav) { +		count = trav->value->len / sizeof (int32_t); +		array = CALLOC (count, sizeof (int32_t)); + +		ret = posix_xattr_cache_read (this, handle, trav->key,  +					      array, trav->value->len); + +		switch (optype) { + +		case GF_XATTROP_ADD_ARRAY: +			__add_array (array, (int32_t *) trav->value->data, +				     trav->value->len / 4); +			break; + +		default: +			gf_log (this->name, GF_LOG_ERROR, +				"unknown xattrop type %d", +				optype); + +			op_ret = -1; +			op_errno = EINVAL; +			goto out; +		} + +		ret = posix_xattr_cache_write (this, handle, trav->key, +					       array, trav->value->len); + +		ret = dict_set_bin (xattr, trav->key, array, +				    trav->value->len); + +		if (ret != 0) { +			gf_log (this->name, GF_LOG_ERROR, +				"key=%s (%s)", +				trav->key, strerror (-ret)); +			op_ret = -1; +			op_errno = EINVAL; +			goto out; +		} + +		trav = trav->next; +		array = NULL; +	} + +out: +	if (array) +		FREE (array); + +	STACK_UNWIND (frame, op_ret, op_errno, xattr); +	return 0; +} + + +int +posix_xattrop (call_frame_t *frame, xlator_t *this, +	       loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr) +{ +	xattr_cache_handle_t handle = {{0,}, 0}; +	int ret = -1; + +	loc_copy (&handle.loc, loc); +	{ +		ret = posix_xattrop_common (frame, this, &handle, optype, xattr); +	} +	loc_wipe (&handle.loc); + +	return ret; +} + + +int +posix_fxattrop (call_frame_t *frame, xlator_t *this, +		fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr) +{ +	int ret = -1; +	xattr_cache_handle_t handle = {{0,}, 0}; +	 +	handle.fd = fd; + +	ret = posix_xattrop_common (frame, this, &handle, optype, xattr); + +	return ret; +} + + +int +posix_access (call_frame_t *frame, xlator_t *this, +              loc_t *loc, int32_t mask) +{ +        int32_t op_ret    = -1; +        int32_t op_errno  = 0; +        char *  real_path = NULL; + +        DECLARE_OLD_FS_ID_VAR; +        SET_FS_ID (frame->root->uid, frame->root->gid); + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (loc, out); + +        MAKE_REAL_PATH (real_path, this, loc->path); + +        op_ret = access (real_path, mask & 07); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, "access failed on %s: %s", +                        loc->path, strerror (op_errno)); +                goto out; +        } + +        op_ret = 0; + + out: +        SET_TO_OLD_FS_ID (); + +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno); +        return 0; +} + + +int32_t +posix_ftruncate (call_frame_t *frame, xlator_t *this, +                 fd_t *fd, off_t offset) +{ +        int32_t           op_ret   = -1; +        int32_t           op_errno = 0; +        int               _fd      = -1; +        struct stat       buf      = {0,}; +        struct posix_fd * pfd      = NULL; +        int               ret      = -1; +	uint64_t          tmp_pfd  = 0; + +        DECLARE_OLD_FS_ID_VAR; +        SET_FS_ID (frame->root->uid, frame->root->gid); + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); + +        ret = fd_ctx_get (fd, this, &tmp_pfd); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, +                        "pfd is NULL, fd=%p", fd); +                op_errno = -ret; +                goto out; +        } +	pfd = (struct posix_fd *)(long)tmp_pfd; + +        _fd = pfd->fd; + +        op_ret = ftruncate (_fd, offset); + +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, "ftruncate failed: %s", +                        strerror (errno)); +                goto out; +        } + +        op_ret = fstat (_fd, &buf); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, "fstat failed: %s", +                        strerror (errno)); +                goto out; +        } + +        op_ret = 0; + + out: +        SET_TO_OLD_FS_ID (); + +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno, &buf); + +        return 0; +} + +int32_t +posix_fchown (call_frame_t *frame, xlator_t *this, +              fd_t *fd, uid_t uid, gid_t gid) +{ +        int32_t           op_ret   = -1; +        int32_t           op_errno = 0; +        int               _fd      = -1; +        struct stat       buf      = {0,}; +        struct posix_fd * pfd      = NULL; +        int               ret      = -1; +	uint64_t          tmp_pfd  = 0; + +        DECLARE_OLD_FS_ID_VAR; + +        SET_FS_ID (frame->root->uid, frame->root->gid); + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); + +        ret = fd_ctx_get (fd, this, &tmp_pfd); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, +                        "pfd is NULL, fd=%p", fd); +                op_errno = -ret; +                goto out; +        } +	pfd = (struct posix_fd *)(long)tmp_pfd; + +        _fd = pfd->fd; + +        op_ret = fchown (_fd, uid, gid); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, "fchown failed: %s", +                        strerror (op_errno)); +                goto out; +        } + +        op_ret = fstat (_fd, &buf); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, "fstat failed: %s", +                        strerror (op_errno)); +                goto out; +        } + +        op_ret = 0; + + out: +        SET_TO_OLD_FS_ID (); +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno, &buf); + +        return 0; +} + + +int32_t +posix_fchmod (call_frame_t *frame, xlator_t *this, +              fd_t *fd, mode_t mode) +{ +        int32_t           op_ret   = -1; +        int32_t           op_errno = 0; +        int               _fd      = -1; +        struct stat       buf      = {0,}; +        struct posix_fd * pfd      = NULL; +        int               ret      = -1; +	uint64_t          tmp_pfd  = 0; + +        DECLARE_OLD_FS_ID_VAR; + +        SET_FS_ID (frame->root->uid, frame->root->gid); + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); + +        ret = fd_ctx_get (fd, this, &tmp_pfd); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, +                        "pfd is NULL fd=%p", fd); +                op_errno = -ret; +                goto out; +        } +	pfd = (struct posix_fd *)(long)tmp_pfd; + +        _fd = pfd->fd; + +        op_ret = fchmod (_fd, mode); + +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +			"fchmod failed: %s", strerror (errno)); +                goto out; +        } + +        op_ret = fstat (_fd, &buf); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, +			"fstat failed: %s", strerror (errno)); +                goto out; +        } + +        op_ret = 0; + + out: +        SET_TO_OLD_FS_ID (); + +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno, &buf); + +        return 0; +} + + +static int +same_file_type (mode_t m1, mode_t m2) +{ +	return ((S_IFMT & (m1 ^ m2)) == 0); +} + + +static int +ensure_file_type (xlator_t *this, char *pathname, mode_t mode) +{ +        struct stat stbuf  = {0,}; +        int         op_ret = 0; +        int         ret    = -1; + +        ret = lstat (pathname, &stbuf); +        if (ret == -1) { +                op_ret = -errno; +                gf_log (this->name, GF_LOG_CRITICAL, +                        "stat failed while trying to make sure entry %s " +			"is a directory: %s", pathname, strerror (errno)); +                goto out; +        } + +        if (!same_file_type (mode, stbuf.st_mode)) { +                op_ret = -EEXIST; +                gf_log (this->name, GF_LOG_CRITICAL, +                        "entry %s is a different type of file " +			"than expected", pathname); +                goto out; +        } + out: +        return op_ret; +} + +static int +create_entry (xlator_t *this, int32_t flags, +              dir_entry_t *entry, char *pathname) +{ +        int op_ret        = 0; +        int ret           = -1; +        struct timeval tv[2]     = {{0,0},{0,0}}; + +        if (S_ISDIR (entry->buf.st_mode)) { +                /* +                 * If the entry is directory, create it by +                 * calling 'mkdir'. If the entry is already +                 * present, check if it is a directory, +                 * and issue a warning if otherwise. +                 */ + +                ret = mkdir (pathname, entry->buf.st_mode); +                if (ret == -1) { +                        if (errno == EEXIST) { +                                op_ret = ensure_file_type (this, pathname, +                                                           entry->buf.st_mode); +                        } +                        else { +                                op_ret = -errno; +                                gf_log (this->name, GF_LOG_DEBUG, +                                        "mkdir %s with mode (0%o) failed: %s", +                                        pathname, entry->buf.st_mode, +                                        strerror (errno)); +                                goto out; +                        } +                } + +        } else if ((flags & GF_SET_IF_NOT_PRESENT) +                   || !(flags & GF_SET_DIR_ONLY)) { + +                /* create a 0-byte file here */ + +                if (S_ISREG (entry->buf.st_mode)) { +                        ret = open (pathname, O_CREAT|O_EXCL, +                                    entry->buf.st_mode); + +                        if (ret == -1) { +                                if (errno == EEXIST) { +                                        op_ret = ensure_file_type (this, +								   pathname, +                                                                   entry->buf.st_mode); +                                } +                                else { +                                        op_ret = -errno; +                                        gf_log (this->name, GF_LOG_ERROR, +                                                "Error creating file %s with " +						"mode (0%o): %s", +                                                pathname, entry->buf.st_mode, +                                                strerror (errno)); +                                        goto out; +                                } +                        } + +                        close (ret); + +                } else if (S_ISLNK (entry->buf.st_mode)) { +                        ret = symlink (entry->link, pathname); + +                        if (ret == -1) { +                                if (errno == EEXIST) { +                                        op_ret = ensure_file_type (this, +								   pathname, +                                                                   entry->buf.st_mode); +                                } +                                else { +                                        op_ret = -errno; +                                        gf_log (this->name, GF_LOG_ERROR, +                                                "error creating symlink %s: %s" +						, pathname, strerror (errno)); +                                        goto out; +                                } +                        } + +                } else if (S_ISBLK (entry->buf.st_mode) || +                           S_ISCHR (entry->buf.st_mode) || +                           S_ISFIFO (entry->buf.st_mode) || +			   S_ISSOCK (entry->buf.st_mode)) { + +                        ret = mknod (pathname, entry->buf.st_mode, +                                     entry->buf.st_dev); + +                        if (ret == -1) { +                                if (errno == EEXIST) { +                                        op_ret = ensure_file_type (this, +								   pathname, +                                                                   entry->buf.st_mode); +                                } else { +                                        op_ret = -errno; +                                        gf_log (this->name, GF_LOG_ERROR, +                                                "error creating device file " +						"%s: %s", +						pathname, strerror (errno)); +                                        goto out; +                                } +                        } +                } else { +			gf_log (this->name, GF_LOG_ERROR, +				"invalid mode 0%o for %s", entry->buf.st_mode, +				pathname); +			op_ret = -EINVAL; +			goto out; +		} +        } + +	/* +	 * Preserve atime and mtime +	 */ + +	if (!S_ISLNK (entry->buf.st_mode)) { +		tv[0].tv_sec = entry->buf.st_atime; +		tv[1].tv_sec = entry->buf.st_mtime; +		ret = utimes (pathname, tv); +		if (ret == -1) { +			op_ret = -errno; +			gf_log (this->name, GF_LOG_ERROR, +				"utimes %s failed: %s", +				pathname, strerror (errno)); +			goto out; +		} +	} + +out: +        return op_ret; + +} + + +int +posix_setdents (call_frame_t *frame, xlator_t *this, +                fd_t *fd, int32_t flags, dir_entry_t *entries, +                int32_t count) +{ +        char *            real_path      = NULL; +        char *            entry_path     = NULL; +        int32_t           real_path_len  = -1; +        int32_t           entry_path_len = -1; +        int32_t           ret            = 0; +        int32_t           op_ret         = -1; +        int32_t           op_errno       = 0; +        struct posix_fd * pfd            = {0, }; +        struct timeval    tv[2]          = {{0, }, {0, }}; +	uint64_t          tmp_pfd        = 0; +        char              pathname[ZR_PATH_MAX] = {0,}; +        dir_entry_t *     trav           = NULL; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); +        VALIDATE_OR_GOTO (entries, out); + +        tv[0].tv_sec = tv[0].tv_usec = 0; +        tv[1].tv_sec = tv[1].tv_usec = 0; + +        ret = fd_ctx_get (fd, this, &tmp_pfd); +        if (ret < 0) { +                op_errno = -ret; +                gf_log (this->name, GF_LOG_ERROR, +			"fd's ctx not found on fd=%p for %s", +                        fd, this->name); +                goto out; +        } +	pfd = (struct posix_fd *)(long)tmp_pfd; + +        real_path = pfd->path; + +        if (!real_path) { +                op_errno = EINVAL; +                gf_log (this->name, GF_LOG_ERROR, +                        "path is NULL on pfd=%p fd=%p", pfd, fd); +                goto out; +        } + +        real_path_len  = strlen (real_path); +        entry_path_len = real_path_len + 256; +        entry_path     = CALLOC (1, entry_path_len); + +        if (!entry_path) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_ERROR, "out of memory :("); +                goto out; +        } + +        strcpy (entry_path, real_path); +        entry_path[real_path_len] = '/'; + +	posix_xattr_cache_flush_all (this); + +        /* fd exists, and everything looks fine */ +        /** +         * create an entry for each one present in '@entries' +         *  - if flag is set (ie, if its namespace), create both directories +	 *    and files +         *  - if not set, create only directories. +         * +         *  after the entry is created, change the mode and ownership of the +	 *  entry according to the stat present in entries->buf. +         */ + +        trav = entries->next; +        while (trav) { +                strcpy (pathname, entry_path); +                strcat (pathname, trav->name); + +                ret = create_entry (this, flags, trav, pathname); +                if (ret < 0) { +                        op_errno = -ret; +                        goto out; +                } + +                /* TODO: handle another flag, GF_SET_OVERWRITE */ + +                /* Change the mode */ +		if (!S_ISLNK (trav->buf.st_mode)) { +			ret = chmod (pathname, trav->buf.st_mode); +			if (ret == -1) { +				op_errno = errno; +				gf_log (this->name, GF_LOG_ERROR, +					"chmod on %s failed: %s", pathname, +					strerror (op_errno)); +				goto out; +			} +		} + +                /* change the ownership */ +                ret = lchown (pathname, trav->buf.st_uid, trav->buf.st_gid); +                if (ret == -1) { +                        op_errno = errno; +                        gf_log (this->name, GF_LOG_ERROR, +                                "chmod on %s failed: %s", pathname, +                                strerror (op_errno)); +                        goto out; +                } + +                if (flags & GF_SET_EPOCH_TIME) { +                        ret = utimes (pathname, tv); +                        if (ret == -1) { +                                op_errno = errno; +                                gf_log (this->name, GF_LOG_ERROR, +                                        "utimes on %s failed: %s", pathname, +                                        strerror (op_errno)); +                                goto out; +                        } +                } + +                /* consider the next entry */ +                trav = trav->next; +        } + +        op_ret = 0; + out: +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno); +        if (entry_path) +                FREE (entry_path); + +        return 0; +} + +int32_t +posix_fstat (call_frame_t *frame, xlator_t *this, +             fd_t *fd) +{ +        int               _fd      = -1; +        int32_t           op_ret   = -1; +        int32_t           op_errno = 0; +        struct stat       buf      = {0,}; +        struct posix_fd * pfd      = NULL; +	uint64_t          tmp_pfd  = 0; +        int               ret      = -1; + +        DECLARE_OLD_FS_ID_VAR; +        SET_FS_ID (frame->root->uid, frame->root->gid); + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); + +        ret = fd_ctx_get (fd, this, &tmp_pfd); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, +                        "pfd is NULL, fd=%p", fd); +                op_errno = -ret; +                goto out; +        } +	pfd = (struct posix_fd *)(long)tmp_pfd; + +        _fd = pfd->fd; + +        op_ret = fstat (_fd, &buf); + +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_WARNING, "fstat failed: %s", +                        strerror (op_errno)); +                goto out; +        } + +        op_ret = 0; + + out: +        SET_TO_OLD_FS_ID (); +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno, &buf); +        return 0; +} + +static int gf_posix_lk_log; + +int32_t +posix_lk (call_frame_t *frame, xlator_t *this, +          fd_t *fd, int32_t cmd, struct flock *lock) +{ +        struct flock nullock = {0, }; +        frame->root->rsp_refs = NULL; + +        gf_posix_lk_log++; + +	GF_LOG_OCCASIONALLY (gf_posix_lk_log, this->name, GF_LOG_ERROR, +			     "\"features/posix-locks\" translator is " +			     "not loaded, you need to use it"); + +        STACK_UNWIND (frame, -1, ENOSYS, &nullock); +        return 0; +} + +int32_t +posix_inodelk (call_frame_t *frame, xlator_t *this, +	       loc_t *loc, int32_t cmd, struct flock *lock) +{ +        frame->root->rsp_refs = NULL; + +	gf_log (this->name, GF_LOG_CRITICAL, +		"\"features/posix-locks\" translator is not loaded. " +		"You need to use it for proper functioning of GlusterFS"); + +        STACK_UNWIND (frame, -1, ENOSYS); +        return 0; +} + +int32_t +posix_finodelk (call_frame_t *frame, xlator_t *this, +		fd_t *fd, int32_t cmd, struct flock *lock) +{ +        frame->root->rsp_refs = NULL; + +	gf_log (this->name, GF_LOG_CRITICAL, +		"\"features/posix-locks\" translator is not loaded. " +		"You need to use it for proper functioning of GlusterFS"); + +        STACK_UNWIND (frame, -1, ENOSYS); +        return 0; +} + + +int32_t +posix_entrylk (call_frame_t *frame, xlator_t *this, +	       loc_t *loc, const char *basename, entrylk_cmd cmd, +	       entrylk_type type) +{ +        frame->root->rsp_refs = NULL; + +	gf_log (this->name, GF_LOG_CRITICAL, +		"\"features/posix-locks\" translator is not loaded. " +		"You need to use it for proper functioning of GlusterFS"); + +        STACK_UNWIND (frame, -1, ENOSYS); +        return 0; +} + +int32_t +posix_fentrylk (call_frame_t *frame, xlator_t *this, +		fd_t *fd, const char *basename, entrylk_cmd cmd, +		entrylk_type type) +{ +        frame->root->rsp_refs = NULL; + +	gf_log (this->name, GF_LOG_CRITICAL, +		"\"features/posix-locks\" translator is not loaded. " +		" You need to use it for proper functioning of GlusterFS"); + +        STACK_UNWIND (frame, -1, ENOSYS); +        return 0; +} + + +int32_t +posix_readdir (call_frame_t *frame, xlator_t *this, +               fd_t *fd, size_t size, off_t off) +{ +	uint64_t          tmp_pfd = 0; +        struct posix_fd * pfd    = NULL; +        DIR *             dir    = NULL; +        int               ret    = -1; +        size_t            filled = 0; +	int               count = 0; + +        int32_t           op_ret   = -1; +        int32_t           op_errno = 0; + +        gf_dirent_t *     this_entry = NULL; +	gf_dirent_t       entries; +        struct dirent *   entry      = NULL; +        off_t             in_case    = -1; +        int32_t           this_size  = -1; + + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); + +	INIT_LIST_HEAD (&entries.list); + +        ret = fd_ctx_get (fd, this, &tmp_pfd); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, +                        "pfd is NULL, fd=%p", fd); +                op_errno = -ret; +                goto out; +        } +	pfd = (struct posix_fd *)(long)tmp_pfd; + +        dir = pfd->dir; + +        if (!dir) { +                gf_log (this->name, GF_LOG_ERROR, +                        "dir is NULL for fd=%p", fd); +                op_errno = EINVAL; +                goto out; +        } + + +        if (!off) { +                rewinddir (dir); +        } else { +                seekdir (dir, off); +        } + +        while (filled <= size) { +                in_case = telldir (dir); + +                if (in_case == -1) { +                        op_errno = errno; +                        gf_log (this->name, GF_LOG_ERROR, +				"telldir failed: %s", +                                strerror (errno)); +                        goto out; +                } + +                errno = 0; +                entry = readdir (dir); + +                if (!entry) { +                        if (errno == EBADF) { +                                op_errno = errno; +                                gf_log (this->name, GF_LOG_ERROR, +					"readdir failed: %s", +                                        strerror (op_errno)); +                                goto out; +                        } +                        break; +                } + +                this_size = dirent_size (entry); + +                if (this_size + filled > size) { +                        seekdir (dir, in_case); +                        break; +                } + + +		this_entry = gf_dirent_for_name (entry->d_name); + +		if (!this_entry) { +			gf_log (this->name, GF_LOG_ERROR, +				"could not create gf_dirent for entry %s (%s)", +				entry->d_name, strerror (errno)); +			goto out; +		} +		this_entry->d_off = telldir (dir); +		this_entry->d_ino = entry->d_ino; + +		list_add_tail (&this_entry->list, &entries.list); + +                filled += this_size; +		count ++; +        } + +        op_ret = count; + + out: +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno, &entries); + +	gf_dirent_free (&entries); + +        return 0; +} + + +int32_t +posix_stats (call_frame_t *frame, xlator_t *this, +             int32_t flags) + +{ +        int32_t op_ret   = -1; +        int32_t op_errno = 0; + +        struct xlator_stats    xlstats = {0, }; +        struct xlator_stats *  stats   = NULL; +        struct statvfs         buf     = {0,}; +        struct timeval         tv      = {0,}; +        struct posix_private * priv = (struct posix_private *)this->private; + +        int64_t avg_read  = 0; +        int64_t avg_write = 0; +        int64_t _time_ms  = 0; + +        DECLARE_OLD_FS_ID_VAR; + +        SET_FS_ID (frame->root->uid, frame->root->gid); + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); + +        stats = &xlstats; + +        op_ret = statvfs (priv->base_path, &buf); + +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_ERROR, "statvfs failed: %s", +                        strerror (op_errno)); +                goto out; +        } + +	/* client info is maintained at FSd */ +        stats->nr_clients = priv->stats.nr_clients; +        stats->nr_files   = priv->stats.nr_files; + +        /* number of free block in the filesystem. */ +        stats->free_disk  = buf.f_bfree * buf.f_bsize; + +        stats->total_disk_size = buf.f_blocks  * buf.f_bsize; +        stats->disk_usage      = (buf.f_blocks - buf.f_bavail) * buf.f_bsize; + +        /* Calculate read and write usage */ +        op_ret = gettimeofday (&tv, NULL); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_ERROR, +			"gettimeofday failed: %s", strerror (errno)); +                goto out; +        } + +        /* Read */ +        _time_ms  = (tv.tv_sec  - priv->init_time.tv_sec)  * 1000 + +                ((tv.tv_usec - priv->init_time.tv_usec) / 1000); + +        avg_read  = (_time_ms) ? (priv->read_value  / _time_ms) : 0; /* KBps */ +        avg_write = (_time_ms) ? (priv->write_value / _time_ms) : 0; /* KBps */ + +        _time_ms  = (tv.tv_sec  - priv->prev_fetch_time.tv_sec)  * 1000 + +                ((tv.tv_usec - priv->prev_fetch_time.tv_usec) / 1000); + +        if (_time_ms && ((priv->interval_read  / _time_ms) > priv->max_read)) { +                priv->max_read  = (priv->interval_read / _time_ms); +        } + +        if (_time_ms && +	    ((priv->interval_write / _time_ms) > priv->max_write)) { +                priv->max_write = priv->interval_write / _time_ms; +        } + +        stats->read_usage  = avg_read  / priv->max_read; +        stats->write_usage = avg_write / priv->max_write; + +        op_ret = gettimeofday (&(priv->prev_fetch_time), NULL); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_ERROR, "gettimeofday failed: %s", +                        strerror (op_errno)); +                goto out; +        } + +        priv->interval_read  = 0; +        priv->interval_write = 0; + +        op_ret = 0; + + out: +        SET_TO_OLD_FS_ID (); + +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno, stats); +        return 0; +} + +int32_t +posix_checksum (call_frame_t *frame, xlator_t *this, +                loc_t *loc, int32_t flag) +{ +        char *          real_path                      = NULL; +        DIR *           dir                            = NULL; +        struct dirent * dirent                         = NULL; +        uint8_t         file_checksum[ZR_FILENAME_MAX] = {0,}; +        uint8_t         dir_checksum[ZR_FILENAME_MAX]  = {0,}; +        int32_t         op_ret                         = -1; +        int32_t         op_errno                       = 0; +        int             i                              = 0; +        int             length                         = 0; + +        struct stat buf                        = {0,}; +        char        tmp_real_path[ZR_PATH_MAX] = {0,}; +        int         ret                        = -1; + +        MAKE_REAL_PATH (real_path, this, loc->path); + +        dir = opendir (real_path); + +        if (!dir){ +                op_errno = errno; +                gf_log (this->name, GF_LOG_DEBUG, +			"opendir() failed on `%s': %s", +                        real_path, strerror (op_errno)); +                goto out; +        } + +        while ((dirent = readdir (dir))) { +                errno = 0; +                if (!dirent) { +                        if (errno != 0) { +                                op_errno = errno; +                                gf_log (this->name, GF_LOG_DEBUG, +                                        "readdir() failed: %s", +					strerror (errno)); +                                goto out; +                        } +                        break; +                } + +                length = strlen (dirent->d_name); + +                strcpy (tmp_real_path, real_path); +                strcat (tmp_real_path, "/"); +                strcat (tmp_real_path, dirent->d_name); +                ret = lstat (tmp_real_path, &buf); + +                if (ret == -1) +                        continue; + +                if (S_ISDIR (buf.st_mode)) { +                        for (i = 0; i < length; i++) +                                dir_checksum[i] ^= dirent->d_name[i]; +                } else { +                        for (i = 0; i < length; i++) +                                file_checksum[i] ^= dirent->d_name[i]; +                } +        } +        closedir (dir); + +        op_ret = 0; + + out: +        frame->root->rsp_refs = NULL; +        STACK_UNWIND (frame, op_ret, op_errno, file_checksum, dir_checksum); + +        return 0; +} + +/** + * notify - when parent sends PARENT_UP, send CHILD_UP event from here + */ +int32_t +notify (xlator_t *this, +        int32_t event, +        void *data, +        ...) +{ +        switch (event) +                { +                case GF_EVENT_PARENT_UP: +                        { +                                /* Tell the parent that posix xlator is up */ +                                default_notify (this, GF_EVENT_CHILD_UP, data); +                        } +                        break; +                default: +                        /* */ +                        break; +                } +        return 0; +} + +/** + * init - + */ +int +init (xlator_t *this) +{ +        int                    ret      = 0; +        int                    op_ret   = -1; +	gf_boolean_t           tmp_bool = 0; +        struct stat            buf      = {0,}; +        struct posix_private * _private = NULL; +        data_t *               dir_data = NULL; +	data_t *               tmp_data = NULL; + +        dir_data = dict_get (this->options, "directory"); + +        if (this->children) { +                gf_log (this->name, GF_LOG_ERROR, +                        "FATAL: storage/posix cannot have subvolumes"); +                ret = -1; +                goto out; +        } + +	if (!this->parents) { +		gf_log (this->name, GF_LOG_WARNING, +			"dangling volume. check volfile "); +	} + +        if (!dir_data) { +                gf_log (this->name, GF_LOG_ERROR, +                        "export directory not specified in volfile"); +                ret = -1; +                goto out; +        } + +        umask (000); // umask `masking' is done at the client side + +        /* Check whether the specified directory exists, if not create it. */ +        op_ret = lstat (dir_data->data, &buf); +        if ((ret != 0) || !S_ISDIR (buf.st_mode)) { +                gf_log (this->name, GF_LOG_ERROR, +                        "directory '%s' doesn't exists, Exiting", +			dir_data->data); +                ret = -1; +                goto out; +        } + + +        /* Check for Extended attribute support, if not present, log it */ +        op_ret = lsetxattr (dir_data->data, +			    "trusted.glusterfs.test", "working", 8, 0); +        if (op_ret < 0) { +		tmp_data = dict_get (this->options, +				     "mandate-attribute"); +		if (tmp_data) { +			if (gf_string2boolean (tmp_data->data, +					       &tmp_bool) == -1) { +				gf_log (this->name, GF_LOG_ERROR, +					"wrong option provided for key " +					"\"mandate-xattr\""); +				ret = -1; +				goto out; +			} +			if (!tmp_bool) { +				gf_log (this->name, GF_LOG_WARNING, +					"Extended attribute not supported, " +					"starting as per option"); +			} else { +				gf_log (this->name, GF_LOG_CRITICAL, +					"Extended attribute not supported, " +					"exiting"); +				ret = -1; +				goto out; +			} +		} else { +			gf_log (this->name, GF_LOG_CRITICAL, +				"Extended attribute not supported, exiting"); +			ret = -1; +			goto out; +		} +        } + +        _private = CALLOC (1, sizeof (*_private)); +        if (!_private) { +                gf_log (this->name, GF_LOG_ERROR, +                        "out of memory :("); +                ret = -1; +                goto out; +        } + +        _private->base_path = strdup (dir_data->data); +        _private->base_path_length = strlen (_private->base_path); +	_private->base_stdev = buf.st_dev; + +	_private->xattr_cache = posix_xattr_cache_init (16); +	if (!_private->xattr_cache) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		ret = -1; +		goto out; +	} + +        { +                /* Stats related variables */ +                gettimeofday (&_private->init_time, NULL); +                gettimeofday (&_private->prev_fetch_time, NULL); +                _private->max_read = 1; +                _private->max_write = 1; +        } + +        _private->export_statfs = 1; +        tmp_data = dict_get (this->options, "export-statfs-size"); +        if (tmp_data) { +		if (gf_string2boolean (tmp_data->data, +				       &_private->export_statfs) == -1) { +			ret = -1; +			gf_log (this->name, GF_LOG_ERROR, +				"'export-statfs-size' takes only boolean " +				"options"); +			goto out; +		} +                if (!_private->export_statfs) +                        gf_log (this->name, GF_LOG_DEBUG, +				"'statfs()' returns dummy size"); +        } + +        tmp_data = dict_get (this->options, "o-direct"); +        if (tmp_data) { +		if (gf_string2boolean (tmp_data->data, +				       &_private->o_direct) == -1) { +			ret = -1; +			gf_log (this->name, GF_LOG_ERROR, +				"wrong option provided for 'o-direct'"); +			goto out; +		} +		if (_private->o_direct) +                        gf_log (this->name, GF_LOG_DEBUG, +                                "o-direct mode is enabled (O_DIRECT " +				"for every open)"); +        } + +#ifndef GF_DARWIN_HOST_OS +        { +                struct rlimit lim; +                lim.rlim_cur = 1048576; +                lim.rlim_max = 1048576; + +                if (setrlimit (RLIMIT_NOFILE, &lim) == -1) { +                        gf_log (this->name, GF_LOG_WARNING, +				"WARNING: Failed to set 'ulimit -n " +				" 1048576': %s", strerror(errno)); +                        lim.rlim_cur = 65536; +                        lim.rlim_max = 65536; + +                        if (setrlimit (RLIMIT_NOFILE, &lim) == -1) { +                                gf_log (this->name, GF_LOG_ERROR, +					"Failed to set max open fd to " +					"64k: %s", strerror(errno)); +                        } +                        else { +                                gf_log (this->name, GF_LOG_ERROR, +					"max open fd set to 64k"); +                        } +                } +        } +#endif + +        this->private = (void *)_private; + + out: +        return ret; +} + +void +fini (xlator_t *this) +{ +        struct posix_private *priv = this->private; +        lremovexattr (priv->base_path, "trusted.glusterfs.test"); +        FREE (priv); +        return; +} + +struct xlator_mops mops = { +        .stats    = posix_stats, +}; + +struct xlator_fops fops = { +        .lookup      = posix_lookup, +        .stat        = posix_stat, +        .opendir     = posix_opendir, +        .readdir     = posix_readdir, +        .readlink    = posix_readlink, +        .mknod       = posix_mknod, +        .mkdir       = posix_mkdir, +        .unlink      = posix_unlink, +        .rmdir       = posix_rmdir, +        .symlink     = posix_symlink, +        .rename      = posix_rename, +        .link        = posix_link, +        .chmod       = posix_chmod, +        .chown       = posix_chown, +        .truncate    = posix_truncate, +        .utimens     = posix_utimens, +        .create      = posix_create, +        .open        = posix_open, +        .readv       = posix_readv, +        .writev      = posix_writev, +        .statfs      = posix_statfs, +        .flush       = posix_flush, +        .fsync       = posix_fsync, +        .setxattr    = posix_setxattr, +        .getxattr    = posix_getxattr, +        .removexattr = posix_removexattr, +        .fsyncdir    = posix_fsyncdir, +        .access      = posix_access, +        .ftruncate   = posix_ftruncate, +        .fstat       = posix_fstat, +        .lk          = posix_lk, +	.inodelk     = posix_inodelk, +	.finodelk    = posix_finodelk, +	.entrylk     = posix_entrylk, +	.fentrylk    = posix_fentrylk, +        .fchown      = posix_fchown, +        .fchmod      = posix_fchmod, +        .setdents    = posix_setdents, +        .getdents    = posix_getdents, +        .checksum    = posix_checksum, +	.xattrop     = posix_xattrop, +	.fxattrop    = posix_fxattrop, +}; + +struct xlator_cbks cbks = { +	.release     = posix_release, +	.releasedir  = posix_releasedir, +	.forget      = posix_forget +}; + +struct volume_options options[] = { +	{ .key  = {"o-direct"}, +	  .type = GF_OPTION_TYPE_BOOL }, +	{ .key  = {"directory"}, +	  .type = GF_OPTION_TYPE_PATH }, +	{ .key  = {"export-statfs-size"}, +	  .type = GF_OPTION_TYPE_BOOL }, +	{ .key  = {"mandate-attribute"}, +	  .type = GF_OPTION_TYPE_BOOL }, +	{ .key  = {NULL} } +}; diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h new file mode 100644 index 00000000000..b162139c955 --- /dev/null +++ b/xlators/storage/posix/src/posix.h @@ -0,0 +1,110 @@ +/* +   Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +   This file is part of GlusterFS. + +   GlusterFS is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published +   by the Free Software Foundation; either version 3 of the License, +   or (at your option) any later version. + +   GlusterFS is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see +   <http://www.gnu.org/licenses/>. +*/ + +#ifndef _POSIX_H +#define _POSIX_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <stdio.h> +#include <unistd.h> +#include <sys/types.h> +#include <dirent.h> + +#ifdef linux +#ifdef __GLIBC__ +#include <sys/fsuid.h> +#else +#include <unistd.h> +#endif +#endif + +#ifdef HAVE_SYS_XATTR_H +#include <sys/xattr.h> +#endif + +#ifdef HAVE_SYS_EXTATTR_H +#include <sys/extattr.h> +#endif + +#include "xlator.h" +#include "inode.h" +#include "compat.h" + +#include "xattr-cache.h" + +/** + * posix_fd - internal structure common to file and directory fd's + */ + +struct posix_fd { +	int     fd;      /* fd returned by the kernel */ +	int32_t flags;   /* flags for open/creat      */ +	char *  path;    /* used by setdents/getdents */ +	DIR *   dir;     /* handle returned by the kernel */ +}; + +struct posix_private { +	char   *base_path; +	int32_t base_path_length; +	dev_t   base_stdev; + +	xattr_cache_t *xattr_cache; + +        /* Statistics, provides activity of the server */ +	struct xlator_stats stats;  +   +	struct timeval prev_fetch_time; +	struct timeval init_time; + +	int32_t max_read;            /* */ +	int32_t max_write;           /* */ +	int64_t interval_read;      /* Used to calculate the max_read value */ +	int64_t interval_write;     /* Used to calculate the max_write value */ +	int64_t read_value;    /* Total read, from init */ +	int64_t write_value;   /* Total write, from init */ + +/* +   In some cases, two exported volumes may reside on the same +   partition on the server. Sending statvfs info for both +   the volumes will lead to erroneous df output at the client, +   since free space on the partition will be counted twice. + +   In such cases, user can disable exporting statvfs info +   on one of the volumes by setting this option. +*/ +	gf_boolean_t    export_statfs; + +	gf_boolean_t    o_direct;     /* always open files in O_DIRECT mode */ +}; + +#define POSIX_BASE_PATH(this) (((struct posix_private *)this->private)->base_path) + +#define POSIX_BASE_PATH_LEN(this) (((struct posix_private *)this->private)->base_path_length) + +#define MAKE_REAL_PATH(var, this, path) do {                            \ +		var = alloca (strlen (path) + POSIX_BASE_PATH_LEN(this) + 2); \ +                strcpy (var, POSIX_BASE_PATH(this));			\ +                strcpy (&var[POSIX_BASE_PATH_LEN(this)], path);		\ +        } while (0) + +#endif /* _POSIX_H */ diff --git a/xlators/storage/posix/src/xattr-cache.c b/xlators/storage/posix/src/xattr-cache.c new file mode 100644 index 00000000000..a39c35ae234 --- /dev/null +++ b/xlators/storage/posix/src/xattr-cache.c @@ -0,0 +1,521 @@ +/* +  Copyright (c) 2009 Z RESEARCH, Inc. <http://www.zresearch.com> +  This file is part of GlusterFS. + +  GlusterFS is free software; you can redistribute it and/or modify +  it under the terms of the GNU General Public License as published +  by the Free Software Foundation; either version 3 of the License, +  or (at your option) any later version. + +  GlusterFS is distributed in the hope that it will be useful, but +  WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  General Public License for more details. + +  You should have received a copy of the GNU General Public License +  along with this program.  If not, see +  <http://www.gnu.org/licenses/>. +*/ + +#include "byte-order.h" + +#include "xattr-cache.h" +#include "posix.h" +#include "compat-errno.h" + +static int +__hgetxattr (xattr_cache_handle_t *handle, xlator_t *this,  +	     const char *key, void *value, size_t len) +{ +	char *            real_path = NULL; +	struct posix_fd * pfd = NULL; +	uint64_t          tmp_pfd = 0; +	int op_ret = -1; +	int ret    = -1; +	int _fd    = -1; + +	if (handle->loc.path) { +		MAKE_REAL_PATH (real_path, this, handle->loc.path); +		op_ret = lgetxattr (real_path, key, value, len); + +		if (op_ret == -1) +			op_ret = -errno; +	} else { +		ret = fd_ctx_get (handle->fd, this, &tmp_pfd); +                if (ret < 0) { +                        gf_log (this->name, GF_LOG_ERROR, +                                "failed to get pfd from fd=%p", +                                handle->fd); +                        op_ret = -EBADFD; +			goto out; +                } +		pfd = (struct posix_fd *)(long)tmp_pfd; +                _fd = pfd->fd; + +		op_ret = fgetxattr (_fd, key, value, len); +		if (op_ret == -1) +			op_ret = -errno; +	} + +out: +	return op_ret; +} + + +static int +__hsetxattr (xattr_cache_handle_t *handle, xlator_t *this, +	     const char *key, void *value, size_t len, int flags) +{ +	char *            real_path = NULL; +	struct posix_fd * pfd = NULL; +	uint64_t          tmp_pfd = 0; +	int op_ret = -1; +	int ret    = -1; +	int _fd    = -1; + +	if (handle->loc.path) { +		MAKE_REAL_PATH (real_path, this, handle->loc.path); + +		op_ret = lsetxattr (real_path, key, value, len, flags); +		if (op_ret == -1) +			op_ret = -errno; +	} else { +		ret = fd_ctx_get (handle->fd, this, &tmp_pfd); +                if (ret < 0) { +                        gf_log (this->name, GF_LOG_ERROR, +                                "failed to get pfd from fd=%p", +                                handle->fd); + +			op_ret = -EBADFD; +			goto out; +                } +		pfd = (struct posix_fd *)(long)tmp_pfd; +		 +                _fd = pfd->fd; + +		op_ret = fsetxattr (_fd, key, value, len, flags); +		if (op_ret == -1) +			op_ret = -errno; +	} + +out: +	return op_ret; +} + + +static xattr_cache_entry_t * +__cache_lookup (xattr_cache_t *cache, inode_t *inode, char *key) +{ +	int i = 0; + +	for (i = 0; i < cache->size; i++) { +		if ((cache->entries[i]->inode == inode) +		    && (!strcmp (cache->entries[i]->key, key))) { +			cache->entries[i]->nraccess++; +			return cache->entries[i]; +		} +	} + +	return NULL; +} + + +static xattr_cache_entry_t * +__cache_least_used_entry (xattr_cache_t *cache) +{ +	xattr_cache_entry_t *lue = cache->entries[0]; +	int i; + +	for (i = 0; i < cache->size; i++) { +		if (cache->entries[i]->nraccess < lue->nraccess) +			lue = cache->entries[i]; +	} + +	lue->nraccess++; +	return lue; +} + + +static inode_t * +__inode_for_handle (xattr_cache_handle_t *handle) +{ +	inode_t *inode = NULL; + +	if (handle->loc.path) +		inode = handle->loc.inode; +	else if (handle->fd) +		inode = handle->fd->inode; + +	return inode; +} + + +static void +__free_handle (xattr_cache_handle_t *handle) +{ +	if (handle->loc.path) +		loc_wipe (&handle->loc); +	 +	FREE (handle); +} + + +static xattr_cache_handle_t * +__copy_handle (xattr_cache_handle_t *handle) +{ +	xattr_cache_handle_t *hnew = calloc (1, sizeof (xattr_cache_handle_t)); +	 +	if (handle->loc.path) +		loc_copy (&hnew->loc, &handle->loc); +	else +		hnew->fd = handle->fd; + +	return hnew; +} + + +static int +__cache_populate_entry (xattr_cache_entry_t *entry, xlator_t *this, +			xattr_cache_handle_t *handle, char *key, size_t len) +{ +	int op_ret = -1; + +	entry->array = calloc (1, len); +	if (!entry->array) { +		op_ret = -ENOMEM; +		goto out; +	} + +	op_ret = __hgetxattr (handle, this, key, entry->array, len); + +	entry->key      = strdup (key); +	entry->inode    = __inode_for_handle (handle); +	entry->handle   = __copy_handle (handle); +	entry->len      = len; +	entry->nraccess = 1; + +out: +	return op_ret; +} + + +static int +__cache_flush_entry (xattr_cache_entry_t *entry, xlator_t *this) +{ +	int ret = -1; + +	if (entry->dirty) { +		ret = __hsetxattr (entry->handle, this,  +				   entry->key, entry->array, entry->len, 0); +	} + +	entry->len      = 0; +	entry->nraccess = 0; +	entry->dirty    = 0; +	entry->inode    = NULL; + +	if (entry->key) { +		FREE (entry->key); +		entry->key = NULL; +	} +	 +	if (entry->array) { +		FREE (entry->array); +		entry->array = NULL; +	} + +	if (entry->handle) { +		__free_handle (entry->handle); +		entry->handle = NULL; +	} + +	return 0; +} + + +static void +__print_array (char *str, xlator_t *this, int32_t *array, size_t len) +{ +	char *ptr = NULL; +	char *buf = NULL; + +	int i, count = -1; + +	count = len / sizeof (int32_t); + +	/* 10 digits per entry + 1 space + '[' and ']' */ +	buf = malloc (count * 11 + 8); + +	ptr = buf; +	ptr += sprintf (ptr, "[ "); +	for (i = 0; i < count; i++) +		ptr += sprintf (ptr, "%d ", ntoh32 (array[i])); +	ptr += sprintf (ptr, "]"); + +	gf_log (this->name, GF_LOG_DEBUG, +		"%s%s", str, buf); + +	FREE (buf); +} + + +int  +posix_xattr_cache_read (xlator_t *this, xattr_cache_handle_t *handle,  +			char *key, int32_t *array, size_t len) +{ +	xattr_cache_entry_t *entry  = NULL; +	xattr_cache_entry_t *purgee = NULL; + +	xattr_cache_t *cache = NULL; +	inode_t *inode = NULL; + +	int op_ret = -1; + +	inode = __inode_for_handle (handle); +	 +	if (!inode) { +		gf_log (this->name, GF_LOG_DEBUG, +			"handle has no inode!"); +		goto out; +	} + +	cache = ((struct posix_private *) (this->private))->xattr_cache; + +	pthread_mutex_lock (&cache->lock); +	{ +		entry = __cache_lookup (cache, inode, key); + +		if (entry) { +			if (handle->loc.path) +				gf_log (this->name, GF_LOG_DEBUG, +					"cache hit for %s", handle->loc.path); +			else if (handle->fd) +				gf_log (this->name, GF_LOG_DEBUG, +					"cache hit for fd=%p", handle->fd); +		} + +		if (!entry) { +			purgee = __cache_least_used_entry (cache); + +			if (purgee->handle && purgee->handle->loc.path) +				gf_log (this->name, GF_LOG_DEBUG, +					"flushing and purging entry for %s", +					purgee->handle->loc.path); +			else if (purgee->handle && purgee->handle->fd) +				gf_log (this->name, GF_LOG_DEBUG, +					"flushing and purging entry for fd=%p",  +					purgee->handle->fd); +			__cache_flush_entry (purgee, this); + +			if (handle->loc.path) +				gf_log (this->name, GF_LOG_DEBUG, +					"populating entry for %s", +					handle->loc.path); +			else if (handle->fd) +				gf_log (this->name, GF_LOG_DEBUG, +					"populating entry for fd=%p",  +					handle->fd); +			__cache_populate_entry (purgee, this, handle, key, len); + +			entry = purgee; +		} + +		memcpy (array, entry->array, len); + +		__print_array ("read array: ", this, array, len); +	} +	pthread_mutex_unlock (&cache->lock); + +	op_ret = 0; +out: +	return op_ret; +} + + +int posix_xattr_cache_write (xlator_t *this, xattr_cache_handle_t *handle,  +			     char *key, int32_t *array, size_t len) +{ +	xattr_cache_t       * cache = NULL; +	xattr_cache_entry_t * entry = NULL; + +	inode_t *inode = NULL; + +	int op_ret = -1; + +	inode = __inode_for_handle (handle); +	 +	if (!inode) { +		gf_log (this->name, GF_LOG_DEBUG, +			"handle has no inode!"); +		goto out; +	} + +	cache = ((struct posix_private *) (this->private))->xattr_cache; +	 +	pthread_mutex_lock (&cache->lock); +	{ +		entry = __cache_lookup (cache, inode, key); + +		if (entry) { +			entry->dirty = 1; +			memcpy (entry->array, array, len); +		} else { +			/* +			 * This case shouldn't usually happen, since the +			 * entry should have been brought into the cache +			 * by the previous read (xattrop always does a read & +			 * write). +			 * +			 * If we've reached here, it means things are happening +			 * very quickly and the entry was flushed after read +			 * but before this write. In that case, let's just +			 * write this to disk +			 */ +			  +			op_ret = __hsetxattr (handle, this, key, array, +					      len, 0); +		} + +		__print_array ("wrote array: ", this, array, len); +	} +	pthread_mutex_unlock (&cache->lock); + +	op_ret = 0; +out: +	return op_ret; +} + + +int posix_xattr_cache_flush (xlator_t *this, xattr_cache_handle_t *handle) +{ +	xattr_cache_t       *cache = NULL; +	xattr_cache_entry_t *entry = NULL; + +	int i; +	inode_t *inode = NULL; + +	int op_ret = -1; + +	inode = __inode_for_handle (handle); +	if (!inode) { +		gf_log (this->name, GF_LOG_DEBUG, +			"handle has no inode!"); +		op_ret = -EINVAL; +		goto out; +	} + +	cache = ((struct posix_private *) (this->private))->xattr_cache; + +	pthread_mutex_lock (&cache->lock); +	{ +		for (i = 0; i < cache->size; i++) { +			entry = cache->entries[i]; + +			if (entry->inode == inode) { +				if (entry->handle->loc.path) +					gf_log (this->name, GF_LOG_DEBUG, +						"force flushing entry for %s", +						entry->handle->loc.path); +				 +				else if (cache->entries[i]->handle->fd) +					gf_log (this->name, GF_LOG_DEBUG, +						"force flushing entry for fd=%p",  +						entry->handle->fd); +				 +				__cache_flush_entry (entry, this); +			} +		} +	} +	pthread_mutex_unlock (&cache->lock); + +	op_ret = 0; +out: +	return op_ret; +} + + +int +posix_xattr_cache_flush_all (xlator_t *this) +{ +	xattr_cache_t       *cache = NULL; +	xattr_cache_entry_t *entry = NULL; + +	int i; +	int op_ret = 0; + +	cache = ((struct posix_private *) (this->private))->xattr_cache; + +	pthread_mutex_lock (&cache->lock); +	{ +		gf_log (this->name, GF_LOG_DEBUG, +			"flushing entire xattr cache: "); + +		for (i = 0; i < cache->size; i++) { +			entry = cache->entries[i]; + +			if (!entry || !entry->handle) +				continue; + +			if (entry->handle->loc.path) +				gf_log (this->name, GF_LOG_DEBUG, +					"  force flushing entry for %s", +					entry->handle->loc.path); +			 +			else if (cache->entries[i]->handle->fd) +				gf_log (this->name, GF_LOG_DEBUG, +					"  force flushing entry for fd=%p",  +					entry->handle->fd); +			 +			__cache_flush_entry (entry, this); +		} +	} +	pthread_mutex_unlock (&cache->lock); + +	return op_ret; +} + + +xattr_cache_t * +posix_xattr_cache_init (size_t size) +{ +	int i = 0; +	xattr_cache_t * cache = NULL; +	int op_ret = -1; + +	cache = CALLOC (1, sizeof (xattr_cache_t)); +	if (!cache) { +		goto out; +	} + +	cache->entries = CALLOC (size, sizeof (xattr_cache_entry_t *)); +	if (!cache->entries) +		goto out; + +	cache->size = size; + +	for (i = 0; i < size; i++) { +		cache->entries[i] = calloc (1, sizeof (xattr_cache_entry_t)); +		if (!cache->entries[i]) +			goto out; +	} + +	pthread_mutex_init (&cache->lock, NULL); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		if (cache) { +			if (cache->entries) { +				for (i = 0; i < size; i++) +					if (cache->entries[i]) +						FREE (cache->entries[i]); + +				FREE (cache->entries); +			} +			 +			FREE (cache); +		} +	} + +	return cache; +} diff --git a/xlators/storage/posix/src/xattr-cache.h b/xlators/storage/posix/src/xattr-cache.h new file mode 100644 index 00000000000..3e12742a90f --- /dev/null +++ b/xlators/storage/posix/src/xattr-cache.h @@ -0,0 +1,65 @@ +/* +  Copyright (c) 2009 Z RESEARCH, Inc. <http://www.zresearch.com> +  This file is part of GlusterFS. + +  GlusterFS is free software; you can redistribute it and/or modify +  it under the terms of the GNU General Public License as published +  by the Free Software Foundation; either version 3 of the License, +  or (at your option) any later version. + +  GlusterFS is distributed in the hope that it will be useful, but +  WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  General Public License for more details. + +  You should have received a copy of the GNU General Public License +  along with this program.  If not, see +  <http://www.gnu.org/licenses/>. +*/ + +#ifndef __XATTR_CACHE_H__ +#define __XATTR_CACHE_H__ + + +#include "glusterfs.h" +#include "inode.h" + +typedef struct __xattr_cache_handle { +	loc_t loc; +	fd_t  *fd; +} xattr_cache_handle_t; + + +typedef struct __xattr_cache_entry { +	char *key;               /* name of the xattr */ +	int32_t *array;          /* value */ +	size_t len;              /* length of array in bytes */ +	inode_t *inode;          /* inode for which the entry is for */ + +	xattr_cache_handle_t *handle; +	unsigned char dirty; +	unsigned long nraccess;  /* number of times accessed */ +} xattr_cache_entry_t; + + +typedef struct __xattr_cache { +	size_t size; +	pthread_mutex_t lock; +	xattr_cache_entry_t **entries; +} xattr_cache_t; + + +xattr_cache_t * posix_xattr_cache_init (size_t size); + +int posix_xattr_cache_read (xlator_t *this, xattr_cache_handle_t *handle,  +			    char *key, int32_t *array, size_t len); + +int posix_xattr_cache_write (xlator_t *this, xattr_cache_handle_t *handle, +			     char *key, int32_t *array, size_t len); + +int posix_xattr_cache_flush (xlator_t *this, xattr_cache_handle_t *handle); + +int posix_xattr_cache_flush_all (xlator_t *this); + + +#endif /* __XATTR_CACHE_H__ */  | 
