summaryrefslogtreecommitdiffstats
path: root/xlators/storage/bdb/src/bdb-ll.c
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/storage/bdb/src/bdb-ll.c')
-rw-r--r--xlators/storage/bdb/src/bdb-ll.c1464
1 files changed, 0 insertions, 1464 deletions
diff --git a/xlators/storage/bdb/src/bdb-ll.c b/xlators/storage/bdb/src/bdb-ll.c
deleted file mode 100644
index f70ec47f494..00000000000
--- a/xlators/storage/bdb/src/bdb-ll.c
+++ /dev/null
@@ -1,1464 +0,0 @@
-/*
- Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#include <libgen.h>
-#include "bdb.h"
-#include <list.h>
-#include "hashfn.h"
-/*
- * implement the procedures to interact with bdb */
-
-/****************************************************************
- *
- * General wrappers and utility procedures for bdb xlator
- *
- ****************************************************************/
-
-ino_t
-bdb_inode_transform (ino_t parent,
- const char *name,
- size_t namelen)
-{
- ino_t ino = -1;
- uint64_t hash = 0;
-
- hash = gf_dm_hashfn (name, namelen);
-
- ino = (((parent << 32) | 0x00000000ffffffffULL)
- & (hash | 0xffffffff00000000ULL));
-
- return ino;
-}
-
-static int
-bdb_generate_secondary_hash (DB *secondary,
- const DBT *pkey,
- const DBT *data,
- DBT *skey)
-{
- char *primary = NULL;
- uint32_t *hash = NULL;
-
- primary = pkey->data;
-
- hash = GF_CALLOC (1, sizeof (uint32_t), gf_bdb_mt_uint32_t);
-
- *hash = gf_dm_hashfn (primary, pkey->size);
-
- skey->data = hash;
- skey->size = sizeof (hash);
- skey->flags = DB_DBT_APPMALLOC;
-
- return 0;
-}
-
-/***********************************************************
- *
- * bdb storage database utilities
- *
- **********************************************************/
-
-/*
- * bdb_db_open - opens a storage db.
- *
- * @ctx: context specific to the directory for which we are supposed to open db
- *
- * see, if we have empty slots to open a db.
- * if (no-empty-slots), then prune open dbs and close as many as possible
- * if (empty-slot-available), tika muchkonDu db open maaDu
- *
- */
-static int
-bdb_db_open (bctx_t *bctx)
-{
- DB *primary = NULL;
- DB *secondary = NULL;
- int32_t ret = -1;
- bctx_table_t *table = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
-
- table = bctx->table;
- GF_VALIDATE_OR_GOTO ("bdb-ll", table, out);
-
- /* we have to do the following, we can't deny someone of db_open ;) */
- ret = db_create (&primary, table->dbenv, 0);
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_OPEN %s: %s (failed to create database object"
- " for primary database)",
- bctx->directory, db_strerror (ret));
- ret = -ENOMEM;
- goto out;
- }
-
- if (table->page_size) {
- ret = primary->set_pagesize (primary,
- table->page_size);
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_OPEN %s: %s (failed to set page-size "
- "to %"PRIu64")",
- bctx->directory, db_strerror (ret),
- table->page_size);
- } else {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_OPEN %s: page-size set to %"PRIu64,
- bctx->directory, table->page_size);
- }
- }
-
- ret = primary->open (primary, NULL, bctx->db_path, "primary",
- table->access_mode, table->dbflags, 0);
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "_BDB_DB_OPEN %s: %s "
- "(failed to open primary database)",
- bctx->directory, db_strerror (ret));
- ret = -1;
- goto cleanup;
- }
-
- ret = db_create (&secondary, table->dbenv, 0);
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_OPEN %s: %s (failed to create database object"
- " for secondary database)",
- bctx->directory, db_strerror (ret));
- ret = -ENOMEM;
- goto cleanup;
- }
-
- ret = secondary->open (secondary, NULL, bctx->db_path, "secondary",
- table->access_mode, table->dbflags, 0);
- if (ret != 0 ) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "_BDB_DB_OPEN %s: %s "
- "(failed to open secondary database)",
- bctx->directory, db_strerror (ret));
- ret = -1;
- goto cleanup;
- }
-
- ret = primary->associate (primary, NULL, secondary,
- bdb_generate_secondary_hash,
-#ifdef DB_IMMUTABLE_KEY
- DB_IMMUTABLE_KEY);
-#else
- 0);
-#endif
- if (ret != 0 ) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "_BDB_DB_OPEN %s: %s "
- "(failed to associate primary database with "
- "secondary database)",
- bctx->directory, db_strerror (ret));
- ret = -1;
- goto cleanup;
- }
-
-out:
- bctx->primary = primary;
- bctx->secondary = secondary;
-
- return ret;
-cleanup:
- if (primary)
- primary->close (primary, 0);
- if (secondary)
- secondary->close (secondary, 0);
-
- return ret;
-}
-
-int32_t
-bdb_cursor_close (bctx_t *bctx,
- DBC *cursorp)
-{
- int32_t ret = -1;
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
- GF_VALIDATE_OR_GOTO ("bdb-ll", cursorp, out);
-
- LOCK (&bctx->lock);
- {
-#ifdef HAVE_BDB_CURSOR_GET
- ret = cursorp->close (cursorp);
-#else
- ret = cursorp->c_close (cursorp);
-#endif
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_CURSOR_CLOSE %s: %s "
- "(failed to close database cursor)",
- bctx->directory, db_strerror (ret));
- }
- }
- UNLOCK (&bctx->lock);
-
-out:
- return ret;
-}
-
-
-int32_t
-bdb_cursor_open (bctx_t *bctx,
- DBC **cursorpp)
-{
- int32_t ret = -1;
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
- GF_VALIDATE_OR_GOTO ("bdb-ll", cursorpp, out);
-
- LOCK (&bctx->lock);
- {
- if (bctx->secondary) {
- /* do nothing, just continue */
- ret = 0;
- } else {
- ret = bdb_db_open (bctx);
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_CURSOR_OPEN %s: ENOMEM "
- "(failed to open secondary database)",
- bctx->directory);
- ret = -ENOMEM;
- } else {
- ret = 0;
- }
- }
-
- if (ret == 0) {
- /* all set, open cursor */
- ret = bctx->secondary->cursor (bctx->secondary,
- NULL, cursorpp, 0);
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_CURSOR_OPEN %s: %s "
- "(failed to open a cursor to database)",
- bctx->directory, db_strerror (ret));
- }
- }
- }
- UNLOCK (&bctx->lock);
-
-out:
- return ret;
-}
-
-
-/* cache related */
-static bdb_cache_t *
-bdb_cache_lookup (bctx_t *bctx,
- char *path)
-{
- bdb_cache_t *bcache = NULL;
- bdb_cache_t *trav = NULL;
- char *key = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
- GF_VALIDATE_OR_GOTO ("bdb-ll", path, out);
-
- MAKE_KEY_FROM_PATH (key, path);
-
- LOCK (&bctx->lock);
- {
- list_for_each_entry (trav, &bctx->c_list, c_list) {
- if (!strcmp (trav->key, key)){
- bcache = trav;
- break;
- }
- }
- }
- UNLOCK (&bctx->lock);
-
-out:
- return bcache;
-}
-
-static int32_t
-bdb_cache_insert (bctx_t *bctx,
- DBT *key,
- DBT *data)
-{
- bdb_cache_t *bcache = NULL;
- int32_t ret = -1;
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
- GF_VALIDATE_OR_GOTO ("bdb-ll", key, out);
- GF_VALIDATE_OR_GOTO ("bdb-ll", data, out);
-
- LOCK (&bctx->lock);
- {
- if (bctx->c_count > 5) {
- /* most of the times, we enter here */
- /* FIXME: ugly, not supposed to disect any of the
- * 'struct list_head' directly */
- if (!list_empty (&bctx->c_list)) {
- bcache = list_entry (bctx->c_list.prev,
- bdb_cache_t, c_list);
- list_del_init (&bcache->c_list);
- }
- if (bcache->key) {
- GF_FREE (bcache->key);
- bcache->key = GF_CALLOC (key->size + 1,
- sizeof (char),
- gf_bdb_mt_char);
- GF_VALIDATE_OR_GOTO ("bdb-ll",
- bcache->key, unlock);
- memcpy (bcache->key, (char *)key->data,
- key->size);
- } else {
- /* should never come here */
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_CACHE_INSERT %s (%s) "
- "(found a cache entry with empty key)",
- bctx->directory, (char *)key->data);
- } /* if(bcache->key)...else */
- if (bcache->data) {
- GF_FREE (bcache->data);
- bcache->data = memdup (data->data, data->size);
- GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->data,
- unlock);
- bcache->size = data->size;
- } else {
- /* should never come here */
- gf_log ("bdb-ll", GF_LOG_CRITICAL,
- "_BDB_CACHE_INSERT %s (%s) "
- "(found a cache entry with no data)",
- bctx->directory, (char *)key->data);
- } /* if(bcache->data)...else */
- list_add (&bcache->c_list, &bctx->c_list);
- ret = 0;
- } else {
- /* we will be entering here very rarely */
- bcache = GF_CALLOC (1, sizeof (*bcache),
- gf_bdb_mt_bdb_cache_t);
- GF_VALIDATE_OR_GOTO ("bdb-ll", bcache, unlock);
-
- bcache->key = GF_CALLOC (key->size + 1, sizeof (char),
- gf_bdb_mt_char);
- GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->key, unlock);
- memcpy (bcache->key, key->data, key->size);
-
- bcache->data = memdup (data->data, data->size);
- GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->data, unlock);
-
- bcache->size = data->size;
- list_add (&bcache->c_list, &bctx->c_list);
- bctx->c_count++;
- ret = 0;
- } /* if(private->c_count < 5)...else */
- }
-unlock:
- UNLOCK (&bctx->lock);
-out:
- return ret;
-}
-
-static int32_t
-bdb_cache_delete (bctx_t *bctx,
- const char *key)
-{
- bdb_cache_t *bcache = NULL;
- bdb_cache_t *trav = NULL;
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
- GF_VALIDATE_OR_GOTO ("bdb-ll", key, out);
-
- LOCK (&bctx->lock);
- {
- list_for_each_entry (trav, &bctx->c_list, c_list) {
- if (!strcmp (trav->key, key)){
- bctx->c_count--;
- bcache = trav;
- break;
- }
- }
-
- if (bcache) {
- list_del_init (&bcache->c_list);
- GF_FREE (bcache->key);
- GF_FREE (bcache->data);
- GF_FREE (bcache);
- }
- }
- UNLOCK (&bctx->lock);
-
-out:
- return 0;
-}
-
-void *
-bdb_db_stat (bctx_t *bctx,
- DB_TXN *txnid,
- uint32_t flags)
-{
- DB *storage = NULL;
- void *stat = NULL;
- int32_t ret = -1;
-
- LOCK (&bctx->lock);
- {
- if (bctx->primary == NULL) {
- ret = bdb_db_open (bctx);
- storage = bctx->primary;
- } else {
- /* we are just fine, lets continue */
- storage = bctx->primary;
- } /* if(bctx->dbp==NULL)...else */
- }
- UNLOCK (&bctx->lock);
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out);
-
- ret = storage->stat (storage, txnid, &stat, flags);
-
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_STAT %s: %s "
- "(failed to do stat database)",
- bctx->directory, db_strerror (ret));
- }
-out:
- return stat;
-
-}
-
-/* bdb_storage_get - retrieve a key/value pair corresponding to @path from the
- * corresponding db file.
- *
- * @bctx: bctx_t * corresponding to the parent directory of @path. (should
- * always be a valid bctx). bdb_storage_get should never be called if
- * @bctx = NULL.
- * @txnid: NULL if bdb_storage_get is not embedded in an explicit transaction
- * or a valid DB_TXN *, when embedded in an explicit transaction.
- * @path: path of the file to read from (translated to a database key using
- * MAKE_KEY_FROM_PATH)
- * @buf: char ** - pointer to a pointer to char. a read buffer is created in
- * this procedure and pointer to the buffer is passed through @buf to the
- * caller.
- * @size: size of the file content to be read.
- * @offset: offset from which the file content to be read.
- *
- * NOTE: bdb_storage_get tries to open DB, if @bctx->dbp == NULL
- * (@bctx->dbp == NULL, nobody has opened DB till now or DB was closed by
- * bdb_table_prune()).
- *
- * NOTE: if private->cache is set (bdb xlator's internal caching enabled), then
- * bdb_storage_get first looks up the cache for key/value pair. if
- * bdb_lookup_cache fails, then only DB->get() is called. also, inserts a
- * newly read key/value pair to cache through bdb_insert_to_cache.
- *
- * return: 'number of bytes read' on success or -1 on error.
- *
- * also see: bdb_lookup_cache, bdb_insert_to_cache for details about bdb
- * xlator's internal cache.
- */
-static int32_t
-bdb_db_get (bctx_t *bctx,
- DB_TXN *txnid,
- const char *path,
- char *buf,
- size_t size,
- off_t offset)
-{
- DB *storage = NULL;
- DBT key = {0,};
- DBT value = {0,};
- int32_t ret = -1;
- size_t copy_size = 0;
- char *key_string = NULL;
- bdb_cache_t *bcache = NULL;
- int32_t db_flags = 0;
- uint8_t need_break = 0;
- int32_t retries = 1;
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
- GF_VALIDATE_OR_GOTO ("bdb-ll", path, out);
-
- MAKE_KEY_FROM_PATH (key_string, path);
-
- if (bctx->cache &&
- ((bcache = bdb_cache_lookup (bctx, key_string)) != NULL)) {
- if (buf) {
- copy_size = ((bcache->size - offset) < size)?
- (bcache->size - offset) : size;
-
- memcpy (buf, (bcache->data + offset), copy_size);
- ret = copy_size;
- } else {
- ret = bcache->size;
- }
-
- goto out;
- }
-
- LOCK (&bctx->lock);
- {
- if (bctx->primary == NULL) {
- ret = bdb_db_open (bctx);
- storage = bctx->primary;
- } else {
- /* we are just fine, lets continue */
- storage = bctx->primary;
- } /* if(bctx->dbp==NULL)...else */
- }
- UNLOCK (&bctx->lock);
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out);
-
- key.data = (char *)key_string;
- key.size = strlen (key_string);
- key.flags = DB_DBT_USERMEM;
-
- if (bctx->cache){
- value.flags = DB_DBT_MALLOC;
- } else {
- if (size) {
- value.data = buf;
- value.ulen = size;
- value.flags = DB_DBT_USERMEM | DB_DBT_PARTIAL;
- } else {
- value.flags = DB_DBT_MALLOC;
- }
- value.dlen = size;
- value.doff = offset;
- }
-
- do {
- /* TODO: we prefer to give our own buffer to value.data
- * and ask bdb to fill in it */
- ret = storage->get (storage, txnid, &key, &value,
- db_flags);
-
- if (ret == DB_NOTFOUND) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_GET %s - %s: ENOENT"
- "(specified key not found in database)",
- bctx->directory, key_string);
- ret = -1;
- need_break = 1;
- } else if (ret == DB_LOCK_DEADLOCK) {
- retries++;
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_GET %s - %s"
- "(deadlock detected, retrying for %d "
- "time)",
- bctx->directory, key_string, retries);
- } else if (ret == 0) {
- /* successfully read data, lets set everything
- * in place and return */
- if (bctx->cache) {
- if (buf) {
- copy_size = ((value.size - offset) < size) ?
- (value.size - offset) : size;
-
- memcpy (buf, (value.data + offset),
- copy_size);
- ret = copy_size;
- }
-
- bdb_cache_insert (bctx, &key, &value);
- } else {
- ret = value.size;
- }
-
- if (size == 0)
- GF_FREE (value.data);
-
- need_break = 1;
- } else {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_GET %s - %s: %s"
- "(failed to retrieve specified key from"
- " database)",
- bctx->directory, key_string,
- db_strerror (ret));
- ret = -1;
- need_break = 1;
- }
- } while (!need_break);
-
-out:
- return ret;
-}/* bdb_db_get */
-
-/* TODO: handle errors here and log. propogate only the errno to caller */
-int32_t
-bdb_db_fread (struct bdb_fd *bfd, char *buf, size_t size, off_t offset)
-{
- return bdb_db_get (bfd->ctx, NULL, bfd->key, buf, size, offset);
-}
-
-int32_t
-bdb_db_iread (struct bdb_ctx *bctx, const char *key, char **bufp)
-{
- char *buf = NULL;
- size_t size = 0;
- int64_t ret = 0;
-
- ret = bdb_db_get (bctx, NULL, key, NULL, 0, 0);
- size = ret;
-
- if (bufp) {
- buf = GF_CALLOC (size, sizeof (char), gf_bdb_mt_char);
- *bufp = buf;
- ret = bdb_db_get (bctx, NULL, key, buf, size, 0);
- }
-
- return ret;
-}
-
-/* bdb_storage_put - insert a key/value specified to the corresponding DB.
- *
- * @bctx: bctx_t * corresponding to the parent directory of @path.
- * (should always be a valid bctx). bdb_storage_put should never be
- * called if @bctx = NULL.
- * @txnid: NULL if bdb_storage_put is not embedded in an explicit transaction
- * or a valid DB_TXN *, when embedded in an explicit transaction.
- * @key_string: key of the database entry.
- * @buf: pointer to the buffer data to be written as data for @key_string.
- * @size: size of @buf.
- * @offset: offset in the key's data to be modified with provided data.
- * @flags: valid flags are BDB_TRUNCATE_RECORD (to reduce the data of
- * @key_string to 0 size).
- *
- * NOTE: bdb_storage_put tries to open DB, if @bctx->dbp == NULL
- * (@bctx->dbp == NULL, nobody has opened DB till now or DB was closed by
- * bdb_table_prune()).
- *
- * NOTE: bdb_storage_put deletes the key/value from bdb xlator's internal cache.
- *
- * return: 0 on success or -1 on error.
- *
- * also see: bdb_cache_delete for details on how a cached key/value pair is
- * removed.
- */
-static int32_t
-bdb_db_put (bctx_t *bctx,
- DB_TXN *txnid,
- const char *key_string,
- const char *buf,
- size_t size,
- off_t offset,
- int32_t flags)
-{
- DB *storage = NULL;
- DBT key = {0,}, value = {0,};
- int32_t ret = -1;
- int32_t db_flags = DB_AUTO_COMMIT;
- uint8_t need_break = 0;
- int32_t retries = 1;
-
- LOCK (&bctx->lock);
- {
- if (bctx->primary == NULL) {
- ret = bdb_db_open (bctx);
- storage = bctx->primary;
- } else {
- /* we are just fine, lets continue */
- storage = bctx->primary;
- }
- }
- UNLOCK (&bctx->lock);
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out);
-
- if (bctx->cache) {
- ret = bdb_cache_delete (bctx, (char *)key_string);
- GF_VALIDATE_OR_GOTO ("bdb-ll", (ret == 0), out);
- }
-
- key.data = (void *)key_string;
- key.size = strlen (key_string);
-
- /* NOTE: bdb lets us expand the file, suppose value.size > value.len,
- * then value.len bytes from value.doff offset and value.size bytes
- * will be written from value.doff and data from
- * value.doff + value.dlen will be pushed value.doff + value.size
- */
- value.data = (void *)buf;
-
- if (flags & BDB_TRUNCATE_RECORD) {
- value.size = size;
- value.doff = 0;
- value.dlen = offset;
- } else {
- value.size = size;
- value.dlen = size;
- value.doff = offset;
- }
- value.flags = DB_DBT_PARTIAL;
- if (buf == NULL && size == 0)
- /* truncate called us */
- value.flags = 0;
-
- do {
- ret = storage->put (storage, txnid, &key, &value, db_flags);
- if (ret == DB_LOCK_DEADLOCK) {
- retries++;
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_PUT %s - %s"
- "(deadlock detected, retying for %d time)",
- bctx->directory, key_string, retries);
- } else if (ret) {
- /* write failed */
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_PUT %s - %s: %s"
- "(failed to put specified entry into database)",
- bctx->directory, key_string, db_strerror (ret));
- need_break = 1;
- } else {
- /* successfully wrote */
- ret = 0;
- need_break = 1;
- }
- } while (!need_break);
-out:
- return ret;
-}/* bdb_db_put */
-
-int32_t
-bdb_db_icreate (struct bdb_ctx *bctx, const char *key)
-{
- return bdb_db_put (bctx, NULL, key, NULL, 0, 0, 0);
-}
-
-/* TODO: handle errors here and log. propogate only the errno to caller */
-int32_t
-bdb_db_fwrite (struct bdb_fd *bfd, char *buf, size_t size, off_t offset)
-{
- return bdb_db_put (bfd->ctx, NULL, bfd->key, buf, size, offset, 0);
-}
-
-/* TODO: handle errors here and log. propogate only the errno to caller */
-int32_t
-bdb_db_iwrite (struct bdb_ctx *bctx, const char *key, char *buf, size_t size)
-{
- return bdb_db_put (bctx, NULL, key, buf, size, 0, 0);
-}
-
-int32_t
-bdb_db_itruncate (struct bdb_ctx *bctx, const char *key)
-{
- return bdb_db_put (bctx, NULL, key, NULL, 0, 1, 0);
-}
-
-/* bdb_storage_del - delete a key/value pair corresponding to @path from
- * corresponding db file.
- *
- * @bctx: bctx_t * corresponding to the parent directory of @path.
- * (should always be a valid bctx). bdb_storage_del should never be called
- * if @bctx = NULL.
- * @txnid: NULL if bdb_storage_del is not embedded in an explicit transaction
- * or a valid DB_TXN *, when embedded in an explicit transaction.
- * @path: path to the file, whose key/value pair has to be deleted.
- *
- * NOTE: bdb_storage_del tries to open DB, if @bctx->dbp == NULL
- * (@bctx->dbp == NULL, nobody has opened DB till now or DB was closed by
- * bdb_table_prune()).
- *
- * return: 0 on success or -1 on error.
- */
-static int32_t
-bdb_db_del (bctx_t *bctx,
- DB_TXN *txnid,
- const char *key_string)
-{
- DB *storage = NULL;
- DBT key = {0,};
- int32_t ret = -1;
- int32_t db_flags = 0;
- uint8_t need_break = 0;
- int32_t retries = 1;
-
- LOCK (&bctx->lock);
- {
- if (bctx->primary == NULL) {
- ret = bdb_db_open (bctx);
- storage = bctx->primary;
- } else {
- /* we are just fine, lets continue */
- storage = bctx->primary;
- }
- }
- UNLOCK (&bctx->lock);
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out);
-
- ret = bdb_cache_delete (bctx, key_string);
- GF_VALIDATE_OR_GOTO ("bdb-ll", (ret == 0), out);
-
- key.data = (char *)key_string;
- key.size = strlen (key_string);
- key.flags = DB_DBT_USERMEM;
-
- do {
- ret = storage->del (storage, txnid, &key, db_flags);
-
- if (ret == DB_NOTFOUND) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_DEL %s - %s: ENOENT"
- "(failed to delete entry, could not be "
- "found in the database)",
- bctx->directory, key_string);
- need_break = 1;
- } else if (ret == DB_LOCK_DEADLOCK) {
- retries++;
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_DEL %s - %s"
- "(deadlock detected, retying for %d time)",
- bctx->directory, key_string, retries);
- } else if (ret == 0) {
- /* successfully deleted the entry */
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_DEL %s - %s"
- "(successfully deleted entry from database)",
- bctx->directory, key_string);
- ret = 0;
- need_break = 1;
- } else {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_DB_DEL %s - %s: %s"
- "(failed to delete entry from database)",
- bctx->directory, key_string, db_strerror (ret));
- ret = -1;
- need_break = 1;
- }
- } while (!need_break);
-out:
- return ret;
-}
-
-int32_t
-bdb_db_iremove (bctx_t *bctx,
- const char *key)
-{
- return bdb_db_del (bctx, NULL, key);
-}
-
-/* NOTE: bdb version compatibility wrapper */
-int32_t
-bdb_cursor_get (DBC *cursorp,
- DBT *sec, DBT *pri,
- DBT *val,
- int32_t flags)
-{
- int32_t ret = -1;
-
- GF_VALIDATE_OR_GOTO ("bdb-ll", cursorp, out);
-
-#ifdef HAVE_BDB_CURSOR_GET
- ret = cursorp->pget (cursorp, sec, pri, val, flags);
-#else
- ret = cursorp->c_pget (cursorp, sec, pri, val, flags);
-#endif
- if ((ret != 0) && (ret != DB_NOTFOUND)) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_CURSOR_GET: %s"
- "(failed to retrieve entry from database cursor)",
- db_strerror (ret));
- }
-
-out:
- return ret;
-}/* bdb_cursor_get */
-
-int32_t
-bdb_dirent_size (DBT *key)
-{
- return GF_DIR_ALIGN (24 /* FIX MEEEE!!! */ + key->size);
-}
-
-
-
-/* bdb_dbenv_init - initialize DB_ENV
- *
- * initialization includes:
- * 1. opening DB_ENV (db_env_create(), DB_ENV->open()).
- * NOTE: see private->envflags for flags used.
- * 2. DB_ENV->set_lg_dir - set log directory to be used for storing log files
- * (log files are the files in which transaction logs are written by db).
- * 3. DB_ENV->set_flags (DB_LOG_AUTOREMOVE) - set DB_ENV to automatically
- * clear the unwanted log files (flushed at each checkpoint).
- * 4. DB_ENV->set_errfile - set errfile to be used by db to report detailed
- * error logs. used only for debbuging purpose.
- *
- * return: returns a valid DB_ENV * on success or NULL on error.
- *
- */
-static DB_ENV *
-bdb_dbenv_init (xlator_t *this,
- char *directory)
-{
- /* Create a DB environment */
- DB_ENV *dbenv = NULL;
- int32_t ret = 0;
- bdb_private_t *private = NULL;
- int32_t fatal_flags = 0;
-
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (directory, err);
-
- private = this->private;
- VALIDATE_OR_GOTO (private, err);
-
- ret = db_env_create (&dbenv, 0);
- VALIDATE_OR_GOTO ((ret == 0), err);
-
- /* NOTE: set_errpfx returns 'void' */
- dbenv->set_errpfx(dbenv, this->name);
-
- ret = dbenv->set_lk_detect (dbenv, DB_LOCK_DEFAULT);
- VALIDATE_OR_GOTO ((ret == 0), err);
-
- ret = dbenv->open(dbenv, directory,
- private->envflags,
- S_IRUSR | S_IWUSR);
- if ((ret != 0) && (ret != DB_RUNRECOVERY)) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "failed to join Berkeley DB environment at %s: %s."
- "please run manual recovery and retry running "
- "glusterfs",
- directory, db_strerror (ret));
- dbenv = NULL;
- goto err;
- } else if (ret == DB_RUNRECOVERY) {
- fatal_flags = ((private->envflags & (~DB_RECOVER))
- | DB_RECOVER_FATAL);
- ret = dbenv->open(dbenv, directory, fatal_flags,
- S_IRUSR | S_IWUSR);
- if (ret != 0) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "failed to join Berkeley DB environment in "
- "recovery mode at %s: %s. please run manual "
- "recovery and retry running glusterfs",
- directory, db_strerror (ret));
- dbenv = NULL;
- goto err;
- }
- }
-
- ret = 0;
-#if (DB_VERSION_MAJOR == 4 && \
- DB_VERSION_MINOR == 7)
- if (private->log_auto_remove) {
- ret = dbenv->log_set_config (dbenv, DB_LOG_AUTO_REMOVE, 1);
- } else {
- ret = dbenv->log_set_config (dbenv, DB_LOG_AUTO_REMOVE, 0);
- }
-#else
- if (private->log_auto_remove) {
- ret = dbenv->set_flags (dbenv, DB_LOG_AUTOREMOVE, 1);
- } else {
- ret = dbenv->set_flags (dbenv, DB_LOG_AUTOREMOVE, 0);
- }
-#endif
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "autoremoval of transactional log files could not be "
- "configured (%s). you may have to do a manual "
- "monitoring of transactional log files and remove "
- "periodically.",
- db_strerror (ret));
- goto err;
- }
-
- if (private->transaction) {
- ret = dbenv->set_flags(dbenv, DB_AUTO_COMMIT, 1);
-
- if (ret != 0) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "configuration of auto-commit failed for "
- "database environment at %s. none of the "
- "operations will be embedded in transaction "
- "unless explicitly done so.",
- db_strerror (ret));
- goto err;
- }
-
- if (private->txn_timeout) {
- ret = dbenv->set_timeout (dbenv, private->txn_timeout,
- DB_SET_TXN_TIMEOUT);
- if (ret != 0) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "could not configure Berkeley DB "
- "transaction timeout to %d (%s). please"
- " review 'option transaction-timeout %d"
- "' option.",
- private->txn_timeout,
- db_strerror (ret),
- private->txn_timeout);
- goto err;
- }
- }
-
- if (private->lock_timeout) {
- ret = dbenv->set_timeout(dbenv,
- private->txn_timeout,
- DB_SET_LOCK_TIMEOUT);
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "could not configure Berkeley DB "
- "lock timeout to %d (%s). please"
- " review 'option lock-timeout %d"
- "' option.",
- private->lock_timeout,
- db_strerror (ret),
- private->lock_timeout);
- goto err;
- }
- }
-
- ret = dbenv->set_lg_dir (dbenv, private->logdir);
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "failed to configure libdb transaction log "
- "directory at %s. please review the "
- "'option logdir %s' option.",
- db_strerror (ret), private->logdir);
- goto err;
- }
- }
-
- if (private->errfile) {
- private->errfp = fopen (private->errfile, "a+");
- if (private->errfp) {
- dbenv->set_errfile (dbenv, private->errfp);
- } else {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "failed to open error logging file for "
- "libdb (Berkeley DB) internal logging (%s)."
- "please review the 'option errfile %s' option.",
- strerror (errno), private->errfile);
- goto err;
- }
- }
-
- return dbenv;
-err:
- if (dbenv) {
- dbenv->close (dbenv, 0);
- }
-
- return NULL;
-}
-
-#define BDB_ENV(this) ((((struct bdb_private *)this->private)->b_table)->dbenv)
-
-/* bdb_checkpoint - during transactional usage, db does not directly write the
- * data to db files, instead db writes a 'log' (similar to a journal entry)
- * into a log file. db normally clears the log files during opening of an
- * environment. since we expect a filesystem server to run for a pretty long
- * duration and flushing 'log's during dbenv->open would prove very costly, if
- * we accumulate the log entries for one complete run of glusterfs server. to
- * flush the logs frequently, db provides a mechanism called 'checkpointing'.
- * when we do a checkpoint, db flushes the logs to disk (writes changes to db
- * files) and we can also clear the accumulated log files after checkpointing.
- * NOTE: removing unwanted log files is not part of dbenv->txn_checkpoint()
- * call.
- *
- * @data: xlator_t of the current instance of bdb xlator.
- *
- * bdb_checkpoint is called in a different thread from the main glusterfs
- * thread. bdb xlator creates the checkpoint thread after successfully opening
- * the db environment.
- * NOTE: bdb_checkpoint thread shares the DB_ENV handle with the filesystem
- * thread.
- *
- * db environment checkpointing frequency is controlled by
- * 'option checkpoint-timeout <time-in-seconds>' in volfile.
- *
- * NOTE: checkpointing thread is started only if 'option transaction on'
- * specified in volfile. checkpointing is not valid for non-transactional
- * environments.
- *
- */
-static void *
-bdb_checkpoint (void *data)
-{
- xlator_t *this = NULL;
- struct bdb_private *private = NULL;
- DB_ENV *dbenv = NULL;
- int32_t ret = 0;
- uint32_t active = 0;
-
- this = (xlator_t *) data;
- dbenv = BDB_ENV(this);
- private = this->private;
-
- for (;;sleep (private->checkpoint_interval)) {
- LOCK (&private->active_lock);
- active = private->active;
- UNLOCK (&private->active_lock);
-
- if (active) {
- ret = dbenv->txn_checkpoint (dbenv, 1024, 0, 0);
- if (ret) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_CHECKPOINT: %s"
- "(failed to checkpoint environment)",
- db_strerror (ret));
- } else {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_CHECKPOINT: successfully "
- "checkpointed");
- }
- } else {
- ret = dbenv->txn_checkpoint (dbenv, 1024, 0, 0);
- if (ret) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "_BDB_CHECKPOINT: %s"
- "(final checkpointing failed. might "
- "need to run recovery tool manually on "
- "next usage of this database "
- "environment)",
- db_strerror (ret));
- } else {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "_BDB_CHECKPOINT: final successfully "
- "checkpointed");
- }
- break;
- }
- }
-
- return NULL;
-}
-
-
-/* bdb_db_init - initialize bdb xlator
- *
- * reads the options from @options dictionary and sets appropriate values in
- * @this->private. also initializes DB_ENV.
- *
- * return: 0 on success or -1 on error
- * (with logging the error through gf_log()).
- */
-int
-bdb_db_init (xlator_t *this,
- dict_t *options)
-{
- /* create a db entry for root */
- int32_t op_ret = 0;
- bdb_private_t *private = NULL;
- bctx_table_t *table = NULL;
-
- char *checkpoint_interval_str = NULL;
- char *page_size_str = NULL;
- char *lru_limit_str = NULL;
- char *timeout_str = NULL;
- char *access_mode = NULL;
- char *endptr = NULL;
- char *errfile = NULL;
- char *directory = NULL;
- char *logdir = NULL;
- char *mode = NULL;
- char *mode_str = NULL;
- int ret = -1;
- int idx = 0;
- struct stat stbuf = {0,};
-
- private = this->private;
-
- /* cache is always on */
- private->cache = ON;
-
- ret = dict_get_str (options, "access-mode", &access_mode);
- if ((ret == 0)
- && (!strcmp (access_mode, "btree"))) {
- gf_log (this->name, GF_LOG_DEBUG,
- "using BTREE access mode to access libdb "
- "(Berkeley DB)");
- private->access_mode = DB_BTREE;
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "using HASH access mode to access libdb (Berkeley DB)");
- private->access_mode = DB_HASH;
- }
-
- ret = dict_get_str (options, "mode", &mode);
- if ((ret == 0)
- && (!strcmp (mode, "cache"))) {
- gf_log (this->name, GF_LOG_DEBUG,
- "cache data mode selected for 'storage/bdb'. filesystem"
- " operations are not transactionally protected and "
- "system crash does not guarantee recoverability of "
- "data");
- private->envflags = DB_CREATE | DB_INIT_LOG |
- DB_INIT_MPOOL | DB_THREAD;
- private->dbflags = DB_CREATE | DB_THREAD;
- private->transaction = OFF;
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "persistent data mode selected for 'storage/bdb'. each"
- "filesystem operation is guaranteed to be Berkeley DB "
- "transaction protected.");
- private->transaction = ON;
- private->envflags = DB_CREATE | DB_INIT_LOCK | DB_INIT_LOG |
- DB_INIT_MPOOL | DB_INIT_TXN | DB_RECOVER | DB_THREAD;
- private->dbflags = DB_CREATE | DB_THREAD;
-
-
- ret = dict_get_str (options, "lock-timeout", &timeout_str);
-
- if (ret == 0) {
- ret = gf_string2time (timeout_str,
- &private->lock_timeout);
-
- if (private->lock_timeout > 4260000) {
- /* db allows us to DB_SET_LOCK_TIMEOUT to be
- * set to a maximum of 71 mins
- * (4260000 milliseconds) */
- gf_log (this->name, GF_LOG_DEBUG,
- "Berkeley DB lock-timeout parameter "
- "(%d) is out of range. please specify"
- " a valid timeout value for "
- "lock-timeout and retry.",
- private->lock_timeout);
- goto err;
- }
- }
- ret = dict_get_str (options, "transaction-timeout",
- &timeout_str);
- if (ret == 0) {
- ret = gf_string2time (timeout_str,
- &private->txn_timeout);
-
- if (private->txn_timeout > 4260000) {
- /* db allows us to DB_SET_TXN_TIMEOUT to be set
- * to a maximum of 71 mins
- * (4260000 milliseconds) */
- gf_log (this->name, GF_LOG_DEBUG,
- "Berkeley DB lock-timeout parameter "
- "(%d) is out of range. please specify"
- " a valid timeout value for "
- "lock-timeout and retry.",
- private->lock_timeout);
- goto err;
- }
- }
-
- private->checkpoint_interval = BDB_DEFAULT_CHECKPOINT_INTERVAL;
- ret = dict_get_str (options, "checkpoint-interval",
- &checkpoint_interval_str);
- if (ret == 0) {
- ret = gf_string2time (checkpoint_interval_str,
- &private->checkpoint_interval);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "'%"PRIu32"' is not a valid parameter "
- "for checkpoint-interval option. "
- "please specify a valid "
- "checkpoint-interval and retry",
- private->checkpoint_interval);
- goto err;
- }
- }
- }
-
- ret = dict_get_str (options, "file-mode", &mode_str);
- if (ret == 0) {
- private->file_mode = strtol (mode_str, &endptr, 8);
-
- if ((*endptr) ||
- (!IS_VALID_FILE_MODE(private->file_mode))) {
- gf_log (this->name, GF_LOG_DEBUG,
- "'%o' is not a valid parameter for file-mode "
- "option. please specify a valid parameter for "
- "file-mode and retry.",
- private->file_mode);
- goto err;
- }
- } else {
- private->file_mode = DEFAULT_FILE_MODE;
- }
- private->symlink_mode = private->file_mode | S_IFLNK;
- private->file_mode = private->file_mode | S_IFREG;
-
- ret = dict_get_str (options, "dir-mode", &mode_str);
- if (ret == 0) {
- private->dir_mode = strtol (mode_str, &endptr, 8);
- if ((*endptr) ||
- (!IS_VALID_FILE_MODE(private->dir_mode))) {
- gf_log (this->name, GF_LOG_DEBUG,
- "'%o' is not a valid parameter for dir-mode "
- "option. please specify a valid parameter for "
- "dir-mode and retry.",
- private->dir_mode);
- goto err;
- }
- } else {
- private->dir_mode = DEFAULT_DIR_MODE;
- }
-
- private->dir_mode = private->dir_mode | S_IFDIR;
-
- table = GF_CALLOC (1, sizeof (*table), gf_bdb_mt_bctx_table_t);
- if (table == NULL) {
- gf_log ("bdb-ll", GF_LOG_CRITICAL,
- "memory allocation for 'storage/bdb' internal "
- "context table failed.");
- goto err;
- }
-
- INIT_LIST_HEAD(&(table->b_lru));
- INIT_LIST_HEAD(&(table->active));
- INIT_LIST_HEAD(&(table->purge));
-
- LOCK_INIT (&table->lock);
- LOCK_INIT (&table->checkpoint_lock);
-
- table->transaction = private->transaction;
- table->access_mode = private->access_mode;
- table->dbflags = private->dbflags;
- table->this = this;
-
- ret = dict_get_str (options, "lru-limit",
- &lru_limit_str);
-
- /* TODO: set max lockers and max txns to accomodate
- * for more than lru_limit */
- if (ret == 0) {
- ret = gf_string2uint32 (lru_limit_str,
- &table->lru_limit);
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "setting lru limit of 'storage/bdb' internal context"
- "table to %d. maximum of %d unused databases can be "
- "open at any given point of time.",
- table->lru_limit, table->lru_limit);
- } else {
- table->lru_limit = BDB_DEFAULT_LRU_LIMIT;
- }
-
- ret = dict_get_str (options, "page-size",
- &page_size_str);
-
- if (ret == 0) {
- ret = gf_string2bytesize (page_size_str,
- &table->page_size);
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "\"%s\" is an invalid parameter to "
- "\"option page-size\". please specify a valid "
- "size and retry.",
- page_size_str);
- goto err;
- }
-
- if (!PAGE_SIZE_IN_RANGE(table->page_size)) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "\"%s\" is out of range for Berkeley DB "
- "page-size. allowed page-size range is %d to "
- "%d. please specify a page-size value in the "
- "range and retry.",
- page_size_str, BDB_LL_PAGE_SIZE_MIN,
- BDB_LL_PAGE_SIZE_MAX);
- goto err;
- }
- } else {
- table->page_size = BDB_LL_PAGE_SIZE_DEFAULT;
- }
-
- table->hash_size = BDB_DEFAULT_HASH_SIZE;
- table->b_hash = GF_CALLOC (BDB_DEFAULT_HASH_SIZE,
- sizeof (struct list_head),
- gf_bdb_mt_list_head);
-
- for (idx = 0; idx < table->hash_size; idx++)
- INIT_LIST_HEAD(&(table->b_hash[idx]));
-
- private->b_table = table;
-
- ret = dict_get_str (options, "errfile", &errfile);
- if (ret == 0) {
- private->errfile = gf_strdup (errfile);
- gf_log (this->name, GF_LOG_DEBUG,
- "using %s as error logging file for libdb (Berkeley DB "
- "library) internal logging.", private->errfile);
- }
-
- ret = dict_get_str (options, "directory", &directory);
-
- if (ret == 0) {
- ret = dict_get_str (options, "logdir", &logdir);
-
- if (ret < 0) {
- gf_log ("bdb-ll", GF_LOG_DEBUG,
- "using the database environment home "
- "directory (%s) itself as transaction log "
- "directory", directory);
- private->logdir = gf_strdup (directory);
-
- } else {
- private->logdir = gf_strdup (logdir);
-
- op_ret = stat (private->logdir, &stbuf);
- if ((op_ret != 0)
- || (!S_ISDIR (stbuf.st_mode))) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "specified logdir %s does not exist. "
- "please provide a valid existing "
- "directory as parameter to 'option "
- "logdir'",
- private->logdir);
- goto err;
- }
- }
-
- private->b_table->dbenv = bdb_dbenv_init (this, directory);
- if (private->b_table->dbenv == NULL) {
- gf_log ("bdb-ll", GF_LOG_ERROR,
- "initialization of database environment "
- "failed");
- goto err;
- } else {
- if (private->transaction) {
- /* all well, start the checkpointing thread */
- LOCK_INIT (&private->active_lock);
-
- LOCK (&private->active_lock);
- {
- private->active = 1;
- }
- UNLOCK (&private->active_lock);
- pthread_create (&private->checkpoint_thread,
- NULL, bdb_checkpoint, this);
- }
- }
- }
-
- return op_ret;
-err:
- if (table) {
- GF_FREE (table->b_hash);
- GF_FREE (table);
- }
- if (private) {
- if (private->errfile)
- GF_FREE (private->errfile);
-
- if (private->logdir)
- GF_FREE (private->logdir);
- }
-
- return -1;
-}