diff options
Diffstat (limited to 'xlators/storage/bdb/src/bdb-ll.c')
| -rw-r--r-- | xlators/storage/bdb/src/bdb-ll.c | 1295 | 
1 files changed, 644 insertions, 651 deletions
diff --git a/xlators/storage/bdb/src/bdb-ll.c b/xlators/storage/bdb/src/bdb-ll.c index cd2d1ac4916..59d431d8257 100644 --- a/xlators/storage/bdb/src/bdb-ll.c +++ b/xlators/storage/bdb/src/bdb-ll.c @@ -20,6 +20,7 @@  #include <libgen.h>  #include "bdb.h"  #include <list.h> +#include "hashfn.h"  /*   * implement the procedures to interact with bdb */ @@ -31,22 +32,41 @@  ino_t  bdb_inode_transform (ino_t parent, -                     bctx_t *bctx) +                     const char *name, +                     size_t namelen)  { -        struct bdb_private *private = NULL;          ino_t               ino = -1; +        uint64_t            hash = 0; -        GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); +        hash = gf_dm_hashfn (name, namelen); -        private = bctx->table->this->private; +        ino = (((parent << 32) | 0x00000000ffffffff) +               & (hash | 0xffffffff00000000)); -        LOCK (&private->ino_lock); -        ino = ++private->next_ino; -        UNLOCK (&private->ino_lock); -out:          return ino;  } +static int +bdb_generate_secondary_hash (DB *secondary, +                             const DBT *pkey, +                             const DBT *data, +                             DBT *skey) +{ +        char *primary = NULL; +        uint32_t *hash = NULL; + +        primary = pkey->data; + +        hash = calloc (1, sizeof (uint32_t)); + +        *hash = gf_dm_hashfn (primary, pkey->size); + +        skey->data = hash; +        skey->size = sizeof (hash); +        skey->flags = DB_DBT_APPMALLOC; + +        return 0; +}  /***********************************************************   * @@ -63,13 +83,13 @@ out:   *      if (no-empty-slots), then prune open dbs and close as many as possible   *      if (empty-slot-available), tika muchkonDu db open maaDu   * - * NOTE: illi baro munche lock hiDkobEku   */ -static DB * +static int  bdb_db_open (bctx_t *bctx)  { -        DB *storage_dbp = NULL; -        int32_t op_ret = -1; +        DB *primary   = NULL; +        DB *secondary = NULL; +        int32_t ret = -1;          bctx_table_t *table = NULL;          GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); @@ -78,51 +98,94 @@ bdb_db_open (bctx_t *bctx)          GF_VALIDATE_OR_GOTO ("bdb-ll", table, out);          /* we have to do the following, we can't deny someone of db_open ;) */ -        op_ret = db_create (&storage_dbp, table->dbenv, 0); -        if (op_ret != 0) { -                gf_log ("bdb-ll", GF_LOG_ERROR, -                        "failed to do db_create for directory %s (%s)", -                        bctx->directory, db_strerror (op_ret)); -                storage_dbp = NULL; +        ret = db_create (&primary, table->dbenv, 0); +        if (ret < 0) { +                gf_log ("bdb-ll", GF_LOG_DEBUG, +                        "_BDB_DB_OPEN %s: %s (failed to create database object" +                        " for primary database)", +                        bctx->directory, db_strerror (ret)); +                ret = -ENOMEM;                  goto out;          }          if (table->page_size) { -                op_ret = storage_dbp->set_pagesize (storage_dbp, -                                                    table->page_size); -                if (op_ret != 0) { -                        gf_log ("bdb-ll", GF_LOG_ERROR, -                                "failed to set the page_size (%"PRIu64") for " -                                "directory %s (%s)", -                                table->page_size, bctx->directory, -                                db_strerror (op_ret)); -                } else { +                ret = primary->set_pagesize (primary, +                                             table->page_size); +                if (ret < 0) {                          gf_log ("bdb-ll", GF_LOG_DEBUG, -                                "page-size (%"PRIu64") set on DB", +                                "_BDB_DB_OPEN %s: %s (failed to set page-size " +                                "to %"PRIu64")", +                                bctx->directory, db_strerror (ret),                                  table->page_size); +                } else { +                        gf_log ("bdb-ll", GF_LOG_DEBUG, +                                "_BDB_DB_OPEN %s: page-size set to %"PRIu64, +                                bctx->directory, table->page_size);                  }          } -        op_ret = storage_dbp->open (storage_dbp, -                                    NULL, -                                    bctx->db_path, -                                    NULL, -                                    table->access_mode, -                                    table->dbflags, -                                    0); -        if (op_ret != 0 ) { -                gf_log ("bdb-ll", -                        GF_LOG_ERROR, -                        "failed to open storage-db for directory %s (%s)", -                        bctx->db_path, db_strerror (op_ret)); -                storage_dbp = NULL; +        ret = primary->open (primary, NULL, bctx->db_path, "primary", +                             table->access_mode, table->dbflags, 0); +        if (ret < 0) { +                gf_log ("bdb-ll", GF_LOG_ERROR, +                        "_BDB_DB_OPEN %s: %s " +                        "(failed to open primary database)", +                        bctx->directory, db_strerror (ret)); +                ret = -1; +                goto cleanup; +        } + +        ret = db_create (&secondary, table->dbenv, 0); +        if (ret < 0) { +                gf_log ("bdb-ll", GF_LOG_DEBUG, +                        "_BDB_DB_OPEN %s: %s (failed to create database object" +                        " for secondary database)", +                        bctx->directory, db_strerror (ret)); +                ret = -ENOMEM; +                goto cleanup; +        } + +        ret = secondary->open (secondary, NULL, bctx->db_path, "secondary", +                               table->access_mode, table->dbflags, 0); +        if (ret != 0 ) { +                gf_log ("bdb-ll", GF_LOG_ERROR, +                        "_BDB_DB_OPEN %s: %s " +                        "(failed to open secondary database)", +                        bctx->directory, db_strerror (ret)); +                ret = -1; +                goto cleanup; +        } + +        ret = primary->associate (primary, NULL, secondary, +                                  bdb_generate_secondary_hash, +#ifdef DB_IMMUTABLE_KEY +                                  DB_IMMUTABLE_KEY); +#else +                                  0); +#endif +        if (ret != 0 ) { +                gf_log ("bdb-ll", GF_LOG_ERROR, +                        "_BDB_DB_OPEN %s: %s " +                        "(failed to associate primary database with " +                        "secondary database)", +                        bctx->directory, db_strerror (ret)); +                ret = -1; +                goto cleanup;          }  out: -        return storage_dbp; -} +        bctx->primary = primary; +        bctx->secondary = secondary; +        return ret; +cleanup: +        if (primary) +                primary->close (primary, 0); +        if (secondary) +                secondary->close (secondary, 0); +        return ret; +}  int32_t  bdb_cursor_close (bctx_t *bctx, @@ -140,10 +203,10 @@ bdb_cursor_close (bctx_t *bctx,  #else                  ret = cursorp->c_close (cursorp);  #endif -                if ((ret != 0)) { -                        gf_log ("bdb-ll", GF_LOG_ERROR, -                                "failed to close db cursor for directory " -                                "%s (%s)", +                if (ret < 0) { +                        gf_log ("bdb-ll", GF_LOG_DEBUG, +                                "_BDB_CURSOR_CLOSE %s: %s " +                                "(failed to close database cursor)",                                  bctx->directory, db_strerror (ret));                  }          } @@ -165,27 +228,30 @@ bdb_cursor_open (bctx_t *bctx,          LOCK (&bctx->lock);          { -                if (bctx->dbp) { +                if (bctx->secondary) {                          /* do nothing, just continue */                          ret = 0;                  } else { -                        bctx->dbp = bdb_db_open (bctx); -                        if (!bctx->dbp) { -                                gf_log ("bdb-ll", GF_LOG_ERROR, -                                        "failed to open storage db for %s", +                        ret = bdb_db_open (bctx); +                        if (ret < 0) { +                                gf_log ("bdb-ll", GF_LOG_DEBUG, +                                        "_BDB_CURSOR_OPEN %s: ENOMEM " +                                        "(failed to open secondary database)",                                          bctx->directory); -                                ret = -1; +                                ret = -ENOMEM;                          } else {                                  ret = 0;                          }                  }                  if (ret == 0) { -                        /* all set, lets open cursor */ -                        ret = bctx->dbp->cursor (bctx->dbp, NULL, cursorpp, 0); -                        if (ret != 0) { -                                gf_log ("bdb-ll", GF_LOG_ERROR, -                                        "failed to create a cursor for %s (%s)", +                        /* all set, open cursor */ +                        ret = bctx->secondary->cursor (bctx->secondary, +                                                       NULL, cursorpp, 0); +                        if (ret < 0) { +                                gf_log ("bdb-ll", GF_LOG_DEBUG, +                                        "_BDB_CURSOR_OPEN %s: %s " +                                        "(failed to open a cursor to database)",                                          bctx->directory, db_strerror (ret));                          }                  } @@ -245,27 +311,37 @@ bdb_cache_insert (bctx_t *bctx,                          /* FIXME: ugly, not supposed to disect any of the                           * 'struct list_head' directly */                          if (!list_empty (&bctx->c_list)) { -                                bcache = list_entry (bctx->c_list.prev, bdb_cache_t, c_list); +                                bcache = list_entry (bctx->c_list.prev, +                                                     bdb_cache_t, c_list);                                  list_del_init (&bcache->c_list);                          }                          if (bcache->key) {                                  free (bcache->key); -                                bcache->key = strdup ((char *)key->data); -                                GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->key, unlock); +                                bcache->key = calloc (key->size + 1, +                                                      sizeof (char)); +                                GF_VALIDATE_OR_GOTO ("bdb-ll", +                                                     bcache->key, unlock); +                                memcpy (bcache->key, (char *)key->data, +                                        key->size);                          } else {                                  /* should never come here */ -                                gf_log ("bdb-ll", GF_LOG_CRITICAL, -                                        "bcache->key (null)"); +                                gf_log ("bdb-ll", GF_LOG_DEBUG, +                                        "_BDB_CACHE_INSERT %s (%s) " +                                        "(found a cache entry with empty key)", +                                        bctx->directory, (char *)key->data);                          } /* if(bcache->key)...else */                          if (bcache->data) {                                  free (bcache->data);                                  bcache->data = memdup (data->data, data->size); -                                GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->data, unlock); +                                GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->data, +                                                     unlock);                                  bcache->size = data->size;                          } else {                                  /* should never come here */                                  gf_log ("bdb-ll", GF_LOG_CRITICAL, -                                        "bcache->data (null)"); +                                        "_BDB_CACHE_INSERT %s (%s) " +                                        "(found a cache entry with no data)", +                                        bctx->directory, (char *)key->data);                          } /* if(bcache->data)...else */                          list_add (&bcache->c_list, &bctx->c_list);                          ret = 0; @@ -273,10 +349,14 @@ bdb_cache_insert (bctx_t *bctx,                          /* we will be entering here very rarely */                          bcache = CALLOC (1, sizeof (*bcache));                          GF_VALIDATE_OR_GOTO ("bdb-ll", bcache, unlock); -                        bcache->key = strdup ((char *)(key->data)); + +                        bcache->key = calloc (key->size + 1, sizeof (char));                          GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->key, unlock); +                        memcpy (bcache->key, key->data, key->size); +                          bcache->data = memdup (data->data, data->size);                          GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->data, unlock); +                          bcache->size = data->size;                          list_add (&bcache->c_list, &bctx->c_list);                          bctx->c_count++; @@ -291,7 +371,7 @@ out:  static int32_t  bdb_cache_delete (bctx_t *bctx, -                  char *key) +                  const char *key)  {          bdb_cache_t *bcache = NULL;          bdb_cache_t *trav   = NULL; @@ -333,12 +413,12 @@ bdb_db_stat (bctx_t *bctx,          LOCK (&bctx->lock);          { -                if (bctx->dbp == NULL) { -                        bctx->dbp = bdb_db_open (bctx); -                        storage = bctx->dbp; +                if (bctx->primary == NULL) { +                        ret = bdb_db_open (bctx); +                        storage = bctx->primary;                  } else {                          /* we are just fine, lets continue */ -                        storage = bctx->dbp; +                        storage = bctx->primary;                  } /* if(bctx->dbp==NULL)...else */          }          UNLOCK (&bctx->lock); @@ -347,46 +427,48 @@ bdb_db_stat (bctx_t *bctx,          ret = storage->stat (storage, txnid, &stat, flags); -        if (ret != 0) { -                gf_log ("bdb-ll", GF_LOG_ERROR, -                        "failed to do DB->stat() on db file %s: %s", -                        bctx->db_path, db_strerror (ret)); -        } else { +        if (ret < 0) {                  gf_log ("bdb-ll", GF_LOG_DEBUG, -                        "successfully called DB->stat() on db file %s", -                        bctx->db_path); +                        "_BDB_DB_STAT %s: %s " +                        "(failed to do stat database)", +                        bctx->directory, db_strerror (ret));          }  out:          return stat;  } -/* bdb_storage_get - retrieve a key/value pair corresponding to @path from the corresponding - *                   db file. +/* bdb_storage_get - retrieve a key/value pair corresponding to @path from the + *  corresponding db file.   * - * @bctx: bctx_t * corresponding to the parent directory of @path. (should always be a valid - *        bctx).  bdb_storage_get should never be called if @bctx = NULL. - * @txnid: NULL if bdb_storage_get is not embedded in an explicit transaction or a valid - *         DB_TXN *, when embedded in an explicit transaction. - * @path: path of the file to read from (translated to a database key using MAKE_KEY_FROM_PATH) - * @buf: char ** - pointer to a pointer to char. a read buffer is created in this procedure - *       and pointer to the buffer is passed through @buf to the caller. + * @bctx: bctx_t * corresponding to the parent directory of @path. (should + *  always be a valid bctx).  bdb_storage_get should never be called if + *  @bctx = NULL. + * @txnid: NULL if bdb_storage_get is not embedded in an explicit transaction + *  or a valid DB_TXN *, when embedded in an explicit transaction. + * @path: path of the file to read from (translated to a database key using + *  MAKE_KEY_FROM_PATH) + * @buf: char ** - pointer to a pointer to char. a read buffer is created in + *  this procedure and pointer to the buffer is passed through @buf to the + *  caller.   * @size: size of the file content to be read.   * @offset: offset from which the file content to be read.   * - * NOTE: bdb_storage_get tries to open DB, if @bctx->dbp == NULL (@bctx->dbp == NULL, - *      nobody has opened DB till now or DB was closed by bdb_table_prune()). + * NOTE: bdb_storage_get tries to open DB, if @bctx->dbp == NULL + *  (@bctx->dbp == NULL, nobody has opened DB till now or DB was closed by + *  bdb_table_prune()).   * - * NOTE: if private->cache is set (bdb xlator's internal caching enabled), then bdb_storage_get - *      first looks up the cache for key/value pair. if bdb_lookup_cache fails, then only - *      DB->get() is called. also,  inserts a newly read key/value pair to cache through - *      bdb_insert_to_cache. + * NOTE: if private->cache is set (bdb xlator's internal caching enabled), then + *  bdb_storage_get first looks up the cache for key/value pair. if + *  bdb_lookup_cache fails, then only DB->get() is called. also,  inserts a + *  newly read key/value pair to cache through bdb_insert_to_cache.   *   * return: 'number of bytes read' on success or -1 on error.   * - * also see: bdb_lookup_cache, bdb_insert_to_cache for details about bdb xlator's internal cache. + * also see: bdb_lookup_cache, bdb_insert_to_cache for details about bdb + *  xlator's internal cache.   */ -int32_t +static int32_t  bdb_db_get (bctx_t *bctx,              DB_TXN *txnid,              const char *path, @@ -420,12 +502,12 @@ bdb_db_get (bctx_t *bctx,          } else {                  LOCK (&bctx->lock);                  { -                        if (bctx->dbp == NULL) { -                                bctx->dbp = bdb_db_open (bctx); -                                storage = bctx->dbp; +                        if (bctx->primary == NULL) { +                                ret = bdb_db_open (bctx); +                                storage = bctx->primary;                          } else {                                  /* we are just fine, lets continue */ -                                storage = bctx->dbp; +                                storage = bctx->primary;                          } /* if(bctx->dbp==NULL)...else */                  }                  UNLOCK (&bctx->lock); @@ -457,22 +539,25 @@ bdb_db_get (bctx_t *bctx,                          if (ret == DB_NOTFOUND) {                                  gf_log ("bdb-ll", GF_LOG_DEBUG, -                                        "failed to do DB->get() for key: %s." -                                        " key not found in storage DB", -                                        key_string); +                                        "_BDB_DB_GET %s - %s: ENOENT" +                                        "(specified key not found in database)", +                                        bctx->directory, key_string);                                  ret = -1;                                  need_break = 1;                          } else if (ret == DB_LOCK_DEADLOCK) {                                  retries++; -                                gf_log ("bdb-ll", GF_LOG_ERROR, -                                        "deadlock detected in DB->put. retrying" -                                        " DB->put (%d)", retries); -                        }else if (ret == 0) { +                                gf_log ("bdb-ll", GF_LOG_DEBUG, +                                        "_BDB_DB_GET %s - %s" +                                        "(deadlock detected, retrying for %d " +                                        "time)", +                                        bctx->directory, key_string, retries); +                        } else if (ret == 0) {                                  /* successfully read data, lets set everything                                   * in place and return */                                  if (buf) {                                          *buf = CALLOC (1, value.size); -                                        ERR_ABORT (*buf); +                                        GF_VALIDATE_OR_GOTO ("bdb-ll", +                                                             *buf, out);                                          memcpy (*buf, value.data, value.size);                                  }                                  ret = value.size; @@ -481,10 +566,12 @@ bdb_db_get (bctx_t *bctx,                                  free (value.data);                                  need_break = 1;                          } else { -                                gf_log ("bdb-ll", -                                        GF_LOG_ERROR, -                                        "failed to do DB->get() for key %s: %s", -                                        key_string, db_strerror (ret)); +                                gf_log ("bdb-ll", GF_LOG_DEBUG, +                                        "_BDB_DB_GET %s - %s: %s" +                                        "(failed to retrieve specified key from" +                                        " database)", +                                        bctx->directory, key_string, +                                        db_strerror (ret));                                  ret = -1;                                  need_break = 1;                          } @@ -494,6 +581,19 @@ out:          return ret;  }/* bdb_db_get */ +/* TODO: handle errors here and log. propogate only the errno to caller */ +int32_t +bdb_db_fread (struct bdb_fd *bfd, char **buf, size_t size, off_t offset) +{ +        return bdb_db_get (bfd->ctx, NULL, bfd->key, buf, size, offset); +} + +int32_t +bdb_db_iread (struct bdb_ctx *bctx, const char *key, char **buf) +{ +        return bdb_db_get (bctx, NULL, key, buf, 0, 0); +} +  /* bdb_storage_put - insert a key/value specified to the corresponding DB.   *   * @bctx: bctx_t * corresponding to the parent directory of @path. @@ -519,7 +619,7 @@ out:   * also see: bdb_cache_delete for details on how a cached key/value pair is   * removed.   */ -int32_t +static int32_t  bdb_db_put (bctx_t *bctx,              DB_TXN *txnid,              const char *key_string, @@ -537,12 +637,12 @@ bdb_db_put (bctx_t *bctx,          LOCK (&bctx->lock);          { -                if (bctx->dbp == NULL) { -                        bctx->dbp = bdb_db_open (bctx); -                        storage = bctx->dbp; +                if (bctx->primary == NULL) { +                        ret = bdb_db_open (bctx); +                        storage = bctx->primary;                  } else {                          /* we are just fine, lets continue */ -                        storage = bctx->dbp; +                        storage = bctx->primary;                  }          }          UNLOCK (&bctx->lock); @@ -582,15 +682,16 @@ bdb_db_put (bctx_t *bctx,                  ret = storage->put (storage, txnid, &key, &value, db_flags);                  if (ret == DB_LOCK_DEADLOCK) {                          retries++; -                        gf_log ("bdb-ll", GF_LOG_ERROR, -                                "deadlock detected in DB->put. " -                                "retrying DB->put (%d)", -                                retries); +                        gf_log ("bdb-ll", GF_LOG_DEBUG, +                                "_BDB_DB_PUT %s - %s" +                                "(deadlock detected, retying for %d time)", +                                bctx->directory, key_string, retries);                  } else if (ret) {                          /* write failed */ -                        gf_log ("bdb-ll", GF_LOG_ERROR, -                                "failed to do DB->put() for key %s: %s", -                                key_string, db_strerror (ret)); +                        gf_log ("bdb-ll", GF_LOG_DEBUG, +                                "_BDB_DB_PUT %s - %s: %s" +                                "(failed to put specified entry into database)", +                                bctx->directory, key_string, db_strerror (ret));                          need_break = 1;                  } else {                          /* successfully wrote */ @@ -602,44 +703,68 @@ out:          return ret;  }/* bdb_db_put */ +int32_t +bdb_db_icreate (struct bdb_ctx *bctx, const char *key) +{ +        return bdb_db_put (bctx, NULL, key, NULL, 0, 0, 0); +} + +/* TODO: handle errors here and log. propogate only the errno to caller */ +int32_t +bdb_db_fwrite (struct bdb_fd *bfd, char *buf, size_t size, off_t offset) +{ +        return bdb_db_put (bfd->ctx, NULL, bfd->key, buf, size, offset, 0); +} + +/* TODO: handle errors here and log. propogate only the errno to caller */ +int32_t +bdb_db_iwrite (struct bdb_ctx *bctx, const char *key, char *buf, size_t size) +{ +        return bdb_db_put (bctx, NULL, key, buf, size, 0, 0); +} + +int32_t +bdb_db_itruncate (struct bdb_ctx *bctx, const char *key) +{ +        return bdb_db_put (bctx, NULL, key, NULL, 0, 1, 0); +} -/* bdb_storage_del - delete a key/value pair corresponding to @path from corresponding db file. +/* bdb_storage_del - delete a key/value pair corresponding to @path from + *  corresponding db file.   *   * @bctx: bctx_t * corresponding to the parent directory of @path.   *       (should always be a valid bctx). bdb_storage_del should never be called   *       if @bctx = NULL. - * @txnid: NULL if bdb_storage_del is not embedded in an explicit transaction or a - *         valid DB_TXN *, when embedded in an explicit transaction. + * @txnid: NULL if bdb_storage_del is not embedded in an explicit transaction + *   or a valid DB_TXN *, when embedded in an explicit transaction.   * @path: path to the file, whose key/value pair has to be deleted.   * - * NOTE: bdb_storage_del tries to open DB, if @bctx->dbp == NULL (@bctx->dbp == NULL, - *      nobody has opened DB till now or DB was closed by bdb_table_prune()). + * NOTE: bdb_storage_del tries to open DB, if @bctx->dbp == NULL + *  (@bctx->dbp == NULL, nobody has opened DB till now or DB was closed by + *  bdb_table_prune()).   *   * return: 0 on success or -1 on error.   */ -int32_t +static int32_t  bdb_db_del (bctx_t *bctx,              DB_TXN *txnid, -            const char *path) +            const char *key_string)  {          DB     *storage    = NULL;          DBT     key        = {0,}; -        char   *key_string = NULL;          int32_t ret        = -1;          int32_t db_flags   = 0;          uint8_t need_break = 0;          int32_t retries    = 1; -        MAKE_KEY_FROM_PATH (key_string, path); -          LOCK (&bctx->lock);          { -                if (bctx->dbp == NULL) { -                        bctx->dbp = bdb_db_open (bctx); -                        storage = bctx->dbp; +                if (bctx->primary == NULL) { +                        ret = bdb_db_open (bctx); +                        storage = bctx->primary;                  } else {                          /* we are just fine, lets continue */ -                        storage = bctx->dbp; +                        storage = bctx->primary;                  }          }          UNLOCK (&bctx->lock); @@ -649,7 +774,7 @@ bdb_db_del (bctx_t *bctx,          ret = bdb_cache_delete (bctx, key_string);          GF_VALIDATE_OR_GOTO ("bdb-ll", (ret == 0), out); -        key.data = key_string; +        key.data = (char *)key_string;          key.size = strlen (key_string);          key.flags = DB_DBT_USERMEM; @@ -658,26 +783,30 @@ bdb_db_del (bctx_t *bctx,                  if (ret == DB_NOTFOUND) {                          gf_log ("bdb-ll", GF_LOG_DEBUG, -                                "failed to delete %s from storage db, " -                                "doesn't exist in storage DB", -                                path); +                                "_BDB_DB_DEL %s - %s: ENOENT" +                                "(failed to delete entry, could not be " +                                "found in the database)", +                                bctx->directory, key_string);                          need_break = 1;                  } else if (ret == DB_LOCK_DEADLOCK) {                          retries++; -                        gf_log ("bdb-ll", GF_LOG_ERROR, -                                "deadlock detected in DB->put. " -                                "retrying DB->put (%d)", -                                retries); -                }else if (ret == 0) { +                        gf_log ("bdb-ll", GF_LOG_DEBUG, +                                "_BDB_DB_DEL %s - %s" +                                "(deadlock detected, retying for %d time)", +                                bctx->directory, key_string, retries); +                } else if (ret == 0) {                          /* successfully deleted the entry */                          gf_log ("bdb-ll", GF_LOG_DEBUG, -                                "deleted %s from storage db", path); +                                "_BDB_DB_DEL %s - %s" +                                "(successfully deleted entry from database)", +                                bctx->directory, key_string);                          ret = 0;                          need_break = 1;                  } else { -                        gf_log ("bdb-ll", GF_LOG_ERROR, -                                "failed to delete %s from storage db: %s", -                                path, db_strerror (ret)); +                        gf_log ("bdb-ll", GF_LOG_DEBUG, +                                "_BDB_DB_DEL %s - %s: %s" +                                "(failed to delete entry from database)", +                                bctx->directory, key_string, db_strerror (ret));                          ret = -1;                          need_break = 1;                  } @@ -686,11 +815,18 @@ out:          return ret;  } +int32_t +bdb_db_iremove (bctx_t *bctx, +                const char *key) +{ +        return bdb_db_del (bctx, NULL, key); +} +  /* NOTE: bdb version compatibility wrapper */  int32_t  bdb_cursor_get (DBC *cursorp, -                DBT *key, -                DBT *value, +                DBT *sec, DBT *pri, +                DBT *val,                  int32_t flags)  {          int32_t ret = -1; @@ -698,21 +834,21 @@ bdb_cursor_get (DBC *cursorp,          GF_VALIDATE_OR_GOTO ("bdb-ll", cursorp, out);  #ifdef HAVE_BDB_CURSOR_GET -        ret = cursorp->get (cursorp, key, value, flags); +        ret = cursorp->pget (cursorp, sec, pri, val, flags);  #else -        ret = cursorp->c_get (cursorp, key, value, flags); +        ret = cursorp->c_pget (cursorp, sec, pri, val, flags);  #endif          if ((ret != 0)  && (ret != DB_NOTFOUND)) { -                gf_log ("bdb-ll", GF_LOG_ERROR, -                        "failed to CURSOR->get() for key %s (%s)", -                        (char *)key->data, db_strerror (ret)); +                gf_log ("bdb-ll", GF_LOG_DEBUG, +                        "_BDB_CURSOR_GET: %s" +                        "(failed to retrieve entry from database cursor)", +                        db_strerror (ret));          }  out:          return ret;  }/* bdb_cursor_get */ -  int32_t  bdb_dirent_size (DBT *key)  { @@ -720,29 +856,6 @@ bdb_dirent_size (DBT *key)  } -/* bdb_extract_bfd - translate a fd_t to a bfd (either a 'struct bdb_bfd' or 'struct bdb_dir') - * - * @fd->ctx is with bdb specific file handle during a successful bdb_open (also bdb_create) - *  or bdb_opendir. - * - * return: 'struct bdb_bfd *' or 'struct bdb_dir *' on success, or NULL on failure. - */ -inline void * -bdb_extract_bfd (fd_t *fd, -                 xlator_t *this) -{ -        uint64_t tmp_bfd = 0; -        void    *bfd     = NULL; - -        GF_VALIDATE_OR_GOTO ("bdb-ll", fd, out); -        GF_VALIDATE_OR_GOTO ("bdb-ll", this, out); - -        fd_ctx_get (fd, this, &tmp_bfd); -        bfd = (void *)(long)bfd; - -out: -        return bfd; -}  /* bdb_dbenv_init - initialize DB_ENV   * @@ -751,10 +864,10 @@ out:   *      NOTE: see private->envflags for flags used.   *   2. DB_ENV->set_lg_dir - set log directory to be used for storing log files   *     (log files are the files in which transaction logs are written by db). - *   3. DB_ENV->set_flags (DB_LOG_AUTOREMOVE) - set DB_ENV to automatically clear - *      the unwanted log files (flushed at each checkpoint). - *   4. DB_ENV->set_errfile - set errfile to be used by db to report detailed error logs. - *     used only for debbuging purpose. + *   3. DB_ENV->set_flags (DB_LOG_AUTOREMOVE) - set DB_ENV to automatically + *      clear the unwanted log files (flushed at each checkpoint). + *   4. DB_ENV->set_errfile - set errfile to be used by db to report detailed + *      error logs. used only for debbuging purpose.   *   * return: returns a valid DB_ENV * on success or NULL on error.   * @@ -769,55 +882,49 @@ bdb_dbenv_init (xlator_t *this,          bdb_private_t *private     = NULL;          int32_t        fatal_flags = 0; -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (directory, out); +        VALIDATE_OR_GOTO (this, err); +        VALIDATE_OR_GOTO (directory, err);          private = this->private; -        VALIDATE_OR_GOTO (private, out); +        VALIDATE_OR_GOTO (private, err);          ret = db_env_create (&dbenv, 0); -        VALIDATE_OR_GOTO ((ret == 0), out); +        VALIDATE_OR_GOTO ((ret == 0), err);          /* NOTE: set_errpfx returns 'void' */          dbenv->set_errpfx(dbenv, this->name);          ret = dbenv->set_lk_detect (dbenv, DB_LOCK_DEFAULT); -        VALIDATE_OR_GOTO ((ret == 0), out); +        VALIDATE_OR_GOTO ((ret == 0), err);          ret = dbenv->open(dbenv, directory,                            private->envflags,                            S_IRUSR | S_IWUSR);          if ((ret != 0) && (ret != DB_RUNRECOVERY)) {                  gf_log (this->name, GF_LOG_CRITICAL, -                        "failed to open DB environment (%s)", -                        db_strerror (ret)); +                        "failed to join Berkeley DB environment at %s: %s." +                        "please run manual recovery and retry running " +                        "glusterfs", +                        directory, db_strerror (ret));                  dbenv = NULL; -                goto out; +                goto err;          } else if (ret == DB_RUNRECOVERY) {                  fatal_flags = ((private->envflags & (~DB_RECOVER))                                 | DB_RECOVER_FATAL);                  ret = dbenv->open(dbenv, directory, fatal_flags,                                    S_IRUSR | S_IWUSR);                  if (ret != 0) { -                        gf_log (this->name, GF_LOG_ERROR, -                                "failed to open DB environment (%s) with " -                                "DB_REOVER_FATAL", -                                db_strerror (ret)); +                        gf_log (this->name, GF_LOG_CRITICAL, +                                "failed to join Berkeley DB environment in " +                                "recovery mode at %s: %s. please run manual " +                                "recovery and retry running glusterfs", +                                directory, db_strerror (ret));                          dbenv = NULL; -                        goto out; -                } else { -                        gf_log (this->name, GF_LOG_WARNING, -                                "opened DB environment after DB_RECOVER_FATAL:" -                                " %s", db_strerror (ret)); +                        goto err;                  } -        } else { -                gf_log (this->name, GF_LOG_DEBUG, -                        "DB environment successfull opened: %s", -                        db_strerror (ret));          } - - +        ret = 0;  #if (DB_VERSION_MAJOR == 4 &&                   \       DB_VERSION_MINOR == 7)          if (private->log_auto_remove) { @@ -832,41 +939,42 @@ bdb_dbenv_init (xlator_t *this,                  ret = dbenv->set_flags (dbenv, DB_LOG_AUTOREMOVE, 0);          }  #endif -        if (ret != 0) { -                gf_log ("bctx", GF_LOG_ERROR, -                        "failed to set DB_LOG_AUTOREMOVE on dbenv: %s", +        if (ret < 0) { +                gf_log ("bdb-ll", GF_LOG_ERROR, +                        "autoremoval of transactional log files could not be " +                        "configured (%s). you may have to do a manual " +                        "monitoring of transactional log files and remove " +                        "periodically.",                          db_strerror (ret)); -        } else { -                gf_log ("bctx", GF_LOG_DEBUG, -                        "DB_LOG_AUTOREMOVE set on dbenv"); +                goto err;          }          if (private->transaction) {                  ret = dbenv->set_flags(dbenv, DB_AUTO_COMMIT, 1);                  if (ret != 0) { -                        gf_log ("bctx", GF_LOG_ERROR, -                                "failed to set DB_AUTO_COMMIT on dbenv: %s", +                        gf_log ("bdb-ll", GF_LOG_DEBUG, +                                "configuration of auto-commit failed for " +                                "database environment at %s. none of the " +                                "operations will be embedded in transaction " +                                "unless explicitly done so.",                                  db_strerror (ret)); -                } else { -                        gf_log ("bctx", GF_LOG_DEBUG, -                                "DB_AUTO_COMMIT set on dbenv"); +                        goto err;                  }                  if (private->txn_timeout) { -                        ret = dbenv->set_timeout (dbenv, -                                                  private->txn_timeout, +                        ret = dbenv->set_timeout (dbenv, private->txn_timeout,                                                    DB_SET_TXN_TIMEOUT);                          if (ret != 0) { -                                gf_log ("bctx", GF_LOG_ERROR, -                                        "failed to set TXN_TIMEOUT to %d " -                                        "milliseconds on dbenv: %s", +                                gf_log ("bdb-ll", GF_LOG_ERROR, +                                        "could not configure Berkeley DB " +                                        "transaction timeout to %d (%s). please" +                                        " review 'option transaction-timeout %d" +                                        "' option.",                                          private->txn_timeout, -                                        db_strerror (ret)); -                        } else { -                                gf_log ("bctx", GF_LOG_DEBUG, -                                        "TXN_TIMEOUT set to %d milliseconds", +                                        db_strerror (ret),                                          private->txn_timeout); +                                goto err;                          }                  } @@ -874,32 +982,28 @@ bdb_dbenv_init (xlator_t *this,                          ret = dbenv->set_timeout(dbenv,                                                   private->txn_timeout,                                                   DB_SET_LOCK_TIMEOUT); - -                        if (ret != 0) { -                                gf_log ("bctx", GF_LOG_ERROR, -                                        "failed to set LOCK_TIMEOUT to %d " -                                        "milliseconds on dbenv: %s", +                        if (ret < 0) { +                                gf_log ("bdb-ll", GF_LOG_ERROR, +                                        "could not configure Berkeley DB " +                                        "lock timeout to %d (%s). please" +                                        " review 'option lock-timeout %d" +                                        "' option.",                                          private->lock_timeout, -                                        db_strerror (ret)); -                        } else { -                                gf_log ("bctx", GF_LOG_DEBUG, -                                        "LOCK_TIMEOUT set to %d milliseconds", +                                        db_strerror (ret),                                          private->lock_timeout); +                                goto err;                          }                  }                  ret = dbenv->set_lg_dir (dbenv, private->logdir); - -                if (ret != 0) { -                        gf_log ("bctx", GF_LOG_ERROR, -                                "failed to set log directory for dbenv: %s", -                                db_strerror (ret)); -                } else { -                        gf_log ("bctx", GF_LOG_DEBUG, -                                "set dbenv log dir to %s", -                                private->logdir); +                if (ret < 0) { +                        gf_log ("bdb-ll", GF_LOG_ERROR, +                                "failed to configure libdb transaction log " +                                "directory at %s. please review the " +                                "'option logdir %s' option.", +                                db_strerror (ret), private->logdir); +                        goto err;                  } -          }          if (private->errfile) { @@ -907,41 +1011,52 @@ bdb_dbenv_init (xlator_t *this,                  if (private->errfp) {                          dbenv->set_errfile (dbenv, private->errfp);                  } else { -                        gf_log ("bctx", GF_LOG_ERROR, -                                "failed to open errfile: %s", -                                strerror (errno)); +                        gf_log ("bdb-ll", GF_LOG_ERROR, +                                "failed to open error logging file for " +                                "libdb (Berkeley DB) internal logging (%s)." +                                "please review the 'option errfile %s' option.", +                                strerror (errno), private->errfile); +                        goto err;                  }          } -out:          return dbenv; +err: +        if (dbenv) { +                dbenv->close (dbenv, 0); +        } + +        return NULL;  }  #define BDB_ENV(this) ((((struct bdb_private *)this->private)->b_table)->dbenv) -/* bdb_checkpoint - during transactional usage, db does not directly write the data to db - *                  files, instead db writes a 'log' (similar to a journal entry) into a - *                  log file. db normally clears the log files during opening of an - *                  environment. since we expect a filesystem server to run for a pretty - *                  long duration and flushing 'log's during dbenv->open would prove very - *                  costly, if we accumulate the log entries for one complete run of - *                  glusterfs server. to flush the logs frequently, db provides a mechanism - *                  called 'checkpointing'. when we do a checkpoint, db flushes the logs to - *                  disk (writes changes to db files) and we can also clear the accumulated - *                  log files after checkpointing. NOTE: removing unwanted log files is not - *                  part of dbenv->txn_checkpoint() call. +/* bdb_checkpoint - during transactional usage, db does not directly write the + *  data to db files, instead db writes a 'log' (similar to a journal entry) + *  into a log file. db normally clears the log files during opening of an + *  environment. since we expect a filesystem server to run for a pretty long + *  duration and flushing 'log's during dbenv->open would prove very costly, if + *  we accumulate the log entries for one complete run of glusterfs server. to + *  flush the logs frequently, db provides a mechanism called 'checkpointing'. + *  when we do a checkpoint, db flushes the logs to disk (writes changes to db + *  files) and we can also clear the accumulated log files after checkpointing. + *  NOTE: removing unwanted log files is not part of dbenv->txn_checkpoint() + *  call.   *   * @data: xlator_t of the current instance of bdb xlator.   * - *  bdb_checkpoint is called in a different thread from the main glusterfs thread. bdb - *  xlator creates the checkpoint thread after successfully opening the db environment. - *  NOTE: bdb_checkpoint thread shares the DB_ENV handle with the filesystem thread. + *  bdb_checkpoint is called in a different thread from the main glusterfs + *  thread. bdb xlator creates the checkpoint thread after successfully opening + *  the db environment. + *  NOTE: bdb_checkpoint thread shares the DB_ENV handle with the filesystem + *  thread.   *   *  db environment checkpointing frequency is controlled by   *  'option checkpoint-timeout <time-in-seconds>' in volfile.   * - * NOTE: checkpointing thread is started only if 'option transaction on' specified in - *      volfile. checkpointing is not valid for non-transactional environments. + * NOTE: checkpointing thread is started only if 'option transaction on' + *      specified in volfile. checkpointing is not valid for non-transactional + *      environments.   *   */  static void * @@ -965,23 +1080,29 @@ bdb_checkpoint (void *data)                  if (active) {                          ret = dbenv->txn_checkpoint (dbenv, 1024, 0, 0);                          if (ret) { -                                gf_log ("bctx", GF_LOG_ERROR, -                                        "failed to checkpoint environment: %s", +                                gf_log ("bdb-ll", GF_LOG_DEBUG, +                                        "_BDB_CHECKPOINT: %s" +                                        "(failed to checkpoint environment)",                                          db_strerror (ret));                          } else { -                                gf_log ("bctx", GF_LOG_DEBUG, -                                        "checkpointing successful"); +                                gf_log ("bdb-ll", GF_LOG_DEBUG, +                                        "_BDB_CHECKPOINT: successfully " +                                        "checkpointed");                          }                  } else {                          ret = dbenv->txn_checkpoint (dbenv, 1024, 0, 0);                          if (ret) { -                                gf_log ("bctx", GF_LOG_ERROR, -                                        "failed to do final checkpoint " -                                        "environment: %s", +                                gf_log ("bdb-ll", GF_LOG_ERROR, +                                        "_BDB_CHECKPOINT: %s" +                                        "(final checkpointing failed. might " +                                        "need to run recovery tool manually on " +                                        "next usage of this database " +                                        "environment)",                                          db_strerror (ret));                          } else { -                                gf_log ("bctx", GF_LOG_DEBUG, -                                        "final checkpointing successful"); +                                gf_log ("bdb-ll", GF_LOG_DEBUG, +                                        "_BDB_CHECKPOINT: final successfully " +                                        "checkpointed");                          }                          break;                  } @@ -990,449 +1111,321 @@ bdb_checkpoint (void *data)          return NULL;  } -static inline void -bdb_cache_init (xlator_t *this, -                dict_t *options, -                struct bdb_private *private) -{ -        /* cache is always on */ -        private->cache = ON; -} - -static inline void -bdb_log_remove_init (xlator_t *this, -                     dict_t *options, -                     struct bdb_private *private) -{ -        private->log_auto_remove = 1; -        gf_log (this->name, GF_LOG_DEBUG, -                "DB_ENV will use DB_LOG_AUTO_REMOVE"); -} -static inline void -bdb_errfile_init (xlator_t *this, -                  dict_t *options, -                  struct bdb_private *private) -{ -        int ret = -1; -        char *errfile = NULL; - -        ret = dict_get_str (options, "errfile", &errfile); -        if (ret == 0) { -                private->errfile = strdup (errfile); -                gf_log (this->name, GF_LOG_DEBUG, -                        "using errfile: %s", private->errfile); -        } -} - -static inline void -bdb_table_init (xlator_t *this, -                dict_t *options, -                struct bdb_private *private) +/* bdb_db_init - initialize bdb xlator + * + * reads the options from @options dictionary and sets appropriate values in + * @this->private. also initializes DB_ENV. + * + * return: 0 on success or -1 on error + * (with logging the error through gf_log()). + */ +int +bdb_db_init (xlator_t *this, +             dict_t *options)  { -        bctx_table_t *table = NULL; -        int32_t       idx   = 0; - -        int ret = -1; -        char *lru_limit_str = NULL; -        char *page_size_str = NULL; - -        table = CALLOC (1, sizeof (*table)); -        if (table) { -                INIT_LIST_HEAD(&(table->b_lru)); -                INIT_LIST_HEAD(&(table->active)); -                INIT_LIST_HEAD(&(table->purge)); - -                LOCK_INIT (&table->lock); -                LOCK_INIT (&table->checkpoint_lock); - -                table->transaction = private->transaction; -                table->access_mode = private->access_mode; -                table->dbflags = private->dbflags; -                table->this    = this; - -                { -                        ret = dict_get_str (options, "lru-limit", -                                            &lru_limit_str); - -                        /* TODO: set max lockers and max txns to accomodate -                         * for more than lru_limit */ -                        if (ret == 0) { -                                ret = gf_string2uint32 (lru_limit_str, -                                                        &table->lru_limit); -                                gf_log ("bdb-ll", GF_LOG_DEBUG, -                                        "setting bctx lru limit to %d", -                                        table->lru_limit); -                        } else { -                                table->lru_limit = BDB_DEFAULT_LRU_LIMIT; -                        } -                } - -                { -                        ret = dict_get_str (options, "page-size", -                                            &page_size_str); - -                        if (ret == 0) { -                                ret = gf_string2bytesize (page_size_str, -                                                          &table->page_size); -                                if (ret != 0) { -                                        gf_log ("bdb-ll", GF_LOG_ERROR, -                                                "invalid number format \"%s\"" -                                                " of \"option page-size\"", -                                                page_size_str); -                                } +        /* create a db entry for root */ +        int32_t        op_ret  = 0; +        bdb_private_t *private = NULL; +        bctx_table_t  *table = NULL; -                                if (!PAGE_SIZE_IN_RANGE(table->page_size)) { -                                        gf_log ("bdb-ll", GF_LOG_ERROR, -                                                "pagesize %s is out of range." -                                                "Allowed pagesize is between " -                                                "%d and %d", -                                                page_size_str, -                                                BDB_LL_PAGE_SIZE_MIN, -                                                BDB_LL_PAGE_SIZE_MAX); -                                } -                        } -                        else { -                                table->page_size = BDB_LL_PAGE_SIZE_DEFAULT; -                        } -                        gf_log ("bdb-ll", -                                GF_LOG_DEBUG, "using page-size %"PRIu64, -                                table->page_size); -                } +        char *checkpoint_interval_str = NULL; +        char *page_size_str           = NULL; +        char *lru_limit_str           = NULL; +        char *timeout_str             = NULL; +        char *access_mode             = NULL; +        char *endptr    = NULL; +        char *errfile   = NULL; +        char *directory = NULL; +        char *logdir    = NULL; +        char *mode      = NULL; +        char *mode_str  = NULL; +        int   ret = -1; +        int   idx = 0; +        struct stat stbuf = {0,}; -                table->hash_size = BDB_DEFAULT_HASH_SIZE; -                table->b_hash = CALLOC (BDB_DEFAULT_HASH_SIZE, -                                        sizeof (struct list_head)); +        private = this->private; -                for (idx = 0; idx < table->hash_size; idx++) -                        INIT_LIST_HEAD(&(table->b_hash[idx])); +        /* cache is always on */ +        private->cache = ON; -                private->b_table = table; +        ret = dict_get_str (options, "access-mode", &access_mode); +        if ((ret == 0) +            && (!strcmp (access_mode, "btree"))) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "using BTREE access mode to access libdb " +                        "(Berkeley DB)"); +                private->access_mode = DB_BTREE;          } else { -                gf_log ("bdb-ll", GF_LOG_CRITICAL, -                        "failed to allocate bctx table: out of memory"); +                gf_log (this->name, GF_LOG_DEBUG, +                        "using HASH access mode to access libdb (Berkeley DB)"); +                private->access_mode = DB_HASH;          } -} - -static inline void -bdb_directory_init (xlator_t *this, -                    dict_t *options, -                    struct bdb_private *private) -{ -        int ret = -1; -        char *directory = NULL; -        char *logdir = NULL; -        int32_t op_ret = -1; -        struct stat stbuf = {0}; -        ret = dict_get_str (options, "directory", &directory); +        ret = dict_get_str (options, "mode", &mode); +        if ((ret == 0) +            && (!strcmp (mode, "cache"))) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "cache data mode selected for 'storage/bdb'. filesystem" +                        " operations are not transactionally protected and " +                        "system crash does not guarantee recoverability of " +                        "data"); +                private->envflags = DB_CREATE | DB_INIT_LOG | +                        DB_INIT_MPOOL | DB_THREAD; +                private->dbflags = DB_CREATE | DB_THREAD; +                private->transaction = OFF; +        } else { +                gf_log (this->name, GF_LOG_DEBUG, +                        "persistent data mode selected for 'storage/bdb'. each" +                        "filesystem operation is guaranteed to be Berkeley DB " +                        "transaction protected."); +                private->transaction = ON; +                private->envflags = DB_CREATE | DB_INIT_LOCK | DB_INIT_LOG | +                        DB_INIT_MPOOL | DB_INIT_TXN | DB_RECOVER | DB_THREAD; +                private->dbflags = DB_CREATE | DB_THREAD; -        if (ret == 0) { -                ret = dict_get_str (options, "logdir", &logdir); -                if (ret != 0) { -                        gf_log ("bdb-ll", GF_LOG_DEBUG, -                                "using default logdir as database home"); -                        private->logdir = strdup (directory); +                ret = dict_get_str (options, "lock-timeout", &timeout_str); -                } else { -                        private->logdir = strdup (logdir); -                        gf_log ("bdb-ll", GF_LOG_DEBUG, -                                "using logdir: %s", -                                private->logdir); -                        umask (000); -                        if (mkdir (private->logdir, 0777) == 0) { -                                gf_log ("bdb-ll", GF_LOG_WARNING, -                                        "logdir specified (%s) not exists, " -                                        "created", -                                        private->logdir); -                        } - -                        op_ret = stat (private->logdir, &stbuf); -                        if ((op_ret != 0) -                            || (!S_ISDIR (stbuf.st_mode))) { -                                gf_log ("bdb-ll", GF_LOG_ERROR, -                                        "specified logdir doesn't exist, " -                                        "using default " -                                        "(environment home directory: %s)", -                                        directory); -                                private->logdir = strdup (directory); +                if (ret == 0) { +                        ret = gf_string2time (timeout_str, +                                              &private->lock_timeout); + +                        if (private->lock_timeout > 4260000) { +                                /* db allows us to DB_SET_LOCK_TIMEOUT to be +                                 * set to a maximum of 71 mins +                                 * (4260000 milliseconds) */ +                                gf_log (this->name, GF_LOG_DEBUG, +                                        "Berkeley DB lock-timeout parameter " +                                        "(%d) is out of range. please specify" +                                        " a valid timeout value for " +                                        "lock-timeout and retry.", +                                        private->lock_timeout); +                                goto err;                          }                  } - -                private->b_table->dbenv = bdb_dbenv_init (this, directory); - -                if (!private->b_table->dbenv) { -                        gf_log ("bdb-ll", GF_LOG_ERROR, -                                "failed to initialize db environment"); -                        FREE (private); -                        op_ret = -1; -                } else { -                        if (private->transaction) { -                                /* all well, start the checkpointing thread */ -                                LOCK_INIT (&private->active_lock); - -                                LOCK (&private->active_lock); -                                { -                                        private->active = 1; -                                } -                                UNLOCK (&private->active_lock); -                                pthread_create (&private->checkpoint_thread, -                                                NULL, bdb_checkpoint, this); +                ret = dict_get_str (options, "transaction-timeout", +                                    &timeout_str); +                if (ret == 0) { +                        ret = gf_string2time (timeout_str, +                                              &private->txn_timeout); + +                        if (private->txn_timeout > 4260000) { +                                /* db allows us to DB_SET_TXN_TIMEOUT to be set +                                 * to a maximum of 71 mins +                                 * (4260000 milliseconds) */ +                                gf_log (this->name, GF_LOG_DEBUG, +                                        "Berkeley DB lock-timeout parameter " +                                        "(%d) is out of range. please specify" +                                        " a valid timeout value for " +                                        "lock-timeout and retry.", +                                        private->lock_timeout); +                                goto err;                          }                  } -        } -} - -static inline void -bdb_dir_mode_init (xlator_t *this, -                   dict_t *options, -                   struct bdb_private *private) -{ -        int ret = -1; -        char *mode_str = NULL; -        char *endptr = NULL; -        ret = dict_get_str (options, "dir-mode", &mode_str); - -        if (ret == 0) { -                private->dir_mode = strtol (mode_str, &endptr, 8); -                if ((*endptr) || -                    (!IS_VALID_FILE_MODE(private->dir_mode))) { -                        gf_log (this->name, GF_LOG_DEBUG, -                                "invalid dir-mode %o. setting to default %o", -                                private->dir_mode, -                                DEFAULT_DIR_MODE); -                        private->dir_mode = DEFAULT_DIR_MODE; -                } else { -                        gf_log (this->name, GF_LOG_DEBUG, -                                "setting dir-mode to %o", -                                private->dir_mode); +                private->checkpoint_interval = BDB_DEFAULT_CHECKPOINT_INTERVAL; +                ret = dict_get_str (options, "checkpoint-interval", +                                    &checkpoint_interval_str); +                if (ret == 0) { +                        ret = gf_string2time (checkpoint_interval_str, +                                              &private->checkpoint_interval); + +                        if (ret < 0) { +                                gf_log (this->name, GF_LOG_DEBUG, +                                        "'%"PRIu32"' is not a valid parameter " +                                        "for checkpoint-interval option. " +                                        "please specify a valid " +                                        "checkpoint-interval and retry", +                                        private->checkpoint_interval); +                                goto err; +                        }                  } -        } else { -                private->dir_mode = DEFAULT_DIR_MODE;          } -        private->dir_mode = private->dir_mode | S_IFDIR; -} - -static inline void -bdb_file_mode_init (xlator_t *this, -                    dict_t *options, -                    struct bdb_private *private) -{ -        int ret = -1; -        char *mode_str = NULL; -        char *endptr = NULL; -          ret = dict_get_str (options, "file-mode", &mode_str); -          if (ret == 0) {                  private->file_mode = strtol (mode_str, &endptr, 8);                  if ((*endptr) ||                      (!IS_VALID_FILE_MODE(private->file_mode))) {                          gf_log (this->name, GF_LOG_DEBUG, -                                "invalid file-mode %o. setting to default %o", -                                private->file_mode, DEFAULT_FILE_MODE); -                        private->file_mode = DEFAULT_FILE_MODE; -                } else { -                        gf_log (this->name, GF_LOG_DEBUG, -                                "setting file-mode to %o", +                                "'%o' is not a valid parameter for file-mode " +                                "option. please specify a valid parameter for " +                                "file-mode and retry.",                                  private->file_mode); -                        private->file_mode = private->file_mode; +                        goto err;                  }          } else {                  private->file_mode = DEFAULT_FILE_MODE;          } -          private->symlink_mode = private->file_mode | S_IFLNK;          private->file_mode = private->file_mode | S_IFREG; -} - -static inline void -bdb_checkpoint_interval_init (xlator_t *this, -                              dict_t *options, -                              struct bdb_private *private) -{ -        int   ret = -1; -        char *checkpoint_interval_str = NULL; - -        private->checkpoint_interval = BDB_DEFAULT_CHECKPOINT_INTERVAL; - -        ret = dict_get_str (options, "checkpoint-interval", -                            &checkpoint_interval_str); +        ret = dict_get_str (options, "dir-mode", &mode_str);          if (ret == 0) { -                ret = gf_string2time (checkpoint_interval_str, -                                      &private->checkpoint_interval); - -                if (ret == 0) { +                private->dir_mode = strtol (mode_str, &endptr, 8); +                if ((*endptr) || +                    (!IS_VALID_FILE_MODE(private->dir_mode))) {                          gf_log (this->name, GF_LOG_DEBUG, -                                "setting checkpoint-interval to %"PRIu32" seconds", -                                private->checkpoint_interval); +                                "'%o' is not a valid parameter for dir-mode " +                                "option. please specify a valid parameter for " +                                "dir-mode and retry.", +                                private->dir_mode); +                        goto err;                  }          } else { -                gf_log (this->name, GF_LOG_DEBUG, -                        "setting checkpoint-interval to default: %"PRIu32" seconds", -                        private->checkpoint_interval); +                private->dir_mode = DEFAULT_DIR_MODE;          } -} -static inline void -bdb_lock_timeout_init (xlator_t *this, -                       dict_t *options, -                       struct bdb_private *private) -{ -        int   ret = -1; -        char *timeout_str = NULL; +        private->dir_mode = private->dir_mode | S_IFDIR; -        ret = dict_get_str (options, "lock-timeout", &timeout_str); +        table = CALLOC (1, sizeof (*table)); +        if (table == NULL) { +                gf_log ("bdb-ll", GF_LOG_CRITICAL, +                        "memory allocation for 'storage/bdb' internal " +                        "context table failed."); +                goto err; +        } -        if (ret == 0) { -                ret = gf_string2time (timeout_str, &private->lock_timeout); +        INIT_LIST_HEAD(&(table->b_lru)); +        INIT_LIST_HEAD(&(table->active)); +        INIT_LIST_HEAD(&(table->purge)); -                if (private->lock_timeout > 4260000) { -                        /* db allows us to DB_SET_LOCK_TIMEOUT to be set to a -                         * maximum of 71 mins (4260000 milliseconds) */ -                        gf_log (this->name, GF_LOG_DEBUG, -                                "lock-timeout %d, out of range", -                                private->lock_timeout); -                        private->lock_timeout = 0; -                } else { -                        gf_log (this->name, GF_LOG_DEBUG, -                                "setting lock-timeout to %d milliseconds", -                                private->lock_timeout); -                } -        } -} +        LOCK_INIT (&table->lock); +        LOCK_INIT (&table->checkpoint_lock); -static inline void -bdb_transaction_timeout_init (xlator_t *this, -                              dict_t *options, -                              struct bdb_private *private) -{ -        int   ret = -1; -        char *timeout_str = NULL; +        table->transaction = private->transaction; +        table->access_mode = private->access_mode; +        table->dbflags = private->dbflags; +        table->this    = this; -        ret = dict_get_str (options, "transaction-timeout", &timeout_str); +        ret = dict_get_str (options, "lru-limit", +                            &lru_limit_str); +        /* TODO: set max lockers and max txns to accomodate +         * for more than lru_limit */          if (ret == 0) { -                ret = gf_string2time (timeout_str, &private->txn_timeout); - -                if (private->txn_timeout > 4260000) { -                        /* db allows us to DB_SET_TXN_TIMEOUT to be set to -                         * a maximum of 71 mins (4260000 milliseconds) */ -                        gf_log (this->name, GF_LOG_DEBUG, -                                "transaction-timeout %d, out of range", -                                private->txn_timeout); -                        private->txn_timeout = 0; -                } else { -                        gf_log (this->name, GF_LOG_DEBUG, -                                "setting transaction-timeout to %d " -                                "milliseconds", -                                private->txn_timeout); -                } +                ret = gf_string2uint32 (lru_limit_str, +                                        &table->lru_limit); +                gf_log ("bdb-ll", GF_LOG_DEBUG, +                        "setting lru limit of 'storage/bdb' internal context" +                        "table to %d. maximum of %d unused databases can be " +                        "open at any given point of time.", +                        table->lru_limit, table->lru_limit); +        } else { +                table->lru_limit = BDB_DEFAULT_LRU_LIMIT;          } -} -static inline void -bdb_transaction_init (xlator_t *this, -                      dict_t *options, -                      struct bdb_private *private) -{ -        int   ret = -1; -        char *mode = NULL; +        ret = dict_get_str (options, "page-size", +                            &page_size_str); -        ret = dict_get_str (options, "mode", &mode); +        if (ret == 0) { +                ret = gf_string2bytesize (page_size_str, +                                          &table->page_size); +                if (ret < 0) { +                        gf_log ("bdb-ll", GF_LOG_ERROR, +                                "\"%s\" is an invalid parameter to " +                                "\"option page-size\". please specify a valid " +                                "size and retry.", +                                page_size_str); +                        goto err; +                } -        if ((ret == 0) -            && (!strcmp (mode, "cache"))) { -                gf_log (this->name, GF_LOG_DEBUG, -                        "cache mode selected"); -                private->envflags = DB_CREATE | DB_INIT_LOG | -                        DB_INIT_MPOOL | DB_THREAD; -                private->dbflags = DB_CREATE | DB_THREAD; -                private->transaction = OFF; +                if (!PAGE_SIZE_IN_RANGE(table->page_size)) { +                        gf_log ("bdb-ll", GF_LOG_ERROR, +                                "\"%s\" is out of range for Berkeley DB " +                                "page-size. allowed page-size range is %d to " +                                "%d. please specify a page-size value in the " +                                "range and retry.", +                                page_size_str, BDB_LL_PAGE_SIZE_MIN, +                                BDB_LL_PAGE_SIZE_MAX); +                        goto err; +                }          } else { -                gf_log (this->name, GF_LOG_DEBUG, -                        "persistant mode selected"); -                private->transaction = ON; -                private->envflags = DB_CREATE | DB_INIT_LOCK | DB_INIT_LOG | -                        DB_INIT_MPOOL | DB_INIT_TXN | DB_RECOVER | DB_THREAD; -                private->dbflags = DB_CREATE | DB_THREAD; - -                bdb_lock_timeout_init (this, options, private); - -                bdb_transaction_timeout_init (this, options, private); - -                bdb_log_remove_init (this, options, private); - -                bdb_checkpoint_interval_init (this, options, private); +                table->page_size = BDB_LL_PAGE_SIZE_DEFAULT;          } -} -static inline void -bdb_access_mode_init (xlator_t *this, -                      dict_t *options, -                      struct bdb_private *private) -{ -        int   ret = -1; -        char *access_mode = NULL; +        table->hash_size = BDB_DEFAULT_HASH_SIZE; +        table->b_hash = CALLOC (BDB_DEFAULT_HASH_SIZE, +                                sizeof (struct list_head)); -        ret = dict_get_str (options, "access-mode", &access_mode); +        for (idx = 0; idx < table->hash_size; idx++) +                INIT_LIST_HEAD(&(table->b_hash[idx])); -        if ((ret == 0) -            && (!strcmp (access_mode, "btree"))) { -                gf_log (this->name, GF_LOG_DEBUG, -                        "using access mode BTREE"); -                private->access_mode = DB_BTREE; -        } else { +        private->b_table = table; + +        ret = dict_get_str (options, "errfile", &errfile); +        if (ret == 0) { +                private->errfile = strdup (errfile);                  gf_log (this->name, GF_LOG_DEBUG, -                        "using access mode HASH"); -                private->access_mode = DB_HASH; +                        "using %s as error logging file for libdb (Berkeley DB " +                        "library) internal logging.", private->errfile);          } -} +        ret = dict_get_str (options, "directory", &directory); -/* bdb_db_init - initialize bdb xlator - * - * reads the options from @options dictionary and sets appropriate values in - * @this->private. also initializes DB_ENV. - * - * return: 0 on success or -1 on error - * (with logging the error through gf_log()). - */ -int -bdb_db_init (xlator_t *this, -             dict_t *options) -{ -        /* create a db entry for root */ -        int32_t        op_ret             = 0; -        bdb_private_t *private            = NULL; +        if (ret == 0) { +                ret = dict_get_str (options, "logdir", &logdir); -        private = this->private; +                if (ret < 0) { +                        gf_log ("bdb-ll", GF_LOG_DEBUG, +                                "using the database environment home " +                                "directory (%s) itself as transaction log " +                                "directory", directory); +                        private->logdir = strdup (directory); -        bdb_cache_init (this, options, private); +                } else { +                        private->logdir = strdup (logdir); -        bdb_access_mode_init (this, options, private); +                        op_ret = stat (private->logdir, &stbuf); +                        if ((op_ret != 0) +                            || (!S_ISDIR (stbuf.st_mode))) { +                                gf_log ("bdb-ll", GF_LOG_ERROR, +                                        "specified logdir %s does not exist. " +                                        "please provide a valid existing " +                                        "directory as parameter to 'option " +                                        "logdir'", +                                        private->logdir); +                                goto err; +                        } +                } -        bdb_transaction_init (this, options, private); +                private->b_table->dbenv = bdb_dbenv_init (this, directory); +                if (private->b_table->dbenv == NULL) { +                        gf_log ("bdb-ll", GF_LOG_ERROR, +                                "initialization of database environment " +                                "failed"); +                        goto err; +                } else { +                        if (private->transaction) { +                                /* all well, start the checkpointing thread */ +                                LOCK_INIT (&private->active_lock); -        { -                LOCK_INIT (&private->ino_lock); -                private->next_ino = 2; +                                LOCK (&private->active_lock); +                                { +                                        private->active = 1; +                                } +                                UNLOCK (&private->active_lock); +                                pthread_create (&private->checkpoint_thread, +                                                NULL, bdb_checkpoint, this); +                        } +                }          } -        bdb_file_mode_init (this, options, private); - -        bdb_dir_mode_init (this, options, private); - -        bdb_table_init (this, options, private); - -        bdb_errfile_init (this, options, private); +        return op_ret; +err: +        if (table) { +                FREE (table->b_hash); +                FREE (table); +        } +        if (private) { +                if (private->errfile) +                        FREE (private->errfile); -        bdb_directory_init (this, options, private); +                if (private->logdir) +                        FREE (private->logdir); +        } -        return op_ret; +        return -1;  }  | 
