5 files changed, 216 insertions, 8 deletions
diff --git a/xlators/nfs/server/src/nfs-fops.c b/xlators/nfs/server/src/nfs-fops.c
index 6e2b334842b..87c511d546f 100644
--- a/xlators/nfs/server/src/nfs-fops.c
+++ b/xlators/nfs/server/src/nfs-fops.c
@@ -22,6 +22,9 @@
 #include "config.h"
 #endif
 
+#include <grp.h>
+#include <pwd.h>
+
 #include "dict.h"
 #include "xlator.h"
 #include "iobuf.h"
@@ -32,9 +35,143 @@
 #include "inode.h"
 #include "nfs-common.h"
 #include "nfs3-helpers.h"
+#include "nfs-mem-types.h"
 #include <libgen.h>
 #include <semaphore.h>
 
+/*
+ * We treat this as a very simple set-associative LRU cache, with entries aged
+ * out after a configurable interval.  Hardly rocket science, but lots of
+ * details to worry about.
+ */
+#define BUCKET_START(p,n)       ((p) + ((n) * AUX_GID_CACHE_ASSOC))
+
+void
+nfs_fix_groups (xlator_t *this, call_stack_t *root)
+{
+        struct passwd    mypw;
+        char             mystrs[1024];
+        struct passwd    *result;
+        gid_t            mygroups[GF_MAX_AUX_GROUPS];
+        int              ngroups;
+        int              i;
+        struct nfs_state *priv = this->private;
+        aux_gid_list_t   *agl = NULL;
+        int              bucket = 0;
+        time_t           now = 0;
+
+        if (!priv->server_aux_gids) {
+                return;
+        }
+
+        LOCK(&priv->aux_gid_lock);
+        now = time(NULL);
+        bucket = root->uid % priv->aux_gid_nbuckets;
+        agl = BUCKET_START(priv->aux_gid_cache,bucket);
+        for (i = 0; i < AUX_GID_CACHE_ASSOC; ++i, ++agl) {
+                if (!agl->gid_list) {
+                        continue;
+                }
+                if (agl->uid != root->uid) {
+                        continue;
+                }
+                /*
+                 * We don't put new entries in the cache when expiration=0, but
+                 * there might be entries still in there if expiration was
+                 * changed very recently.  Writing the check this way ensures
+                 * that they're not used.
+                 */
+                if (now < agl->deadline) {
+                        for (ngroups = 0; ngroups < agl->gid_count; ++ngroups) {
+                                root->groups[ngroups] = agl->gid_list[ngroups];
+                        }
+                        UNLOCK(&priv->aux_gid_lock);
+                        root->ngrps = ngroups;
+                        return;
+                }
+                /*
+                 * We're not going to find any more UID matches, and reaping
+                 * is handled further down to maintain LRU order.
+                 */
+                break;
+        }
+        UNLOCK(&priv->aux_gid_lock);
+
+        if (getpwuid_r(root->uid,&mypw,mystrs,sizeof(mystrs),&result) != 0) {
+                gf_log (this->name, GF_LOG_ERROR,
+                        "getpwuid_r(%u) failed", root->uid);
+                return;
+        }
+
+        if (!result) {
+                gf_log (this->name, GF_LOG_ERROR,
+                        "getpwuid_r(%u) found nothing", root->uid);
+                return;
+        }
+
+        gf_log (this->name, GF_LOG_TRACE, "mapped %u => %s",
+                root->uid, result->pw_name);
+
+        ngroups = GF_MAX_AUX_GROUPS;
+        if (getgrouplist(result->pw_name,root->gid,mygroups,&ngroups) == -1) {
+                gf_log (this->name, GF_LOG_ERROR,
+                        "could not map %s to group list", result->pw_name);
+                return;
+        }
+
+        if (priv->aux_gid_max_age) {
+                LOCK(&priv->aux_gid_lock);
+                /* Bucket should still be valid from before. */
+                agl = BUCKET_START(priv->aux_gid_cache,bucket);
+                for (i = 0; i < AUX_GID_CACHE_ASSOC; ++i, ++agl) {
+                        if (!agl->gid_list) {
+                                break;
+                        }
+                }
+                /*
+                 * The way we allocate free entries naturally places the newest
+                 * ones at the highest indices, so evicting the lowest makes
+                 * sense, but that also means we can't just replace it with the
+                 * one that caused the eviction.  That would cause us to thrash
+                 * the first entry while others remain idle.  Therefore, we
+                 * need to slide the other entries down and add the new one at
+                 * the end just as if the *last* slot had been free.
+                 *
+                 * Deadline expiration is also handled here, since the oldest
+                 * expired entry will be in the first position.  This does mean
+                 * the bucket can stay full of expired entries if we're idle
+                 * but, if the small amount of extra memory or scan time before
+                 * we decide to evict someone ever become issues, we could
+                 * easily add a reaper thread.
+                 */
+                if (i >= AUX_GID_CACHE_ASSOC) {
+                        agl = BUCKET_START(priv->aux_gid_cache,bucket);
+                        GF_FREE(agl->gid_list);
+                        for (i = 1; i < AUX_GID_CACHE_ASSOC; ++i) {
+                                agl[0] = agl[1];
+                                ++agl;
+                        }
+                }
+                agl->gid_list = GF_CALLOC(ngroups,sizeof(gid_t),
+                                          gf_nfs_mt_aux_gids);
+                if (agl->gid_list) {
+                        /* It's not fatal if the alloc failed. */
+                        agl->uid = root->uid;
+                        agl->gid_count = ngroups;
+                        memcpy(agl->gid_list,mygroups,sizeof(gid_t)*ngroups);
+                        agl->deadline = now + priv->aux_gid_max_age;
+                }
+                UNLOCK(&priv->aux_gid_lock);
+        }
+
+        for (i = 0; i < ngroups; ++i) {
+                gf_log (this->name, GF_LOG_TRACE,
+                        "%s is in group %u", result->pw_name, mygroups[i]);
+                root->groups[i] = mygroups[i];
+        }
+        root->ngrps = ngroups;
+}
+
 struct nfs_fop_local *
 nfs_fop_local_init (xlator_t *nfsx)
 {
@@ -122,18 +259,24 @@ nfs_create_frame (xlator_t *xl, nfs_user_t *nfu)
         frame->root->uid = nfu->uid;
         frame->root->gid = nfu->gids[NFS_PRIMGID_IDX];
         frame->root->lk_owner = nfu->lk_owner;
-        if (nfu->ngrps == 1)
-                goto err;       /* Done, we only got primary gid */
 
-        frame->root->ngrps = nfu->ngrps - 1;
+        if (nfu->ngrps != 1) {
+                frame->root->ngrps = nfu->ngrps - 1;
 
-        gf_log (GF_NFS, GF_LOG_TRACE,"uid: %d, gid %d, gids: %d",
-                frame->root->uid, frame->root->gid, frame->root->ngrps);
-        for(y = 0, x = 1;  y < frame->root->ngrps; x++,y++) {
-                gf_log (GF_NFS, GF_LOG_TRACE, "gid: %d", nfu->gids[x]);
-                frame->root->groups[y] = nfu->gids[x];
+                gf_log (GF_NFS, GF_LOG_TRACE,"uid: %d, gid %d, gids: %d",
+                        frame->root->uid, frame->root->gid, frame->root->ngrps);
+                for(y = 0, x = 1;  y < frame->root->ngrps; x++,y++) {
+                        gf_log (GF_NFS, GF_LOG_TRACE, "gid: %d", nfu->gids[x]);
+                        frame->root->groups[y] = nfu->gids[x];
+                }
         }
 
+        /*
+         * It's tempting to do this *instead* of using nfu above, but we need
+         * to have those values in case nfs_fix_groups doesn't do anything.
+         */
+        nfs_fix_groups(xl,frame->root);
+
 err:
         return frame;
 }
diff --git a/xlators/nfs/server/src/nfs-mem-types.h b/xlators/nfs/server/src/nfs-mem-types.h
index d9edc95b90c..de25b08a826 100644
--- a/xlators/nfs/server/src/nfs-mem-types.h
+++ b/xlators/nfs/server/src/nfs-mem-types.h
@@ -52,6 +52,7 @@ enum gf_nfs_mem_types_ {
         gf_nfs_mt_nlm4_fde,
         gf_nfs_mt_nlm4_nlmclnt,
         gf_nfs_mt_nlm4_share,
+        gf_nfs_mt_aux_gids,
         gf_nfs_mt_end
 };
 #endif
diff --git a/xlators/nfs/server/src/nfs.c b/xlators/nfs/server/src/nfs.c
index 6ed3614296f..ba63bcd7a8c 100644
--- a/xlators/nfs/server/src/nfs.c
+++ b/xlators/nfs/server/src/nfs.c
@@ -41,6 +41,10 @@
 #include "nfs-mem-types.h"
 #include "nfs3-helpers.h"
 #include "nlm4.h"
+#include "options.h"
+
+#define OPT_SERVER_AUX_GIDS             "nfs.server-aux-gids"
+#define OPT_SERVER_GID_CACHE_TIMEOUT    "nfs.server.aux-gid-timeout"
 
 /* Every NFS version must call this function with the init function
  * for its particular version.
@@ -730,6 +734,11 @@ nfs_init_state (xlator_t *this)
                 }
         }
 
+        GF_OPTION_INIT (OPT_SERVER_AUX_GIDS, nfs->server_aux_gids,
+                        bool, free_foppool);
+        GF_OPTION_INIT (OPT_SERVER_GID_CACHE_TIMEOUT,nfs->aux_gid_max_age,
+                        uint32, free_foppool);
+
         if (stat("/sbin/rpc.statd", &stbuf) == -1) {
                 gf_log (GF_NFS, GF_LOG_WARNING, "/sbin/rpc.statd not found. "
                         "Disabling NLM");
@@ -818,6 +827,9 @@ init (xlator_t *this) {
                 goto err;
         }
 
+        LOCK_INIT(&nfs->aux_gid_lock);
+        nfs->aux_gid_nbuckets = AUX_GID_CACHE_BUCKETS;
+
         gf_log (GF_NFS, GF_LOG_INFO, "NFS service started");
 err:
 
@@ -1223,6 +1235,24 @@ struct volume_options options[] = {
                          "Needed by Solaris NFS clients if NLM support is"
                          "needed"
         },
+        { .key = {OPT_SERVER_AUX_GIDS},
+          .type = GF_OPTION_TYPE_BOOL,
+          .default_value = "off",
+          .description = "Let the server look up which groups a user belongs "
+                         "to, overwriting the list passed from the client. "
+                         "This enables support for group lists longer than "
+                         "can be passed through the NFS protocol, but is not "
+                         "secure unless users and groups are well synchronized "
+                         "between clients and servers."
+        },
+        { .key = {OPT_SERVER_GID_CACHE_TIMEOUT},
+          .type = GF_OPTION_TYPE_INT,
+          .min = 0,
+          .max = 3600,
+          .default_value = "5",
+          .description = "Number of seconds to cache auxiliary-GID data, when "
+                         OPT_SERVER_AUX_GIDS " is set."
+        },
 
         { .key  = {NULL} },
 };
diff --git a/xlators/nfs/server/src/nfs.h b/xlators/nfs/server/src/nfs.h
index 4c6d039f8d2..d2a0c134318 100644
--- a/xlators/nfs/server/src/nfs.h
+++ b/xlators/nfs/server/src/nfs.h
@@ -65,6 +65,27 @@ struct nfs_initer_list {
         rpcsvc_program_t        *program;
 };
 
+/*
+ * TBD: make the cache size tunable
+ *
+ * The current size represents a pretty trivial amount of memory, and should
+ * provide good hit rates even for quite busy systems.  If we ever want to
+ * support really large cache sizes, we'll need to do dynamic allocation
+ * instead of just defining an array within nfs_state.  It doesn't make a
+ * whole lot of sense to change the associativity, because it won't improve
+ * hit rates all that much and will increase the maintenance cost as we have
+ * to scan more entries with every lookup/update.
+ */
+#define AUX_GID_CACHE_ASSOC     4
+#define AUX_GID_CACHE_BUCKETS   256
+#define AUX_GID_CACHE_SIZE      (AUX_GID_CACHE_ASSOC * AUX_GID_CACHE_BUCKETS)
+
+typedef struct {
+        uid_t                   uid;
+        int                     gid_count;
+        gid_t                   *gid_list;
+        time_t                  deadline;
+} aux_gid_list_t;
 
 struct nfs_state {
         rpcsvc_t                *rpcsvc;
@@ -88,6 +109,11 @@ struct nfs_state {
         int                     enable_nlm;
         int                     mount_udp;
         struct rpc_clnt         *rpc_clnt;
+        gf_boolean_t            server_aux_gids;
+        gf_lock_t               aux_gid_lock;
+        uint32_t                aux_gid_max_age;
+        unsigned int            aux_gid_nbuckets;
+        aux_gid_list_t          aux_gid_cache[AUX_GID_CACHE_SIZE];
 };
 
 #define gf_nfs_dvm_on(nfsstt)   (((struct nfs_state *)nfsstt)->dynamicvolumes == GF_NFS_DVM_ON)
@@ -126,4 +152,7 @@ nfs_request_primary_user_init (nfs_user_t *nfu, rpcsvc_request_t *req,
                                uid_t uid, gid_t gid);
 extern int
 nfs_subvolume_started (struct nfs_state *nfs, xlator_t *xl);
+
+extern void
+nfs_fix_groups (xlator_t *this, call_stack_t *root);
 #endif
diff --git a/xlators/nfs/server/src/nlm4.c b/xlators/nfs/server/src/nlm4.c
index 5acc1b80978..98012203bb5 100644
--- a/xlators/nfs/server/src/nlm4.c
+++ b/xlators/nfs/server/src/nlm4.c
@@ -646,6 +646,11 @@ nlm4_file_open_and_resume(nfs3_call_state_t *cs, nlm4_resume_fn_t resume)
         frame->root->uid = 0;
         frame->root->gid = 0;
         frame->local = cs;
+        /*
+         * This is the only place that we call STACK_WIND without nfs_fix_groups,
+         * because in this particular case the relevant identify is in lk_owner and
+         * we don't care about the fields that nfs_fix_groups would set up.
+         */
         STACK_WIND_COOKIE (frame, nlm4_file_open_cbk, cs->vol, cs->vol,
                           cs->vol->fops->open, &cs->resolvedloc, O_RDWR,
                           cs->fd, NULL);