1 files changed, 83 insertions, 22 deletions
diff --git a/tools/glusterfind/src/changelog.py b/tools/glusterfind/src/changelog.py
index 721b8d0ca3a..a5e9ea4288f 100644
--- a/tools/glusterfind/src/changelog.py
+++ b/tools/glusterfind/src/changelog.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
 # -*- coding: utf-8 -*-
 
 # Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com/>
@@ -14,9 +14,13 @@ import sys
 import time
 import xattr
 import logging
+from gfind_py2py3 import bytearray_to_str
 from argparse import ArgumentParser, RawDescriptionHelpFormatter
 import hashlib
-import urllib
+try:
+    import urllib.parse as urllib
+except ImportError:
+    import urllib
 import codecs
 
 import libgfchangelog
@@ -40,8 +44,6 @@ history_turn_time = 0
 logger = logging.getLogger()
 
 
-
-
 def pgfid_to_path(brick, changelog_data):
     """
     For all the pgfids in table, converts into path using recursive
@@ -49,7 +51,7 @@ def pgfid_to_path(brick, changelog_data):
     """
     # pgfid1 to path1 in case of CREATE/MKNOD/MKDIR/LINK/SYMLINK
     for row in changelog_data.gfidpath_get_distinct("pgfid1", {"path1": ""}):
-        # In case of Data/Metadata only, pgfid1 will not be their
+        # In case of Data/Metadata only, pgfid1 will not be there
         if row[0] == "":
             continue
 
@@ -94,7 +96,7 @@ def populate_pgfid_and_inodegfid(brick, changelog_data):
                 path = symlink_gfid_to_path(brick, gfid)
                 path = output_path_prepare(path, args)
                 changelog_data.gfidpath_update({"path1": path},
-                                                {"gfid": gfid})
+                                               {"gfid": gfid})
             except (IOError, OSError) as e:
                 logger.warn("Error converting to path: %s" % e)
                 continue
@@ -104,15 +106,55 @@ def populate_pgfid_and_inodegfid(brick, changelog_data):
                 changelog_data.inodegfid_add(os.stat(p).st_ino, gfid)
                 file_xattrs = xattr.list(p)
                 for x in file_xattrs:
-                    if x.startswith("trusted.pgfid."):
+                    x_str = bytearray_to_str(x)
+                    if x_str.startswith("trusted.pgfid."):
                         # PGFID in pgfid table
-                        changelog_data.pgfid_add(x.split(".")[-1])
+                        changelog_data.pgfid_add(x_str.split(".")[-1])
             except (IOError, OSError):
                 # All OS Errors ignored, since failures will be logged
                 # in End. All GFIDs present in gfidpath table
                 continue
 
 
+def enum_hard_links_using_gfid2path(brick, gfid, args):
+    hardlinks = []
+    p = os.path.join(brick, ".glusterfs", gfid[0:2], gfid[2:4], gfid)
+    if not os.path.isdir(p):
+        # we have a symlink or a normal file
+        try:
+            file_xattrs = xattr.list(p)
+            for x in file_xattrs:
+                x_str = bytearray_to_str(x)
+                if x_str.startswith("trusted.gfid2path."):
+                    # get the value for the xattr i.e. <PGFID>/<BN>
+                    v = xattr.getxattr(p, x_str)
+                    v_str = bytearray_to_str(v)
+                    pgfid, bn = v_str.split(os.sep)
+                    try:
+                        path = symlink_gfid_to_path(brick, pgfid)
+                        fullpath = os.path.join(path, bn)
+                        fullpath = output_path_prepare(fullpath, args)
+                        hardlinks.append(fullpath)
+                    except (IOError, OSError) as e:
+                        logger.warn("Error converting to path: %s" % e)
+                        continue
+        except (IOError, OSError):
+            pass
+    return hardlinks
+
+
+def gfid_to_all_paths_using_gfid2path(brick, changelog_data, args):
+    path = ""
+    for row in changelog_data.gfidpath_get({"path1": "", "type": "MODIFY"}):
+        gfid = row[3].strip()
+        logger.debug("Processing gfid %s" % gfid)
+        hardlinks = enum_hard_links_using_gfid2path(brick, gfid, args)
+
+        path = ",".join(hardlinks)
+
+        changelog_data.gfidpath_update({"path1": path}, {"gfid": gfid})
+
+
 def gfid_to_path_using_pgfid(brick, changelog_data, args):
     """
     For all the pgfids collected, Converts to Path and
@@ -160,10 +202,10 @@ def gfid_to_path_using_pgfid(brick, changelog_data, args):
         try:
             path = symlink_gfid_to_path(brick, row[0])
             find(os.path.join(brick, path),
-                callback_func=output_callback,
-                filter_func=inode_filter,
-                ignore_dirs=ignore_dirs,
-                subdirs_crawl=False)
+                 callback_func=output_callback,
+                 filter_func=inode_filter,
+                 ignore_dirs=ignore_dirs,
+                 subdirs_crawl=False)
         except (IOError, OSError) as e:
             logger.warn("Error converting to path: %s" % e)
             continue
@@ -245,7 +287,7 @@ def get_changes(brick, hash_dir, log_file, start, end, args):
     session_dir = os.path.join(conf.get_opt("session_dir"),
                                args.session)
     status_file = os.path.join(session_dir, args.volume,
-                               "%s.status" % urllib.quote_plus(args.brick))
+                     "%s.status" % urllib.quote_plus(args.brick))
 
     # Get previous session
     try:
@@ -272,9 +314,10 @@ def get_changes(brick, hash_dir, log_file, start, end, args):
         actual_end = libgfchangelog.cl_history_changelog(
             cl_path, start, end, CHANGELOGAPI_NUM_WORKERS)
     except libgfchangelog.ChangelogException as e:
-        fail("%s Historical Changelogs not available: %s" % (brick, e),
-             logger=logger)
+        fail("%s: %s Historical Changelogs not available: %s" %
+             (args.node, brick, e), logger=logger)
 
+    logger.info("[1/4] Starting changelog parsing ...")
     try:
         # scan followed by getchanges till scan returns zero.
         # history_scan() is blocking call, till it gets the number
@@ -296,25 +339,34 @@ def get_changes(brick, hash_dir, log_file, start, end, args):
                     libgfchangelog.cl_history_done(change)
                 except IOError as e:
                     logger.warn("Error parsing changelog file %s: %s" %
-                        (change, e))
+                                (change, e))
 
             changelog_data.commit()
     except libgfchangelog.ChangelogException as e:
         fail("%s Error during Changelog Crawl: %s" % (brick, e),
              logger=logger)
 
+    logger.info("[1/4] Finished changelog parsing.")
+
     # Convert all pgfid available from Changelogs
+    logger.info("[2/4] Starting 'pgfid to path' conversions ...")
     pgfid_to_path(brick, changelog_data)
     changelog_data.commit()
+    logger.info("[2/4] Finished 'pgfid to path' conversions.")
 
-    # Convert all GFIDs for which no other additional details available
-    gfid_to_path_using_pgfid(brick, changelog_data, args)
+    # Convert all gfids recorded for data and metadata to all hardlink paths
+    logger.info("[3/4] Starting 'gfid2path' conversions ...")
+    gfid_to_all_paths_using_gfid2path(brick, changelog_data, args)
     changelog_data.commit()
+    logger.info("[3/4] Finished 'gfid2path' conversions.")
 
     # If some GFIDs fail to get converted from previous step,
     # convert using find
+    logger.info("[4/4] Starting 'gfid to path using batchfind' "
+                "conversions ...")
     gfid_to_path_using_batchfind(brick, changelog_data)
     changelog_data.commit()
+    logger.info("[4/4] Finished 'gfid to path using batchfind' conversions.")
 
     return actual_end
 
@@ -328,7 +380,7 @@ def changelog_crawl(brick, start, end, args):
 
     # WORKING_DIR/BRICKHASH/OUTFILE
     working_dir = os.path.dirname(args.outfile)
-    brickhash = hashlib.sha1(brick)
+    brickhash = hashlib.sha1(brick.encode())
     brickhash = str(brickhash.hexdigest())
     working_dir = os.path.join(working_dir, brickhash)
 
@@ -350,9 +402,11 @@ def _get_args():
 
     parser.add_argument("session", help="Session Name")
     parser.add_argument("volume", help="Volume Name")
+    parser.add_argument("node", help="Node Name")
     parser.add_argument("brick", help="Brick Name")
     parser.add_argument("outfile", help="Output File")
     parser.add_argument("start", help="Start Time", type=int)
+    parser.add_argument("end", help="End Time", type=int)
     parser.add_argument("--only-query", help="Query mode only (no session)",
                         action="store_true")
     parser.add_argument("--debug", help="Debug", action="store_true")
@@ -361,6 +415,7 @@ def _get_args():
                         action="store_true")
     parser.add_argument("--output-prefix", help="File prefix in output",
                         default=".")
+    parser.add_argument("--type",default="both")
     parser.add_argument("-N", "--only-namespace-changes",
                         help="List only namespace changes",
                         action="store_true")
@@ -380,13 +435,15 @@ if __name__ == "__main__":
 
     session_dir = os.path.join(conf.get_opt("session_dir"), args.session)
     status_file = os.path.join(session_dir, args.volume,
-                               "%s.status" % urllib.quote_plus(args.brick))
+                     "%s.status" % urllib.quote_plus(args.brick))
     status_file_pre = status_file + ".pre"
     mkdirp(os.path.join(session_dir, args.volume), exit_on_err=True,
            logger=logger)
 
+    end = -1
     if args.only_query:
         start = args.start
+        end = args.end
     else:
         try:
             with open(status_file) as f:
@@ -394,13 +451,17 @@ if __name__ == "__main__":
         except (ValueError, OSError, IOError):
             start = args.start
 
-    end = int(time.time()) - get_changelog_rollover_time(args.volume)
+    # end time is optional; so a -1 may be sent to use the default method of
+    # identifying the end time
+    if end == -1:
+        end = int(time.time()) - get_changelog_rollover_time(args.volume)
+
     logger.info("%s Started Changelog Crawl - Start: %s End: %s" % (args.brick,
                                                                     start,
                                                                     end))
     actual_end = changelog_crawl(args.brick, start, end, args)
     if not args.only_query:
-        with open(status_file_pre, "w", buffering=0) as f:
+        with open(status_file_pre, "w") as f:
             f.write(str(actual_end))
 
     logger.info("%s Finished Changelog Crawl - End: %s" % (args.brick,