diff options
author | Aravinda VK <avishwan@redhat.com> | 2015-02-18 19:07:23 +0530 |
---|---|---|
committer | Vijay Bellur <vbellur@redhat.com> | 2015-03-18 02:54:25 -0700 |
commit | 7e98a0e0b1e346f89047b57e495f66f1b4223997 (patch) | |
tree | 0524dda8ad7066acab8d50537fe4f6e08f19fd29 /tools/glusterfind/src/changelog.py | |
parent | c9b1aea9b3f7087336c75548b59574f251813136 (diff) |
feature/glusterfind: A tool to find incremental changes
Documentation is available in patch:
http://review.gluster.org/#/c/9800/
A tool which helps to get list of modified files or list of all files in
GlusterFS Volume using Changelog or find command.
Usage
=====
glusterfind --help
Create:
-------
glusterfind create --help
The tool creates status file $GLUSTERD_WORKDIR/SESSION/VOLUME/status
and records current timestamp to initiate the session. This timestamp
will be used as start time for next runs.
As part of create also generates ssh key and distributes to all peers.
and enables build.pgfid and changelog using volume set command.
Pre:
----
glusterfind pre --help
This command is used to generate the list of files modified after session
creation time or after last run. To get list of all files/dirs in Volume,
run pre command with `--full` argument.
The tool gets all nodes details using gluster volume info and runs node
agent for each brick in respective nodes via ssh command. Once these node
agents generate the output file, tool copies to local using scp. Merges all
the output files to generate the final output file.
Post:
-----
glusterfind post --help
After consuming the list, this sub command is called to update the session
time based on pre command status file.
List:
-----
glusterfind list --help
To view all the sessions
Delete:
-------
glusterfind delete --help
Delete session.
Known Issues
------------
1. Deleted files will not get listed, since we can't convert GFID to
Path if file/dir is deleted.
2. Only new name will get listed if Renamed.
3. All hardlinks will get listed.
Change-Id: I82991feb0aea85cb6ec035fddbf80a2b276e86b0
BUG: 1193893
Signed-off-by: Aravinda VK <avishwan@redhat.com>
Reviewed-on: http://review.gluster.org/9682
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Venky Shankar <vshankar@redhat.com>
Reviewed-by: Prashanth Pai <ppai@redhat.com>
Reviewed-by: Kotresh HR <khiremat@redhat.com>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
Diffstat (limited to 'tools/glusterfind/src/changelog.py')
-rw-r--r-- | tools/glusterfind/src/changelog.py | 309 |
1 files changed, 309 insertions, 0 deletions
diff --git a/tools/glusterfind/src/changelog.py b/tools/glusterfind/src/changelog.py new file mode 100644 index 00000000000..b7697ea5030 --- /dev/null +++ b/tools/glusterfind/src/changelog.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python + +# Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com/> +# This file is part of GlusterFS. +# +# This file is licensed to you under your choice of the GNU Lesser +# General Public License, version 3 or any later version (LGPLv3 or +# later), or the GNU General Public License, version 2 (GPLv2), in all +# cases as published by the Free Software Foundation. + +import os +import sys +import time +import xattr +from errno import ENOENT +import logging +from argparse import ArgumentParser, RawDescriptionHelpFormatter +import hashlib + +import libgfchangelog +from utils import create_file, mkdirp, execute, symlink_gfid_to_path +from utils import fail, setup_logger, output_write, find +import conf + + +CHANGELOG_LOG_LEVEL = 9 +CHANGELOG_CONN_RETRIES = 5 +CHANGELOGAPI_NUM_WORKERS = 3 +PROG_DESCRIPTION = """ +Changelog Crawler +""" +history_turns = 0 +history_turn_time = 0 + +logger = logging.getLogger() + + +def gfid_to_path_using_batchfind(brick, gfids_file, output_file): + """ + find -samefile gets the inode number and crawls entire namespace + to get the list of files/dirs having same inode number. + Do find without any option, except the ignore directory option, + print the output in <INODE_NUM> <PATH> format, use this output + to look into in-memory dictionary of inode numbers got from the + list of GFIDs + """ + with open(output_file, "a+") as fout: + inode_dict = {} + with open(gfids_file) as f: + for gfid in f: + gfid = gfid.strip() + backend_path = os.path.join(brick, ".glusterfs", + gfid[0:2], gfid[2:4], gfid) + + try: + inode_dict[str(os.stat(backend_path).st_ino)] = 1 + except (IOError, OSError) as e: + if e.errno == ENOENT: + continue + else: + fail("%s Failed to convert to path from " + "GFID %s: %s" % (brick, gfid, e), logger=logger) + + if not inode_dict: + return + + def inode_filter(path): + try: + st = os.lstat(path) + except (OSError, IOError) as e: + if e.errno == ENOENT: + st = None + else: + raise + + if st and inode_dict.get(str(st.st_ino), None): + return True + + return False + + brick_path_len = len(brick) + + def output_callback(path): + path = path.strip() + path = path[brick_path_len+1:] + output_write(fout, path, args.output_prefix) + + # Length of brick path, to remove from output path + find(brick, callback_func=output_callback, + filter_func=inode_filter, + ignore_dirs=[".glusterfs"]) + + fout.flush() + os.fsync(fout.fileno()) + + +def gfid_to_path_using_pgfid(brick, gfids_file, output_file, outfile_failures): + """ + Parent GFID is saved as xattr, collect Parent GFIDs from all + the files from gfids_file. Convert parent GFID to path and Crawl + each directories to get the list of files/dirs having same inode number. + Do find with maxdepth as 1 and print the output in <INODE_NUM> <PATH> + format, use this output to look into in memory dictionary of inode + numbers got from the list of GFIDs + """ + with open(output_file, "a+") as fout: + pgfids = set() + inode_dict = {} + with open(gfids_file) as f: + for gfid in f: + gfid = gfid.strip() + p = os.path.join(brick, + ".glusterfs", + gfid[0:2], + gfid[2:4], + gfid) + if os.path.islink(p): + path = symlink_gfid_to_path(brick, gfid) + output_write(fout, path, args.output_prefix) + else: + try: + inode_dict[str(os.stat(p).st_ino)] = 1 + file_xattrs = xattr.list(p) + num_parent_gfid = 0 + for x in file_xattrs: + if x.startswith("trusted.pgfid."): + num_parent_gfid += 1 + pgfids.add(x.split(".")[-1]) + + if num_parent_gfid == 0: + with open(outfile_failures, "a") as f: + f.write("%s\n" % gfid) + f.flush() + os.fsync(f.fileno()) + + except (IOError, OSError) as e: + if e.errno == ENOENT: + continue + else: + fail("%s Failed to convert to path from " + "GFID %s: %s" % (brick, gfid, e), + logger=logger) + + if not inode_dict: + return + + def inode_filter(path): + try: + st = os.lstat(path) + except (OSError, IOError) as e: + if e.errno == ENOENT: + st = None + else: + raise + + if st and inode_dict.get(str(st.st_ino), None): + return True + + return False + + # Length of brick path, to remove from output path + brick_path_len = len(brick) + + def output_callback(path): + path = path.strip() + path = path[brick_path_len+1:] + output_write(fout, path, args.output_prefix) + + for pgfid in pgfids: + path = symlink_gfid_to_path(brick, pgfid) + find(os.path.join(brick, path), + callback_func=output_callback, + filter_func=inode_filter, + ignore_dirs=[".glusterfs"], + subdirs_crawl=False) + + fout.flush() + os.fsync(fout.fileno()) + + +def sort_unique(filename): + execute(["sort", "-u", "-o", filename, filename], + exit_msg="Sort failed", logger=logger) + + +def get_changes(brick, hash_dir, log_file, end, args): + """ + Makes use of libgfchangelog's history API to get changelogs + containing changes from start and end time. Further collects + the modified gfids from the changelogs and writes the list + of gfid to 'gfid_list' file. + """ + try: + libgfchangelog.cl_register(brick, hash_dir, log_file, + CHANGELOG_LOG_LEVEL, CHANGELOG_CONN_RETRIES) + except libgfchangelog.ChangelogException as e: + fail("%s Changelog register failed: %s" % (brick, e), logger=logger) + + # Output files to record GFIDs and GFID to Path failure GFIDs + gfid_list_path = args.outfile + ".gfids" + gfid_list_failures_file = gfid_list_path + ".failures" + create_file(gfid_list_path, exit_on_err=True, logger=logger) + create_file(gfid_list_failures_file, exit_on_err=True, logger=logger) + + # Changelogs path(Hard coded to BRICK/.glusterfs/changelogs + cl_path = os.path.join(brick, ".glusterfs/changelogs") + + # Fail if History fails for requested Start and End + try: + actual_end = libgfchangelog.cl_history_changelog( + cl_path, args.start, end, CHANGELOGAPI_NUM_WORKERS) + except libgfchangelog.ChangelogException as e: + fail("%s Historical Changelogs not available: %s" % (brick, e), + logger=logger) + + try: + # scan followed by getchanges till scan returns zero. + # history_scan() is blocking call, till it gets the number + # of changelogs to process. Returns zero when no changelogs + # to be processed. returns positive value as number of changelogs + # to be processed, which will be fetched using + # history_getchanges() + changes = [] + while libgfchangelog.cl_history_scan() > 0: + changes += libgfchangelog.cl_history_getchanges() + + if changes: + with open(gfid_list_path, 'a+') as fgfid: + for change in changes: + with open(change) as f: + for line in f: + # Space delimited list, collect GFID + details = line.split() + fgfid.write("%s\n" % details[1]) + + libgfchangelog.cl_history_done(change) + fgfid.flush() + os.fsync(fgfid.fileno()) + except libgfchangelog.ChangelogException as e: + fail("%s Error during Changelog Crawl: %s" % (brick, e), + logger=logger) + + # If TS returned from history_changelog is < end time + # then FS crawl may be required, since history is only available + # till TS returned from history_changelog + if actual_end < end: + fail("Partial History available with Changelog", 2, logger=logger) + + sort_unique(gfid_list_path) + gfid_to_path_using_pgfid(brick, gfid_list_path, + args.outfile, gfid_list_failures_file) + gfid_to_path_using_batchfind(brick, gfid_list_failures_file, args.outfile) + + +def changelog_crawl(brick, end, args): + """ + Init function, prepares working dir and calls Changelog query + """ + if brick.endswith("/"): + brick = brick[0:len(brick)-1] + + # WORKING_DIR/BRICKHASH/OUTFILE + working_dir = os.path.dirname(args.outfile) + brickhash = hashlib.sha1(brick) + brickhash = str(brickhash.hexdigest()) + working_dir = os.path.join(working_dir, brickhash) + + mkdirp(working_dir, exit_on_err=True, logger=logger) + create_file(args.outfile, exit_on_err=True, logger=logger) + create_file(args.outfile + ".gfids", exit_on_err=True, logger=logger) + + log_file = os.path.join(conf.get_opt("log_dir"), + args.session, + args.volume, + "changelog.%s.log" % brickhash) + + logger.info("%s Started Changelog Crawl. Start: %s, End: %s" + % (brick, args.start, end)) + get_changes(brick, working_dir, log_file, end, args) + + +def _get_args(): + parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter, + description=PROG_DESCRIPTION) + + parser.add_argument("session", help="Session Name") + parser.add_argument("volume", help="Volume Name") + parser.add_argument("brick", help="Brick Name") + parser.add_argument("outfile", help="Output File") + parser.add_argument("start", help="Start Time", type=int) + parser.add_argument("--debug", help="Debug", action="store_true") + parser.add_argument("--output-prefix", help="File prefix in output", + default=".") + + return parser.parse_args() + + +if __name__ == "__main__": + args = _get_args() + mkdirp(os.path.join(conf.get_opt("log_dir"), args.session, args.volume), + exit_on_err=True) + log_file = os.path.join(conf.get_opt("log_dir"), + args.session, + args.volume, + "changelog.log") + setup_logger(logger, log_file, args.debug) + end = int(time.time()) - int(conf.get_opt("changelog_rollover_time")) + changelog_crawl(args.brick, end, args) + sys.exit(0) |