diff options
| author | vamahaja <vamahaja@redhat.com> | 2018-10-04 17:30:11 +0530 |
|---|---|---|
| committer | vamahaja <vamahaja@redhat.com> | 2018-12-10 09:22:38 +0530 |
| commit | 024f0f0bc9f8c969c8f5a3ec494cee0c019f1868 (patch) | |
| tree | 8a000350e82a43e01e90b31d437cf3973d29f9ff | |
| parent | 31492fa754bd77e583564d8356822500078d1e2c (diff) | |
[CNS-1314][CNS-1285] Restart gluster block volumes and validate
Change-Id: Ib7e3125e5120a91fe431816b33be4d4e6f15078e
Signed-off-by: vamahaja <vamahaja@redhat.com>
| -rw-r--r-- | cns-libs/cnslibs/common/gluster_ops.py | 296 | ||||
| -rw-r--r-- | cns-libs/cnslibs/common/heketi_ops.py | 46 | ||||
| -rw-r--r-- | cns-libs/cnslibs/common/openshift_ops.py | 47 | ||||
| -rw-r--r-- | tests/functional/common/gluster_stability/test_gluster_services_restart.py | 137 |
4 files changed, 472 insertions, 54 deletions
diff --git a/cns-libs/cnslibs/common/gluster_ops.py b/cns-libs/cnslibs/common/gluster_ops.py new file mode 100644 index 00000000..76b3bc7d --- /dev/null +++ b/cns-libs/cnslibs/common/gluster_ops.py @@ -0,0 +1,296 @@ +import six +import time +import json +import re + +from glusto.core import Glusto as g +from glustolibs.gluster.heal_libs import is_heal_complete +from glustolibs.gluster.volume_ops import ( + get_volume_status, + get_volume_list, + volume_status, + volume_start, + volume_stop +) +from glustolibs.gluster.block_ops import block_list +from cnslibs.common.openshift_ops import ( + oc_get_pods, + oc_rsh, + wait_for_process_to_kill_on_pod +) +from cnslibs.common.heketi_ops import heketi_blockvolume_info +from cnslibs.common import exceptions, podcmd +from cnslibs.common import waiter + + +def _get_gluster_pod(gluster_pod, hostname=None): + """create glusto.podcmd object if gluster_pod is string and + hostname is given else returns gluster_pod object given + + Args: + gluster_pod (podcmd | str): gluster pod class object has gluster + pod and ocp master node or gluster + pod name + hostname (str): master node on which gluster pod exists + """ + if isinstance(gluster_pod, podcmd.Pod): + return gluster_pod + elif isinstance(gluster_pod, six.string_types): + if hostname: + return podcmd.Pod(hostname, gluster_pod) + else: + raise exceptions.ExecutionError( + "gluster pod is string '%s' but hostname '%s' not valid" % ( + gluster_pod, hostname) + ) + else: + raise exceptions.ExecutionError( + "invalid gluster pod parameter '%s', '%s'" % ( + gluster_pod, type(gluster_pod)) + ) + + +@podcmd.GlustoPod() +def wait_to_heal_complete( + gluster_pod, hostname=None, timeout=300, wait_step=5): + """Monitors heal for volumes on gluster + gluster_pod (podcmd | str): gluster pod class object has gluster + pod and ocp master node or gluster + pod name + hostname (str): master node on which gluster pod exists + """ + gluster_pod = _get_gluster_pod(gluster_pod, hostname) + + gluster_vol_list = get_volume_list(gluster_pod) + if not gluster_vol_list: + raise AssertionError("failed to get gluster volume list") + + _waiter = waiter.Waiter(timeout=timeout, interval=wait_step) + for gluster_vol in gluster_vol_list: + for w in _waiter: + if is_heal_complete(gluster_pod, gluster_vol): + break + + if w.expired: + err_msg = ("reached timeout waiting for all the gluster volumes " + "to reach the 'healed' state.") + g.log.error(err_msg) + raise AssertionError(err_msg) + + +@podcmd.GlustoPod() +def get_brick_pids(gluster_pod, block_hosting_vol, hostname=None): + """gets brick pids from gluster pods + + Args: + hostname (str): hostname on which gluster pod exists + gluster_pod (podcmd | str): gluster pod class object has gluster + pod and ocp master node or gluster + pod name + block_hosting_vol (str): Block hosting volume id + """ + gluster_pod = _get_gluster_pod(gluster_pod, hostname) + + gluster_volume_status = get_volume_status(gluster_pod, block_hosting_vol) + if not gluster_volume_status: + raise AssertionError("failed to get volume status for gluster " + "volume '%s' on pod '%s'" % ( + gluster_pod, block_hosting_vol)) + + gluster_volume_status = gluster_volume_status.get(block_hosting_vol) + assert gluster_volume_status, ("gluster volume %s not present" % ( + block_hosting_vol)) + + pids = {} + for parent_key, parent_val in gluster_volume_status.items(): + for child_key, child_val in parent_val.items(): + if not child_key.startswith("/var"): + continue + + pid = child_val["pid"] + # When birck is down, pid of the brick is returned as -1. + # Which is unexepeted situation, hence raising error. + if pid == "-1": + raise AssertionError("Something went wrong brick pid is -1") + + pids[parent_key] = pid + + return pids + + +@podcmd.GlustoPod() +def restart_brick_process(hostname, gluster_pod, block_hosting_vol): + """restarts brick process of block hosting volumes + + Args: + hostname (str): hostname on which gluster pod exists + gluster_pod (podcmd | str): gluster pod class object has gluster + pod and ocp master node or gluster + pod name + block_hosting_vol (str): block hosting volume name + """ + pids = get_brick_pids(gluster_pod, block_hosting_vol, hostname) + + # using count variable to limit the max pod process kill to 2 + count = 0 + killed_process = {} + pid_keys = pids.keys() + oc_pods = oc_get_pods(hostname) + for pod in oc_pods.keys(): + if not (oc_pods[pod]["ip"] in pid_keys and count <= 1): + continue + + ret, out, err = oc_rsh( + hostname, pod, "kill -9 %s" % pids[oc_pods[pod]["ip"]] + ) + if ret != 0: + err_msg = "failed to kill process id %s error: %s" % ( + pids[oc_pods[pod]["ip"]], err) + g.log.error(err_msg) + raise AssertionError(err_msg) + + killed_process[pod] = pids[oc_pods[pod]["ip"]] + count += 1 + + for pod, pid in killed_process.items(): + wait_for_process_to_kill_on_pod(pod, pid, hostname) + + ret, out, err = volume_start(gluster_pod, block_hosting_vol, force=True) + if ret != 0: + err_msg = "failed to start gluster volume %s on pod %s error: %s" % ( + block_hosting_vol, gluster_pod, err) + g.log.error(err_msg) + raise AssertionError(err_msg) + + +@podcmd.GlustoPod() +def restart_block_hosting_volume( + gluster_pod, block_hosting_vol, sleep_time=120, hostname=None): + """restars block hosting volume service + + Args: + hostname (str): hostname on which gluster pod exists + gluster_pod (podcmd | str): gluster pod class object has gluster + pod and ocp master node or gluster + pod name + block_hosting_vol (str): name of block hosting volume + """ + gluster_pod = _get_gluster_pod(gluster_pod, hostname) + + gluster_volume_status = get_volume_status(gluster_pod, block_hosting_vol) + if not gluster_volume_status: + raise AssertionError("failed to get gluster volume status") + + g.log.info("Gluster volume %s status\n%s : " % ( + block_hosting_vol, gluster_volume_status) + ) + + ret, out, err = volume_stop(gluster_pod, block_hosting_vol) + if ret != 0: + err_msg = "failed to stop gluster volume %s on pod %s error: %s" % ( + block_hosting_vol, gluster_pod, err) + g.log.error(err_msg) + raise AssertionError(err_msg) + + # Explicit wait to stop ios and pvc creation for 2 mins + time.sleep(sleep_time) + ret, out, err = volume_start(gluster_pod, block_hosting_vol, force=True) + if ret != 0: + err_msg = "failed to start gluster volume %s on pod %s error: %s" % ( + block_hosting_vol, gluster_pod, err) + g.log.error(err_msg) + raise AssertionError(err_msg) + + ret, out, err = volume_status(gluster_pod, block_hosting_vol) + if ret != 0: + err_msg = ("failed to get status for gluster volume %s on pod %s " + "error: %s" % (block_hosting_vol, gluster_pod, err)) + g.log.error(err_msg) + raise AssertionError(err_msg) + + +@podcmd.GlustoPod() +def match_heketi_and_gluster_block_volumes_by_prefix( + gluster_pod, heketi_block_volumes, block_vol_prefix, hostname=None): + """Match block volumes from heketi and gluster. This function can't + be used for block volumes with custom prefixes + + Args: + gluster_pod (podcmd | str): gluster pod class object has gluster + pod and ocp master node or gluster + pod name + heketi_block_volumes (list): list of heketi block volumes with + which gluster block volumes need to + be matched + block_vol_prefix (str): block volume prefix by which the block + volumes needs to be filtered + hostname (str): ocp master node on which oc command gets executed + + """ + gluster_pod = _get_gluster_pod(gluster_pod, hostname) + + gluster_vol_list = get_volume_list(gluster_pod) + + gluster_vol_block_list = [] + for gluster_vol in gluster_vol_list[1:]: + ret, out, err = block_list(gluster_pod, gluster_vol) + try: + if ret != 0 and json.loads(out)["RESULT"] == "FAIL": + msg = "failed to get block volume list with error: %s" % err + g.log.error(msg) + raise AssertionError(msg) + except Exception as e: + g.log.error(e) + raise + + gluster_vol_block_list.extend([ + block_vol.replace(block_vol_prefix, "") + for block_vol in json.loads(out)["blocks"] + if block_vol.startswith(block_vol_prefix) + ]) + + if cmp(sorted(gluster_vol_block_list), heketi_block_volumes) != 0: + err_msg = "Gluster and Heketi Block volume list match failed" + err_msg += "\nGluster Volumes: %s, " % gluster_vol_block_list + err_msg += "\nBlock volumes %s" % heketi_block_volumes + err_msg += "\nDifference: %s" % (set(gluster_vol_block_list) ^ + set(heketi_block_volumes)) + raise AssertionError(err_msg) + + +@podcmd.GlustoPod() +def get_block_hosting_volume_name(heketi_client_node, heketi_server_url, + block_volume, gluster_pod, hostname=None): + """Returns block hosting volume name of given block volume + + Args: + heketi_client_node (str): Node on which cmd has to be executed. + heketi_server_url (str): Heketi server url + block_volume (str): Block volume of which block hosting volume + returned + gluster_pod (podcmd | str): Gluster pod class object has gluster + pod and ocp master node or gluster + pod name + hostname (str): OCP master node on which ocp commands get executed + + Returns: + str : Name of the block hosting volume for given block volume + """ + gluster_pod = _get_gluster_pod(gluster_pod, hostname) + + block_vol_info = heketi_blockvolume_info( + heketi_client_node, heketi_server_url, block_volume + ) + + for line in block_vol_info.splitlines(): + block_hosting_vol_match = re.search( + "^Block Hosting Volume: (.*)$", line + ) + + if not block_hosting_vol_match: + continue + + gluster_vol_list = get_volume_list(gluster_pod) + for vol in gluster_vol_list: + if block_hosting_vol_match.group(1).strip() in vol: + return vol diff --git a/cns-libs/cnslibs/common/heketi_ops.py b/cns-libs/cnslibs/common/heketi_ops.py index 534017ff..12910492 100644 --- a/cns-libs/cnslibs/common/heketi_ops.py +++ b/cns-libs/cnslibs/common/heketi_ops.py @@ -3,20 +3,18 @@ """ import json -import six from glusto.core import Glusto as g -from glustolibs.gluster.block_ops import block_list -from glustolibs.gluster.volume_ops import get_volume_list from collections import OrderedDict try: from heketi import HeketiClient except ImportError: g.log.error("Please install python-client for heketi and re-run the test") -from cnslibs.common import exceptions, podcmd +from cnslibs.common import exceptions from cnslibs.common.utils import parse_prometheus_data + HEKETI_SSH_KEY = "/etc/heketi/heketi_key" HEKETI_CONFIG_FILE = "/etc/heketi/heketi.json" @@ -2351,46 +2349,6 @@ def rm_arbiter_tag(heketi_client_node, heketi_server_url, source, source_id, source, source_id, 'arbiter', **kwargs) -@podcmd.GlustoPod() -def match_heketi_and_gluster_block_volumes( - gluster_pod, heketi_block_volumes, block_vol_prefix, hostname=None): - """Match block volumes from heketi and gluster - - Args: - gluster_pod (podcmd | str): gluster pod class object has gluster - pod and ocp master node or gluster - pod name - heketi_block_volumes (list): list of heketi block volumes with - which gluster block volumes need to - be matched - block_vol_prefix (str): block volume prefix by which the block - volumes needs to be filtered - hostname (str): master node on which gluster pod exists - - """ - if isinstance(gluster_pod, podcmd.Pod): - g.log.info("Recieved gluster pod object using same") - elif isinstance(gluster_pod, six.string_types) and hostname: - g.log.info("Recieved gluster pod name and hostname") - gluster_pod = podcmd.Pod(hostname, gluster_pod) - else: - raise exceptions.ExecutionError("Invalid glsuter pod parameter") - - gluster_vol_list = get_volume_list(gluster_pod) - - gluster_vol_block_list = [] - for gluster_vol in gluster_vol_list[1:]: - ret, out, err = block_list(gluster_pod, gluster_vol) - gluster_vol_block_list.extend([ - block_vol.replace(block_vol_prefix, "") - for block_vol in json.loads(out)["blocks"] - if block_vol.startswith(block_vol_prefix) - ]) - - assert sorted(gluster_vol_block_list) == heketi_block_volumes, ( - "Gluster and Heketi Block volume list match failed") - - def get_heketi_metrics(heketi_client_node, heketi_server_url, prometheus_format=False): ''' Execute curl command to get metrics output diff --git a/cns-libs/cnslibs/common/openshift_ops.py b/cns-libs/cnslibs/common/openshift_ops.py index 7e000bc7..3a6f38b3 100644 --- a/cns-libs/cnslibs/common/openshift_ops.py +++ b/cns-libs/cnslibs/common/openshift_ops.py @@ -1422,7 +1422,12 @@ def match_pvc_and_pv(hostname, prefix): if pv[0].startswith(prefix) ]) - assert pvc_list == pv_list, "PVC and PV list match failed" + if cmp(pvc_list, pv_list) != 0: + err_msg = "PVC and PV list match failed" + err_msg += "\nPVC list: %s, " % pvc_list + err_msg += "\nPV list %s" % pv_list + err_msg += "\nDifference: %s" % (set(pvc_list) ^ set(pv_list)) + raise AssertionError(err_msg) def match_pv_and_heketi_block_volumes( @@ -1446,8 +1451,13 @@ def match_pv_and_heketi_block_volumes( if pv[0].startswith(pvc_prefix) and pv[1] == "gluster.org/glusterblock" ]) - assert pv_block_volumes == heketi_block_volumes, ( - "PV and Heketi Block list match failed") + if cmp(pv_block_volumes, heketi_block_volumes) != 0: + err_msg = "PV block volumes and Heketi Block volume list match failed" + err_msg += "\nPV Block Volumes: %s, " % pv_block_volumes + err_msg += "\nHeketi Block volumes %s" % heketi_block_volumes + err_msg += "\nDifference: %s" % (set(pv_block_volumes) ^ + set(heketi_block_volumes)) + raise AssertionError(err_msg) def check_service_status( @@ -1502,3 +1512,34 @@ def restart_service_on_pod(hostname, podname, service): (service, podname)) g.log.error(err_msg) raise AssertionError(err_msg) + + +def wait_for_process_to_kill_on_pod( + pod, pid, hostname, timeout=60, interval=3): + """check for process presence if process is present for more than + timeout sec raise exception + + Args: + pid (int | str): process id to be killed on pod + pod (str): pod name on which process id to be killed + hostname (str): hostname on which pod is present + """ + killed_pid_cmd = "ps -eaf | grep %s | grep -v grep | awk '{print $2}'" + _waiter = waiter.Waiter(timeout=60, interval=3) + for w in _waiter: + ret, out, err = oc_rsh(hostname, pod, killed_pid_cmd % pid) + if ret != 0: + err_msg = ("failed to get killed process id '%s' details " + "from pod '%s' err: %s" % (pid, pod, err)) + g.log.error(err_msg) + raise AssertionError(err_msg) + + if not out.strip() == pid: + g.log.info("brick process '%s' killed on pod '%s'" % (pid, pod)) + break + + if w.expired: + error_msg = ("process id '%s' still exists on pod '%s' after waiting " + "for it '%s' seconds to get kill" % (pid, pod, timeout)) + g.log.error(error_msg) + raise exceptions.ExecutionError(error_msg) diff --git a/tests/functional/common/gluster_stability/test_gluster_services_restart.py b/tests/functional/common/gluster_stability/test_gluster_services_restart.py index 0a5d4e5e..82511900 100644 --- a/tests/functional/common/gluster_stability/test_gluster_services_restart.py +++ b/tests/functional/common/gluster_stability/test_gluster_services_restart.py @@ -1,13 +1,15 @@ +from unittest import skip import ddt import re +import time -from cnslibs.common.heketi_ops import ( - heketi_blockvolume_list, - match_heketi_and_gluster_block_volumes -) +from datetime import datetime +from glusto.core import Glusto as g +from cnslibs.common.heketi_ops import heketi_blockvolume_list from cnslibs.common.openshift_ops import ( check_service_status, + oc_get_custom_resource, get_ocp_gluster_pod_names, get_pod_name_from_dc, match_pv_and_heketi_block_volumes, @@ -18,12 +20,20 @@ from cnslibs.common.openshift_ops import ( oc_create_secret, oc_delete, oc_get_yaml, + oc_rsh, restart_service_on_pod, scale_dc_pod_amount_and_wait, verify_pvc_status_is_bound, wait_for_pod_be_ready, wait_for_resource_absence ) +from cnslibs.common.gluster_ops import ( + get_block_hosting_volume_name, + match_heketi_and_gluster_block_volumes_by_prefix, + restart_block_hosting_volume, + restart_brick_process, + wait_to_heal_complete +) from cnslibs.cns.cns_baseclass import CnsBaseClass from cnslibs.common import podcmd @@ -46,6 +56,7 @@ class GlusterStabilityTestSetup(CnsBaseClass): """ self.oc_node = self.ocp_master_node[0] self.gluster_pod = get_ocp_gluster_pod_names(self.oc_node)[0] + self.gluster_pod_obj = podcmd.Pod(self.oc_node, self.gluster_pod) # prefix used to create resources, generating using glusto_test_id # which uses time and date of test case @@ -140,6 +151,31 @@ class GlusterStabilityTestSetup(CnsBaseClass): return sc_name, pvc_name, dc_name, secretname + def get_block_hosting_volume_by_pvc_name(self, pvc_name): + """Get block hosting volume of pvc name given + + Args: + pvc_name (str): pvc name of which host name is need + to be returned + """ + pv_name = oc_get_custom_resource( + self.oc_node, 'pvc', ':.spec.volumeName', name=pvc_name + )[0] + + block_volume = oc_get_custom_resource( + self.oc_node, 'pv', + r':.metadata.annotations."gluster\.org\/volume\-id"', + name=pv_name + )[0] + + # get block hosting volume from pvc name + block_hosting_vol = get_block_hosting_volume_name( + self.heketi_client_node, self.heketi_server_url, + block_volume, self.gluster_pod, self.oc_node + ) + + return block_hosting_vol + def get_heketi_block_volumes(self): """lists heketi block volumes @@ -197,11 +233,52 @@ class GlusterStabilityTestSetup(CnsBaseClass): ) # validate block volumes listed by heketi and gluster - gluster_pod_obj = podcmd.Pod(self.heketi_client_node, self.gluster_pod) - match_heketi_and_gluster_block_volumes( - gluster_pod_obj, heketi_block_volume_names, "%s_" % self.prefix + match_heketi_and_gluster_block_volumes_by_prefix( + self.gluster_pod_obj, heketi_block_volume_names, + "%s_" % self.prefix ) + def get_io_time(self): + """Gets last io time of io pod by listing log file directory + /mnt on pod + """ + ret, stdout, stderr = oc_rsh( + self.oc_node, self.pod_name, "ls -l /mnt/ | awk '{print $8}'" + ) + if ret != 0: + err_msg = "failed to get io time for pod %s" % self.pod_name + g.log.error(err_msg) + raise AssertionError(err_msg) + + get_time = None + try: + get_time = datetime.strptime(stdout.strip(), "%H:%M") + except Exception: + g.log.error("invalid time format ret %s, stout: %s, " + "stderr: %s" % (ret, stdout, stderr)) + raise + + return get_time + + def restart_block_hosting_volume_wait_for_heal(self, block_hosting_vol): + """restarts block hosting volume and wait for heal to complete + + Args: + block_hosting_vol (str): block hosting volume which need to + restart + """ + start_io_time = self.get_io_time() + + restart_block_hosting_volume(self.gluster_pod_obj, block_hosting_vol) + + # Explicit wait to start ios on pvc after volume start + time.sleep(5) + resume_io_time = self.get_io_time() + + self.assertGreater(resume_io_time, start_io_time, "IO has not stopped") + + wait_to_heal_complete(self.gluster_pod_obj) + @ddt.data(SERVICE_BLOCKD, SERVICE_TCMU, SERVICE_TARGET) def test_restart_services_provision_volume_and_run_io(self, service): """[CNS-1393-1395] Restart gluster service then validate volumes @@ -228,3 +305,49 @@ class GlusterStabilityTestSetup(CnsBaseClass): # validates pvc, pv, heketi block and gluster block count after # service restarts self.validate_volumes_and_blocks() + + @skip("Blocked by BZ-1634745, BZ-1635736, BZ-1636477") + def test_target_side_failures_brick_failure_on_block_hosting_volume(self): + """[CNS-1285] Target side failures - Brick failure on block + hosting volume + """ + # get block hosting volume from pvc name + block_hosting_vol = self.get_block_hosting_volume_by_pvc_name( + self.pvc_name + ) + + # restarts brick 2 process of block hosting volume + restart_brick_process( + self.oc_node, self.gluster_pod_obj, block_hosting_vol + ) + + # checks if all glusterfs services are in running state + for service in (SERVICE_BLOCKD, SERVICE_TCMU, SERVICE_TARGET): + status = "exited" if service == SERVICE_TARGET else "running" + self.assertTrue( + check_service_status( + self.oc_node, self.gluster_pod, service, status + ), + "service %s is not in %s state" % (service, status) + ) + + # validates pvc, pv, heketi block and gluster block count after + # service restarts + self.validate_volumes_and_blocks() + + @skip("Blocked by BZ-1634745, BZ-1635736, BZ-1636477") + def test_start_stop_block_volume_service(self): + """[CNS-1314] Block hosting volume - stop/start block hosting + volume when IO's and provisioning are going on + """ + # get block hosting volume from pvc name + block_hosting_vol = self.get_block_hosting_volume_by_pvc_name( + self.pvc_name + ) + + # restarts one of the block hosting volume and checks heal + self.restart_block_hosting_volume_wait_for_heal(block_hosting_vol) + + # validates pvc, pv, heketi block and gluster block count after + # service restarts + self.validate_volumes_and_blocks() |
