tests/functional/prometheous/test_prometheus_validations_file.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335

try:
    # py2/3
    import simplejson as json
except ImportError:
    # py2
    import json
import time

import ddt
from glusto.core import Glusto as g
from glustolibs.gluster import rebalance_ops
import pytest

from openshiftstoragelibs import baseclass
from openshiftstoragelibs import exceptions
from openshiftstoragelibs import heketi_ops
from openshiftstoragelibs import openshift_ops
from openshiftstoragelibs import podcmd
from openshiftstoragelibs import waiter


@ddt.ddt
class TestPrometheusValidationFile(baseclass.BaseClass):
    """Prometheus Validations for file volumes"""

    @classmethod
    def setUpClass(cls):
        super(TestPrometheusValidationFile, cls).setUpClass()

        # Metrics of which the data need to retrieve in this class
        cls.metrics = ('kubelet_volume_stats_inodes_free',
                       'kubelet_volume_stats_inodes',
                       'kubelet_volume_stats_inodes_used',
                       'kubelet_volume_stats_available_bytes',
                       'kubelet_volume_stats_capacity_bytes',
                       'kubelet_volume_stats_used_bytes')

    def setUp(self):
        """Initialize all the variables which are necessary for test cases"""
        super(TestPrometheusValidationFile, self).setUp()

        try:
            prometheus_config = g.config['openshift']['prometheus']
            self._prometheus_project_name = prometheus_config[
                'prometheus_project_name']
            self._prometheus_resources_selector = prometheus_config[
                'prometheus_resources_selector']
            self._alertmanager_resources_selector = prometheus_config[
                'alertmanager_resources_selector']
        except KeyError as err:
            self.skipTest("Config file doesn't have key {}".format(err))

        self._master = self.ocp_master_node[0]

    def _fetch_metric_from_promtheus_pod(self, metric):
        """Fetch metric from prometheus pod using api call"""

        prometheus_pods = list(openshift_ops.oc_get_pods(
            self._master, selector=self._prometheus_resources_selector).keys())
        fetch_metric_cmd = ("curl 'http://localhost:9090/api/v1/query"
                            "?query={}'".format(metric))
        ret, metric_data, _ = openshift_ops.oc_rsh(
            self._master, prometheus_pods[0], fetch_metric_cmd)
        metric_result = json.loads(metric_data)["data"]["result"]
        if (not metric_result) or ret:
            raise exceptions.ExecutionError(
                "Failed to fecth data for metric {}, output {}".format(
                    metric, metric_result))
        return metric_result

    def _get_and_manipulate_metric_data(self, metrics, pvc):
        """Create a dict of metric names and total values"""

        # Switch to namespace containing prometheus pods
        openshift_ops.switch_oc_project(self._master,
                                        self._prometheus_project_name)
        self.addCleanup(openshift_ops.switch_oc_project,
                        self._master, self.storage_project_name)

        metric_data = dict()
        for metric in metrics:
            out = self._fetch_metric_from_promtheus_pod(metric)
            for matric_result in out:
                if matric_result["metric"]["persistentvolumeclaim"] == pvc:
                    metric_data[matric_result["metric"][
                        "__name__"]] = matric_result["value"][1]
        return metric_data

    def _fetch_initial_metrics(self, vol_name_prefix=None,
                               volume_expansion=False):

        # Create PVC and wait for it to be in 'Bound' state
        sc_name = self.create_storage_class(
            vol_name_prefix=vol_name_prefix,
            allow_volume_expansion=volume_expansion)
        pvc_name = self.create_and_wait_for_pvc(
            pvc_name_prefix=vol_name_prefix, sc_name=sc_name)

        # Create DC and attach with pvc
        self.dc_name, pod_name = self.create_dc_with_pvc(pvc_name)
        for w in waiter.Waiter(120, 10):
            initial_metrics = self._get_and_manipulate_metric_data(
                self.metrics, pvc_name)
            if bool(initial_metrics) and len(initial_metrics) == 6:
                break
        if w.expired:
            raise AssertionError("Unable to fetch metrics for the pvc")
        return pvc_name, pod_name, initial_metrics

    def _perform_io_and_fetch_metrics(
            self, pod_name, pvc_name, filename, dirname,
            metric_data, operation):
        """Create 1000 files and dirs and validate with old metrics"""
        openshift_ops.switch_oc_project(
            self._master, self.storage_project_name)
        if operation == "create":
            cmds = ("touch /mnt/{}{{1..1000}}".format(filename),
                    "mkdir /mnt/{}{{1..1000}}".format(dirname))
        else:
            cmds = ("rm -rf /mnt/large_file",
                    "rm -rf /mnt/{}{{1..1000}}".format(filename),
                    "rm -rf /mnt/{}{{1..1000}}".format(dirname))
        for cmd in cmds:
            self.cmd_run("oc rsh {} {}".format(pod_name, cmd))

        # Fetch the new metrics and compare the inodes used and bytes used
        for w in waiter.Waiter(120, 10):
            after_io_metrics = self._get_and_manipulate_metric_data(
                self.metrics, pvc_name)
            if operation == "create":
                if (int(after_io_metrics[
                    'kubelet_volume_stats_inodes_used']) > int(
                    metric_data['kubelet_volume_stats_inodes_used']) and int(
                    after_io_metrics[
                        'kubelet_volume_stats_used_bytes']) > int(
                        metric_data['kubelet_volume_stats_used_bytes'])):
                    break
            else:
                if int(metric_data[
                        'kubelet_volume_stats_used_bytes']) > int(
                        after_io_metrics['kubelet_volume_stats_used_bytes']):
                    break
        if w.expired:
            raise AssertionError(
                "After data is modified metrics like bytes used and inodes "
                "used are not reflected in prometheus")

    def _run_io_on_the_pod(self, pod_name, number_of_files):
        for each in range(number_of_files):
            cmd = "touch /mnt/file{}".format(each)
            ret, _, err = openshift_ops.oc_rsh(self._master, pod_name, cmd)
            self.assertFalse(ret, "Failed to run the IO with error msg {}".
                             format(err))

    @podcmd.GlustoPod()
    def _rebalance_completion(self, volume_name):
        """Rebalance start and completion after expansion."""
        ret, _, err = rebalance_ops.rebalance_start(
            'auto_get_gluster_endpoint', volume_name)
        self.assertFalse(
            ret, "Rebalance for {} volume not started with error {}".format(
                volume_name, err))

        for w in waiter.Waiter(240, 10):
            reb_status = rebalance_ops.get_rebalance_status(
                'auto_get_gluster_endpoint', volume_name)
            if reb_status["aggregate"]["statusStr"] == "completed":
                break
        if w.expired:
            raise AssertionError(
                "Failed to complete the rebalance in 240 seconds")

    @pytest.mark.tier2
    def test_prometheus_volume_metrics_on_pod_restart(self):
        """Validate volume metrics using prometheus before and after pod
        restart"""

        # Create PVC and wait for it to be in 'Bound' state
        pvc_name = self.create_and_wait_for_pvc()
        pod_name = openshift_ops.oc_create_tiny_pod_with_volume(
            self._master, pvc_name, "autotest-volume",
            image=self.io_container_image_cirros)
        self.addCleanup(openshift_ops.oc_delete, self._master, 'pod', pod_name,
                        raise_on_absence=False)

        # Wait for POD be up and running
        openshift_ops.wait_for_pod_be_ready(
            self._master, pod_name, timeout=60, wait_step=2)

        # Write data on the volume and wait for 2 mins and sleep is must for
        # prometheus to get the exact values of the metrics
        self._run_io_on_the_pod(pod_name, 30)
        time.sleep(120)

        # Fetching the metrics and storing in initial_metrics as dictionary
        initial_metrics = self._get_and_manipulate_metric_data(
            self.metrics, pvc_name)

        # Mark the current node unschedulable on which app pod is running
        openshift_ops.switch_oc_project(
            self._master, self.storage_project_name)
        pod_info = openshift_ops.oc_get_pods(self._master, name=pod_name)
        openshift_ops.oc_adm_manage_node(
            self._master, '--schedulable=false',
            nodes=[pod_info[pod_name]["node"]])
        self.addCleanup(
            openshift_ops.oc_adm_manage_node, self._master,
            '--schedulable=true', nodes=[pod_info[pod_name]["node"]])

        # Delete the existing pod and create a new pod
        openshift_ops.oc_delete(self._master, 'pod', pod_name)
        pod_name = openshift_ops.oc_create_tiny_pod_with_volume(
            self._master, pvc_name, "autotest-volume")
        self.addCleanup(openshift_ops.oc_delete, self._master, 'pod', pod_name)

        # Wait for POD be up and running and prometheus to refresh the data
        openshift_ops.wait_for_pod_be_ready(
            self._master, pod_name, timeout=60, wait_step=2)
        time.sleep(120)

        # Fetching the metrics and storing in final_metrics as dictionary and
        # validating with initial_metrics
        final_metrics = self._get_and_manipulate_metric_data(
            self.metrics, pvc_name)
        self.assertEqual(dict(initial_metrics), dict(final_metrics),
                         "Metrics are different post pod restart")

    @pytest.mark.tier2
    def test_prometheus_basic_validation(self):
        """ Validate basic volume metrics using prometheus """

        # Fetch the metrics and storing initial_metrics as dictionary
        pvc_name, pod_name, initial_metrics = self._fetch_initial_metrics(
            volume_expansion=False)

        # Create 1000 files and fetch the metrics that the data is updated
        self._perform_io_and_fetch_metrics(
            pod_name=pod_name, pvc_name=pvc_name,
            filename="filename1", dirname="dirname1",
            metric_data=initial_metrics, operation="create")

        # Write the IO half the size of the volume and validated from
        # prometheus pod that the size change is reflected
        size_to_write = int(initial_metrics[
            'kubelet_volume_stats_capacity_bytes']) // 2
        openshift_ops.switch_oc_project(
            self._master, self.storage_project_name)
        cmd = ("dd if=/dev/urandom of=/mnt/large_file bs={} count=1024".
               format(size_to_write // 1024))
        ret, _, err = openshift_ops.oc_rsh(self._master, pod_name, cmd)
        self.assertFalse(ret, 'Failed to write file due to err {}'.format(err))

        # Fetching the metrics and validating the data change is reflected
        for w in waiter.Waiter(120, 10):
            half_io_metrics = self._get_and_manipulate_metric_data(
                ['kubelet_volume_stats_used_bytes'], pvc_name)
            if bool(half_io_metrics) and (int(
                    half_io_metrics['kubelet_volume_stats_used_bytes'])
                    > size_to_write):
                break
        if w.expired:
            raise AssertionError(
                "After Data is written on the pvc, metrics like inodes used "
                "and bytes used are not reflected in the prometheus")

        # Delete the files from the volume and wait for the
        # updated details reflected in prometheus
        self._perform_io_and_fetch_metrics(
            pod_name=pod_name, pvc_name=pvc_name,
            filename="filename1", dirname="dirname1",
            metric_data=half_io_metrics, operation="delete")

    @pytest.mark.tier2
    def test_prometheus_pv_resize(self):
        """ Validate prometheus metrics with pv resize"""

        # Fetch the metrics and storing initial_metrics as dictionary
        pvc_name, pod_name, initial_metrics = self._fetch_initial_metrics(
            vol_name_prefix="for-pv-resize", volume_expansion=True)

        # Write data on the pvc and confirm it is reflected in the prometheus
        self._perform_io_and_fetch_metrics(
            pod_name=pod_name, pvc_name=pvc_name,
            filename="filename1", dirname="dirname1",
            metric_data=initial_metrics, operation="create")

        # Resize the pvc to 2GiB
        openshift_ops.switch_oc_project(
            self._master, self.storage_project_name)
        pvc_size = 2
        openshift_ops.resize_pvc(self._master, pvc_name, pvc_size)
        openshift_ops.wait_for_events(self._master, obj_name=pvc_name,
                                      event_reason='VolumeResizeSuccessful')
        openshift_ops.verify_pvc_size(self._master, pvc_name, pvc_size)
        pv_name = openshift_ops.get_pv_name_from_pvc(
            self._master, pvc_name)
        openshift_ops.verify_pv_size(self._master, pv_name, pvc_size)

        heketi_volume_name = heketi_ops.heketi_volume_list_by_name_prefix(
            self.heketi_client_node, self.heketi_server_url,
            "for-pv-resize", json=True)[0][2]
        self.assertIsNotNone(
            heketi_volume_name, "Failed to fetch volume with prefix {}".
            format("for-pv-resize"))

        openshift_ops.oc_delete(self._master, 'pod', pod_name)
        openshift_ops.wait_for_resource_absence(self._master, 'pod', pod_name)
        pod_name = openshift_ops.get_pod_name_from_dc(
            self._master, self.dc_name)
        openshift_ops.wait_for_pod_be_ready(self._master, pod_name)

        # Check whether the metrics are updated or not
        for w in waiter.Waiter(120, 10):
            resize_metrics = self._get_and_manipulate_metric_data(
                self.metrics, pvc_name)
            if bool(resize_metrics) and int(resize_metrics[
                'kubelet_volume_stats_capacity_bytes']) > int(
                    initial_metrics['kubelet_volume_stats_capacity_bytes']):
                break
        if w.expired:
            raise AssertionError("Failed to reflect PVC Size after resizing")
        openshift_ops.switch_oc_project(
            self._master, self.storage_project_name)
        time.sleep(240)

        # Lookup and trigger rebalance and wait for the its completion
        for _ in range(100):
            self.cmd_run("oc rsh {} ls /mnt/".format(pod_name))
        self._rebalance_completion(heketi_volume_name)

        # Write data on the resized pvc and compared with the resized_metrics
        self._perform_io_and_fetch_metrics(
            pod_name=pod_name, pvc_name=pvc_name,
            filename="secondfilename", dirname="seconddirname",
            metric_data=resize_metrics, operation="create")