diff --git a/pytest_tests/testsuites/metrics/test_shard_metrics.py b/pytest_tests/testsuites/metrics/test_shard_metrics.py new file mode 100644 index 00000000..56565ed3 --- /dev/null +++ b/pytest_tests/testsuites/metrics/test_shard_metrics.py @@ -0,0 +1,179 @@ +import random +import re + +import allure +import pytest +from frostfs_testlib import reporter +from frostfs_testlib.resources.wellknown_acl import EACL_PUBLIC_READ_WRITE +from frostfs_testlib.shell import Shell +from frostfs_testlib.steps.cli.container import create_container, delete_container +from frostfs_testlib.steps.cli.object import get_object, get_object_nodes, put_object +from frostfs_testlib.steps.node_management import node_shard_list, node_shard_set_mode +from frostfs_testlib.storage.cluster import Cluster, ClusterNode +from frostfs_testlib.storage.controllers import ShardsWatcher +from frostfs_testlib.storage.dataclasses.wallet import WalletInfo +from frostfs_testlib.testing import parallel, wait_for_success +from frostfs_testlib.testing.cluster_test_base import ClusterTestBase +from frostfs_testlib.utils.file_utils import generate_file + + +class TestShardMetrics(ClusterTestBase): + @pytest.fixture() + @allure.title("Get two shards for set mode") + def two_shards_and_node(self, cluster: Cluster) -> tuple[str, str, ClusterNode]: + node = random.choice(cluster.cluster_nodes) + shards = node_shard_list(node.storage_node) + two_shards = random.sample(shards, k=2) + + yield two_shards[0], two_shards[1], node + + for shard in two_shards: + node_shard_set_mode(node.storage_node, shard, "read-write") + + node_shard_list(node.storage_node) + + @pytest.fixture() + @allure.title("Revert all shards mode") + def revert_all_shards_mode(self): + yield + parallel(self.set_shard_rw_mode, self.cluster.cluster_nodes) + + def set_shard_rw_mode(self, node: ClusterNode): + watcher = ShardsWatcher(node) + shards = watcher.get_shards() + for shard in shards: + watcher.set_shard_mode(shard["shard_id"], mode="read-write") + watcher.await_for_all_shards_status(status="read-write") + + @wait_for_success(interval=10) + def check_metrics_in_node(self, cluster_node: ClusterNode, counter_exp: int, **metrics_greps: str): + counter_act = 0 + try: + metric_result = cluster_node.metrics.storage.get_metrics_search_by_greps(**metrics_greps) + counter_act += self.calc_metrics_count_from_stdout(metric_result.stdout) + except RuntimeError as e: + ... + assert counter_act == counter_exp, f"Expected: {counter_exp}, Actual: {counter_act} in node: {cluster_node}" + + @staticmethod + def calc_metrics_count_from_stdout(metric_result_stdout: str): + result = re.findall(r"}\s(\d+)", metric_result_stdout) + return sum(map(int, result)) + + @staticmethod + def get_error_count_from_logs(shell: Shell, object_path: str, object_name: str): + error_count = 0 + try: + logs = shell.exec(f"journalctl -u frostfs-storage --grep='error count' --no-pager") + # search error logs for current object + for error_line in logs.stdout.split("\n"): + if object_path in error_line and object_name in error_line: + result = re.findall(r'"error\scount":\s(\d+)', error_line) + error_count += sum(map(int, result)) + except RuntimeError as e: + ... + + return error_count + + @staticmethod + @wait_for_success(180, 30) + def get_object_path_and_name_file(oid: str, cid: str, node: ClusterNode) -> tuple[str, str]: + oid_path = f"{oid[0]}/{oid[1]}/{oid[2]}/{oid[3]}" + object_path = None + + with reporter.step("Search object file"): + node_shell = node.storage_node.host.get_shell() + data_path = node.storage_node.get_data_directory() + all_datas = node_shell.exec(f"ls -la {data_path}/data | awk '{{ print $9 }}'").stdout.strip() + for data_dir in all_datas.replace(".", "").strip().split("\n"): + check_dir = node_shell.exec( + f" [ -d {data_path}/data/{data_dir}/data/{oid_path} ] && echo 1 || echo 0" + ).stdout + if "1" in check_dir: + object_path = f"{data_path}/data/{data_dir}/data/{oid_path}" + object_name = f"{oid[4:]}.{cid}" + break + + assert object_path is not None, f"{oid} object not found in directory - {data_path}/data" + return object_path, object_name + + @allure.title("Metric for shard mode") + def test_shard_metrics_set_mode(self, two_shards_and_node: tuple[str, str, ClusterNode]): + metrics_counter = 1 + shard1, shard2, node = two_shards_and_node + + with reporter.step("Shard1 set to mode 'read-only'"): + node_shard_set_mode(node.storage_node, shard1, "read-only") + + with reporter.step(f"Check shard metrics, 'the mode will change to 'READ_ONLY'"): + self.check_metrics_in_node( + node, metrics_counter, command="frostfs_node_engine_mode_info", mode="READ_ONLY", shard_id=shard1 + ) + + with reporter.step("Shard2 set to mode 'degraded-read-only'"): + node_shard_set_mode(node.storage_node, shard2, "degraded-read-only") + + with reporter.step(f"Check shard metrics, 'the mode will change to 'DEGRADED_READ_ONLY'"): + self.check_metrics_in_node( + node, + metrics_counter, + command="frostfs_node_engine_mode_info", + mode="DEGRADED_READ_ONLY", + shard_id=shard2, + ) + + with reporter.step("Both shards set to mode 'read-write'"): + for shard in [shard1, shard2]: + node_shard_set_mode(node.storage_node, shard, "read-write") + + with reporter.step(f"Check shard metrics, 'the mode will change to 'READ_WRITE'"): + self.check_metrics_in_node( + node, metrics_counter, command="frostfs_node_engine_mode_info", mode="READ_WRITE", shard_id=shard + ) + + @allure.title("Metric for error count on shard") + def test_shard_metrics_error_count( + self, max_object_size: int, default_wallet: WalletInfo, cluster: Cluster, revert_all_shards_mode + ): + file_path = generate_file(round(max_object_size * 0.8)) + + with reporter.step(f"Create container"): + cid = create_container( + wallet=default_wallet, + shell=self.shell, + endpoint=self.cluster.default_rpc_endpoint, + rule="REP 1 CBF 1", + basic_acl=EACL_PUBLIC_READ_WRITE, + ) + + with reporter.step("Put object"): + oid = put_object(default_wallet, file_path, cid, self.shell, self.cluster.default_rpc_endpoint) + + with reporter.step("Get object nodes"): + object_nodes = get_object_nodes(cluster, cid, oid, cluster.cluster_nodes[0]) + node = object_nodes[0] + + with reporter.step("Search object in system."): + object_path, object_name = self.get_object_path_and_name_file(oid, cid, node) + + with reporter.step("Block read file"): + node.host.get_shell().exec(f"chmod a-r {object_path}/{object_name}") + + with reporter.step("Get object, expect error"): + with pytest.raises(RuntimeError): + get_object( + wallet=default_wallet, + cid=cid, + oid=oid, + shell=self.shell, + endpoint=node.storage_node.get_rpc_endpoint(), + ) + + with reporter.step(f"Get shard error count from logs"): + counter = self.get_error_count_from_logs(node.host.get_shell(), object_path, object_name) + + with reporter.step(f"Check shard error metrics"): + self.check_metrics_in_node(node, counter, command="frostfs_node_engine_errors_total") + + with reporter.step("Delete container"): + delete_container(default_wallet, cid, self.shell, cluster.default_rpc_endpoint)