[#255] refactore metrics tests

This commit is contained in:
Ilyas Niyazov 2024-06-25 16:23:25 +03:00
parent b8d5706515
commit d5b9dc571e
6 changed files with 345 additions and 372 deletions

View file

@ -1,47 +1,21 @@
import math
import re
import allure
import pytest
from frostfs_testlib import reporter
from frostfs_testlib.steps.cli.container import create_container, delete_container
from frostfs_testlib.steps.cli.object import delete_object, get_object_nodes, put_object_to_random_node
from frostfs_testlib.storage.cluster import Cluster, ClusterNode
from frostfs_testlib.steps.cli.container import create_container
from frostfs_testlib.steps.cli.object import delete_object, put_object_to_random_node
from frostfs_testlib.steps.metrics import check_metrics_counter
from frostfs_testlib.steps.storage_policy import get_nodes_with_object
from frostfs_testlib.storage.cluster import Cluster
from frostfs_testlib.storage.dataclasses.object_size import ObjectSize
from frostfs_testlib.storage.dataclasses.wallet import WalletInfo
from frostfs_testlib.testing.cluster_test_base import ClusterTestBase
from frostfs_testlib.testing.test_control import wait_for_success
from frostfs_testlib.utils.file_utils import generate_file
@pytest.mark.container
class TestContainerMetrics(ClusterTestBase):
@wait_for_success(interval=10)
def check_sum_counter_metrics_in_nodes(
self, cluster_nodes: list[ClusterNode], cid: str, phy_exp: int, logic_exp: int, user_exp: int
):
counter_phy = 0
counter_logic = 0
counter_user = 0
for cluster_node in cluster_nodes:
metric_result = cluster_node.metrics.storage.get_metric_container(f"container_objects_total", cid)
counter_phy += self.get_count_metric_type_from_stdout(metric_result.stdout, "phy")
counter_logic += self.get_count_metric_type_from_stdout(metric_result.stdout, "logic")
counter_user += self.get_count_metric_type_from_stdout(metric_result.stdout, "user")
assert counter_phy == phy_exp, f"Expected metric Phy={phy_exp}, Actual: {counter_phy} in nodes: {cluster_nodes}"
assert (
counter_logic == logic_exp
), f"Expected metric logic={logic_exp}, Actual: {counter_logic} in nodes: {cluster_nodes}"
assert (
counter_user == user_exp
), f"Expected metric User={user_exp}, Actual: {counter_user} in nodes: {cluster_nodes}"
@staticmethod
def get_count_metric_type_from_stdout(metric_result_stdout: str, metric_type: str):
result = re.findall(rf'type="{metric_type}"}}\s(\d+)', metric_result_stdout)
return sum(map(int, result))
@allure.title("Container metrics (obj_size={object_size})")
def test_container_metrics(
self, object_size: ObjectSize, max_object_size: int, default_wallet: WalletInfo, cluster: Cluster
@ -57,15 +31,10 @@ class TestContainerMetrics(ClusterTestBase):
link_object = 1
with reporter.step(f"Create container with policy {placement_policy}"):
cid = create_container(
default_wallet,
rule=placement_policy,
shell=self.shell,
endpoint=self.cluster.default_rpc_endpoint,
)
cid = create_container(default_wallet, self.shell, cluster.default_rpc_endpoint, placement_policy)
with reporter.step("Put object to random node"):
storage_object_id = put_object_to_random_node(
oid = put_object_to_random_node(
wallet=default_wallet,
path=file_path,
cid=cid,
@ -73,25 +42,46 @@ class TestContainerMetrics(ClusterTestBase):
cluster=cluster,
)
with reporter.step("Get object nodes"):
object_storage_nodes = get_nodes_with_object(cid, oid, self.shell, cluster.storage_nodes)
object_nodes = [
cluster_node
for cluster_node in cluster.cluster_nodes
if cluster_node.storage_node in object_storage_nodes
]
with reporter.step("Check metric appears in node where the object is located"):
object_nodes = get_object_nodes(
cluster=cluster, cid=cid, oid=storage_object_id, alive_node=cluster.cluster_nodes[0]
count_metrics = (object_chunks + head_object + link_object) * copies
check_metrics_counter(
object_nodes, counter_exp=count_metrics, command="container_objects_total", cid=cid, type="phy"
)
count_metrics_exp = (object_chunks + head_object + link_object) * copies
self.check_sum_counter_metrics_in_nodes(
object_nodes, cid, phy_exp=count_metrics_exp, logic_exp=count_metrics_exp, user_exp=copies
check_metrics_counter(
object_nodes, counter_exp=count_metrics, command="container_objects_total", cid=cid, type="logic"
)
check_metrics_counter(
object_nodes, counter_exp=copies, command="container_objects_total", cid=cid, type="user"
)
with reporter.step("Delete file, wait until gc remove object"):
delete_object(default_wallet, cid, storage_object_id, self.shell, self.cluster.default_rpc_endpoint)
count_metrics_exp = len(object_nodes)
self.check_sum_counter_metrics_in_nodes(
object_nodes, cid, phy_exp=count_metrics_exp, logic_exp=count_metrics_exp, user_exp=0
delete_object(default_wallet, cid, oid, self.shell, cluster.default_rpc_endpoint)
with reporter.step(f"Check container metrics 'the counter should equal {len(object_nodes)}' in object nodes"):
check_metrics_counter(
object_nodes, counter_exp=len(object_nodes), command="container_objects_total", cid=cid, type="phy"
)
check_metrics_counter(
object_nodes, counter_exp=len(object_nodes), command="container_objects_total", cid=cid, type="logic"
)
check_metrics_counter(object_nodes, counter_exp=0, command="container_objects_total", cid=cid, type="user")
with reporter.step("Check metrics(Phy, Logic, User) in each nodes"):
# Phy and Logic metrics are 4, because in rule 'CBF 2 SELECT 2 FROM', cbf2*sel2=4
self.check_sum_counter_metrics_in_nodes(cluster.cluster_nodes, cid, phy_exp=4, logic_exp=4, user_exp=0)
with reporter.step("Delete container"):
delete_container(default_wallet, cid, shell=self.shell, endpoint=self.cluster.default_rpc_endpoint)
check_metrics_counter(
cluster.cluster_nodes, counter_exp=4, command="container_objects_total", cid=cid, type="phy"
)
check_metrics_counter(
cluster.cluster_nodes, counter_exp=4, command="container_objects_total", cid=cid, type="logic"
)
check_metrics_counter(
cluster.cluster_nodes, counter_exp=0, command="container_objects_total", cid=cid, type="user"
)

View file

@ -2,9 +2,12 @@ import random
import re
import allure
import pytest
from frostfs_testlib import reporter
from frostfs_testlib.steps.cli.container import create_container, delete_container
from frostfs_testlib.steps.cli.object import delete_object, get_object_nodes, put_object, put_object_to_random_node
from frostfs_testlib.steps.cli.container import create_container
from frostfs_testlib.steps.cli.object import delete_object, put_object, put_object_to_random_node
from frostfs_testlib.steps.metrics import check_metrics_counter, get_metrics_value
from frostfs_testlib.steps.storage_policy import get_nodes_with_object
from frostfs_testlib.storage.cluster import Cluster, ClusterNode
from frostfs_testlib.storage.dataclasses.object_size import ObjectSize
from frostfs_testlib.storage.dataclasses.wallet import WalletInfo
@ -40,10 +43,9 @@ class TestGarbageCollectorMetrics(ClusterTestBase):
with reporter.step("Get current garbage collector metrics for each nodes"):
metrics_counter = {}
for node in cluster.cluster_nodes:
command_result = node.metrics.storage.get_metrics_search_by_greps(
command="frostfs_node_garbage_collector_marked_for_removal_objects_total"
metrics_counter[node] = get_metrics_value(
node, command="frostfs_node_garbage_collector_marked_for_removal_objects_total"
)
metrics_counter[node] = self.calc_metrics_count_from_stdout(command_result.stdout)
with reporter.step(f"Create container with policy {placement_policy}"):
cid = create_container(default_wallet, self.shell, cluster.default_rpc_endpoint, placement_policy)
@ -60,23 +62,29 @@ class TestGarbageCollectorMetrics(ClusterTestBase):
)
with reporter.step("Get object nodes"):
object_nodes = get_object_nodes(cluster, cid, oid, cluster.cluster_nodes[0])
object_storage_nodes = get_nodes_with_object(cid, oid, self.shell, cluster.storage_nodes)
object_nodes = [
cluster_node
for cluster_node in cluster.cluster_nodes
if cluster_node.storage_node in object_storage_nodes
]
with reporter.step("Tick Epoch"):
self.tick_epochs(epochs_to_tick=2, wait_block=2)
with reporter.step(f"Check garbage collector metrics 'the counter should increase by {metrics_step}'"):
with reporter.step(
f"Check garbage collector metrics 'the counter should increase by {metrics_step}' in object nodes"
):
for node in object_nodes:
metrics_counter[node] += metrics_step
for node, counter in metrics_counter.items():
self.check_metrics_in_node(
node, counter, command="frostfs_node_garbage_collector_marked_for_removal_objects_total"
check_metrics_counter(
[node],
counter_exp=counter,
command="frostfs_node_garbage_collector_marked_for_removal_objects_total",
)
with reporter.step("Delete container"):
delete_container(default_wallet, cid, self.shell, cluster.default_rpc_endpoint)
@allure.title("Garbage collector delete object")
def test_garbage_collector_metrics_deleted_objects(
self, simple_object_size: ObjectSize, default_wallet: WalletInfo, cluster: Cluster
@ -89,10 +97,7 @@ class TestGarbageCollectorMetrics(ClusterTestBase):
node = random.choice(cluster.cluster_nodes)
with reporter.step("Get current garbage collector metrics for selected node"):
command_result = node.metrics.storage.get_metrics_search_by_greps(
command="frostfs_node_garbage_collector_deleted_objects_total"
)
metrics_counter = self.calc_metrics_count_from_stdout(command_result.stdout)
metrics_counter = get_metrics_value(node, command="frostfs_node_garbage_collector_deleted_objects_total")
with reporter.step(f"Create container with policy {placement_policy}"):
cid = create_container(default_wallet, self.shell, node.storage_node.get_rpc_endpoint(), placement_policy)
@ -105,9 +110,6 @@ class TestGarbageCollectorMetrics(ClusterTestBase):
with reporter.step(f"Check garbage collector metrics 'the counter should increase by {metrics_step}'"):
metrics_counter += metrics_step
self.check_metrics_in_node(
node, metrics_counter, command="frostfs_node_garbage_collector_deleted_objects_total"
check_metrics_counter(
[node], counter_exp=metrics_counter, command="frostfs_node_garbage_collector_deleted_objects_total"
)
with reporter.step("Delete container"):
delete_container(default_wallet, cid, self.shell, cluster.default_rpc_endpoint)

View file

@ -1,21 +1,20 @@
import random
import re
import allure
import pytest
from frostfs_testlib import reporter
from frostfs_testlib.healthcheck.interfaces import Healthcheck
from frostfs_testlib.steps.cli.container import create_container, delete_container, get_container, list_containers
from frostfs_testlib.steps.cli.container import create_container, get_container, list_containers
from frostfs_testlib.steps.cli.object import get_object, head_object, put_object, search_object
from frostfs_testlib.steps.cli.tree import get_tree_list
from frostfs_testlib.storage.cluster import Cluster, ClusterNode
from frostfs_testlib.steps.metrics import check_metrics_counter, get_metrics_value
from frostfs_testlib.storage.cluster import Cluster
from frostfs_testlib.storage.controllers.cluster_state_controller import ClusterStateController
from frostfs_testlib.storage.controllers.state_managers.config_state_manager import ConfigStateManager
from frostfs_testlib.storage.dataclasses.frostfs_services import StorageNode
from frostfs_testlib.storage.dataclasses.object_size import ObjectSize
from frostfs_testlib.storage.dataclasses.wallet import WalletInfo
from frostfs_testlib.testing.cluster_test_base import ClusterTestBase
from frostfs_testlib.testing.test_control import wait_for_success
from frostfs_testlib.utils.file_utils import generate_file
@ -27,25 +26,6 @@ class TestGRPCMetrics(ClusterTestBase):
yield
cluster_state_controller.manager(ConfigStateManager).revert_all()
@wait_for_success(interval=10)
def check_metrics_in_node(self, cluster_node: ClusterNode, counter_exp: int, **metrics_greps: str):
counter_act = self.get_metrics_value(cluster_node, **metrics_greps)
assert counter_act == counter_exp, f"Expected: {counter_exp}, Actual: {counter_act} in node: {cluster_node}"
def get_metrics_value(self, node: ClusterNode, **metrics_greps: str):
try:
command_result = node.metrics.storage.get_metrics_search_by_greps(**metrics_greps)
metrics_counter = self.calc_metrics_count_from_stdout(command_result.stdout)
except RuntimeError as e:
metrics_counter = 0
return metrics_counter
@staticmethod
def calc_metrics_count_from_stdout(metric_result_stdout: str):
result = re.findall(r"}\s(\d+)", metric_result_stdout)
return sum(map(int, result))
@allure.title("GRPC metrics container operations")
def test_grpc_metrics_container_operations(self, default_wallet: WalletInfo, cluster: Cluster):
placement_policy = "REP 2 IN X CBF 1 SELECT 4 FROM * AS X"
@ -54,7 +34,7 @@ class TestGRPCMetrics(ClusterTestBase):
node = random.choice(cluster.cluster_nodes)
with reporter.step("Get current gRPC metrics for method 'Put'"):
metrics_counter_put = self.get_metrics_value(
metrics_counter_put = get_metrics_value(
node, command="grpc_server_handled_total", service="ContainerService", method="Put"
)
@ -63,12 +43,16 @@ class TestGRPCMetrics(ClusterTestBase):
with reporter.step(f"Check gRPC metrics method 'Put', 'the counter should increase by 1'"):
metrics_counter_put += 1
self.check_metrics_in_node(
node, metrics_counter_put, command="grpc_server_handled_total", service="ContainerService", method="Put"
check_metrics_counter(
[node],
counter_exp=metrics_counter_put,
command="grpc_server_handled_total",
service="ContainerService",
method="Put",
)
with reporter.step("Get current gRPC metrics for method 'Get'"):
metrics_counter_get = self.get_metrics_value(
metrics_counter_get = get_metrics_value(
node, command="grpc_server_handled_total", service="ContainerService", method="Get"
)
@ -77,12 +61,16 @@ class TestGRPCMetrics(ClusterTestBase):
with reporter.step(f"Check gRPC metrics method=Get, 'the counter should increase by 1'"):
metrics_counter_get += 1
self.check_metrics_in_node(
node, metrics_counter_get, command="grpc_server_handled_total", service="ContainerService", method="Get"
check_metrics_counter(
[node],
counter_exp=metrics_counter_get,
command="grpc_server_handled_total",
service="ContainerService",
method="Get",
)
with reporter.step("Get current gRPC metrics for method 'List'"):
metrics_counter_list = self.get_metrics_value(
metrics_counter_list = get_metrics_value(
node, command="grpc_server_handled_total", service="ContainerService", method="List"
)
@ -91,17 +79,14 @@ class TestGRPCMetrics(ClusterTestBase):
with reporter.step(f"Check gRPC metrics method=List, 'the counter should increase by 1'"):
metrics_counter_list += 1
self.check_metrics_in_node(
node,
metrics_counter_list,
check_metrics_counter(
[node],
counter_exp=metrics_counter_list,
command="grpc_server_handled_total",
service="ContainerService",
method="List",
)
with reporter.step("Delete container"):
delete_container(default_wallet, cid, self.shell, self.cluster.default_rpc_endpoint)
@allure.title("GRPC metrics object operations")
def test_grpc_metrics_object_operations(
self, simple_object_size: ObjectSize, default_wallet: WalletInfo, cluster: Cluster, disable_policer
@ -116,7 +101,7 @@ class TestGRPCMetrics(ClusterTestBase):
cid = create_container(default_wallet, self.shell, node.storage_node.get_rpc_endpoint(), placement_policy)
with reporter.step("Get current gRPC metrics for method 'Put'"):
metrics_counter_put = self.get_metrics_value(
metrics_counter_put = get_metrics_value(
node, command="grpc_server_handled_total", service="ObjectService", method="Put"
)
@ -125,12 +110,16 @@ class TestGRPCMetrics(ClusterTestBase):
with reporter.step(f"Check gRPC metrics method 'Put', 'the counter should increase by 1'"):
metrics_counter_put += 1
self.check_metrics_in_node(
node, metrics_counter_put, command="grpc_server_handled_total", service="ObjectService", method="Put"
check_metrics_counter(
[node],
counter_exp=metrics_counter_put,
command="grpc_server_handled_total",
service="ObjectService",
method="Put",
)
with reporter.step("Get current gRPC metrics for method 'Get'"):
metrics_counter_get = self.get_metrics_value(
metrics_counter_get = get_metrics_value(
node, command="grpc_server_handled_total", service="ObjectService", method="Get"
)
@ -139,12 +128,16 @@ class TestGRPCMetrics(ClusterTestBase):
with reporter.step(f"Check gRPC metrics method=Get, 'the counter should increase by 1'"):
metrics_counter_get += 1
self.check_metrics_in_node(
node, metrics_counter_get, command="grpc_server_handled_total", service="ObjectService", method="Get"
check_metrics_counter(
[node],
counter_exp=metrics_counter_get,
command="grpc_server_handled_total",
service="ObjectService",
method="Get",
)
with reporter.step("Get current gRPC metrics for method 'Search'"):
metrics_counter_search = self.get_metrics_value(
metrics_counter_search = get_metrics_value(
node, command="grpc_server_handled_total", service="ObjectService", method="Search"
)
@ -153,16 +146,16 @@ class TestGRPCMetrics(ClusterTestBase):
with reporter.step(f"Check gRPC metrics method=Search, 'the counter should increase by 1'"):
metrics_counter_search += 1
self.check_metrics_in_node(
node,
metrics_counter_search,
check_metrics_counter(
[node],
counter_exp=metrics_counter_search,
command="grpc_server_handled_total",
service="ObjectService",
method="Search",
)
with reporter.step("Get current gRPC metrics for method 'Head'"):
metrics_counter_head = self.get_metrics_value(
metrics_counter_head = get_metrics_value(
node, command="grpc_server_handled_total", service="ObjectService", method="Head"
)
@ -171,20 +164,21 @@ class TestGRPCMetrics(ClusterTestBase):
with reporter.step(f"Check gRPC metrics method=Head, 'the counter should increase by 1'"):
metrics_counter_head += 1
self.check_metrics_in_node(
node, metrics_counter_head, command="grpc_server_handled_total", service="ObjectService", method="Head"
check_metrics_counter(
[node],
counter_exp=metrics_counter_head,
command="grpc_server_handled_total",
service="ObjectService",
method="Head",
)
with reporter.step("Delete container"):
delete_container(default_wallet, cid, self.shell, self.cluster.default_rpc_endpoint)
@allure.title("GRPC metrics Tree healthcheck")
def test_grpc_metrics_tree_service(self, cluster: Cluster, healthcheck: Healthcheck):
with reporter.step("Select random node"):
node = random.choice(cluster.cluster_nodes)
with reporter.step("Get current gRPC metrics for Healthcheck"):
metrics_counter = self.get_metrics_value(
metrics_counter = get_metrics_value(
node, command="grpc_server_handled_total", service="TreeService", method="Healthcheck"
)
@ -192,10 +186,14 @@ class TestGRPCMetrics(ClusterTestBase):
healthcheck.tree_healthcheck(node)
with reporter.step(f"Check gRPC metrics for Healthcheck, 'the counter should increase'"):
metrics_counter_new = self.get_metrics_value(
node, command="grpc_server_handled_total", service="TreeService", method="Healthcheck"
check_metrics_counter(
[node],
">",
metrics_counter,
command="grpc_server_handled_total",
service="TreeService",
method="Healthcheck",
)
assert metrics_counter_new > metrics_counter, "the metrics has not increased"
@allure.title("GRPC metrics Tree list")
def test_grpc_metrics_tree_list(self, default_wallet: WalletInfo, cluster: Cluster):
@ -208,7 +206,7 @@ class TestGRPCMetrics(ClusterTestBase):
cid = create_container(default_wallet, self.shell, node.storage_node.get_rpc_endpoint(), placement_policy)
with reporter.step("Get current gRPC metrics for Tree List"):
metrics_counter = self.get_metrics_value(
metrics_counter = get_metrics_value(
node, command="grpc_server_handled_total", service="TreeService", method="TreeList"
)
@ -217,9 +215,10 @@ class TestGRPCMetrics(ClusterTestBase):
with reporter.step(f"Check gRPC metrics for Tree List, 'the counter should increase by 1'"):
metrics_counter += 1
self.check_metrics_in_node(
node, metrics_counter, command="grpc_server_handled_total", service="TreeService", method="TreeList"
check_metrics_counter(
[node],
counter_exp=metrics_counter,
command="grpc_server_handled_total",
service="TreeService",
method="TreeList",
)
with reporter.step("Delete container"):
delete_container(default_wallet, cid, self.shell, self.cluster.default_rpc_endpoint)

View file

@ -1,11 +1,11 @@
import random
import re
from datetime import datetime
from datetime import datetime, timezone
import allure
import pytest
from frostfs_testlib import reporter
from frostfs_testlib.shell import Shell
from frostfs_testlib.steps.metrics import get_metrics_value
from frostfs_testlib.storage.cluster import Cluster, ClusterNode
from frostfs_testlib.storage.controllers.cluster_state_controller import ClusterStateController
from frostfs_testlib.storage.controllers.state_managers.config_state_manager import ConfigStateManager
@ -16,59 +16,40 @@ from frostfs_testlib.testing.test_control import wait_for_success
class TestLogsMetrics(ClusterTestBase):
@pytest.fixture
def restart_storage_service(self, cluster_state_controller: ClusterStateController) -> str:
def restart_storage_service(self, cluster_state_controller: ClusterStateController) -> datetime:
config_manager = cluster_state_controller.manager(ConfigStateManager)
config_manager.csc.stop_services_of_type(StorageNode)
restart_time = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
restart_time = datetime.now(timezone.utc)
config_manager.csc.start_services_of_type(StorageNode)
yield restart_time
cluster_state_controller.manager(ConfigStateManager).revert_all()
@wait_for_success(interval=10)
def check_metrics_in_node(self, cluster_node: ClusterNode, counter_exp: int, **metrics_greps: str):
counter_act = self.get_metrics_value(cluster_node, **metrics_greps)
def check_metrics_in_node(self, cluster_node: ClusterNode, restart_time: datetime, **metrics_greps):
counter_exp = self.get_count_logs_by_level(cluster_node, metrics_greps.get("level"), restart_time)
counter_act = get_metrics_value(cluster_node, **metrics_greps)
assert counter_act == counter_exp, f"Expected: {counter_exp}, Actual: {counter_act} in node: {cluster_node}"
def get_metrics_value(self, node: ClusterNode, **metrics_greps: str):
try:
command_result = node.metrics.storage.get_metrics_search_by_greps(**metrics_greps)
metrics_counter = self.calc_metrics_count_from_stdout(command_result.stdout)
except RuntimeError as e:
metrics_counter = 0
return metrics_counter
@staticmethod
def calc_metrics_count_from_stdout(metric_result_stdout: str):
result = re.findall(r"}\s(\d+)", metric_result_stdout)
return sum(map(int, result))
@staticmethod
def get_count_logs_by_level(shell: Shell, log_level: str, after_time: str):
def get_count_logs_by_level(cluster_node: ClusterNode, log_level: str, after_time: datetime):
count_logs = 0
try:
logs = shell.exec(f"journalctl -u frostfs-storage --grep='{log_level}' --since '{after_time}'")
result = re.findall(rf"Z\s+{log_level}\s+", logs.stdout)
logs = cluster_node.host.get_filtered_logs(log_level, unit="frostfs-storage", since=after_time)
result = re.findall(rf"Z\s+{log_level}\s+", logs)
count_logs += len(result)
except RuntimeError as e:
...
return count_logs
@allure.title("Metrics for the log counter")
def test_log_counter_metrics(self, cluster: Cluster, restart_storage_service: str):
def test_log_counter_metrics(self, cluster: Cluster, restart_storage_service: datetime):
restart_time = restart_storage_service
with reporter.step("Select random node"):
node = random.choice(cluster.cluster_nodes)
with reporter.step("Get count logs from journalctl with level 'info'"):
count_logs_info = self.get_count_logs_by_level(node.host.get_shell(), "info", restart_time)
with reporter.step(f"Check metrics count logs with level 'info'"):
self.check_metrics_in_node(node, count_logs_info, command="frostfs_node_logger_entry_count", level="info")
with reporter.step("Get count logs from journalctl with level 'error'"):
count_logs_error = self.get_count_logs_by_level(node.host.get_shell(), "error", restart_time)
self.check_metrics_in_node(node, restart_time, command="frostfs_node_logger_entry_count", level="info")
with reporter.step(f"Check metrics count logs with level 'error'"):
self.check_metrics_in_node(node, count_logs_error, command="frostfs_node_logger_entry_count", level="error")
self.check_metrics_in_node(node, restart_time, command="frostfs_node_logger_entry_count", level="error")

View file

@ -5,92 +5,19 @@ import allure
import pytest
from frostfs_testlib import reporter
from frostfs_testlib.steps.cli.container import create_container, delete_container, search_nodes_with_container
from frostfs_testlib.steps.cli.object import (
delete_object,
get_object_nodes,
lock_object,
put_object,
put_object_to_random_node,
)
from frostfs_testlib.steps.cli.object import delete_object, lock_object, put_object, put_object_to_random_node
from frostfs_testlib.steps.metrics import check_metrics_counter, get_metrics_value
from frostfs_testlib.steps.storage_policy import get_nodes_with_object
from frostfs_testlib.storage.cluster import Cluster, ClusterNode
from frostfs_testlib.storage.controllers.cluster_state_controller import ClusterStateController
from frostfs_testlib.storage.dataclasses.object_size import ObjectSize
from frostfs_testlib.storage.dataclasses.wallet import WalletInfo
from frostfs_testlib.testing.cluster_test_base import ClusterTestBase
from frostfs_testlib.testing.test_control import wait_for_success
from frostfs_testlib.utils.file_utils import generate_file
class TestObjectMetrics(ClusterTestBase):
@wait_for_success(interval=10)
def check_metrics_by_type(
self, cluster_nodes: list[ClusterNode], metric_command: str, grep_by: str, metric_type: str, counter_exp: int
):
counter_act = 0
for cluster_node in cluster_nodes:
try:
metric_result = cluster_node.metrics.storage.get_metrics_search_by_greps(
command=metric_command, grep_by=grep_by
)
counter_act += self.calc_metrics_count_from_stdout(metric_result.stdout, metric_type)
except RuntimeError as e:
...
assert (
counter_act == counter_exp
), f"Expected metric {metric_type}={counter_exp}, Actual: {counter_act} in nodes: {cluster_nodes}"
@staticmethod
def calc_metrics_count_from_stdout(metric_result_stdout: str, metric_type: str):
result = re.findall(rf'type="{metric_type}"}}\s(\d+)', metric_result_stdout)
return sum(map(int, result))
@wait_for_success(interval=10)
def check_object_metrics_total_and_container(
self, cluster_nodes: list[ClusterNode], cid: str, objects_metric_total: int, objects_metric_container: int
):
self.check_metrics_by_type(
cluster_nodes,
"frostfs_node_engine_objects_total",
grep_by="user",
metric_type="user",
counter_exp=objects_metric_total,
)
objects_metric_container_act = 0
for node in cluster_nodes:
try:
metrics_container = node.metrics.storage.get_metrics_search_by_greps(
command="frostfs_node_engine_container_objects_total", cid=cid, type="user"
)
objects_metric_container_act += self.calc_metrics_count_from_stdout(
metrics_container.stdout, metric_type="user"
)
except RuntimeError as e:
...
assert (
objects_metric_container_act == objects_metric_container
), f"Expected {objects_metric_container} objects in container"
@wait_for_success(max_wait_time=120, interval=10)
def check_object_metrics_container(
self, cluster_nodes: list[ClusterNode], cid: str, objects_metric_container_exp: int
):
objects_metric_container_act = 0
for node in cluster_nodes:
try:
metrics_container = node.metrics.storage.get_metrics_search_by_greps(
command="frostfs_node_engine_container_objects_total", cid=cid, type="user"
)
objects_metric_container_act += self.calc_metrics_count_from_stdout(
metrics_container.stdout, metric_type="user"
)
except RuntimeError as e:
...
assert (
objects_metric_container_act == objects_metric_container_exp
), f"Expected {objects_metric_container_exp} objects in container"
@allure.title("Object metrics of removed container")
@allure.title("Object metrics of removed container (obj_size={object_size})")
def test_object_metrics_removed_container(
self, object_size: ObjectSize, default_wallet: WalletInfo, cluster: Cluster
):
@ -99,28 +26,26 @@ class TestObjectMetrics(ClusterTestBase):
copies = 2
with reporter.step(f"Create container with policy {placement_policy}"):
cid = create_container(
default_wallet,
rule=placement_policy,
shell=self.shell,
endpoint=self.cluster.default_rpc_endpoint,
)
cid = create_container(default_wallet, self.shell, cluster.default_rpc_endpoint, placement_policy)
with reporter.step("Put object to random node"):
storage_object_id = put_object_to_random_node(
wallet=default_wallet,
path=file_path,
cid=cid,
shell=self.shell,
cluster=cluster,
)
oid = put_object_to_random_node(default_wallet, file_path, cid, self.shell, cluster)
with reporter.step("Check metric appears in node where the object is located"):
object_nodes = get_object_nodes(
cluster=cluster, cid=cid, oid=storage_object_id, alive_node=cluster.cluster_nodes[0]
)
object_storage_nodes = get_nodes_with_object(cid, oid, self.shell, cluster.storage_nodes)
object_nodes = [
cluster_node
for cluster_node in cluster.cluster_nodes
if cluster_node.storage_node in object_storage_nodes
]
self.check_metrics_by_type(object_nodes, "frostfs_node_engine_container_objects_total", cid, "user", copies)
check_metrics_counter(
object_nodes,
counter_exp=copies,
command="frostfs_node_engine_container_objects_total",
cid=cid,
type="user",
)
with reporter.step("Delete container"):
delete_container(default_wallet, cid, shell=self.shell, endpoint=self.cluster.default_rpc_endpoint)
@ -129,13 +54,20 @@ class TestObjectMetrics(ClusterTestBase):
self.tick_epochs(epochs_to_tick=2, wait_block=2)
with reporter.step("Check metrics of removed containers doesn't appear in the storage node"):
self.check_metrics_by_type(object_nodes, "frostfs_node_engine_container_objects_total", cid, "user", 0)
check_metrics_counter(
object_nodes, counter_exp=0, command="frostfs_node_engine_container_objects_total", cid=cid, type="user"
)
check_metrics_counter(
object_nodes, counter_exp=0, command="frostfs_node_engine_container_size_byte", cid=cid
)
for node in object_nodes:
with pytest.raises(RuntimeError):
node.metrics.storage.get_metric_container(f"frostfs_node_engine_container_size_byte", cid)
all_metrics = node.metrics.storage.get_all_metrics()
assert (
cid not in all_metrics.stdout
), "metrics of removed containers shouldn't appear in the storage node"
@allure.title("Object metrics, locked object, (policy={placement_policy})")
@allure.title("Object metrics, locked object (obj_size={object_size}, policy={placement_policy})")
@pytest.mark.parametrize(
"placement_policy", ["REP 1 IN X CBF 1 SELECT 1 FROM * AS X", "REP 2 IN X CBF 2 SELECT 2 FROM * AS X"]
)
@ -146,12 +78,7 @@ class TestObjectMetrics(ClusterTestBase):
metric_step = int(re.search(r"REP\s(\d+)", placement_policy).group(1))
with reporter.step(f"Create container with policy {placement_policy}"):
cid = create_container(
wallet=default_wallet,
rule=placement_policy,
shell=self.shell,
endpoint=self.cluster.default_rpc_endpoint,
)
cid = create_container(default_wallet, self.shell, cluster.default_rpc_endpoint, placement_policy)
with reporter.step("Search container nodes"):
container_nodes = search_nodes_with_container(
@ -165,34 +92,51 @@ class TestObjectMetrics(ClusterTestBase):
with reporter.step("Get current metrics for metric_type=user"):
objects_metric_counter = 0
for node in container_nodes:
metric_objects_total = node.metrics.storage.get_metrics_search_by_greps(
command="frostfs_node_engine_objects_total", type="user"
)
objects_metric_counter += self.calc_metrics_count_from_stdout(
metric_objects_total.stdout, metric_type="user"
objects_metric_counter += get_metrics_value(
node, command="frostfs_node_engine_objects_total", type="user"
)
with reporter.step("Put object to container node"):
oid = put_object(
wallet=default_wallet,
path=file_path,
cid=cid,
shell=self.shell,
endpoint=container_nodes[0].storage_node.get_rpc_endpoint(),
default_wallet, file_path, cid, self.shell, container_nodes[0].storage_node.get_rpc_endpoint()
)
with reporter.step(f"Check metric user 'the counter should increase by {metric_step}'"):
objects_metric_counter += metric_step
self.check_object_metrics_total_and_container(container_nodes, cid, objects_metric_counter, metric_step)
check_metrics_counter(
container_nodes,
counter_exp=objects_metric_counter,
command="frostfs_node_engine_objects_total",
type="user",
)
check_metrics_counter(
container_nodes,
counter_exp=metric_step,
command="frostfs_node_engine_container_objects_total",
cid=cid,
type="user",
)
with reporter.step("Delete object"):
delete_object(default_wallet, cid, oid, self.shell, self.cluster.default_rpc_endpoint)
with reporter.step(f"Check metric user 'the counter should decrease by {metric_step}'"):
objects_metric_counter -= metric_step
self.check_object_metrics_total_and_container(container_nodes, cid, objects_metric_counter, 0)
check_metrics_counter(
container_nodes,
counter_exp=objects_metric_counter,
command="frostfs_node_engine_objects_total",
type="user",
)
check_metrics_counter(
container_nodes,
counter_exp=0,
command="frostfs_node_engine_container_objects_total",
cid=cid,
type="user",
)
with reporter.step("Put object and lock it"):
with reporter.step("Put object and lock it to next epoch"):
oid = put_object(
default_wallet, file_path, cid, self.shell, container_nodes[0].storage_node.get_rpc_endpoint()
)
@ -208,18 +152,47 @@ class TestObjectMetrics(ClusterTestBase):
with reporter.step(f"Check metric user 'the counter should increase by {metric_step}'"):
objects_metric_counter += metric_step
self.check_object_metrics_total_and_container(container_nodes, cid, objects_metric_counter, metric_step)
check_metrics_counter(
container_nodes,
counter_exp=objects_metric_counter,
command="frostfs_node_engine_objects_total",
type="user",
)
check_metrics_counter(
container_nodes,
counter_exp=metric_step,
command="frostfs_node_engine_container_objects_total",
cid=cid,
type="user",
)
with reporter.step(f"Wait until remove locking 'the counter doesn't change'"):
self.tick_epochs(epochs_to_tick=2)
self.check_object_metrics_total_and_container(container_nodes, cid, objects_metric_counter, metric_step)
check_metrics_counter(
container_nodes,
counter_exp=objects_metric_counter,
command="frostfs_node_engine_objects_total",
type="user",
)
with reporter.step("Delete object"):
delete_object(default_wallet, cid, oid, self.shell, self.cluster.default_rpc_endpoint)
with reporter.step(f"Check metric user 'the counter should decrease by {metric_step}'"):
objects_metric_counter -= metric_step
self.check_object_metrics_total_and_container(container_nodes, cid, objects_metric_counter, 0)
check_metrics_counter(
container_nodes,
counter_exp=objects_metric_counter,
command="frostfs_node_engine_objects_total",
type="user",
)
check_metrics_counter(
container_nodes,
counter_exp=0,
command="frostfs_node_engine_container_objects_total",
cid=cid,
type="user",
)
with reporter.step("Put object with expire_at"):
current_epoch = self.get_epoch()
@ -234,23 +207,43 @@ class TestObjectMetrics(ClusterTestBase):
with reporter.step(f"Check metric user 'the counter should increase by {metric_step}'"):
objects_metric_counter += metric_step
self.check_object_metrics_total_and_container(container_nodes, cid, objects_metric_counter, metric_step)
check_metrics_counter(
container_nodes,
counter_exp=objects_metric_counter,
command="frostfs_node_engine_objects_total",
type="user",
)
check_metrics_counter(
container_nodes,
counter_exp=metric_step,
command="frostfs_node_engine_container_objects_total",
cid=cid,
type="user",
)
with reporter.step("Tick Epoch"):
self.tick_epochs(epochs_to_tick=2)
with reporter.step(f"Check metric user 'the counter should decrease by {metric_step}'"):
objects_metric_counter -= metric_step
self.check_object_metrics_total_and_container(container_nodes, cid, objects_metric_counter, 0)
check_metrics_counter(
container_nodes,
counter_exp=objects_metric_counter,
command="frostfs_node_engine_objects_total",
type="user",
)
check_metrics_counter(
container_nodes,
counter_exp=0,
command="frostfs_node_engine_container_objects_total",
cid=cid,
type="user",
)
with reporter.step("Delete container"):
delete_container(default_wallet, cid, shell=self.shell, endpoint=self.cluster.default_rpc_endpoint)
@allure.title("Object metrics, stop the node")
@allure.title("Object metrics, stop the node (obj_size={object_size})")
def test_object_metrics_stop_node(
self,
object_size: ObjectSize,
max_object_size: int,
default_wallet: WalletInfo,
cluster_state_controller: ClusterStateController,
):
@ -259,60 +252,70 @@ class TestObjectMetrics(ClusterTestBase):
copies = 2
with reporter.step(f"Create container with policy {placement_policy}"):
cid = create_container(
wallet=default_wallet,
rule=placement_policy,
shell=self.shell,
endpoint=self.cluster.default_rpc_endpoint,
)
cid = create_container(default_wallet, self.shell, self.cluster.default_rpc_endpoint, placement_policy)
with reporter.step("Search container nodes"):
container_nodes = search_nodes_with_container(
wallet=default_wallet,
with reporter.step(f"Check object metrics in container 'should be zero'"):
check_metrics_counter(
self.cluster.cluster_nodes,
counter_exp=0,
command="frostfs_node_engine_container_objects_total",
type="user",
cid=cid,
shell=self.shell,
endpoint=self.cluster.default_rpc_endpoint,
cluster=self.cluster,
)
with reporter.step("Get current metrics for container nodes"):
objects_metric_counter = 0
for node in container_nodes:
metric_objects_total = node.metrics.storage.get_metrics_search_by_greps(
command="frostfs_node_engine_objects_total", type="user"
)
objects_metric_counter += self.calc_metrics_count_from_stdout(
metric_objects_total.stdout, metric_type="user"
with reporter.step("Get current metrics for each nodes"):
objects_metric_counter: dict[ClusterNode:int] = {}
for node in self.cluster.cluster_nodes:
objects_metric_counter[node] = get_metrics_value(
node, command="frostfs_node_engine_objects_total", type="user"
)
with reporter.step("Put object to container node"):
oid = put_object(
wallet=default_wallet,
path=file_path,
cid=cid,
shell=self.shell,
endpoint=container_nodes[0].storage_node.get_rpc_endpoint(),
)
with reporter.step("Put object"):
oid = put_object(default_wallet, file_path, cid, self.shell, self.cluster.default_rpc_endpoint)
with reporter.step(f"Check metric in container nodes 'the counter should increase by {copies}'"):
objects_metric_counter += copies
self.check_object_metrics_total_and_container(container_nodes, cid, objects_metric_counter, copies)
with reporter.step("Get object nodes"):
object_storage_nodes = get_nodes_with_object(cid, oid, self.shell, self.cluster.storage_nodes)
object_nodes = [
cluster_node
for cluster_node in self.cluster.cluster_nodes
if cluster_node.storage_node in object_storage_nodes
]
with reporter.step(f"Check metrics in object nodes 'the counter should increase by {copies}'"):
counter_exp = sum(objects_metric_counter[node] for node in object_nodes) + copies
check_metrics_counter(
object_nodes, counter_exp=counter_exp, command="frostfs_node_engine_objects_total", type="user"
)
check_metrics_counter(
object_nodes,
counter_exp=copies,
command="frostfs_node_engine_container_objects_total",
type="user",
cid=cid,
)
with reporter.step(f"Select node to stop"):
node_to_stop = container_nodes[0]
alive_nodes = [node for node in container_nodes if node != node_to_stop]
node_to_stop = random.choice(object_nodes)
alive_nodes = set(object_nodes).difference({node_to_stop})
with reporter.step(f"Stop the node, wait until the object is replicated to another node"):
cluster_state_controller.stop_node_host(node_to_stop, "hard")
objects_metric_counter[node_to_stop] += 1
with reporter.step(f"Check metric in alive nodes 'the counter should increase by 1'"):
self.check_object_metrics_container(alive_nodes, cid, copies)
with reporter.step(f"Check metric in alive nodes 'the counter should increase'"):
counter_exp = sum(objects_metric_counter[node] for node in alive_nodes)
check_metrics_counter(
alive_nodes, ">=", counter_exp, command="frostfs_node_engine_objects_total", type="user"
)
with reporter.step("Start node"):
cluster_state_controller.start_node_host(node_to_stop)
with reporter.step(f"Check metric in container nodes 'the counter doesn't change'"):
self.check_object_metrics_total_and_container(container_nodes, cid, objects_metric_counter, copies)
with reporter.step("Delete container"):
delete_container(default_wallet, cid, shell=self.shell, endpoint=self.cluster.default_rpc_endpoint)
with reporter.step(f"Check metric in restarted node, 'the counter doesn't change'"):
check_metrics_counter(
object_nodes,
counter_exp=copies,
command="frostfs_node_engine_container_objects_total",
type="user",
cid=cid,
)

View file

@ -4,11 +4,13 @@ import re
import allure
import pytest
from frostfs_testlib import reporter
from frostfs_testlib.resources.error_patterns import OBJECT_NOT_FOUND
from frostfs_testlib.resources.wellknown_acl import EACL_PUBLIC_READ_WRITE
from frostfs_testlib.shell import Shell
from frostfs_testlib.steps.cli.container import create_container, delete_container
from frostfs_testlib.steps.cli.object import get_object, get_object_nodes, put_object
from frostfs_testlib.steps.cli.container import create_container
from frostfs_testlib.steps.cli.object import get_object, put_object
from frostfs_testlib.steps.metrics import check_metrics_counter
from frostfs_testlib.steps.node_management import node_shard_list, node_shard_set_mode
from frostfs_testlib.steps.storage_policy import get_nodes_with_object
from frostfs_testlib.storage.cluster import Cluster, ClusterNode
from frostfs_testlib.storage.controllers import ShardsWatcher
from frostfs_testlib.storage.dataclasses.wallet import WalletInfo
@ -45,28 +47,13 @@ class TestShardMetrics(ClusterTestBase):
watcher.set_shard_mode(shard["shard_id"], mode="read-write")
watcher.await_for_all_shards_status(status="read-write")
@wait_for_success(interval=10)
def check_metrics_in_node(self, cluster_node: ClusterNode, counter_exp: int, **metrics_greps: str):
counter_act = 0
try:
metric_result = cluster_node.metrics.storage.get_metrics_search_by_greps(**metrics_greps)
counter_act += self.calc_metrics_count_from_stdout(metric_result.stdout)
except RuntimeError as e:
...
assert counter_act == counter_exp, f"Expected: {counter_exp}, Actual: {counter_act} in node: {cluster_node}"
@staticmethod
def calc_metrics_count_from_stdout(metric_result_stdout: str):
result = re.findall(r"}\s(\d+)", metric_result_stdout)
return sum(map(int, result))
@staticmethod
def get_error_count_from_logs(shell: Shell, object_path: str, object_name: str):
def get_error_count_from_logs(cluster_node: ClusterNode, object_path: str, object_name: str):
error_count = 0
try:
logs = shell.exec(f"journalctl -u frostfs-storage --grep='error count' --no-pager")
logs = cluster_node.host.get_filtered_logs("error count", unit="frostfs-storage")
# search error logs for current object
for error_line in logs.stdout.split("\n"):
for error_line in logs.split("\n"):
if object_path in error_line and object_name in error_line:
result = re.findall(r'"error\scount":\s(\d+)', error_line)
error_count += sum(map(int, result))
@ -106,17 +93,21 @@ class TestShardMetrics(ClusterTestBase):
node_shard_set_mode(node.storage_node, shard1, "read-only")
with reporter.step(f"Check shard metrics, 'the mode will change to 'READ_ONLY'"):
self.check_metrics_in_node(
node, metrics_counter, command="frostfs_node_engine_mode_info", mode="READ_ONLY", shard_id=shard1
check_metrics_counter(
[node],
counter_exp=metrics_counter,
command="frostfs_node_engine_mode_info",
mode="READ_ONLY",
shard_id=shard1,
)
with reporter.step("Shard2 set to mode 'degraded-read-only'"):
node_shard_set_mode(node.storage_node, shard2, "degraded-read-only")
with reporter.step(f"Check shard metrics, 'the mode will change to 'DEGRADED_READ_ONLY'"):
self.check_metrics_in_node(
node,
metrics_counter,
check_metrics_counter(
[node],
counter_exp=metrics_counter,
command="frostfs_node_engine_mode_info",
mode="DEGRADED_READ_ONLY",
shard_id=shard2,
@ -126,9 +117,14 @@ class TestShardMetrics(ClusterTestBase):
for shard in [shard1, shard2]:
node_shard_set_mode(node.storage_node, shard, "read-write")
with reporter.step(f"Check shard metrics, 'the mode will change to 'READ_WRITE'"):
self.check_metrics_in_node(
node, metrics_counter, command="frostfs_node_engine_mode_info", mode="READ_WRITE", shard_id=shard
with reporter.step(f"Check shard metrics, 'the mode will change to 'READ_WRITE'"):
for shard in [shard1, shard2]:
check_metrics_counter(
[node],
counter_exp=metrics_counter,
command="frostfs_node_engine_mode_info",
mode="READ_WRITE",
shard_id=shard,
)
@allure.title("Metric for error count on shard")
@ -141,17 +137,22 @@ class TestShardMetrics(ClusterTestBase):
cid = create_container(
wallet=default_wallet,
shell=self.shell,
endpoint=self.cluster.default_rpc_endpoint,
endpoint=cluster.default_rpc_endpoint,
rule="REP 1 CBF 1",
basic_acl=EACL_PUBLIC_READ_WRITE,
)
with reporter.step("Put object"):
oid = put_object(default_wallet, file_path, cid, self.shell, self.cluster.default_rpc_endpoint)
oid = put_object(default_wallet, file_path, cid, self.shell, cluster.default_rpc_endpoint)
with reporter.step("Get object nodes"):
object_nodes = get_object_nodes(cluster, cid, oid, cluster.cluster_nodes[0])
node = object_nodes[0]
object_storage_nodes = get_nodes_with_object(cid, oid, self.shell, cluster.storage_nodes)
object_nodes = [
cluster_node
for cluster_node in cluster.cluster_nodes
if cluster_node.storage_node in object_storage_nodes
]
node = random.choice(object_nodes)
with reporter.step("Search object in system."):
object_path, object_name = self.get_object_path_and_name_file(oid, cid, node)
@ -160,7 +161,7 @@ class TestShardMetrics(ClusterTestBase):
node.host.get_shell().exec(f"chmod a-r {object_path}/{object_name}")
with reporter.step("Get object, expect error"):
with pytest.raises(RuntimeError):
with pytest.raises(RuntimeError, match=OBJECT_NOT_FOUND):
get_object(
wallet=default_wallet,
cid=cid,
@ -170,10 +171,7 @@ class TestShardMetrics(ClusterTestBase):
)
with reporter.step(f"Get shard error count from logs"):
counter = self.get_error_count_from_logs(node.host.get_shell(), object_path, object_name)
counter = self.get_error_count_from_logs(node, object_path, object_name)
with reporter.step(f"Check shard error metrics"):
self.check_metrics_in_node(node, counter, command="frostfs_node_engine_errors_total")
with reporter.step("Delete container"):
delete_container(default_wallet, cid, self.shell, cluster.default_rpc_endpoint)
check_metrics_counter([node], counter_exp=counter, command="frostfs_node_engine_errors_total")