import logging import random from time import sleep import allure import pytest from frostfs_testlib import reporter from frostfs_testlib.healthcheck.interfaces import Healthcheck from frostfs_testlib.resources.wellknown_acl import EACL_PUBLIC_READ_WRITE, PUBLIC_ACL from frostfs_testlib.steps.cli.container import create_container from frostfs_testlib.steps.cli.object import get_object, get_object_nodes, neo_go_query_height, put_object, put_object_to_random_node from frostfs_testlib.steps.storage_object import delete_objects from frostfs_testlib.storage.cluster import ClusterNode from frostfs_testlib.storage.controllers import ClusterStateController from frostfs_testlib.storage.dataclasses.object_size import ObjectSize from frostfs_testlib.storage.dataclasses.storage_object_info import Interfaces, StorageObjectInfo from frostfs_testlib.storage.dataclasses.wallet import WalletInfo from frostfs_testlib.testing.cluster_test_base import ClusterTestBase from frostfs_testlib.testing.parallel import parallel from frostfs_testlib.utils.failover_utils import wait_object_replication from frostfs_testlib.utils.file_utils import generate_file, get_file_hash logger = logging.getLogger("NeoLogger") STORAGE_NODE_COMMUNICATION_PORT = "8080" STORAGE_NODE_COMMUNICATION_PORT_TLS = "8082" PORTS_TO_BLOCK = [STORAGE_NODE_COMMUNICATION_PORT, STORAGE_NODE_COMMUNICATION_PORT_TLS] blocked_nodes: list[ClusterNode] = [] OBJECT_ATTRIBUTES = [ None, {"key1": 1, "key2": "abc", "common_key": "common_value"}, {"key1": 2, "common_key": "common_value"}, ] @pytest.mark.failover @pytest.mark.failover_network class TestFailoverNetwork(ClusterTestBase): @pytest.fixture(autouse=True) @allure.title("Restore network") def restore_network(self, healthcheck: Healthcheck, cluster_state_controller: ClusterStateController): yield with reporter.step(f"Count blocked nodes {len(blocked_nodes)}"): not_empty = len(blocked_nodes) != 0 for node in list(blocked_nodes): with reporter.step(f"Restore network for {node}"): cluster_state_controller.restore_traffic(node=node) blocked_nodes.remove(node) if not_empty: parallel(healthcheck.storage_healthcheck, self.cluster.cluster_nodes) @pytest.fixture() @allure.title("Restore drop traffic to system") def restore_down_interfaces(self, cluster_state_controller: ClusterStateController): yield cluster_state_controller.restore_interfaces() @pytest.fixture() def storage_objects( self, simple_object_size: ObjectSize, default_wallet: WalletInfo, ) -> list[StorageObjectInfo]: file_path = generate_file(simple_object_size.value) file_hash = get_file_hash(file_path) with reporter.step("Create container"): placement_rule = "REP 1 CBF 1" cid = create_container( wallet=default_wallet, shell=self.shell, endpoint=self.cluster.default_rpc_endpoint, rule=placement_rule, await_mode=True, basic_acl=EACL_PUBLIC_READ_WRITE, ) storage_objects = [] with reporter.step("Put object"): for attribute in OBJECT_ATTRIBUTES: oid = put_object_to_random_node( wallet=default_wallet, path=file_path, cid=cid, shell=self.shell, cluster=self.cluster, ) storage_object = StorageObjectInfo(cid=cid, oid=oid) storage_object.size = simple_object_size.value storage_object.wallet = default_wallet storage_object.file_path = file_path storage_object.file_hash = file_hash storage_object.attributes = attribute storage_objects.append(storage_object) return storage_objects @allure.title("Block Storage node traffic") def test_block_storage_node_traffic( self, default_wallet: WalletInfo, require_multiple_hosts, simple_object_size: ObjectSize, cluster_state_controller: ClusterStateController, ): """ Block storage nodes traffic using iptables and wait for replication for objects. """ wallet = default_wallet placement_rule = "REP 2 IN X CBF 2 SELECT 2 FROM * AS X" wakeup_node_timeout = 10 # timeout to let nodes detect that traffic has blocked nodes_to_block_count = 2 source_file_path = generate_file(simple_object_size.value) cid = create_container( wallet, shell=self.shell, endpoint=self.cluster.default_rpc_endpoint, rule=placement_rule, basic_acl=PUBLIC_ACL, ) oid = put_object_to_random_node(wallet, source_file_path, cid, shell=self.shell, cluster=self.cluster) nodes = wait_object_replication(cid, oid, 2, shell=self.shell, nodes=self.cluster.storage_nodes) logger.info(f"Nodes are {nodes}") nodes_to_block = nodes if nodes_to_block_count > len(nodes): # TODO: the intent of this logic is not clear, need to revisit nodes_to_block = random.choices(nodes, k=2) nodes_non_block = list(set(self.cluster.storage_nodes) - set(nodes_to_block)) nodes_non_block_cluster = [ cluster_node for cluster_node in self.cluster.cluster_nodes if cluster_node.storage_node in nodes_non_block ] with reporter.step("Block traffic and check corrupted object"): for node in nodes_non_block_cluster: with reporter.step(f"Block incoming traffic at node {node}"): blocked_nodes.append(node) cluster_state_controller.drop_traffic( node=node, wakeup_timeout=wakeup_node_timeout, name_interface="data", block_nodes=nodes_to_block ) with reporter.step(f"Check object is not stored on node {node}"): new_nodes = wait_object_replication( cid, oid, 2, shell=self.shell, nodes=list(set(self.cluster.storage_nodes) - set(nodes_non_block)), ) assert node.storage_node not in new_nodes with reporter.step("Check object data is not corrupted"): got_file_path = get_object(wallet, cid, oid, endpoint=new_nodes[0].get_rpc_endpoint(), shell=self.shell) assert get_file_hash(source_file_path) == get_file_hash(got_file_path) with reporter.step(f"Unblock incoming traffic"): for node in nodes_non_block_cluster: with reporter.step(f"Unblock at host {node}"): cluster_state_controller.restore_traffic(node=node) block_node = [ cluster_node for cluster_node in self.cluster.cluster_nodes if cluster_node.storage_node == node.storage_node ] blocked_nodes.remove(*block_node) sleep(wakeup_node_timeout) with reporter.step("Check object data is not corrupted"): new_nodes = wait_object_replication(cid, oid, 2, shell=self.shell, nodes=self.cluster.storage_nodes) got_file_path = get_object(wallet, cid, oid, shell=self.shell, endpoint=new_nodes[0].get_rpc_endpoint()) assert get_file_hash(source_file_path) == get_file_hash(got_file_path) @pytest.mark.interfaces @allure.title("Block DATA interface node") def test_block_data_interface( self, cluster_state_controller: ClusterStateController, default_wallet: WalletInfo, restore_down_interfaces: None, storage_objects: list[StorageObjectInfo], ): storage_object = storage_objects[0] with reporter.step("Search nodes with object"): nodes_with_object = get_object_nodes( cluster=self.cluster, cid=storage_object.cid, oid=storage_object.oid, alive_node=self.cluster.cluster_nodes[0], ) with reporter.step("Get data interface to node"): config_interfaces = list(nodes_with_object[0].host.config.interfaces.keys()) with reporter.step(f"Get data in {config_interfaces}"): data_interfaces = [interface for interface in config_interfaces if "data" in interface] with reporter.step("Block data interfaces for node"): for interface in data_interfaces: cluster_state_controller.down_interface(nodes=nodes_with_object, interface=interface) with reporter.step("Tick epoch and wait 2 block"): nodes_without_an_object = list(set(self.cluster.cluster_nodes) - set(nodes_with_object)) self.tick_epochs(1, alive_node=nodes_without_an_object[0].storage_node, wait_block=2) with reporter.step("Get object for target nodes to data interfaces, expect false"): with pytest.raises(RuntimeError, match="can't create API client: can't init SDK client: gRPC dial: context deadline exceeded"): get_object( wallet=default_wallet, cid=storage_object.cid, oid=storage_object.oid, shell=self.shell, endpoint=nodes_with_object[0].storage_node.get_rpc_endpoint(), ) with reporter.step(f"Get object others nodes, expect true"): input_file = get_object( wallet=default_wallet, cid=storage_object.cid, oid=storage_object.oid, shell=self.shell, endpoint=nodes_without_an_object[0].storage_node.get_rpc_endpoint(), ) with reporter.step("Restore interface and tick 1 epoch, wait 2 block"): cluster_state_controller.restore_interfaces() self.tick_epochs(1, alive_node=nodes_without_an_object[0].storage_node, wait_block=2) @pytest.mark.interfaces @allure.title("Block INTERNAL interface node") def test_block_internal_interface( self, cluster_state_controller: ClusterStateController, default_wallet: WalletInfo, restore_down_interfaces: None, storage_objects: list[StorageObjectInfo], simple_object_size: ObjectSize, ): storage_object = storage_objects[0] with reporter.step("Search nodes with object"): nodes_with_object = get_object_nodes( cluster=self.cluster, cid=storage_object.cid, oid=storage_object.oid, alive_node=self.cluster.cluster_nodes[0], ) with reporter.step("Get internal interface to node"): config_interfaces = list(nodes_with_object[0].host.config.interfaces.keys()) with reporter.step(f"Get internal in {config_interfaces}"): internal_interfaces = [interface for interface in config_interfaces if "internal" in interface] with reporter.step("Block internal interfaces for node"): for interface in internal_interfaces: cluster_state_controller.down_interface(nodes=nodes_with_object, interface=interface) with reporter.step("Tick epoch and wait 2 block"): nodes_without_an_object = list(set(self.cluster.cluster_nodes) - set(nodes_with_object)) self.tick_epochs(1, alive_node=nodes_without_an_object[0].storage_node, wait_block=2) with reporter.step("Get object others node, expect false"): with pytest.raises(RuntimeError, match="rpc error"): get_object( wallet=default_wallet, cid=storage_object.cid, oid=storage_object.oid, shell=self.shell, endpoint=nodes_without_an_object[0].storage_node.get_rpc_endpoint(), ) with reporter.step("Put object, others node, expect false"): with pytest.raises(RuntimeError, match="rpc error"): put_object( wallet=default_wallet, path=storage_object.file_path, cid=storage_object.cid, shell=self.shell, endpoint=nodes_without_an_object[0].storage_node.get_rpc_endpoint(), ) with reporter.step(f"Get object nodes with object, expect true"): input_file = get_object( wallet=default_wallet, cid=storage_object.cid, oid=storage_object.oid, shell=self.shell, endpoint=nodes_with_object[0].storage_node.get_rpc_endpoint(), ) with reporter.step(f"Put object nodes with object, expect true"): temp_file_path = generate_file(simple_object_size.value) _ = put_object( wallet=default_wallet, path=temp_file_path, cid=storage_object.cid, shell=self.shell, endpoint=nodes_with_object[0].storage_node.get_rpc_endpoint(), ) with reporter.step("Restore interface and tick 1 epoch, wait 2 block"): cluster_state_controller.restore_interfaces() self.tick_epochs(1, alive_node=nodes_without_an_object[0].storage_node, wait_block=2) @pytest.mark.interfaces @pytest.mark.failover_baremetal @pytest.mark.parametrize( "block_interface, other_interface", [(Interfaces.DATA_O, Interfaces.DATA_1), (Interfaces.DATA_1, Interfaces.DATA_O)], ) @allure.title("Down data interfaces to all nodes(interface={block_interface})") def test_down_data_interface( self, require_multiple_interfaces, cluster_state_controller: ClusterStateController, default_wallet: WalletInfo, simple_object_size: ObjectSize, restore_down_interfaces: None, block_interface: Interfaces, other_interface: Interfaces, ): cluster_nodes = self.cluster.cluster_nodes with reporter.step(f"Block {block_interface.value} interfaces"): cluster_state_controller.down_interface(cluster_nodes, block_interface.value) with reporter.step("Tick 1 epoch and wait 2 block for sync all nodes"): self.tick_epochs(1, alive_node=cluster_nodes[0].storage_node, wait_block=2) with reporter.step("Create container"): cid = create_container( wallet=default_wallet, shell=self.shell, endpoint=f"{cluster_nodes[0].get_data_interface(other_interface.value)[0]}:8080", rule="REP 4 CBF 1", ) with reporter.step("Put object"): file_path = generate_file(simple_object_size.value) oid = put_object( wallet=default_wallet, path=file_path, cid=cid, shell=self.shell, endpoint=f"{cluster_nodes[0].get_data_interface(other_interface.value)[0]}:8080", ) with reporter.step("Get object"): file_get_path = get_object( wallet=default_wallet, cid=cid, oid=oid, shell=self.shell, endpoint=f"{cluster_nodes[0].get_data_interface(other_interface.value)[0]}:8080", ) with reporter.step("Restore interfaces all nodes"): cluster_state_controller.restore_interfaces() self.tick_epochs(1, alive_node=cluster_nodes[0].storage_node, wait_block=2) @pytest.mark.interfaces @pytest.mark.failover_baremetal @pytest.mark.parametrize("interface", [Interfaces.INTERNAL_0, Interfaces.INTERNAL_1]) @allure.title("Down internal interfaces to all nodes(interface={interface})") def test_down_internal_interface( self, require_multiple_interfaces, cluster_state_controller: ClusterStateController, default_wallet: WalletInfo, simple_object_size: ObjectSize, restore_down_interfaces: None, interface: Interfaces, ): cluster_nodes = self.cluster.cluster_nodes latest_block = {} with reporter.step("Get block all nodes"): for cluster_node in cluster_nodes: latest_block[cluster_node] = neo_go_query_height( shell=cluster_node.host.get_shell(), endpoint=cluster_node.morph_chain.get_http_endpoint() ) with reporter.step(f"Block {interface} interfaces"): cluster_state_controller.down_interface(cluster_nodes, interface.value) with reporter.step("Tick 1 epoch and wait 2 block for sync all nodes"): self.tick_epochs(1, alive_node=cluster_nodes[0].storage_node, wait_block=2) with reporter.step("Create container"): cid = create_container( wallet=default_wallet, shell=self.shell, endpoint=self.cluster.default_rpc_endpoint, rule="REP 4 CBF 1", ) with reporter.step(f"Put object, after down {interface}"): file_path = generate_file(simple_object_size.value) oid = put_object( wallet=default_wallet, path=file_path, cid=cid, shell=self.shell, endpoint=self.cluster.default_rpc_endpoint, ) with reporter.step("Get object"): file_get_path = get_object( wallet=default_wallet, cid=cid, oid=oid, shell=self.shell, endpoint=self.cluster.default_rpc_endpoint, ) now_block = {} with reporter.step("Get actual block"): for cluster_node in cluster_nodes: now_block[cluster_node] = neo_go_query_height( shell=cluster_node.host.get_shell(), endpoint=cluster_node.morph_chain.get_http_endpoint() ) with reporter.step(f"Compare block"): for cluster_node, items in now_block.items(): with reporter.step( f"Node - {cluster_node.host_ip}, old block - {latest_block[cluster_node]['Latest block']}, " f"now block - {now_block[cluster_node]['Latest block']}" ): assert latest_block[cluster_node]["Latest block"] < now_block[cluster_node]["Latest block"] with reporter.step("Restore interfaces all nodes"): cluster_state_controller.restore_interfaces() self.tick_epochs(1, alive_node=self.cluster.cluster_nodes[0].storage_node, wait_block=2)