frostfs-testcases/pytest_tests/testsuites/failovers/test_failover_network.py
Dmitriy Zayakin b01fff85f6 Delete split-brain test
Signed-off-by: Dmitriy Zayakin <d.zayakin@yadro.com>
2023-10-04 08:27:52 +00:00

133 lines
5.3 KiB
Python

import logging
import random
from time import sleep
import allure
import pytest
from frostfs_testlib.resources.wellknown_acl import PUBLIC_ACL
from frostfs_testlib.steps.cli.container import create_container
from frostfs_testlib.steps.cli.object import get_object, put_object_to_random_node
from frostfs_testlib.storage.cluster import ClusterNode
from frostfs_testlib.storage.controllers import ClusterStateController
from frostfs_testlib.storage.dataclasses.object_size import ObjectSize
from frostfs_testlib.testing.cluster_test_base import ClusterTestBase
from frostfs_testlib.utils.failover_utils import (
wait_all_storage_nodes_returned,
wait_object_replication,
)
from frostfs_testlib.utils.file_utils import generate_file, get_file_hash
logger = logging.getLogger("NeoLogger")
STORAGE_NODE_COMMUNICATION_PORT = "8080"
STORAGE_NODE_COMMUNICATION_PORT_TLS = "8082"
PORTS_TO_BLOCK = [STORAGE_NODE_COMMUNICATION_PORT, STORAGE_NODE_COMMUNICATION_PORT_TLS]
blocked_nodes: list[ClusterNode] = []
@pytest.mark.failover
@pytest.mark.failover_network
class TestFailoverNetwork(ClusterTestBase):
@pytest.fixture(autouse=True)
@allure.title("Restore network")
def restore_network(self, cluster_state_controller: ClusterStateController):
yield
with allure.step(f"Count blocked nodes {len(blocked_nodes)}"):
not_empty = len(blocked_nodes) != 0
for node in list(blocked_nodes):
with allure.step(f"Restore network for {node}"):
cluster_state_controller.restore_traffic(mode="ports", node=node)
blocked_nodes.remove(node)
if not_empty:
wait_all_storage_nodes_returned(self.shell, self.cluster)
@allure.title("Block Storage node traffic")
def test_block_storage_node_traffic(
self,
default_wallet: str,
require_multiple_hosts,
simple_object_size: ObjectSize,
cluster_state_controller: ClusterStateController,
):
"""
Block storage nodes traffic using iptables and wait for replication for objects.
"""
wallet = default_wallet
placement_rule = "REP 2 IN X CBF 2 SELECT 2 FROM * AS X"
wakeup_node_timeout = 10 # timeout to let nodes detect that traffic has blocked
nodes_to_block_count = 2
source_file_path = generate_file(simple_object_size.value)
cid = create_container(
wallet,
shell=self.shell,
endpoint=self.cluster.default_rpc_endpoint,
rule=placement_rule,
basic_acl=PUBLIC_ACL,
)
oid = put_object_to_random_node(
wallet, source_file_path, cid, shell=self.shell, cluster=self.cluster
)
nodes = wait_object_replication(
cid, oid, 2, shell=self.shell, nodes=self.cluster.storage_nodes
)
logger.info(f"Nodes are {nodes}")
nodes_to_block = nodes
if nodes_to_block_count > len(nodes):
# TODO: the intent of this logic is not clear, need to revisit
nodes_to_block = random.choices(nodes, k=2)
excluded_nodes = []
for node in nodes_to_block:
with allure.step(f"Block incoming traffic at node {node} on port {PORTS_TO_BLOCK}"):
block_node = [
cluster_node
for cluster_node in self.cluster.cluster_nodes
if cluster_node.storage_node == node
]
blocked_nodes.append(*block_node)
excluded_nodes.append(node)
cluster_state_controller.drop_traffic(
mode="ports",
node=node,
wakeup_timeout=wakeup_node_timeout,
ports=PORTS_TO_BLOCK,
)
with allure.step(f"Check object is not stored on node {node}"):
new_nodes = wait_object_replication(
cid,
oid,
2,
shell=self.shell,
nodes=list(set(self.cluster.storage_nodes) - set(excluded_nodes)),
)
assert node not in new_nodes
with allure.step("Check object data is not corrupted"):
got_file_path = get_object(
wallet, cid, oid, endpoint=new_nodes[0].get_rpc_endpoint(), shell=self.shell
)
assert get_file_hash(source_file_path) == get_file_hash(got_file_path)
for node in nodes_to_block:
with allure.step(f"Unblock incoming traffic at host {node} on port {PORTS_TO_BLOCK}"):
cluster_state_controller.restore_traffic(mode="ports", node=node)
block_node = [
cluster_node
for cluster_node in self.cluster.cluster_nodes
if cluster_node.storage_node == node
]
blocked_nodes.remove(*block_node)
sleep(wakeup_node_timeout)
with allure.step("Check object data is not corrupted"):
new_nodes = wait_object_replication(
cid, oid, 2, shell=self.shell, nodes=self.cluster.storage_nodes
)
got_file_path = get_object(
wallet, cid, oid, shell=self.shell, endpoint=new_nodes[0].get_rpc_endpoint()
)
assert get_file_hash(source_file_path) == get_file_hash(got_file_path)