frostfs-testcases/pytest_tests/testsuites/failovers/test_failover_server.py
Andrey Berezin f06e44642a [#210] Update failover case to not use stopped node
Signed-off-by: Andrey Berezin <a.berezin@yadro.com>
2024-03-19 18:40:42 +03:00

306 lines
13 KiB
Python

import logging
import os.path
import random
import allure
import pytest
from frostfs_testlib import reporter
from frostfs_testlib.resources.wellknown_acl import PUBLIC_ACL
from frostfs_testlib.steps.cli.container import StorageContainer, StorageContainerInfo, create_container
from frostfs_testlib.steps.cli.object import get_object, get_object_nodes, put_object
from frostfs_testlib.steps.node_management import check_node_in_map, check_node_not_in_map
from frostfs_testlib.storage.cluster import ClusterNode, StorageNode
from frostfs_testlib.storage.controllers import ClusterStateController
from frostfs_testlib.storage.dataclasses.object_size import ObjectSize
from frostfs_testlib.storage.dataclasses.storage_object_info import StorageObjectInfo
from frostfs_testlib.storage.dataclasses.wallet import WalletInfo
from frostfs_testlib.testing.cluster_test_base import ClusterTestBase
from frostfs_testlib.testing.parallel import parallel
from frostfs_testlib.testing.test_control import wait_for_success
from frostfs_testlib.utils.failover_utils import wait_object_replication
from frostfs_testlib.utils.file_utils import get_file_hash
from pytest import FixtureRequest
logger = logging.getLogger("NeoLogger")
@pytest.mark.failover
@pytest.mark.failover_server
class TestFailoverServer(ClusterTestBase):
@wait_for_success(max_wait_time=120, interval=1)
def wait_node_not_in_map(self, *args, **kwargs):
check_node_not_in_map(*args, **kwargs)
@wait_for_success(max_wait_time=120, interval=1)
def wait_node_in_map(self, *args, **kwargs):
check_node_in_map(*args, **kwargs)
@reporter.step("Create {count_containers} containers and {count_files} objects")
@pytest.fixture
def containers(
self,
request: FixtureRequest,
default_wallet: WalletInfo,
) -> list[StorageContainer]:
placement_rule = "REP 2 CBF 2 SELECT 2 FROM *"
containers = []
for _ in range(request.param):
cont_id = create_container(
default_wallet,
shell=self.shell,
endpoint=self.cluster.default_rpc_endpoint,
rule=placement_rule,
basic_acl=PUBLIC_ACL,
)
storage_cont_info = StorageContainerInfo(cont_id, default_wallet)
containers.append(StorageContainer(storage_cont_info, self.shell, self.cluster))
return containers
@reporter.step("Creation container")
@pytest.fixture()
def container(self, default_wallet: WalletInfo) -> StorageContainer:
select = len(self.cluster.cluster_nodes)
placement_rule = f"REP {select - 1} CBF 1 SELECT {select} FROM *"
cont_id = create_container(
default_wallet,
shell=self.shell,
endpoint=self.cluster.default_rpc_endpoint,
rule=placement_rule,
basic_acl=PUBLIC_ACL,
)
storage_cont_info = StorageContainerInfo(cont_id, default_wallet)
return StorageContainer(storage_cont_info, self.shell, self.cluster)
@reporter.step("Create object and delete after test")
@pytest.fixture(scope="class")
def storage_objects(
self,
request: FixtureRequest,
containers: list[StorageContainer],
simple_object_size: ObjectSize,
complex_object_size: ObjectSize,
) -> StorageObjectInfo:
count_object = request.param
object_sizes = [simple_object_size, complex_object_size]
object_list: list[StorageObjectInfo] = []
for cont in containers:
for _ in range(count_object):
object_list.append(cont.generate_object(size=random.choice(object_sizes).value))
for storage_object in object_list:
os.remove(storage_object.file_path)
yield object_list
@reporter.step("Select random node to stop and start it after test")
@pytest.fixture
def node_to_stop(
self, node_under_test: ClusterNode, cluster_state_controller: ClusterStateController
) -> ClusterNode:
yield node_under_test
with reporter.step(f"start {node_under_test.storage_node}"):
cluster_state_controller.start_stopped_hosts()
@reporter.step("Upload object with nodes and compare")
def get_corrupted_objects_list(
self, nodes: list[StorageNode], storage_objects: list[StorageObjectInfo]
) -> list[StorageObjectInfo]:
corrupted_objects = []
errors_get = []
for node in nodes:
for storage_object in storage_objects:
try:
got_file_path = get_object(
storage_object.wallet,
storage_object.cid,
storage_object.oid,
endpoint=node.get_rpc_endpoint(),
shell=self.shell,
timeout="60s",
)
if storage_object.file_hash != get_file_hash(got_file_path):
corrupted_objects.append(storage_object)
os.remove(got_file_path)
except RuntimeError:
errors_get.append(storage_object.oid)
assert len(errors_get) == 0, f"Get failed - {errors_get}"
return corrupted_objects
def check_objects_replication(
self, storage_objects: list[StorageObjectInfo], storage_nodes: list[StorageNode]
) -> None:
for storage_object in storage_objects:
wait_object_replication(
storage_object.cid,
storage_object.oid,
2,
shell=self.shell,
nodes=storage_nodes,
sleep_interval=45,
attempts=60,
)
@pytest.fixture()
def object_and_nodes(
self, simple_object_size: ObjectSize, container: StorageContainer
) -> tuple[StorageObjectInfo, list[ClusterNode]]:
object_info = container.generate_object(simple_object_size.value)
object_nodes = get_object_nodes(
cluster=self.cluster, cid=object_info.cid, oid=object_info.oid, alive_node=self.cluster.cluster_nodes[0]
)
return object_info, object_nodes
@pytest.fixture()
def up_stop_nodes(self, cluster_state_controller: ClusterStateController):
yield
cluster_state_controller.start_stopped_hosts()
@allure.title("Full shutdown node")
@pytest.mark.parametrize("containers, storage_objects", [(5, 10)], indirect=True)
def test_complete_node_shutdown(
self,
storage_objects: list[StorageObjectInfo],
node_to_stop: ClusterNode,
cluster_state_controller: ClusterStateController,
):
with reporter.step(f"Remove {node_to_stop} from the list of nodes"):
alive_nodes = list(set(self.cluster.cluster_nodes) - {node_to_stop})
storage_nodes = [cluster.storage_node for cluster in alive_nodes]
with reporter.step("Tick epoch and wait for 2 blocks"):
self.tick_epochs(1, storage_nodes[0], wait_block=2)
with reporter.step(f"Stop node"):
cluster_state_controller.stop_node_host(node=node_to_stop, mode="hard")
with reporter.step("Verify that there are no corrupted objects"):
corrupted_objects_list = self.get_corrupted_objects_list(storage_nodes, storage_objects)
assert not corrupted_objects_list
with reporter.step(f"check {node_to_stop.storage_node} in map"):
self.wait_node_in_map(node_to_stop.storage_node, self.shell, alive_node=storage_nodes[0])
count_tick_epoch = int(alive_nodes[0].ir_node.get_netmap_cleaner_threshold()) + 4
with reporter.step(f"Tick {count_tick_epoch} epochs and wait for 2 blocks"):
self.tick_epochs(count_tick_epoch, storage_nodes[0], wait_block=2)
with reporter.step(f"Check {node_to_stop} in not map"):
self.wait_node_not_in_map(node_to_stop.storage_node, self.shell, alive_node=storage_nodes[0])
with reporter.step(f"Verify that there are no corrupted objects after {count_tick_epoch} epoch"):
corrupted_objects_list = self.get_corrupted_objects_list(storage_nodes, storage_objects)
assert not corrupted_objects_list
@allure.title("Temporarily disable a node")
@pytest.mark.parametrize("containers, storage_objects", [(5, 10)], indirect=True)
def test_temporarily_disable_a_node(
self,
storage_objects: list[StorageObjectInfo],
node_to_stop: ClusterNode,
cluster_state_controller: ClusterStateController,
):
with reporter.step(f"Remove {node_to_stop} from the list of nodes"):
storage_nodes = list(set(self.cluster.storage_nodes) - {node_to_stop.storage_node})
with reporter.step("Tick epoch and wait for 2 blocks"):
self.tick_epochs(1, storage_nodes[0], wait_block=2)
with reporter.step(f"Stop node"):
cluster_state_controller.stop_node_host(node_to_stop, "hard")
with reporter.step("Verify that there are no corrupted objects"):
corrupted_objects_list = self.get_corrupted_objects_list(storage_nodes, storage_objects)
assert not corrupted_objects_list
with reporter.step(f"Check {node_to_stop} in map"):
self.wait_node_in_map(node_to_stop.storage_node, self.shell, alive_node=storage_nodes[0])
cluster_state_controller.start_node_host(node_to_stop)
with reporter.step("Verify that there are no corrupted objects"):
corrupted_objects_list = self.get_corrupted_objects_list(storage_nodes, storage_objects)
assert not corrupted_objects_list
@allure.title("Not enough nodes in the container with policy - 'REP 3 CBF 1 SELECT 4 FROM *'")
def test_not_enough_nodes_in_container_rep_3(
self,
object_and_nodes: tuple[StorageObjectInfo, list[ClusterNode]],
default_wallet: WalletInfo,
cluster_state_controller: ClusterStateController,
simple_file: str,
up_stop_nodes: None,
):
object_info, object_nodes = object_and_nodes
endpoint_without_object = list(set(self.cluster.cluster_nodes) - set(object_nodes))[
0
].storage_node.get_rpc_endpoint()
endpoint_with_object = object_nodes[0].storage_node.get_rpc_endpoint()
with reporter.step("Stop all nodes with object except first one"):
parallel(cluster_state_controller.stop_node_host, object_nodes[1:], mode="hard")
with reporter.step(f"Get object from node without object"):
get_object(default_wallet, object_info.cid, object_info.oid, self.shell, endpoint_without_object)
with reporter.step(f"Get object from node with object"):
get_object(default_wallet, object_info.cid, object_info.oid, self.shell, endpoint_with_object)
with reporter.step(f"Put operation to node with object, expect error"):
with pytest.raises(RuntimeError):
put_object(default_wallet, simple_file, object_info.cid, self.shell, endpoint_with_object)
@allure.title("Not enough nodes in the container with policy - 'REP 2 CBF 2 SELECT 4 FROM *'")
def test_not_enough_nodes_in_container_rep_2(
self,
default_wallet: WalletInfo,
cluster_state_controller: ClusterStateController,
simple_file: str,
up_stop_nodes: None,
):
with reporter.step("Create container with full network map"):
node_count = len(self.cluster.cluster_nodes)
placement_rule = f"REP {node_count - 2} IN X CBF 2 SELECT {node_count} FROM * AS X"
cid = create_container(
default_wallet,
self.shell,
self.cluster.default_rpc_endpoint,
rule=placement_rule,
basic_acl=PUBLIC_ACL,
)
with reporter.step("Put object"):
oid = put_object(default_wallet, simple_file, cid, self.shell, self.cluster.default_rpc_endpoint)
with reporter.step("Search nodes with object"):
object_nodes = get_object_nodes(self.cluster, cid, oid, self.cluster.cluster_nodes[0])
with reporter.step("Choose node to stop"):
node_to_stop = random.choice(object_nodes)
alive_node_with_object = random.choice(list(set(object_nodes) - {node_to_stop}))
alive_endpoint_with_object = alive_node_with_object.storage_node.get_rpc_endpoint()
with reporter.step("Stop random node with object"):
cluster_state_controller.stop_node_host(node_to_stop, "hard")
with reporter.step("Put object to alive node with object"):
oid_2 = put_object(default_wallet, simple_file, cid, self.shell, alive_endpoint_with_object)
with reporter.step("Get object from alive node with object"):
get_file = get_object(default_wallet, cid, oid_2, self.shell, alive_endpoint_with_object)
os.remove(get_file)
with reporter.step("Create container on alive node"):
create_container(
default_wallet,
self.shell,
alive_endpoint_with_object,
rule=placement_rule,
basic_acl=PUBLIC_ACL,
)