frostfs-testcases/pytest_tests/helpers/failover_utils.py
Dmitriy Zayakin df1ffbdcb6 Add tests for node shutdown and object replication integrity
Signed-off-by: Dmitriy Zayakin <d.zayakin@yadro.com>
2023-05-03 11:45:44 +03:00

75 lines
2.6 KiB
Python

import logging
from time import sleep
import allure
from frostfs_testlib.hosting import Host
from frostfs_testlib.shell import CommandOptions, Shell
from pytest_tests.helpers.cluster import Cluster, StorageNode
from pytest_tests.helpers.node_management import storage_node_healthcheck
from pytest_tests.helpers.storage_policy import get_nodes_with_object
from pytest_tests.helpers.test_control import retry
logger = logging.getLogger("NeoLogger")
@allure.step("Wait for object replication")
def wait_object_replication(
cid: str,
oid: str,
expected_copies: int,
shell: Shell,
nodes: list[StorageNode],
sleep_interval: int = 15,
attempts: int = 20,
) -> list[StorageNode]:
nodes_with_object = []
for _ in range(attempts):
nodes_with_object = get_nodes_with_object(cid, oid, shell=shell, nodes=nodes)
if len(nodes_with_object) >= expected_copies:
return nodes_with_object
sleep(sleep_interval)
raise AssertionError(
f"Expected {expected_copies} copies of object, but found {len(nodes_with_object)}. "
f"Waiting time {sleep_interval * attempts}"
)
@allure.step("Wait for storage nodes returned to cluster")
def wait_all_storage_nodes_returned(cluster: Cluster) -> None:
sleep_interval, attempts = 15, 20
for __attempt in range(attempts):
if is_all_storage_nodes_returned(cluster):
return
sleep(sleep_interval)
raise AssertionError("Storage node(s) is broken")
def is_all_storage_nodes_returned(cluster: Cluster) -> bool:
with allure.step("Run health check for all storage nodes"):
for node in cluster.storage_nodes:
try:
health_check = storage_node_healthcheck(node)
except Exception as err:
logger.warning(f"Node healthcheck fails with error {err}")
return False
if health_check.health_status != "READY" or health_check.network_status != "ONLINE":
return False
return True
@allure.step("Ping node")
def ping_host(shell: Shell, host: Host):
options = CommandOptions(check=False)
return shell.exec(f"ping {host.config.address} -c 1", options).return_code
@retry(max_attempts=60, sleep_interval=5, expected_result=1)
@allure.step("Waiting for host of {node} to go offline")
def wait_for_host_offline(shell: Shell, node: StorageNode):
try:
# TODO: Quick solution for now, should be replaced by lib interactions
return ping_host(shell, node.host)
except Exception as err:
logger.warning(f"Host ping fails with error {err}")
return 0