From 253bb3b1d81eb66b3e145bd777e08c99e3377ee5 Mon Sep 17 00:00:00 2001 From: Andrey Berezin Date: Wed, 22 Nov 2023 17:10:09 +0300 Subject: [PATCH] [126] small healthcheck and stop start hosts rework Signed-off-by: Andrey Berezin --- .../healthcheck/basic_healthcheck.py | 67 ++++++++++++++--- src/frostfs_testlib/healthcheck/interfaces.py | 10 +-- .../controllers/cluster_state_controller.py | 74 +++++++++++++------ src/frostfs_testlib/utils/failover_utils.py | 66 +---------------- 4 files changed, 112 insertions(+), 105 deletions(-) diff --git a/src/frostfs_testlib/healthcheck/basic_healthcheck.py b/src/frostfs_testlib/healthcheck/basic_healthcheck.py index 9c1d151..6f21534 100644 --- a/src/frostfs_testlib/healthcheck/basic_healthcheck.py +++ b/src/frostfs_testlib/healthcheck/basic_healthcheck.py @@ -1,22 +1,65 @@ +from typing import Callable + from frostfs_testlib.cli.frostfs_cli.cli import FrostfsCli from frostfs_testlib.healthcheck.interfaces import Healthcheck from frostfs_testlib.reporter import get_reporter from frostfs_testlib.resources.cli import FROSTFS_CLI_EXEC +from frostfs_testlib.shell import CommandOptions from frostfs_testlib.steps.node_management import storage_node_healthcheck from frostfs_testlib.storage.cluster import ClusterNode +from frostfs_testlib.testing.test_control import wait_for_success reporter = get_reporter() class BasicHealthcheck(Healthcheck): - @reporter.step_deco("Perform healthcheck for {cluster_node}") - def perform(self, cluster_node: ClusterNode): - result = self.storage_healthcheck(cluster_node) - if result: - raise AssertionError(result) + def _perform(self, cluster_node: ClusterNode, checks: dict[Callable, dict]): + issues: list[str] = [] + for check, kwargs in checks.items(): + issue = check(cluster_node, **kwargs) + if issue: + issues.append(issue) + + assert not issues, "Issues found:\n" + "\n".join(issues) + + @wait_for_success(900, 30) + def full_healthcheck(self, cluster_node: ClusterNode): + checks = { + self.storage_healthcheck: {}, + self._tree_healthcheck: {}, + } + + with reporter.step(f"Perform full healthcheck for {cluster_node}"): + self._perform(cluster_node, checks) + + @wait_for_success(900, 30) + def startup_healthcheck(self, cluster_node: ClusterNode): + checks = { + self.storage_healthcheck: {}, + self._tree_healthcheck: {}, + } + + with reporter.step(f"Perform startup healthcheck on {cluster_node}"): + self._perform(cluster_node, checks) + + @wait_for_success(900, 30) + def storage_healthcheck(self, cluster_node: ClusterNode) -> str | None: + checks = { + self._storage_healthcheck: {}, + } + + with reporter.step(f"Perform storage healthcheck on {cluster_node}"): + self._perform(cluster_node, checks) + + @reporter.step_deco("Storage healthcheck on {cluster_node}") + def _storage_healthcheck(self, cluster_node: ClusterNode) -> str | None: + result = storage_node_healthcheck(cluster_node.storage_node) + self._gather_socket_info(cluster_node) + if result.health_status != "READY" or result.network_status != "ONLINE": + return f"Node {cluster_node} is not healthy. Health={result.health_status}. Network={result.network_status}" @reporter.step_deco("Tree healthcheck on {cluster_node}") - def tree_healthcheck(self, cluster_node: ClusterNode) -> str | None: + def _tree_healthcheck(self, cluster_node: ClusterNode) -> str | None: host = cluster_node.host service_config = host.get_service_config(cluster_node.storage_node.name) wallet_path = service_config.attributes["wallet_path"] @@ -34,10 +77,10 @@ class BasicHealthcheck(Healthcheck): ) result = remote_cli.tree.healthcheck(rpc_endpoint="127.0.0.1:8080") if result.return_code != 0: - return f"Error during tree healthcheck (rc={result.return_code}): {result.stdout}. \n Stderr: {result.stderr}" + return ( + f"Error during tree healthcheck (rc={result.return_code}): {result.stdout}. \n Stderr: {result.stderr}" + ) - @reporter.step_deco("Storage healthcheck on {cluster_node}") - def storage_healthcheck(self, cluster_node: ClusterNode) -> str | None: - result = storage_node_healthcheck(cluster_node.storage_node) - if result.health_status != "READY" or result.network_status != "ONLINE": - return f"Node {cluster_node} is not healthy. Health={result.health_status}. Network={result.network_status}" + @reporter.step_deco("Gather socket info for {cluster_node}") + def _gather_socket_info(self, cluster_node: ClusterNode): + cluster_node.host.get_shell().exec("ss -tuln | grep 8080", CommandOptions(check=False)) diff --git a/src/frostfs_testlib/healthcheck/interfaces.py b/src/frostfs_testlib/healthcheck/interfaces.py index a036a82..83fa021 100644 --- a/src/frostfs_testlib/healthcheck/interfaces.py +++ b/src/frostfs_testlib/healthcheck/interfaces.py @@ -5,13 +5,13 @@ from frostfs_testlib.storage.cluster import ClusterNode class Healthcheck(ABC): @abstractmethod - def perform(self, cluster_node: ClusterNode): - """Perform healthcheck on the target cluster node""" + def full_healthcheck(self, cluster_node: ClusterNode): + """Perform full healthcheck on the target cluster node""" @abstractmethod - def tree_healthcheck(self, cluster_node: ClusterNode): - """Check tree sync status on target cluster node""" + def startup_healthcheck(self, cluster_node: ClusterNode): + """Perform healthcheck required on startup of target cluster node""" @abstractmethod def storage_healthcheck(self, cluster_node: ClusterNode): - """Perform storage node healthcheck on target cluster node""" + """Perform storage service healthcheck on target cluster node""" diff --git a/src/frostfs_testlib/storage/controllers/cluster_state_controller.py b/src/frostfs_testlib/storage/controllers/cluster_state_controller.py index 000bdd8..7020671 100644 --- a/src/frostfs_testlib/storage/controllers/cluster_state_controller.py +++ b/src/frostfs_testlib/storage/controllers/cluster_state_controller.py @@ -1,4 +1,5 @@ import datetime +import logging import time from typing import TypeVar @@ -6,6 +7,7 @@ import frostfs_testlib.resources.optionals as optionals from frostfs_testlib.cli import FrostfsAdm, FrostfsCli from frostfs_testlib.cli.netmap_parser import NetmapParser from frostfs_testlib.healthcheck.interfaces import Healthcheck +from frostfs_testlib.hosting.interfaces import HostStatus from frostfs_testlib.plugins import load_all from frostfs_testlib.reporter import get_reporter from frostfs_testlib.resources.cli import FROSTFS_ADM_CONFIG_PATH, FROSTFS_ADM_EXEC, FROSTFS_CLI_EXEC @@ -16,16 +18,11 @@ from frostfs_testlib.storage.cluster import Cluster, ClusterNode, S3Gate, Storag from frostfs_testlib.storage.controllers.disk_controller import DiskController from frostfs_testlib.storage.dataclasses.node_base import NodeBase, ServiceClass from frostfs_testlib.testing import parallel -from frostfs_testlib.testing.test_control import run_optionally, wait_for_success +from frostfs_testlib.testing.test_control import retry, run_optionally, wait_for_success from frostfs_testlib.utils.datetime_utils import parse_time -from frostfs_testlib.utils.failover_utils import ( - wait_all_storage_nodes_returned, - wait_for_host_offline, - wait_for_host_online, - wait_for_node_online, -) reporter = get_reporter() +logger = logging.getLogger("NeoLogger") if_up_down_helper = IfUpDownHelper() @@ -88,7 +85,7 @@ class ClusterStateController: self.stopped_nodes.append(node) with reporter.step(f"Stop host {node.host.config.address}"): node.host.stop_host(mode=mode) - wait_for_host_offline(self.shell, node.storage_node) + self._wait_for_host_offline(node) @run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED) @reporter.step_deco("Shutdown whole cluster") @@ -105,18 +102,17 @@ class ClusterStateController: node.host.stop_host(mode=mode) for node in nodes: - wait_for_host_offline(self.shell, node.storage_node) + self._wait_for_host_offline(node) @run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED) @reporter.step_deco("Start host of node {node}") - def start_node_host(self, node: ClusterNode, tree_healthcheck: bool = True): + def start_node_host(self, node: ClusterNode, startup_healthcheck: bool = True): with reporter.step(f"Start host {node.host.config.address}"): node.host.start_host() - wait_for_host_online(self.shell, node.storage_node) + self._wait_for_host_online(node) self.stopped_nodes.remove(node) - wait_for_node_online(node.storage_node) - if tree_healthcheck: - self.wait_tree_healthcheck() + if startup_healthcheck: + self.wait_startup_healthcheck() @run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED) @reporter.step_deco("Start stopped hosts") @@ -131,6 +127,9 @@ class ClusterStateController: self.stopped_services.difference_update(self._get_stopped_by_node(node)) self.stopped_nodes = [] + with reporter.step("Wait for all nodes to go online"): + parallel(self._wait_for_host_online, self.cluster.cluster_nodes) + self.wait_after_storage_startup() @run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED) @@ -183,16 +182,15 @@ class ClusterStateController: if online_s3gates: parallel(self.wait_s3gate, online_s3gates) - @wait_for_success(600, 60) - def wait_tree_healthcheck(self): + @reporter.step_deco("Wait for cluster startup healtcheck") + def wait_startup_healthcheck(self): nodes = self.cluster.nodes(self._get_online(StorageNode)) - parallel(self.healthcheck.tree_healthcheck, nodes) + parallel(self.healthcheck.startup_healthcheck, nodes) @reporter.step_deco("Wait for storage reconnection to the system") def wait_after_storage_startup(self): - wait_all_storage_nodes_returned(self.shell, self.cluster) + self.wait_startup_healthcheck() self.wait_s3gates() - self.wait_tree_healthcheck() @run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED) @reporter.step_deco("Start all stopped services") @@ -366,7 +364,7 @@ class ClusterStateController: @run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED) @reporter.step_deco("Hard reboot host {node} via magic SysRq option") - def panic_reboot_host(self, node: ClusterNode, wait_for_return: bool = True, tree_healthcheck: bool = True): + def panic_reboot_host(self, node: ClusterNode, wait_for_return: bool = True, startup_healthcheck: bool = True): shell = node.host.get_shell() shell.exec('sudo sh -c "echo 1 > /proc/sys/kernel/sysrq"') @@ -381,10 +379,9 @@ class ClusterStateController: # Let the things to be settled # A little wait here to prevent ssh stuck during panic time.sleep(10) - wait_for_host_online(self.shell, node.storage_node) - wait_for_node_online(node.storage_node) - if tree_healthcheck: - self.wait_tree_healthcheck() + self._wait_for_host_online(node) + if startup_healthcheck: + self.wait_startup_healthcheck() @reporter.step_deco("Down {interface} to {nodes}") def down_interface(self, nodes: list[ClusterNode], interface: str): @@ -539,3 +536,32 @@ class ClusterStateController: if "mgmt" not in type: interfaces.append(ip) return interfaces + + @reporter.step_deco("Ping node") + def _ping_host(self, node: ClusterNode): + options = CommandOptions(check=False) + return self.shell.exec(f"ping {node.host.config.address} -c 1", options).return_code + + @retry(max_attempts=60, sleep_interval=5, expected_result=HostStatus.ONLINE) + @reporter.step_deco("Waiting for {node} to go online") + def _wait_for_host_online(self, node: ClusterNode): + try: + ping_result = self._ping_host(node) + if ping_result != 0: + return HostStatus.OFFLINE + return node.host.get_host_status() + except Exception as err: + logger.warning(f"Host ping fails with error {err}") + return HostStatus.OFFLINE + + @retry(max_attempts=60, sleep_interval=5, expected_result=HostStatus.OFFLINE) + @reporter.step_deco("Waiting for {node} to go offline") + def _wait_for_host_offline(self, node: ClusterNode): + try: + ping_result = self._ping_host(node) + if ping_result == 0: + return HostStatus.ONLINE + return node.host.get_host_status() + except Exception as err: + logger.warning(f"Host ping fails with error {err}") + return HostStatus.ONLINE diff --git a/src/frostfs_testlib/utils/failover_utils.py b/src/frostfs_testlib/utils/failover_utils.py index 27cd181..d4892c4 100644 --- a/src/frostfs_testlib/utils/failover_utils.py +++ b/src/frostfs_testlib/utils/failover_utils.py @@ -3,17 +3,15 @@ from dataclasses import dataclass from time import sleep from typing import Optional -from frostfs_testlib.hosting import Host from frostfs_testlib.reporter import get_reporter from frostfs_testlib.resources.common import SERVICE_MAX_STARTUP_TIME -from frostfs_testlib.shell import CommandOptions, Shell +from frostfs_testlib.shell import Shell from frostfs_testlib.steps.cli.object import neo_go_dump_keys from frostfs_testlib.steps.node_management import storage_node_healthcheck from frostfs_testlib.steps.storage_policy import get_nodes_with_object from frostfs_testlib.storage.cluster import Cluster, ClusterNode, NodeBase, StorageNode from frostfs_testlib.storage.dataclasses.frostfs_services import MorphChain -from frostfs_testlib.testing.parallel import parallel -from frostfs_testlib.testing.test_control import retry, wait_for_success +from frostfs_testlib.testing.test_control import wait_for_success from frostfs_testlib.utils.datetime_utils import parse_time reporter = get_reporter() @@ -21,66 +19,6 @@ reporter = get_reporter() logger = logging.getLogger("NeoLogger") -@reporter.step_deco("Ping node") -def ping_host(shell: Shell, host: Host): - options = CommandOptions(check=False) - return shell.exec(f"ping {host.config.address} -c 1", options).return_code - - -# TODO: Move to ClusterStateController -@reporter.step_deco("Wait for storage nodes returned to cluster") -def wait_all_storage_nodes_returned(shell: Shell, cluster: Cluster) -> None: - nodes = cluster.services(StorageNode) - parallel(_wait_for_storage_node, nodes, shell=shell) - - -@reporter.step_deco("Run health check for storage at '{node}'") -def _wait_for_storage_node(node: StorageNode, shell: Shell) -> None: - wait_for_host_online(shell, node) - wait_for_node_online(node) - - -@retry(max_attempts=60, sleep_interval=5, expected_result=0) -@reporter.step_deco("Waiting for host of {node} to go online") -def wait_for_host_online(shell: Shell, node: StorageNode): - try: - # TODO: Quick solution for now, should be replaced by lib interactions - return ping_host(shell, node.host) - except Exception as err: - logger.warning(f"Host ping fails with error {err}") - return 1 - - -@retry(max_attempts=60, sleep_interval=5, expected_result=1) -@reporter.step_deco("Waiting for host of {node} to go offline") -def wait_for_host_offline(shell: Shell, node: StorageNode): - try: - # TODO: Quick solution for now, should be replaced by lib interactions - return ping_host(shell, node.host) - except Exception as err: - logger.warning(f"Host ping fails with error {err}") - return 0 - - -@retry(max_attempts=20, sleep_interval=30, expected_result=True) -@reporter.step_deco("Waiting for node {node} to go online") -def wait_for_node_online(node: StorageNode): - try: - health_check = storage_node_healthcheck(node) - except Exception as err: - logger.warning(f"Node healthcheck fails with error {err}") - return False - finally: - gather_socket_info(node) - - return health_check.health_status == "READY" and health_check.network_status == "ONLINE" - - -@reporter.step_deco("Gather socket info for {node}") -def gather_socket_info(node: StorageNode): - node.host.get_shell().exec("ss -tuln | grep 8080", CommandOptions(check=False)) - - @reporter.step_deco("Check and return status of given service") def service_status(service: str, shell: Shell) -> str: return shell.exec(f"sudo systemctl is-active {service}").stdout.rstrip()