[126] small healthcheck and stop start hosts rework
Signed-off-by: Andrey Berezin <a.berezin@yadro.com>
This commit is contained in:
parent
9ab4def44f
commit
253bb3b1d8
4 changed files with 112 additions and 105 deletions
|
@ -1,22 +1,65 @@
|
|||
from typing import Callable
|
||||
|
||||
from frostfs_testlib.cli.frostfs_cli.cli import FrostfsCli
|
||||
from frostfs_testlib.healthcheck.interfaces import Healthcheck
|
||||
from frostfs_testlib.reporter import get_reporter
|
||||
from frostfs_testlib.resources.cli import FROSTFS_CLI_EXEC
|
||||
from frostfs_testlib.shell import CommandOptions
|
||||
from frostfs_testlib.steps.node_management import storage_node_healthcheck
|
||||
from frostfs_testlib.storage.cluster import ClusterNode
|
||||
from frostfs_testlib.testing.test_control import wait_for_success
|
||||
|
||||
reporter = get_reporter()
|
||||
|
||||
|
||||
class BasicHealthcheck(Healthcheck):
|
||||
@reporter.step_deco("Perform healthcheck for {cluster_node}")
|
||||
def perform(self, cluster_node: ClusterNode):
|
||||
result = self.storage_healthcheck(cluster_node)
|
||||
if result:
|
||||
raise AssertionError(result)
|
||||
def _perform(self, cluster_node: ClusterNode, checks: dict[Callable, dict]):
|
||||
issues: list[str] = []
|
||||
for check, kwargs in checks.items():
|
||||
issue = check(cluster_node, **kwargs)
|
||||
if issue:
|
||||
issues.append(issue)
|
||||
|
||||
assert not issues, "Issues found:\n" + "\n".join(issues)
|
||||
|
||||
@wait_for_success(900, 30)
|
||||
def full_healthcheck(self, cluster_node: ClusterNode):
|
||||
checks = {
|
||||
self.storage_healthcheck: {},
|
||||
self._tree_healthcheck: {},
|
||||
}
|
||||
|
||||
with reporter.step(f"Perform full healthcheck for {cluster_node}"):
|
||||
self._perform(cluster_node, checks)
|
||||
|
||||
@wait_for_success(900, 30)
|
||||
def startup_healthcheck(self, cluster_node: ClusterNode):
|
||||
checks = {
|
||||
self.storage_healthcheck: {},
|
||||
self._tree_healthcheck: {},
|
||||
}
|
||||
|
||||
with reporter.step(f"Perform startup healthcheck on {cluster_node}"):
|
||||
self._perform(cluster_node, checks)
|
||||
|
||||
@wait_for_success(900, 30)
|
||||
def storage_healthcheck(self, cluster_node: ClusterNode) -> str | None:
|
||||
checks = {
|
||||
self._storage_healthcheck: {},
|
||||
}
|
||||
|
||||
with reporter.step(f"Perform storage healthcheck on {cluster_node}"):
|
||||
self._perform(cluster_node, checks)
|
||||
|
||||
@reporter.step_deco("Storage healthcheck on {cluster_node}")
|
||||
def _storage_healthcheck(self, cluster_node: ClusterNode) -> str | None:
|
||||
result = storage_node_healthcheck(cluster_node.storage_node)
|
||||
self._gather_socket_info(cluster_node)
|
||||
if result.health_status != "READY" or result.network_status != "ONLINE":
|
||||
return f"Node {cluster_node} is not healthy. Health={result.health_status}. Network={result.network_status}"
|
||||
|
||||
@reporter.step_deco("Tree healthcheck on {cluster_node}")
|
||||
def tree_healthcheck(self, cluster_node: ClusterNode) -> str | None:
|
||||
def _tree_healthcheck(self, cluster_node: ClusterNode) -> str | None:
|
||||
host = cluster_node.host
|
||||
service_config = host.get_service_config(cluster_node.storage_node.name)
|
||||
wallet_path = service_config.attributes["wallet_path"]
|
||||
|
@ -34,10 +77,10 @@ class BasicHealthcheck(Healthcheck):
|
|||
)
|
||||
result = remote_cli.tree.healthcheck(rpc_endpoint="127.0.0.1:8080")
|
||||
if result.return_code != 0:
|
||||
return f"Error during tree healthcheck (rc={result.return_code}): {result.stdout}. \n Stderr: {result.stderr}"
|
||||
return (
|
||||
f"Error during tree healthcheck (rc={result.return_code}): {result.stdout}. \n Stderr: {result.stderr}"
|
||||
)
|
||||
|
||||
@reporter.step_deco("Storage healthcheck on {cluster_node}")
|
||||
def storage_healthcheck(self, cluster_node: ClusterNode) -> str | None:
|
||||
result = storage_node_healthcheck(cluster_node.storage_node)
|
||||
if result.health_status != "READY" or result.network_status != "ONLINE":
|
||||
return f"Node {cluster_node} is not healthy. Health={result.health_status}. Network={result.network_status}"
|
||||
@reporter.step_deco("Gather socket info for {cluster_node}")
|
||||
def _gather_socket_info(self, cluster_node: ClusterNode):
|
||||
cluster_node.host.get_shell().exec("ss -tuln | grep 8080", CommandOptions(check=False))
|
||||
|
|
|
@ -5,13 +5,13 @@ from frostfs_testlib.storage.cluster import ClusterNode
|
|||
|
||||
class Healthcheck(ABC):
|
||||
@abstractmethod
|
||||
def perform(self, cluster_node: ClusterNode):
|
||||
"""Perform healthcheck on the target cluster node"""
|
||||
def full_healthcheck(self, cluster_node: ClusterNode):
|
||||
"""Perform full healthcheck on the target cluster node"""
|
||||
|
||||
@abstractmethod
|
||||
def tree_healthcheck(self, cluster_node: ClusterNode):
|
||||
"""Check tree sync status on target cluster node"""
|
||||
def startup_healthcheck(self, cluster_node: ClusterNode):
|
||||
"""Perform healthcheck required on startup of target cluster node"""
|
||||
|
||||
@abstractmethod
|
||||
def storage_healthcheck(self, cluster_node: ClusterNode):
|
||||
"""Perform storage node healthcheck on target cluster node"""
|
||||
"""Perform storage service healthcheck on target cluster node"""
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import datetime
|
||||
import logging
|
||||
import time
|
||||
from typing import TypeVar
|
||||
|
||||
|
@ -6,6 +7,7 @@ import frostfs_testlib.resources.optionals as optionals
|
|||
from frostfs_testlib.cli import FrostfsAdm, FrostfsCli
|
||||
from frostfs_testlib.cli.netmap_parser import NetmapParser
|
||||
from frostfs_testlib.healthcheck.interfaces import Healthcheck
|
||||
from frostfs_testlib.hosting.interfaces import HostStatus
|
||||
from frostfs_testlib.plugins import load_all
|
||||
from frostfs_testlib.reporter import get_reporter
|
||||
from frostfs_testlib.resources.cli import FROSTFS_ADM_CONFIG_PATH, FROSTFS_ADM_EXEC, FROSTFS_CLI_EXEC
|
||||
|
@ -16,16 +18,11 @@ from frostfs_testlib.storage.cluster import Cluster, ClusterNode, S3Gate, Storag
|
|||
from frostfs_testlib.storage.controllers.disk_controller import DiskController
|
||||
from frostfs_testlib.storage.dataclasses.node_base import NodeBase, ServiceClass
|
||||
from frostfs_testlib.testing import parallel
|
||||
from frostfs_testlib.testing.test_control import run_optionally, wait_for_success
|
||||
from frostfs_testlib.testing.test_control import retry, run_optionally, wait_for_success
|
||||
from frostfs_testlib.utils.datetime_utils import parse_time
|
||||
from frostfs_testlib.utils.failover_utils import (
|
||||
wait_all_storage_nodes_returned,
|
||||
wait_for_host_offline,
|
||||
wait_for_host_online,
|
||||
wait_for_node_online,
|
||||
)
|
||||
|
||||
reporter = get_reporter()
|
||||
logger = logging.getLogger("NeoLogger")
|
||||
if_up_down_helper = IfUpDownHelper()
|
||||
|
||||
|
||||
|
@ -88,7 +85,7 @@ class ClusterStateController:
|
|||
self.stopped_nodes.append(node)
|
||||
with reporter.step(f"Stop host {node.host.config.address}"):
|
||||
node.host.stop_host(mode=mode)
|
||||
wait_for_host_offline(self.shell, node.storage_node)
|
||||
self._wait_for_host_offline(node)
|
||||
|
||||
@run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
|
||||
@reporter.step_deco("Shutdown whole cluster")
|
||||
|
@ -105,18 +102,17 @@ class ClusterStateController:
|
|||
node.host.stop_host(mode=mode)
|
||||
|
||||
for node in nodes:
|
||||
wait_for_host_offline(self.shell, node.storage_node)
|
||||
self._wait_for_host_offline(node)
|
||||
|
||||
@run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
|
||||
@reporter.step_deco("Start host of node {node}")
|
||||
def start_node_host(self, node: ClusterNode, tree_healthcheck: bool = True):
|
||||
def start_node_host(self, node: ClusterNode, startup_healthcheck: bool = True):
|
||||
with reporter.step(f"Start host {node.host.config.address}"):
|
||||
node.host.start_host()
|
||||
wait_for_host_online(self.shell, node.storage_node)
|
||||
self._wait_for_host_online(node)
|
||||
self.stopped_nodes.remove(node)
|
||||
wait_for_node_online(node.storage_node)
|
||||
if tree_healthcheck:
|
||||
self.wait_tree_healthcheck()
|
||||
if startup_healthcheck:
|
||||
self.wait_startup_healthcheck()
|
||||
|
||||
@run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
|
||||
@reporter.step_deco("Start stopped hosts")
|
||||
|
@ -131,6 +127,9 @@ class ClusterStateController:
|
|||
self.stopped_services.difference_update(self._get_stopped_by_node(node))
|
||||
|
||||
self.stopped_nodes = []
|
||||
with reporter.step("Wait for all nodes to go online"):
|
||||
parallel(self._wait_for_host_online, self.cluster.cluster_nodes)
|
||||
|
||||
self.wait_after_storage_startup()
|
||||
|
||||
@run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
|
||||
|
@ -183,16 +182,15 @@ class ClusterStateController:
|
|||
if online_s3gates:
|
||||
parallel(self.wait_s3gate, online_s3gates)
|
||||
|
||||
@wait_for_success(600, 60)
|
||||
def wait_tree_healthcheck(self):
|
||||
@reporter.step_deco("Wait for cluster startup healtcheck")
|
||||
def wait_startup_healthcheck(self):
|
||||
nodes = self.cluster.nodes(self._get_online(StorageNode))
|
||||
parallel(self.healthcheck.tree_healthcheck, nodes)
|
||||
parallel(self.healthcheck.startup_healthcheck, nodes)
|
||||
|
||||
@reporter.step_deco("Wait for storage reconnection to the system")
|
||||
def wait_after_storage_startup(self):
|
||||
wait_all_storage_nodes_returned(self.shell, self.cluster)
|
||||
self.wait_startup_healthcheck()
|
||||
self.wait_s3gates()
|
||||
self.wait_tree_healthcheck()
|
||||
|
||||
@run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
|
||||
@reporter.step_deco("Start all stopped services")
|
||||
|
@ -366,7 +364,7 @@ class ClusterStateController:
|
|||
|
||||
@run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
|
||||
@reporter.step_deco("Hard reboot host {node} via magic SysRq option")
|
||||
def panic_reboot_host(self, node: ClusterNode, wait_for_return: bool = True, tree_healthcheck: bool = True):
|
||||
def panic_reboot_host(self, node: ClusterNode, wait_for_return: bool = True, startup_healthcheck: bool = True):
|
||||
shell = node.host.get_shell()
|
||||
shell.exec('sudo sh -c "echo 1 > /proc/sys/kernel/sysrq"')
|
||||
|
||||
|
@ -381,10 +379,9 @@ class ClusterStateController:
|
|||
# Let the things to be settled
|
||||
# A little wait here to prevent ssh stuck during panic
|
||||
time.sleep(10)
|
||||
wait_for_host_online(self.shell, node.storage_node)
|
||||
wait_for_node_online(node.storage_node)
|
||||
if tree_healthcheck:
|
||||
self.wait_tree_healthcheck()
|
||||
self._wait_for_host_online(node)
|
||||
if startup_healthcheck:
|
||||
self.wait_startup_healthcheck()
|
||||
|
||||
@reporter.step_deco("Down {interface} to {nodes}")
|
||||
def down_interface(self, nodes: list[ClusterNode], interface: str):
|
||||
|
@ -539,3 +536,32 @@ class ClusterStateController:
|
|||
if "mgmt" not in type:
|
||||
interfaces.append(ip)
|
||||
return interfaces
|
||||
|
||||
@reporter.step_deco("Ping node")
|
||||
def _ping_host(self, node: ClusterNode):
|
||||
options = CommandOptions(check=False)
|
||||
return self.shell.exec(f"ping {node.host.config.address} -c 1", options).return_code
|
||||
|
||||
@retry(max_attempts=60, sleep_interval=5, expected_result=HostStatus.ONLINE)
|
||||
@reporter.step_deco("Waiting for {node} to go online")
|
||||
def _wait_for_host_online(self, node: ClusterNode):
|
||||
try:
|
||||
ping_result = self._ping_host(node)
|
||||
if ping_result != 0:
|
||||
return HostStatus.OFFLINE
|
||||
return node.host.get_host_status()
|
||||
except Exception as err:
|
||||
logger.warning(f"Host ping fails with error {err}")
|
||||
return HostStatus.OFFLINE
|
||||
|
||||
@retry(max_attempts=60, sleep_interval=5, expected_result=HostStatus.OFFLINE)
|
||||
@reporter.step_deco("Waiting for {node} to go offline")
|
||||
def _wait_for_host_offline(self, node: ClusterNode):
|
||||
try:
|
||||
ping_result = self._ping_host(node)
|
||||
if ping_result == 0:
|
||||
return HostStatus.ONLINE
|
||||
return node.host.get_host_status()
|
||||
except Exception as err:
|
||||
logger.warning(f"Host ping fails with error {err}")
|
||||
return HostStatus.ONLINE
|
||||
|
|
|
@ -3,17 +3,15 @@ from dataclasses import dataclass
|
|||
from time import sleep
|
||||
from typing import Optional
|
||||
|
||||
from frostfs_testlib.hosting import Host
|
||||
from frostfs_testlib.reporter import get_reporter
|
||||
from frostfs_testlib.resources.common import SERVICE_MAX_STARTUP_TIME
|
||||
from frostfs_testlib.shell import CommandOptions, Shell
|
||||
from frostfs_testlib.shell import Shell
|
||||
from frostfs_testlib.steps.cli.object import neo_go_dump_keys
|
||||
from frostfs_testlib.steps.node_management import storage_node_healthcheck
|
||||
from frostfs_testlib.steps.storage_policy import get_nodes_with_object
|
||||
from frostfs_testlib.storage.cluster import Cluster, ClusterNode, NodeBase, StorageNode
|
||||
from frostfs_testlib.storage.dataclasses.frostfs_services import MorphChain
|
||||
from frostfs_testlib.testing.parallel import parallel
|
||||
from frostfs_testlib.testing.test_control import retry, wait_for_success
|
||||
from frostfs_testlib.testing.test_control import wait_for_success
|
||||
from frostfs_testlib.utils.datetime_utils import parse_time
|
||||
|
||||
reporter = get_reporter()
|
||||
|
@ -21,66 +19,6 @@ reporter = get_reporter()
|
|||
logger = logging.getLogger("NeoLogger")
|
||||
|
||||
|
||||
@reporter.step_deco("Ping node")
|
||||
def ping_host(shell: Shell, host: Host):
|
||||
options = CommandOptions(check=False)
|
||||
return shell.exec(f"ping {host.config.address} -c 1", options).return_code
|
||||
|
||||
|
||||
# TODO: Move to ClusterStateController
|
||||
@reporter.step_deco("Wait for storage nodes returned to cluster")
|
||||
def wait_all_storage_nodes_returned(shell: Shell, cluster: Cluster) -> None:
|
||||
nodes = cluster.services(StorageNode)
|
||||
parallel(_wait_for_storage_node, nodes, shell=shell)
|
||||
|
||||
|
||||
@reporter.step_deco("Run health check for storage at '{node}'")
|
||||
def _wait_for_storage_node(node: StorageNode, shell: Shell) -> None:
|
||||
wait_for_host_online(shell, node)
|
||||
wait_for_node_online(node)
|
||||
|
||||
|
||||
@retry(max_attempts=60, sleep_interval=5, expected_result=0)
|
||||
@reporter.step_deco("Waiting for host of {node} to go online")
|
||||
def wait_for_host_online(shell: Shell, node: StorageNode):
|
||||
try:
|
||||
# TODO: Quick solution for now, should be replaced by lib interactions
|
||||
return ping_host(shell, node.host)
|
||||
except Exception as err:
|
||||
logger.warning(f"Host ping fails with error {err}")
|
||||
return 1
|
||||
|
||||
|
||||
@retry(max_attempts=60, sleep_interval=5, expected_result=1)
|
||||
@reporter.step_deco("Waiting for host of {node} to go offline")
|
||||
def wait_for_host_offline(shell: Shell, node: StorageNode):
|
||||
try:
|
||||
# TODO: Quick solution for now, should be replaced by lib interactions
|
||||
return ping_host(shell, node.host)
|
||||
except Exception as err:
|
||||
logger.warning(f"Host ping fails with error {err}")
|
||||
return 0
|
||||
|
||||
|
||||
@retry(max_attempts=20, sleep_interval=30, expected_result=True)
|
||||
@reporter.step_deco("Waiting for node {node} to go online")
|
||||
def wait_for_node_online(node: StorageNode):
|
||||
try:
|
||||
health_check = storage_node_healthcheck(node)
|
||||
except Exception as err:
|
||||
logger.warning(f"Node healthcheck fails with error {err}")
|
||||
return False
|
||||
finally:
|
||||
gather_socket_info(node)
|
||||
|
||||
return health_check.health_status == "READY" and health_check.network_status == "ONLINE"
|
||||
|
||||
|
||||
@reporter.step_deco("Gather socket info for {node}")
|
||||
def gather_socket_info(node: StorageNode):
|
||||
node.host.get_shell().exec("ss -tuln | grep 8080", CommandOptions(check=False))
|
||||
|
||||
|
||||
@reporter.step_deco("Check and return status of given service")
|
||||
def service_status(service: str, shell: Shell) -> str:
|
||||
return shell.exec(f"sudo systemctl is-active {service}").stdout.rstrip()
|
||||
|
|
Loading…
Reference in a new issue