[126] small healthcheck and stop start hosts rework

Signed-off-by: Andrey Berezin <a.berezin@yadro.com>
This commit is contained in:
Andrey Berezin 2023-11-22 17:10:09 +03:00
parent 9ab4def44f
commit 253bb3b1d8
4 changed files with 112 additions and 105 deletions

View file

@ -3,17 +3,15 @@ from dataclasses import dataclass
from time import sleep
from typing import Optional
from frostfs_testlib.hosting import Host
from frostfs_testlib.reporter import get_reporter
from frostfs_testlib.resources.common import SERVICE_MAX_STARTUP_TIME
from frostfs_testlib.shell import CommandOptions, Shell
from frostfs_testlib.shell import Shell
from frostfs_testlib.steps.cli.object import neo_go_dump_keys
from frostfs_testlib.steps.node_management import storage_node_healthcheck
from frostfs_testlib.steps.storage_policy import get_nodes_with_object
from frostfs_testlib.storage.cluster import Cluster, ClusterNode, NodeBase, StorageNode
from frostfs_testlib.storage.dataclasses.frostfs_services import MorphChain
from frostfs_testlib.testing.parallel import parallel
from frostfs_testlib.testing.test_control import retry, wait_for_success
from frostfs_testlib.testing.test_control import wait_for_success
from frostfs_testlib.utils.datetime_utils import parse_time
reporter = get_reporter()
@ -21,66 +19,6 @@ reporter = get_reporter()
logger = logging.getLogger("NeoLogger")
@reporter.step_deco("Ping node")
def ping_host(shell: Shell, host: Host):
options = CommandOptions(check=False)
return shell.exec(f"ping {host.config.address} -c 1", options).return_code
# TODO: Move to ClusterStateController
@reporter.step_deco("Wait for storage nodes returned to cluster")
def wait_all_storage_nodes_returned(shell: Shell, cluster: Cluster) -> None:
nodes = cluster.services(StorageNode)
parallel(_wait_for_storage_node, nodes, shell=shell)
@reporter.step_deco("Run health check for storage at '{node}'")
def _wait_for_storage_node(node: StorageNode, shell: Shell) -> None:
wait_for_host_online(shell, node)
wait_for_node_online(node)
@retry(max_attempts=60, sleep_interval=5, expected_result=0)
@reporter.step_deco("Waiting for host of {node} to go online")
def wait_for_host_online(shell: Shell, node: StorageNode):
try:
# TODO: Quick solution for now, should be replaced by lib interactions
return ping_host(shell, node.host)
except Exception as err:
logger.warning(f"Host ping fails with error {err}")
return 1
@retry(max_attempts=60, sleep_interval=5, expected_result=1)
@reporter.step_deco("Waiting for host of {node} to go offline")
def wait_for_host_offline(shell: Shell, node: StorageNode):
try:
# TODO: Quick solution for now, should be replaced by lib interactions
return ping_host(shell, node.host)
except Exception as err:
logger.warning(f"Host ping fails with error {err}")
return 0
@retry(max_attempts=20, sleep_interval=30, expected_result=True)
@reporter.step_deco("Waiting for node {node} to go online")
def wait_for_node_online(node: StorageNode):
try:
health_check = storage_node_healthcheck(node)
except Exception as err:
logger.warning(f"Node healthcheck fails with error {err}")
return False
finally:
gather_socket_info(node)
return health_check.health_status == "READY" and health_check.network_status == "ONLINE"
@reporter.step_deco("Gather socket info for {node}")
def gather_socket_info(node: StorageNode):
node.host.get_shell().exec("ss -tuln | grep 8080", CommandOptions(check=False))
@reporter.step_deco("Check and return status of given service")
def service_status(service: str, shell: Shell) -> str:
return shell.exec(f"sudo systemctl is-active {service}").stdout.rstrip()