forked from TrueCloudLab/frostfs-testlib
[126] small healthcheck and stop start hosts rework
Signed-off-by: Andrey Berezin <a.berezin@yadro.com>
This commit is contained in:
parent
9ab4def44f
commit
253bb3b1d8
4 changed files with 112 additions and 105 deletions
|
@ -3,17 +3,15 @@ from dataclasses import dataclass
|
|||
from time import sleep
|
||||
from typing import Optional
|
||||
|
||||
from frostfs_testlib.hosting import Host
|
||||
from frostfs_testlib.reporter import get_reporter
|
||||
from frostfs_testlib.resources.common import SERVICE_MAX_STARTUP_TIME
|
||||
from frostfs_testlib.shell import CommandOptions, Shell
|
||||
from frostfs_testlib.shell import Shell
|
||||
from frostfs_testlib.steps.cli.object import neo_go_dump_keys
|
||||
from frostfs_testlib.steps.node_management import storage_node_healthcheck
|
||||
from frostfs_testlib.steps.storage_policy import get_nodes_with_object
|
||||
from frostfs_testlib.storage.cluster import Cluster, ClusterNode, NodeBase, StorageNode
|
||||
from frostfs_testlib.storage.dataclasses.frostfs_services import MorphChain
|
||||
from frostfs_testlib.testing.parallel import parallel
|
||||
from frostfs_testlib.testing.test_control import retry, wait_for_success
|
||||
from frostfs_testlib.testing.test_control import wait_for_success
|
||||
from frostfs_testlib.utils.datetime_utils import parse_time
|
||||
|
||||
reporter = get_reporter()
|
||||
|
@ -21,66 +19,6 @@ reporter = get_reporter()
|
|||
logger = logging.getLogger("NeoLogger")
|
||||
|
||||
|
||||
@reporter.step_deco("Ping node")
|
||||
def ping_host(shell: Shell, host: Host):
|
||||
options = CommandOptions(check=False)
|
||||
return shell.exec(f"ping {host.config.address} -c 1", options).return_code
|
||||
|
||||
|
||||
# TODO: Move to ClusterStateController
|
||||
@reporter.step_deco("Wait for storage nodes returned to cluster")
|
||||
def wait_all_storage_nodes_returned(shell: Shell, cluster: Cluster) -> None:
|
||||
nodes = cluster.services(StorageNode)
|
||||
parallel(_wait_for_storage_node, nodes, shell=shell)
|
||||
|
||||
|
||||
@reporter.step_deco("Run health check for storage at '{node}'")
|
||||
def _wait_for_storage_node(node: StorageNode, shell: Shell) -> None:
|
||||
wait_for_host_online(shell, node)
|
||||
wait_for_node_online(node)
|
||||
|
||||
|
||||
@retry(max_attempts=60, sleep_interval=5, expected_result=0)
|
||||
@reporter.step_deco("Waiting for host of {node} to go online")
|
||||
def wait_for_host_online(shell: Shell, node: StorageNode):
|
||||
try:
|
||||
# TODO: Quick solution for now, should be replaced by lib interactions
|
||||
return ping_host(shell, node.host)
|
||||
except Exception as err:
|
||||
logger.warning(f"Host ping fails with error {err}")
|
||||
return 1
|
||||
|
||||
|
||||
@retry(max_attempts=60, sleep_interval=5, expected_result=1)
|
||||
@reporter.step_deco("Waiting for host of {node} to go offline")
|
||||
def wait_for_host_offline(shell: Shell, node: StorageNode):
|
||||
try:
|
||||
# TODO: Quick solution for now, should be replaced by lib interactions
|
||||
return ping_host(shell, node.host)
|
||||
except Exception as err:
|
||||
logger.warning(f"Host ping fails with error {err}")
|
||||
return 0
|
||||
|
||||
|
||||
@retry(max_attempts=20, sleep_interval=30, expected_result=True)
|
||||
@reporter.step_deco("Waiting for node {node} to go online")
|
||||
def wait_for_node_online(node: StorageNode):
|
||||
try:
|
||||
health_check = storage_node_healthcheck(node)
|
||||
except Exception as err:
|
||||
logger.warning(f"Node healthcheck fails with error {err}")
|
||||
return False
|
||||
finally:
|
||||
gather_socket_info(node)
|
||||
|
||||
return health_check.health_status == "READY" and health_check.network_status == "ONLINE"
|
||||
|
||||
|
||||
@reporter.step_deco("Gather socket info for {node}")
|
||||
def gather_socket_info(node: StorageNode):
|
||||
node.host.get_shell().exec("ss -tuln | grep 8080", CommandOptions(check=False))
|
||||
|
||||
|
||||
@reporter.step_deco("Check and return status of given service")
|
||||
def service_status(service: str, shell: Shell) -> str:
|
||||
return shell.exec(f"sudo systemctl is-active {service}").stdout.rstrip()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue