forked from TrueCloudLab/frostfs-testlib
256 lines
9.2 KiB
Python
256 lines
9.2 KiB
Python
import logging
|
|
from dataclasses import dataclass
|
|
from time import sleep
|
|
from typing import Optional
|
|
|
|
from frostfs_testlib.hosting import Host
|
|
from frostfs_testlib.reporter import get_reporter
|
|
from frostfs_testlib.resources.common import SERVICE_MAX_STARTUP_TIME
|
|
from frostfs_testlib.shell import CommandOptions, Shell
|
|
from frostfs_testlib.steps.cli.object import neo_go_dump_keys
|
|
from frostfs_testlib.steps.node_management import storage_node_healthcheck
|
|
from frostfs_testlib.steps.storage_policy import get_nodes_with_object
|
|
from frostfs_testlib.storage.cluster import Cluster, ClusterNode, NodeBase, StorageNode
|
|
from frostfs_testlib.storage.dataclasses.frostfs_services import MorphChain
|
|
from frostfs_testlib.testing.test_control import retry, wait_for_success
|
|
from frostfs_testlib.utils.datetime_utils import parse_time
|
|
|
|
reporter = get_reporter()
|
|
|
|
logger = logging.getLogger("NeoLogger")
|
|
|
|
|
|
@reporter.step_deco("Ping node")
|
|
def ping_host(shell: Shell, host: Host):
|
|
options = CommandOptions(check=False)
|
|
return shell.exec(f"ping {host.config.address} -c 1", options).return_code
|
|
|
|
|
|
@reporter.step_deco("Wait for storage nodes returned to cluster")
|
|
def wait_all_storage_nodes_returned(shell: Shell, cluster: Cluster) -> None:
|
|
for node in cluster.services(StorageNode):
|
|
with reporter.step(f"Run health check for storage at '{node}'"):
|
|
wait_for_host_online(shell, node)
|
|
wait_for_node_online(node)
|
|
|
|
|
|
@retry(max_attempts=60, sleep_interval=5, expected_result=0)
|
|
@reporter.step_deco("Waiting for host of {node} to go online")
|
|
def wait_for_host_online(shell: Shell, node: StorageNode):
|
|
try:
|
|
# TODO: Quick solution for now, should be replaced by lib interactions
|
|
return ping_host(shell, node.host)
|
|
except Exception as err:
|
|
logger.warning(f"Host ping fails with error {err}")
|
|
return 1
|
|
|
|
|
|
@retry(max_attempts=60, sleep_interval=5, expected_result=1)
|
|
@reporter.step_deco("Waiting for host of {node} to go offline")
|
|
def wait_for_host_offline(shell: Shell, node: StorageNode):
|
|
try:
|
|
# TODO: Quick solution for now, should be replaced by lib interactions
|
|
return ping_host(shell, node.host)
|
|
except Exception as err:
|
|
logger.warning(f"Host ping fails with error {err}")
|
|
return 0
|
|
|
|
|
|
@retry(max_attempts=20, sleep_interval=30, expected_result=True)
|
|
@reporter.step_deco("Waiting for node {node} to go online")
|
|
def wait_for_node_online(node: StorageNode):
|
|
try:
|
|
health_check = storage_node_healthcheck(node)
|
|
except Exception as err:
|
|
logger.warning(f"Node healthcheck fails with error {err}")
|
|
return False
|
|
|
|
return health_check.health_status == "READY" and health_check.network_status == "ONLINE"
|
|
|
|
|
|
@reporter.step_deco("Check and return status of given service")
|
|
def service_status(service: str, shell: Shell) -> str:
|
|
return shell.exec(f"sudo systemctl is-active {service}").stdout.rstrip()
|
|
|
|
|
|
@dataclass
|
|
class TopCommand:
|
|
"""
|
|
This class using `from_stdout` helps to parse result from `top command`, could return result only for one PID
|
|
pid: Process PID
|
|
output: stdout result from TOP command
|
|
"""
|
|
|
|
pid: Optional[str] = None
|
|
user: Optional[str] = None
|
|
pr: Optional[str] = None
|
|
ni: Optional[str] = None
|
|
virt: Optional[str] = None
|
|
res: Optional[str] = None
|
|
shr: Optional[str] = None
|
|
status: Optional[str] = None
|
|
cpu_percent: Optional[str] = None
|
|
mem_percent: Optional[str] = None
|
|
time: Optional[str] = None
|
|
cmd: Optional[str] = None
|
|
STATUS_RUNNING = "R"
|
|
STATUS_SLEEP = "S"
|
|
STATUS_ZOMBIE = "Z"
|
|
STATUS_UNSLEEP = "D"
|
|
STATUS_TRACED = "T"
|
|
|
|
@staticmethod
|
|
def from_stdout(output: str, requested_pid: int) -> "TopCommand":
|
|
list_var = [None for i in range(12)]
|
|
for line in output.split("\n"):
|
|
if str(requested_pid) in line:
|
|
list_var = line.split()
|
|
return TopCommand(
|
|
pid=list_var[0],
|
|
user=list_var[1],
|
|
pr=list_var[2],
|
|
ni=list_var[3],
|
|
virt=list_var[4],
|
|
res=list_var[5],
|
|
shr=list_var[6],
|
|
status=list_var[7],
|
|
cpu_percent=list_var[8],
|
|
mem_percent=list_var[9],
|
|
time=list_var[10],
|
|
cmd=list_var[11],
|
|
)
|
|
|
|
|
|
@reporter.step_deco("Run `top` command with specified PID")
|
|
def service_status_top(service: str, shell: Shell) -> TopCommand:
|
|
pid = service_pid(service, shell)
|
|
output = shell.exec(f"sudo top -b -n 1 -p {pid}").stdout
|
|
return TopCommand.from_stdout(output, pid)
|
|
|
|
|
|
@reporter.step_deco("Restart service n times with sleep")
|
|
def multiple_restart(
|
|
service_type: type[NodeBase],
|
|
node: ClusterNode,
|
|
count: int = 5,
|
|
sleep_interval: int = 2,
|
|
):
|
|
service_systemctl_name = node.service(service_type).get_service_systemctl_name()
|
|
service_name = node.service(service_type).name
|
|
for _ in range(count):
|
|
node.host.restart_service(service_name)
|
|
logger.info(
|
|
f"Restart {service_systemctl_name}; sleep {sleep_interval} seconds and continue"
|
|
)
|
|
sleep(sleep_interval)
|
|
|
|
|
|
@reporter.step_deco("Get status of list of services and check expected status")
|
|
@wait_for_success(60, 5)
|
|
def check_services_status(service_list: list[str], expected_status: str, shell: Shell):
|
|
cmd = ""
|
|
for service in service_list:
|
|
cmd += f' sudo systemctl status {service} --lines=0 | grep "Active:";'
|
|
result = shell.exec(cmd).stdout.rstrip()
|
|
statuses = list()
|
|
for line in result.split("\n"):
|
|
status_substring = line.split()
|
|
statuses.append(status_substring[1])
|
|
unique_statuses = list(set(statuses))
|
|
assert (
|
|
len(unique_statuses) == 1 and expected_status in unique_statuses
|
|
), f"Requested status={expected_status} not found in requested services={service_list}, list of statuses={result}"
|
|
|
|
|
|
@reporter.step_deco("Wait for active status of passed service")
|
|
@wait_for_success(60, 5)
|
|
def wait_service_in_desired_state(
|
|
service: str, shell: Shell, expected_status: Optional[str] = "active"
|
|
):
|
|
real_status = service_status(service=service, shell=shell)
|
|
assert (
|
|
expected_status == real_status
|
|
), f"Service {service}: expected status= {expected_status}, real status {real_status}"
|
|
|
|
|
|
@reporter.step_deco("Run healthcheck against passed service")
|
|
@wait_for_success(parse_time(SERVICE_MAX_STARTUP_TIME), 1)
|
|
def service_type_healthcheck(
|
|
service_type: type[NodeBase],
|
|
node: ClusterNode,
|
|
):
|
|
service = node.service(service_type)
|
|
assert (
|
|
service.service_healthcheck()
|
|
), f"Healthcheck failed for {service.get_service_systemctl_name()}, IP={node.host_ip}"
|
|
|
|
|
|
@reporter.step_deco("Kill by process name")
|
|
def kill_by_service_name(service_type: type[NodeBase], node: ClusterNode):
|
|
service_systemctl_name = node.service(service_type).get_service_systemctl_name()
|
|
pid = service_pid(service_systemctl_name, node.host.get_shell())
|
|
node.host.get_shell().exec(f"sudo kill -9 {pid}")
|
|
|
|
|
|
@reporter.step_deco("Service {service} suspend")
|
|
def suspend_service(shell: Shell, service: str):
|
|
shell.exec(f"sudo kill -STOP {service_pid(service, shell)}")
|
|
|
|
|
|
@reporter.step_deco("Service {service} resume")
|
|
def resume_service(shell: Shell, service: str):
|
|
shell.exec(f"sudo kill -CONT {service_pid(service, shell)}")
|
|
|
|
|
|
@reporter.step_deco("Retrieve service's pid")
|
|
# retry mechanism cause when the task has been started recently '0' PID could be returned
|
|
@wait_for_success(10, 1)
|
|
def service_pid(service: str, shell: Shell) -> int:
|
|
output = shell.exec(f"systemctl show --property MainPID {service}").stdout.rstrip()
|
|
splitted = output.split("=")
|
|
PID = int(splitted[1])
|
|
assert PID > 0, f"Service {service} has invalid PID={PID}"
|
|
return PID
|
|
|
|
|
|
@reporter.step_deco("Wrapper for neo-go dump keys command")
|
|
def dump_keys(shell: Shell, node: ClusterNode) -> dict:
|
|
host = node.host
|
|
service_config = host.get_service_config(node.service(MorphChain).name)
|
|
wallet = service_config.attributes["wallet_path"]
|
|
return neo_go_dump_keys(shell=shell, wallet=wallet)
|
|
|
|
|
|
@reporter.step_deco("Wait for object replication")
|
|
def wait_object_replication(
|
|
cid: str,
|
|
oid: str,
|
|
expected_copies: int,
|
|
shell: Shell,
|
|
nodes: list[StorageNode],
|
|
sleep_interval: int = 15,
|
|
attempts: int = 20,
|
|
) -> list[StorageNode]:
|
|
nodes_with_object = []
|
|
for _ in range(attempts):
|
|
nodes_with_object = get_nodes_with_object(cid, oid, shell=shell, nodes=nodes)
|
|
if len(nodes_with_object) >= expected_copies:
|
|
return nodes_with_object
|
|
sleep(sleep_interval)
|
|
raise AssertionError(
|
|
f"Expected {expected_copies} copies of object, but found {len(nodes_with_object)}. "
|
|
f"Waiting time {sleep_interval * attempts}"
|
|
)
|
|
|
|
|
|
def is_all_storage_nodes_returned(cluster: Cluster) -> bool:
|
|
with reporter.step("Run health check for all storage nodes"):
|
|
for node in cluster.services(StorageNode):
|
|
try:
|
|
health_check = storage_node_healthcheck(node)
|
|
except Exception as err:
|
|
logger.warning(f"Node healthcheck fails with error {err}")
|
|
return False
|
|
if health_check.health_status != "READY" or health_check.network_status != "ONLINE":
|
|
return False
|
|
return True
|