import logging from dataclasses import dataclass from time import sleep from typing import Optional from frostfs_testlib.hosting import Host from frostfs_testlib.reporter import get_reporter from frostfs_testlib.resources.common import SERVICE_MAX_STARTUP_TIME from frostfs_testlib.shell import CommandOptions, Shell from frostfs_testlib.steps.cli.object import neo_go_dump_keys from frostfs_testlib.steps.node_management import storage_node_healthcheck from frostfs_testlib.steps.storage_policy import get_nodes_with_object from frostfs_testlib.storage.cluster import Cluster, ClusterNode, NodeBase, StorageNode from frostfs_testlib.storage.dataclasses.frostfs_services import MorphChain from frostfs_testlib.testing.test_control import retry, wait_for_success from frostfs_testlib.utils.datetime_utils import parse_time reporter = get_reporter() logger = logging.getLogger("NeoLogger") @reporter.step_deco("Ping node") def ping_host(shell: Shell, host: Host): options = CommandOptions(check=False) return shell.exec(f"ping {host.config.address} -c 1", options).return_code @reporter.step_deco("Wait for storage nodes returned to cluster") def wait_all_storage_nodes_returned(shell: Shell, cluster: Cluster) -> None: for node in cluster.services(StorageNode): with reporter.step(f"Run health check for storage at '{node}'"): wait_for_host_online(shell, node) wait_for_node_online(node) @retry(max_attempts=60, sleep_interval=5, expected_result=0) @reporter.step_deco("Waiting for host of {node} to go online") def wait_for_host_online(shell: Shell, node: StorageNode): try: # TODO: Quick solution for now, should be replaced by lib interactions return ping_host(shell, node.host) except Exception as err: logger.warning(f"Host ping fails with error {err}") return 1 @retry(max_attempts=60, sleep_interval=5, expected_result=1) @reporter.step_deco("Waiting for host of {node} to go offline") def wait_for_host_offline(shell: Shell, node: StorageNode): try: # TODO: Quick solution for now, should be replaced by lib interactions return ping_host(shell, node.host) except Exception as err: logger.warning(f"Host ping fails with error {err}") return 0 @retry(max_attempts=20, sleep_interval=30, expected_result=True) @reporter.step_deco("Waiting for node {node} to go online") def wait_for_node_online(node: StorageNode): try: health_check = storage_node_healthcheck(node) except Exception as err: logger.warning(f"Node healthcheck fails with error {err}") return False return health_check.health_status == "READY" and health_check.network_status == "ONLINE" @reporter.step_deco("Check and return status of given service") def service_status(service: str, shell: Shell) -> str: return shell.exec(f"sudo systemctl is-active {service}").stdout.rstrip() @dataclass class TopCommand: """ This class using `from_stdout` helps to parse result from `top command`, could return result only for one PID pid: Process PID output: stdout result from TOP command """ pid: Optional[str] = None user: Optional[str] = None pr: Optional[str] = None ni: Optional[str] = None virt: Optional[str] = None res: Optional[str] = None shr: Optional[str] = None status: Optional[str] = None cpu_percent: Optional[str] = None mem_percent: Optional[str] = None time: Optional[str] = None cmd: Optional[str] = None STATUS_RUNNING = "R" STATUS_SLEEP = "S" STATUS_ZOMBIE = "Z" STATUS_UNSLEEP = "D" STATUS_TRACED = "T" @staticmethod def from_stdout(output: str, requested_pid: int) -> "TopCommand": list_var = [None for i in range(12)] for line in output.split("\n"): if str(requested_pid) in line: list_var = line.split() return TopCommand( pid=list_var[0], user=list_var[1], pr=list_var[2], ni=list_var[3], virt=list_var[4], res=list_var[5], shr=list_var[6], status=list_var[7], cpu_percent=list_var[8], mem_percent=list_var[9], time=list_var[10], cmd=list_var[11], ) @reporter.step_deco("Run `top` command with specified PID") def service_status_top(service: str, shell: Shell) -> TopCommand: pid = service_pid(service, shell) output = shell.exec(f"sudo top -b -n 1 -p {pid}").stdout return TopCommand.from_stdout(output, pid) @reporter.step_deco("Restart service n times with sleep") def multiple_restart( service_type: type[NodeBase], node: ClusterNode, count: int = 5, sleep_interval: int = 2, ): service_systemctl_name = node.service(service_type).get_service_systemctl_name() service_name = node.service(service_type).name for _ in range(count): node.host.restart_service(service_name) logger.info( f"Restart {service_systemctl_name}; sleep {sleep_interval} seconds and continue" ) sleep(sleep_interval) @reporter.step_deco("Get status of list of services and check expected status") @wait_for_success(60, 5) def check_services_status(service_list: list[str], expected_status: str, shell: Shell): cmd = "" for service in service_list: cmd += f' sudo systemctl status {service} --lines=0 | grep "Active:";' result = shell.exec(cmd).stdout.rstrip() statuses = list() for line in result.split("\n"): status_substring = line.split() statuses.append(status_substring[1]) unique_statuses = list(set(statuses)) assert ( len(unique_statuses) == 1 and expected_status in unique_statuses ), f"Requested status={expected_status} not found in requested services={service_list}, list of statuses={result}" @reporter.step_deco("Wait for active status of passed service") @wait_for_success(60, 5) def wait_service_in_desired_state( service: str, shell: Shell, expected_status: Optional[str] = "active" ): real_status = service_status(service=service, shell=shell) assert ( expected_status == real_status ), f"Service {service}: expected status= {expected_status}, real status {real_status}" @reporter.step_deco("Run healthcheck against passed service") @wait_for_success(parse_time(SERVICE_MAX_STARTUP_TIME), 1) def service_type_healthcheck( service_type: type[NodeBase], node: ClusterNode, ): service = node.service(service_type) assert ( service.service_healthcheck() ), f"Healthcheck failed for {service.get_service_systemctl_name()}, IP={node.host_ip}" @reporter.step_deco("Kill by process name") def kill_by_service_name(service_type: type[NodeBase], node: ClusterNode): service_systemctl_name = node.service(service_type).get_service_systemctl_name() pid = service_pid(service_systemctl_name, node.host.get_shell()) node.host.get_shell().exec(f"sudo kill -9 {pid}") @reporter.step_deco("Service {service} suspend") def suspend_service(shell: Shell, service: str): shell.exec(f"sudo kill -STOP {service_pid(service, shell)}") @reporter.step_deco("Service {service} resume") def resume_service(shell: Shell, service: str): shell.exec(f"sudo kill -CONT {service_pid(service, shell)}") @reporter.step_deco("Retrieve service's pid") # retry mechanism cause when the task has been started recently '0' PID could be returned @wait_for_success(10, 1) def service_pid(service: str, shell: Shell) -> int: output = shell.exec(f"systemctl show --property MainPID {service}").stdout.rstrip() splitted = output.split("=") PID = int(splitted[1]) assert PID > 0, f"Service {service} has invalid PID={PID}" return PID @reporter.step_deco("Wrapper for neo-go dump keys command") def dump_keys(shell: Shell, node: ClusterNode) -> dict: host = node.host service_config = host.get_service_config(node.service(MorphChain).name) wallet = service_config.attributes["wallet_path"] return neo_go_dump_keys(shell=shell, wallet=wallet) @reporter.step_deco("Wait for object replication") def wait_object_replication( cid: str, oid: str, expected_copies: int, shell: Shell, nodes: list[StorageNode], sleep_interval: int = 15, attempts: int = 20, ) -> list[StorageNode]: nodes_with_object = [] for _ in range(attempts): nodes_with_object = get_nodes_with_object(cid, oid, shell=shell, nodes=nodes) if len(nodes_with_object) >= expected_copies: return nodes_with_object sleep(sleep_interval) raise AssertionError( f"Expected {expected_copies} copies of object, but found {len(nodes_with_object)}. " f"Waiting time {sleep_interval * attempts}" ) def is_all_storage_nodes_returned(cluster: Cluster) -> bool: with reporter.step("Run health check for all storage nodes"): for node in cluster.services(StorageNode): try: health_check = storage_node_healthcheck(node) except Exception as err: logger.warning(f"Node healthcheck fails with error {err}") return False if health_check.health_status != "READY" or health_check.network_status != "ONLINE": return False return True