Fix node transition to online state

Node hangs up if we attempt to transfer it to online state immediately after start.

Signed-off-by: Vladimir Domnich <v.domnich@yadro.com>
develop
Vladimir Domnich 2022-08-15 10:40:04 -04:00 committed by Vladimir Domnich
parent a76614b40d
commit b270f39387
3 changed files with 53 additions and 29 deletions

View File

@ -1,3 +1,4 @@
import json
import logging
import re
import time
@ -19,25 +20,21 @@ class LocalDevEnvStorageServiceHelper:
"""
Manages storage services running on local devenv.
"""
def stop_node(self, node_name: str) -> None:
def stop_node(self, node_name: str, wait: bool = True) -> None:
container_name = _get_storage_container_name(node_name)
client = self._get_docker_client(node_name)
client.stop(container_name)
def start_node(self, node_name: str) -> None:
if wait:
self._wait_for_container_to_be_in_state(node_name, container_name, "exited")
def start_node(self, node_name: str, wait: bool = True) -> None:
container_name = _get_storage_container_name(node_name)
client = self._get_docker_client(node_name)
client.start(container_name)
def wait_for_node_to_start(self, node_name: str) -> None:
container_name = _get_storage_container_name(node_name)
expected_state = "running"
for __attempt in range(10):
container = self._get_container_by_name(node_name, container_name)
if container and container["State"] == expected_state:
return
time.sleep(3)
raise AssertionError(f'Container {container_name} is not in {expected_state} state')
if wait:
self._wait_for_container_to_be_in_state(node_name, container_name, "running")
def run_control_command(self, node_name: str, command: str) -> str:
control_endpoint = NEOFS_NETMAP_DICT[node_name]["control"]
@ -64,14 +61,30 @@ class LocalDevEnvStorageServiceHelper:
def _get_container_by_name(self, node_name: str, container_name: str) -> dict:
client = self._get_docker_client(node_name)
containers = client.containers()
containers = client.containers(all=True)
logger.info(f"Current containers state\n:{json.dumps(containers, indent=2)}")
for container in containers:
if container_name in container["Names"]:
# Names in local docker environment are prefixed with /
clean_names = set(name.strip("/") for name in container["Names"])
if container_name in clean_names:
return container
return None
def _wait_for_container_to_be_in_state(self, node_name: str, container_name: str,
expected_state: str) -> None:
for __attempt in range(10):
container = self._get_container_by_name(node_name, container_name)
logger.info(f"Container info:\n{json.dumps(container, indent=2)}")
if container and container["State"] == expected_state:
return
time.sleep(5)
raise AssertionError(f'Container {container_name} is not in {expected_state} state.')
def _get_docker_client(self, node_name: str) -> docker.APIClient:
# For local devenv we use default docker client that talks to unix socket
# For local docker we use default docker client that talks to unix socket
client = docker.APIClient()
return client
@ -79,29 +92,23 @@ class LocalDevEnvStorageServiceHelper:
class CloudVmStorageServiceHelper:
STORAGE_SERVICE = "neofs-storage.service"
def stop_node(self, node_name: str) -> None:
def stop_node(self, node_name: str, wait: bool = True) -> None:
with _create_ssh_client(node_name) as ssh_client:
cmd = f"sudo systemctl stop {self.STORAGE_SERVICE}"
output = ssh_client.exec_with_confirmation(cmd, [""])
logger.info(f"Stop command output: {output.stdout}")
def start_node(self, node_name: str) -> None:
if wait:
self._wait_for_service_to_be_in_state(node_name, self.STORAGE_SERVICE, "inactive")
def start_node(self, node_name: str, wait: bool = True) -> None:
with _create_ssh_client(node_name) as ssh_client:
cmd = f"sudo systemctl start {self.STORAGE_SERVICE}"
output = ssh_client.exec_with_confirmation(cmd, [""])
logger.info(f"Start command output: {output.stdout}")
def wait_for_node_to_start(self, node_name: str) -> None:
expected_state = 'active (running)'
with _create_ssh_client(node_name) as ssh_client:
for __attempt in range(10):
output = ssh_client.exec(f'sudo systemctl status {self.STORAGE_SERVICE}')
if expected_state in output.stdout:
return
time.sleep(3)
raise AssertionError(
f'Service {self.STORAGE_SERVICE} is not in {expected_state} state'
)
if wait:
self._wait_for_service_to_be_in_state(node_name, self.STORAGE_SERVICE, "active (running)")
def run_control_command(self, node_name: str, command: str) -> str:
control_endpoint = NEOFS_NETMAP_DICT[node_name]["control"]
@ -129,6 +136,19 @@ class CloudVmStorageServiceHelper:
output = ssh_client.exec_with_confirmation(cmd, [""])
return output.stdout
def _wait_for_service_to_be_in_state(self, node_name: str, service_name: str,
expected_state: str) -> None:
with _create_ssh_client(node_name) as ssh_client:
for __attempt in range(10):
# Run command to get service status (set --lines=0 to suppress logs output)
# Also we don't verify return code, because for an inactive service return code will be 3
command = f'sudo systemctl status {service_name} --lines=0'
output = ssh_client.exec(command, verify=False)
if expected_state in output.stdout:
return
time.sleep(3)
raise AssertionError(f'Service {service_name} is not in {expected_state} state')
def delete_node_data(self, node_name: str) -> None:
with _create_ssh_client(node_name) as ssh_client:
ssh_client.exec("sudo rm -rf /srv/neofs/*")
@ -174,7 +194,7 @@ class RemoteDevEnvStorageServiceHelper(LocalDevEnvStorageServiceHelper):
Manages storage services running on remote devenv.
Most of operations are identical to local devenv, however, any interactions
with host resources (files, etc.) require ssh into the devenv host machine.
with host resources (files, etc.) require ssh into the remote host machine.
"""
def _get_docker_client(self, node_name: str) -> docker.APIClient:
# For remote devenv we use docker client that talks to tcp socket 2375:

View File

@ -78,7 +78,10 @@ def return_nodes(alive_node: str = None):
for node in list(check_nodes):
with allure.step(f'Start node {node}'):
helper.start_node(node)
helper.wait_for_node_to_start(node)
# Wait for node to receive notifications from morph-chain
sleep(robot_time_to_int(MAINNET_BLOCK_TIME))
tick_epoch()
with allure.step(f'Move node {node} to online state'):
node_set_status(node, status='online', retries=2)

View File

@ -191,6 +191,7 @@ def drop_object(node_name: str, cid: str, oid: str) -> str:
@keyword('Delete data of node {node_name}')
def delete_node_data(node_name: str) -> None:
helper = get_storage_service_helper()
helper.stop_node(node_name)
helper.delete_node_data(node_name)
time.sleep(robot_time_to_int(MAINNET_BLOCK_TIME))