Fix node transition to online state
Node hangs up if we attempt to transfer it to online state immediately after start. Signed-off-by: Vladimir Domnich <v.domnich@yadro.com>
This commit is contained in:
parent
a76614b40d
commit
b270f39387
3 changed files with 53 additions and 29 deletions
|
@ -1,3 +1,4 @@
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
@ -19,25 +20,21 @@ class LocalDevEnvStorageServiceHelper:
|
||||||
"""
|
"""
|
||||||
Manages storage services running on local devenv.
|
Manages storage services running on local devenv.
|
||||||
"""
|
"""
|
||||||
def stop_node(self, node_name: str) -> None:
|
def stop_node(self, node_name: str, wait: bool = True) -> None:
|
||||||
container_name = _get_storage_container_name(node_name)
|
container_name = _get_storage_container_name(node_name)
|
||||||
client = self._get_docker_client(node_name)
|
client = self._get_docker_client(node_name)
|
||||||
client.stop(container_name)
|
client.stop(container_name)
|
||||||
|
|
||||||
def start_node(self, node_name: str) -> None:
|
if wait:
|
||||||
|
self._wait_for_container_to_be_in_state(node_name, container_name, "exited")
|
||||||
|
|
||||||
|
def start_node(self, node_name: str, wait: bool = True) -> None:
|
||||||
container_name = _get_storage_container_name(node_name)
|
container_name = _get_storage_container_name(node_name)
|
||||||
client = self._get_docker_client(node_name)
|
client = self._get_docker_client(node_name)
|
||||||
client.start(container_name)
|
client.start(container_name)
|
||||||
|
|
||||||
def wait_for_node_to_start(self, node_name: str) -> None:
|
if wait:
|
||||||
container_name = _get_storage_container_name(node_name)
|
self._wait_for_container_to_be_in_state(node_name, container_name, "running")
|
||||||
expected_state = "running"
|
|
||||||
for __attempt in range(10):
|
|
||||||
container = self._get_container_by_name(node_name, container_name)
|
|
||||||
if container and container["State"] == expected_state:
|
|
||||||
return
|
|
||||||
time.sleep(3)
|
|
||||||
raise AssertionError(f'Container {container_name} is not in {expected_state} state')
|
|
||||||
|
|
||||||
def run_control_command(self, node_name: str, command: str) -> str:
|
def run_control_command(self, node_name: str, command: str) -> str:
|
||||||
control_endpoint = NEOFS_NETMAP_DICT[node_name]["control"]
|
control_endpoint = NEOFS_NETMAP_DICT[node_name]["control"]
|
||||||
|
@ -64,14 +61,30 @@ class LocalDevEnvStorageServiceHelper:
|
||||||
|
|
||||||
def _get_container_by_name(self, node_name: str, container_name: str) -> dict:
|
def _get_container_by_name(self, node_name: str, container_name: str) -> dict:
|
||||||
client = self._get_docker_client(node_name)
|
client = self._get_docker_client(node_name)
|
||||||
containers = client.containers()
|
containers = client.containers(all=True)
|
||||||
|
|
||||||
|
logger.info(f"Current containers state\n:{json.dumps(containers, indent=2)}")
|
||||||
|
|
||||||
for container in containers:
|
for container in containers:
|
||||||
if container_name in container["Names"]:
|
# Names in local docker environment are prefixed with /
|
||||||
|
clean_names = set(name.strip("/") for name in container["Names"])
|
||||||
|
if container_name in clean_names:
|
||||||
return container
|
return container
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def _wait_for_container_to_be_in_state(self, node_name: str, container_name: str,
|
||||||
|
expected_state: str) -> None:
|
||||||
|
for __attempt in range(10):
|
||||||
|
container = self._get_container_by_name(node_name, container_name)
|
||||||
|
logger.info(f"Container info:\n{json.dumps(container, indent=2)}")
|
||||||
|
if container and container["State"] == expected_state:
|
||||||
|
return
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
raise AssertionError(f'Container {container_name} is not in {expected_state} state.')
|
||||||
|
|
||||||
def _get_docker_client(self, node_name: str) -> docker.APIClient:
|
def _get_docker_client(self, node_name: str) -> docker.APIClient:
|
||||||
# For local devenv we use default docker client that talks to unix socket
|
# For local docker we use default docker client that talks to unix socket
|
||||||
client = docker.APIClient()
|
client = docker.APIClient()
|
||||||
return client
|
return client
|
||||||
|
|
||||||
|
@ -79,29 +92,23 @@ class LocalDevEnvStorageServiceHelper:
|
||||||
class CloudVmStorageServiceHelper:
|
class CloudVmStorageServiceHelper:
|
||||||
STORAGE_SERVICE = "neofs-storage.service"
|
STORAGE_SERVICE = "neofs-storage.service"
|
||||||
|
|
||||||
def stop_node(self, node_name: str) -> None:
|
def stop_node(self, node_name: str, wait: bool = True) -> None:
|
||||||
with _create_ssh_client(node_name) as ssh_client:
|
with _create_ssh_client(node_name) as ssh_client:
|
||||||
cmd = f"sudo systemctl stop {self.STORAGE_SERVICE}"
|
cmd = f"sudo systemctl stop {self.STORAGE_SERVICE}"
|
||||||
output = ssh_client.exec_with_confirmation(cmd, [""])
|
output = ssh_client.exec_with_confirmation(cmd, [""])
|
||||||
logger.info(f"Stop command output: {output.stdout}")
|
logger.info(f"Stop command output: {output.stdout}")
|
||||||
|
|
||||||
def start_node(self, node_name: str) -> None:
|
if wait:
|
||||||
|
self._wait_for_service_to_be_in_state(node_name, self.STORAGE_SERVICE, "inactive")
|
||||||
|
|
||||||
|
def start_node(self, node_name: str, wait: bool = True) -> None:
|
||||||
with _create_ssh_client(node_name) as ssh_client:
|
with _create_ssh_client(node_name) as ssh_client:
|
||||||
cmd = f"sudo systemctl start {self.STORAGE_SERVICE}"
|
cmd = f"sudo systemctl start {self.STORAGE_SERVICE}"
|
||||||
output = ssh_client.exec_with_confirmation(cmd, [""])
|
output = ssh_client.exec_with_confirmation(cmd, [""])
|
||||||
logger.info(f"Start command output: {output.stdout}")
|
logger.info(f"Start command output: {output.stdout}")
|
||||||
|
|
||||||
def wait_for_node_to_start(self, node_name: str) -> None:
|
if wait:
|
||||||
expected_state = 'active (running)'
|
self._wait_for_service_to_be_in_state(node_name, self.STORAGE_SERVICE, "active (running)")
|
||||||
with _create_ssh_client(node_name) as ssh_client:
|
|
||||||
for __attempt in range(10):
|
|
||||||
output = ssh_client.exec(f'sudo systemctl status {self.STORAGE_SERVICE}')
|
|
||||||
if expected_state in output.stdout:
|
|
||||||
return
|
|
||||||
time.sleep(3)
|
|
||||||
raise AssertionError(
|
|
||||||
f'Service {self.STORAGE_SERVICE} is not in {expected_state} state'
|
|
||||||
)
|
|
||||||
|
|
||||||
def run_control_command(self, node_name: str, command: str) -> str:
|
def run_control_command(self, node_name: str, command: str) -> str:
|
||||||
control_endpoint = NEOFS_NETMAP_DICT[node_name]["control"]
|
control_endpoint = NEOFS_NETMAP_DICT[node_name]["control"]
|
||||||
|
@ -129,6 +136,19 @@ class CloudVmStorageServiceHelper:
|
||||||
output = ssh_client.exec_with_confirmation(cmd, [""])
|
output = ssh_client.exec_with_confirmation(cmd, [""])
|
||||||
return output.stdout
|
return output.stdout
|
||||||
|
|
||||||
|
def _wait_for_service_to_be_in_state(self, node_name: str, service_name: str,
|
||||||
|
expected_state: str) -> None:
|
||||||
|
with _create_ssh_client(node_name) as ssh_client:
|
||||||
|
for __attempt in range(10):
|
||||||
|
# Run command to get service status (set --lines=0 to suppress logs output)
|
||||||
|
# Also we don't verify return code, because for an inactive service return code will be 3
|
||||||
|
command = f'sudo systemctl status {service_name} --lines=0'
|
||||||
|
output = ssh_client.exec(command, verify=False)
|
||||||
|
if expected_state in output.stdout:
|
||||||
|
return
|
||||||
|
time.sleep(3)
|
||||||
|
raise AssertionError(f'Service {service_name} is not in {expected_state} state')
|
||||||
|
|
||||||
def delete_node_data(self, node_name: str) -> None:
|
def delete_node_data(self, node_name: str) -> None:
|
||||||
with _create_ssh_client(node_name) as ssh_client:
|
with _create_ssh_client(node_name) as ssh_client:
|
||||||
ssh_client.exec("sudo rm -rf /srv/neofs/*")
|
ssh_client.exec("sudo rm -rf /srv/neofs/*")
|
||||||
|
@ -174,7 +194,7 @@ class RemoteDevEnvStorageServiceHelper(LocalDevEnvStorageServiceHelper):
|
||||||
Manages storage services running on remote devenv.
|
Manages storage services running on remote devenv.
|
||||||
|
|
||||||
Most of operations are identical to local devenv, however, any interactions
|
Most of operations are identical to local devenv, however, any interactions
|
||||||
with host resources (files, etc.) require ssh into the devenv host machine.
|
with host resources (files, etc.) require ssh into the remote host machine.
|
||||||
"""
|
"""
|
||||||
def _get_docker_client(self, node_name: str) -> docker.APIClient:
|
def _get_docker_client(self, node_name: str) -> docker.APIClient:
|
||||||
# For remote devenv we use docker client that talks to tcp socket 2375:
|
# For remote devenv we use docker client that talks to tcp socket 2375:
|
||||||
|
|
|
@ -78,7 +78,10 @@ def return_nodes(alive_node: str = None):
|
||||||
for node in list(check_nodes):
|
for node in list(check_nodes):
|
||||||
with allure.step(f'Start node {node}'):
|
with allure.step(f'Start node {node}'):
|
||||||
helper.start_node(node)
|
helper.start_node(node)
|
||||||
helper.wait_for_node_to_start(node)
|
|
||||||
|
# Wait for node to receive notifications from morph-chain
|
||||||
|
sleep(robot_time_to_int(MAINNET_BLOCK_TIME))
|
||||||
|
tick_epoch()
|
||||||
|
|
||||||
with allure.step(f'Move node {node} to online state'):
|
with allure.step(f'Move node {node} to online state'):
|
||||||
node_set_status(node, status='online', retries=2)
|
node_set_status(node, status='online', retries=2)
|
||||||
|
|
|
@ -191,6 +191,7 @@ def drop_object(node_name: str, cid: str, oid: str) -> str:
|
||||||
@keyword('Delete data of node {node_name}')
|
@keyword('Delete data of node {node_name}')
|
||||||
def delete_node_data(node_name: str) -> None:
|
def delete_node_data(node_name: str) -> None:
|
||||||
helper = get_storage_service_helper()
|
helper = get_storage_service_helper()
|
||||||
|
helper.stop_node(node_name)
|
||||||
helper.delete_node_data(node_name)
|
helper.delete_node_data(node_name)
|
||||||
time.sleep(robot_time_to_int(MAINNET_BLOCK_TIME))
|
time.sleep(robot_time_to_int(MAINNET_BLOCK_TIME))
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue