2022-07-11 14:11:26 +00:00
import logging
2023-10-25 12:47:29 +00:00
import random
2023-05-31 15:31:35 +00:00
from datetime import datetime
2023-03-09 10:19:41 +00:00
from time import sleep
2022-07-11 14:11:26 +00:00
import allure
import pytest
2023-11-29 13:34:59 +00:00
from frostfs_testlib import reporter
2023-11-24 10:22:00 +00:00
from frostfs_testlib . resources . common import MORPH_BLOCK_TIME
2023-05-15 09:59:33 +00:00
from frostfs_testlib . resources . wellknown_acl import PUBLIC_ACL
2023-10-31 14:51:09 +00:00
from frostfs_testlib . s3 import S3ClientWrapper , VersioningStatus
2023-11-24 10:22:00 +00:00
from frostfs_testlib . steps . cli . container import StorageContainer , StorageContainerInfo , create_container
from frostfs_testlib . steps . cli . object import get_object , put_object_to_random_node
2023-05-15 09:59:33 +00:00
from frostfs_testlib . steps . node_management import (
2023-03-09 10:19:41 +00:00
check_node_in_map ,
check_node_not_in_map ,
exclude_node_from_network_map ,
include_node_to_network_map ,
2023-05-15 09:59:33 +00:00
remove_nodes_from_map_morph ,
2023-03-09 10:19:41 +00:00
wait_for_node_to_be_ready ,
)
2023-05-15 09:59:33 +00:00
from frostfs_testlib . steps . s3 import s3_helper
2024-02-13 16:26:11 +00:00
from frostfs_testlib . steps . s3 . s3_helper import search_nodes_with_bucket
2023-10-31 11:28:44 +00:00
from frostfs_testlib . storage . cluster import Cluster , ClusterNode , S3Gate , StorageNode
2023-05-31 15:31:35 +00:00
from frostfs_testlib . storage . controllers import ClusterStateController , ShardsWatcher
2023-08-02 11:54:03 +00:00
from frostfs_testlib . storage . dataclasses . object_size import ObjectSize
2023-11-24 10:22:00 +00:00
from frostfs_testlib . storage . dataclasses . storage_object_info import StorageObjectInfo
2023-05-31 15:31:35 +00:00
from frostfs_testlib . storage . dataclasses . wallet import WalletInfo
2023-05-15 09:59:33 +00:00
from frostfs_testlib . testing . cluster_test_base import ClusterTestBase
2023-05-30 13:42:27 +00:00
from frostfs_testlib . testing . test_control import expect_not_raises
2023-05-15 09:59:33 +00:00
from frostfs_testlib . utils import datetime_utils
2023-10-25 12:47:29 +00:00
from frostfs_testlib . utils . failover_utils import wait_object_replication
2023-05-30 13:42:27 +00:00
from frostfs_testlib . utils . file_keeper import FileKeeper
2023-05-15 09:59:33 +00:00
from frostfs_testlib . utils . file_utils import generate_file , get_file_hash
2023-03-09 10:19:41 +00:00
2022-09-28 12:07:16 +00:00
logger = logging . getLogger ( " NeoLogger " )
2022-12-05 22:31:45 +00:00
stopped_nodes : list [ StorageNode ] = [ ]
2022-07-11 14:11:26 +00:00
2023-05-30 13:42:27 +00:00
@pytest.fixture ( scope = " function " )
@allure.title ( " Provide File Keeper " )
def file_keeper ( ) :
keeper = FileKeeper ( )
yield keeper
keeper . restore_files ( )
2022-07-14 06:21:20 +00:00
@pytest.mark.failover
2023-10-25 12:47:29 +00:00
@pytest.mark.failover_storage
2022-12-05 22:31:45 +00:00
class TestFailoverStorage ( ClusterTestBase ) :
2023-09-08 10:35:34 +00:00
@allure.title ( " Shutdown and start node (stop_mode= {stop_mode} ) " )
2023-08-02 11:54:03 +00:00
@pytest.mark.parametrize ( " stop_mode " , [ " hard " , " soft " ] )
2022-12-05 22:31:45 +00:00
@pytest.mark.failover_reboot
def test_lose_storage_node_host (
2023-08-02 11:54:03 +00:00
self ,
default_wallet ,
stop_mode : str ,
require_multiple_hosts ,
simple_object_size : ObjectSize ,
2023-10-25 12:47:29 +00:00
cluster : Cluster ,
cluster_state_controller : ClusterStateController ,
2022-12-05 22:31:45 +00:00
) :
wallet = default_wallet
placement_rule = " REP 2 IN X CBF 2 SELECT 2 FROM * AS X "
2023-08-02 11:54:03 +00:00
source_file_path = generate_file ( simple_object_size . value )
2023-10-25 12:47:29 +00:00
stopped_hosts_nodes = [ ]
2022-12-05 22:31:45 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( f " Create container and put object " ) :
2023-10-25 12:47:29 +00:00
cid = create_container (
wallet ,
2022-12-05 22:31:45 +00:00
shell = self . shell ,
2023-10-25 12:47:29 +00:00
endpoint = self . cluster . default_rpc_endpoint ,
rule = placement_rule ,
basic_acl = PUBLIC_ACL ,
2022-12-05 22:31:45 +00:00
)
2023-10-25 12:47:29 +00:00
oid = put_object_to_random_node ( wallet , source_file_path , cid , shell = self . shell , cluster = self . cluster )
2023-11-29 13:34:59 +00:00
with reporter . step ( f " Wait for replication and get nodes with object " ) :
2023-10-25 12:47:29 +00:00
nodes_with_object = wait_object_replication ( cid , oid , 2 , shell = self . shell , nodes = self . cluster . storage_nodes )
2023-11-29 13:34:59 +00:00
with reporter . step ( f " Stop 2 nodes with object and wait replication one by one " ) :
2023-10-25 12:47:29 +00:00
for storage_node in random . sample ( nodes_with_object , 2 ) :
stopped_hosts_nodes . append ( storage_node )
cluster_node = cluster . node ( storage_node )
cluster_state_controller . stop_node_host ( cluster_node , stop_mode )
replicated_nodes = wait_object_replication (
cid ,
oid ,
2 ,
shell = self . shell ,
nodes = list ( set ( self . cluster . storage_nodes ) - { * stopped_hosts_nodes } ) ,
)
2022-12-05 22:31:45 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Check object data is not corrupted " ) :
2023-10-25 12:47:29 +00:00
got_file_path = get_object (
wallet , cid , oid , endpoint = replicated_nodes [ 0 ] . get_rpc_endpoint ( ) , shell = self . shell
)
2022-12-05 22:31:45 +00:00
assert get_file_hash ( source_file_path ) == get_file_hash ( got_file_path )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Return all hosts " ) :
2023-10-25 12:47:29 +00:00
cluster_state_controller . start_stopped_hosts ( )
2022-12-05 22:31:45 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Check object data is not corrupted " ) :
2023-10-25 12:47:29 +00:00
replicated_nodes = wait_object_replication ( cid , oid , 2 , shell = self . shell , nodes = self . cluster . storage_nodes )
got_file_path = get_object (
wallet , cid , oid , shell = self . shell , endpoint = replicated_nodes [ 0 ] . get_rpc_endpoint ( )
)
2022-12-05 22:31:45 +00:00
assert get_file_hash ( source_file_path ) == get_file_hash ( got_file_path )
2023-09-08 10:35:34 +00:00
@allure.title ( " Do not ignore unhealthy tree endpoints (s3_client= {s3_client} ) " )
2023-06-13 09:05:35 +00:00
def test_unhealthy_tree (
self ,
s3_client : S3ClientWrapper ,
2024-03-11 16:34:54 +00:00
default_wallet : WalletInfo ,
2023-08-02 11:54:03 +00:00
simple_object_size : ObjectSize ,
2023-06-13 09:05:35 +00:00
cluster_state_controller : ClusterStateController ,
) :
default_node = self . cluster . cluster_nodes [ 0 ]
2023-11-29 13:34:59 +00:00
with reporter . step ( " Turn S3 GW off on default node " ) :
2023-10-31 11:28:44 +00:00
cluster_state_controller . stop_service_of_type ( default_node , S3Gate )
2023-06-13 09:05:35 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Turn off storage on default node " ) :
2023-10-31 11:28:44 +00:00
cluster_state_controller . stop_service_of_type ( default_node , StorageNode )
2023-06-13 09:05:35 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Turn on S3 GW on default node " ) :
2023-10-31 11:28:44 +00:00
cluster_state_controller . start_service_of_type ( default_node , S3Gate )
2023-06-13 09:05:35 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Turn on storage on default node " ) :
2023-10-31 11:28:44 +00:00
cluster_state_controller . start_service_of_type ( default_node , StorageNode )
2023-06-13 09:05:35 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Create bucket with REP 1 SELECT 1 policy " ) :
2023-06-13 09:05:35 +00:00
bucket = s3_client . create_bucket (
location_constraint = " load-1-1 " ,
)
2023-08-02 11:54:03 +00:00
file_path = generate_file ( simple_object_size . value )
2023-06-13 09:05:35 +00:00
file_name = s3_helper . object_key_from_file_path ( file_path )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Put object into bucket " ) :
2023-06-13 09:05:35 +00:00
put_object = s3_client . put_object ( bucket , file_path )
s3_helper . check_objects_in_bucket ( s3_client , bucket , expected_objects = [ file_name ] )
2024-02-13 16:26:11 +00:00
node_bucket = search_nodes_with_bucket (
cluster = self . cluster ,
bucket_name = bucket ,
wallet = default_wallet ,
shell = self . shell ,
endpoint = self . cluster . storage_nodes [ 0 ] . get_rpc_endpoint ( ) ,
) [ 0 ]
with reporter . step ( " Turn off all storage nodes except bucket node " ) :
for node in [ node_to_stop for node_to_stop in self . cluster . cluster_nodes if node_to_stop != node_bucket ] :
2023-11-29 13:34:59 +00:00
with reporter . step ( f " Stop storage service on node: { node } " ) :
2023-10-31 11:28:44 +00:00
cluster_state_controller . stop_service_of_type ( node , StorageNode )
2023-06-13 09:05:35 +00:00
2024-02-19 12:48:03 +00:00
with reporter . step ( f " Change s3 endpoint to bucket node " ) :
s3_client . set_endpoint ( node_bucket . s3_gate . get_endpoint ( ) )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Check that object is available " ) :
2023-06-13 09:05:35 +00:00
s3_helper . check_objects_in_bucket ( s3_client , bucket , expected_objects = [ file_name ] )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Start storage nodes " ) :
2023-10-31 11:28:44 +00:00
cluster_state_controller . start_all_stopped_services ( )
2023-10-25 12:47:29 +00:00
2023-03-09 10:19:41 +00:00
@pytest.mark.failover
@pytest.mark.failover_empty_map
2023-05-15 09:59:33 +00:00
class TestEmptyMap ( ClusterTestBase ) :
2023-03-09 10:19:41 +00:00
"""
A set of tests for makes map empty and verify that we can read objects after that
"""
2023-05-15 09:59:33 +00:00
2023-11-29 13:34:59 +00:00
@reporter.step ( " Teardown after EmptyMap offline test " )
2023-03-09 10:19:41 +00:00
@pytest.fixture ( )
def empty_map_offline_teardown ( self ) :
yield
2023-11-29 13:34:59 +00:00
with reporter . step ( " Return all storage nodes to network map " ) :
2023-10-25 12:47:29 +00:00
for node in stopped_nodes :
2023-05-15 09:59:33 +00:00
include_node_to_network_map ( node , node , shell = self . shell , cluster = self . cluster )
2023-03-09 10:19:41 +00:00
stopped_nodes . remove ( node )
@pytest.mark.failover_empty_map_offlne
2023-09-08 10:35:34 +00:00
@allure.title ( " Empty network map via offline all storage nodes (s3_client= {s3_client} ) " )
2023-05-15 09:59:33 +00:00
def test_offline_all_storage_nodes (
self ,
s3_client : S3ClientWrapper ,
bucket : str ,
2023-08-02 11:54:03 +00:00
simple_object_size : ObjectSize ,
2023-05-15 09:59:33 +00:00
empty_map_offline_teardown ,
) :
2023-03-09 10:19:41 +00:00
"""
The test makes network map empty ( set offline status on all storage nodes ) then returns all nodes to map and checks that object can read through s3 .
2023-05-15 09:59:33 +00:00
2023-03-09 10:19:41 +00:00
Steps :
1. Check that bucket is empty
2 : PUT object into bucket
3 : Check that object exists in bucket
4 : Exclude all storage nodes from network map ( set status OFFLINE )
5 : Return all storage nodes to network map
6 : Check that we can read object from #2
Args :
bucket : bucket which contains tested object
simple_object_size : size of object
"""
2023-08-02 11:54:03 +00:00
file_path = generate_file ( simple_object_size . value )
2023-05-15 09:59:33 +00:00
file_name = s3_helper . object_key_from_file_path ( file_path )
2023-03-09 10:19:41 +00:00
bucket_objects = [ file_name ]
2023-11-29 13:34:59 +00:00
with reporter . step ( " Put object into bucket " ) :
2023-05-15 09:59:33 +00:00
s3_client . put_object ( bucket , file_path )
2023-03-09 10:19:41 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Check that object exists in bucket " ) :
2023-05-15 09:59:33 +00:00
s3_helper . check_objects_in_bucket ( s3_client , bucket , bucket_objects )
2023-03-09 10:19:41 +00:00
storage_nodes = self . cluster . storage_nodes
2023-11-29 13:34:59 +00:00
with reporter . step ( " Exclude all storage nodes from network map " ) :
2023-03-09 10:19:41 +00:00
for node in storage_nodes :
stopped_nodes . append ( node )
2023-09-27 15:28:56 +00:00
exclude_node_from_network_map ( node , node , shell = self . shell , cluster = self . cluster )
2023-03-09 10:19:41 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Return all storage nodes to network map " ) :
2023-03-09 10:19:41 +00:00
for node in storage_nodes :
2023-05-15 09:59:33 +00:00
include_node_to_network_map ( node , node , shell = self . shell , cluster = self . cluster )
2023-03-09 10:19:41 +00:00
stopped_nodes . remove ( node )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Check that we can read object " ) :
2023-05-15 09:59:33 +00:00
s3_helper . check_objects_in_bucket ( s3_client , bucket , bucket_objects )
2023-03-09 10:19:41 +00:00
2023-11-29 13:34:59 +00:00
@reporter.step ( " Teardown after EmptyMap stop service test " )
2023-03-09 10:19:41 +00:00
@pytest.fixture ( )
2023-10-25 12:47:29 +00:00
def empty_map_stop_service_teardown ( self , cluster_state_controller : ClusterStateController ) :
2023-03-09 10:19:41 +00:00
yield
2023-11-29 13:34:59 +00:00
with reporter . step ( " Return all storage nodes to network map " ) :
2023-10-31 11:28:44 +00:00
cluster_state_controller . start_all_stopped_services ( )
2023-10-25 12:47:29 +00:00
for node in stopped_nodes :
2023-03-09 10:19:41 +00:00
check_node_in_map ( node , shell = self . shell , alive_node = node )
@pytest.mark.failover_empty_map_stop_service
2023-09-08 10:35:34 +00:00
@allure.title ( " Empty network map via stop all storage services (s3_client= {s3_client} ) " )
2023-05-15 09:59:33 +00:00
def test_stop_all_storage_nodes (
self ,
s3_client : S3ClientWrapper ,
bucket : str ,
2023-08-02 11:54:03 +00:00
simple_object_size : ObjectSize ,
2023-05-15 09:59:33 +00:00
empty_map_stop_service_teardown ,
2023-10-25 12:47:29 +00:00
cluster_state_controller : ClusterStateController ,
2023-05-15 09:59:33 +00:00
) :
2023-03-09 10:19:41 +00:00
"""
2023-05-15 09:59:33 +00:00
The test makes network map empty ( stop storage service on all nodes
then use ' frostfs-adm morph delete-nodes ' to delete nodes from map )
2023-03-09 10:19:41 +00:00
then start all services and checks that object can read through s3 .
Steps :
1. Check that bucket is empty
2 : PUT object into bucket
3 : Check that object exists in bucket
2023-05-15 09:59:33 +00:00
4 : Exclude all storage nodes from network map ( stop storage service
2023-03-09 10:19:41 +00:00
and manual exclude from map )
5 : Return all storage nodes to network map
6 : Check that we can read object from #2
Args :
bucket : bucket which contains tested object
simple_object_size : size of object
"""
2023-08-02 11:54:03 +00:00
file_path = generate_file ( simple_object_size . value )
2023-05-15 09:59:33 +00:00
file_name = s3_helper . object_key_from_file_path ( file_path )
2023-03-09 10:19:41 +00:00
bucket_objects = [ file_name ]
2023-11-29 13:34:59 +00:00
with reporter . step ( " Put object into bucket " ) :
2023-05-15 09:59:33 +00:00
s3_client . put_object ( bucket , file_path )
2023-03-09 10:19:41 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Check that object exists in bucket " ) :
2023-05-15 09:59:33 +00:00
s3_helper . check_objects_in_bucket ( s3_client , bucket , bucket_objects )
2023-03-09 10:19:41 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Stop all storage nodes " ) :
2023-10-31 11:28:44 +00:00
cluster_state_controller . stop_services_of_type ( StorageNode )
2023-03-09 10:19:41 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Remove all nodes from network map " ) :
2023-10-25 12:47:29 +00:00
remove_nodes_from_map_morph (
shell = self . shell , cluster = self . cluster , remove_nodes = self . cluster . services ( StorageNode )
)
2023-03-09 10:19:41 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Return all storage nodes to network map " ) :
2023-10-25 12:47:29 +00:00
self . return_nodes_after_stop_with_check_empty_map ( cluster_state_controller )
2023-03-09 10:19:41 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Check that object exists in bucket " ) :
2023-05-15 09:59:33 +00:00
s3_helper . check_objects_in_bucket ( s3_client , bucket , bucket_objects )
2023-03-09 10:19:41 +00:00
2023-11-29 13:34:59 +00:00
@reporter.step ( " Return all nodes to cluster with check empty map first " )
2023-10-25 12:47:29 +00:00
def return_nodes_after_stop_with_check_empty_map ( self , cluster_state_controller : ClusterStateController ) - > None :
first_node = self . cluster . cluster_nodes [ 0 ] . service ( StorageNode )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Start first node and check network map " ) :
2023-10-31 11:28:44 +00:00
cluster_state_controller . start_service_of_type ( self . cluster . cluster_nodes [ 0 ] , StorageNode )
2023-10-25 12:47:29 +00:00
wait_for_node_to_be_ready ( first_node )
for check_node in self . cluster . storage_nodes :
check_node_not_in_map ( check_node , shell = self . shell , alive_node = first_node )
for node in self . cluster . cluster_nodes [ 1 : ] :
storage_node = node . service ( StorageNode )
2023-10-31 11:28:44 +00:00
cluster_state_controller . start_service_of_type ( node , StorageNode )
2023-10-25 12:47:29 +00:00
wait_for_node_to_be_ready ( storage_node )
2023-03-09 10:19:41 +00:00
sleep ( datetime_utils . parse_time ( MORPH_BLOCK_TIME ) )
self . tick_epochs ( 1 )
2023-10-25 12:47:29 +00:00
check_node_in_map ( storage_node , shell = self . shell , alive_node = first_node )
2023-05-02 18:46:59 +00:00
2023-09-08 10:35:34 +00:00
@allure.title ( " Object loss from fstree/blobovnicza (versioning=enabled, s3_client= {s3_client} ) " )
2023-05-02 18:46:59 +00:00
def test_s3_fstree_blobovnicza_loss_versioning_on (
2023-05-26 04:49:23 +00:00
self ,
s3_client : S3ClientWrapper ,
2023-08-02 11:54:03 +00:00
simple_object_size : ObjectSize ,
2023-05-26 04:49:23 +00:00
cluster_state_controller : ClusterStateController ,
2024-06-24 23:27:54 +00:00
bucket : str ,
2023-05-02 18:46:59 +00:00
) :
s3_helper . set_bucket_versioning ( s3_client , bucket , VersioningStatus . ENABLED )
2023-08-02 11:54:03 +00:00
file_path = generate_file ( simple_object_size . value )
2023-05-02 18:46:59 +00:00
file_name = s3_helper . object_key_from_file_path ( file_path )
object_versions = [ ]
2023-11-29 13:34:59 +00:00
with reporter . step ( " Put object into one bucket " ) :
2023-05-02 18:46:59 +00:00
put_object = s3_client . put_object ( bucket , file_path )
s3_helper . check_objects_in_bucket ( s3_client , bucket , expected_objects = [ file_name ] )
object_versions . append ( put_object )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Stop all storage nodes " ) :
2023-10-31 11:28:44 +00:00
cluster_state_controller . stop_services_of_type ( StorageNode )
2023-05-02 18:46:59 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Delete blobovnicza and fstree from all nodes " ) :
2023-05-02 18:46:59 +00:00
for node in self . cluster . storage_nodes :
node . delete_blobovnicza ( )
node . delete_fstree ( )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Start all storage nodes " ) :
2023-10-31 11:28:44 +00:00
cluster_state_controller . start_all_stopped_services ( )
2023-05-02 18:46:59 +00:00
# need to get Delete Marker first
2023-11-29 13:34:59 +00:00
with reporter . step ( " Delete the object from the bucket " ) :
2023-05-02 18:46:59 +00:00
delete_object = s3_client . delete_object ( bucket , file_name )
object_versions . append ( delete_object [ " VersionId " ] )
# and now delete all versions of object (including Delete Markers)
2023-11-29 13:34:59 +00:00
with reporter . step ( " Delete all versions of the object from the bucket " ) :
2023-05-02 18:46:59 +00:00
for version in object_versions :
delete_object = s3_client . delete_object ( bucket , file_name , version_id = version )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Delete bucket " ) :
2023-05-02 18:46:59 +00:00
s3_client . delete_bucket ( bucket )
2023-10-13 08:52:42 +00:00
@allure.title ( " Object loss from fstree/blobovnicza (versioning=disabled, s3_client= {s3_client} ) " )
2023-05-02 18:46:59 +00:00
def test_s3_fstree_blobovnicza_loss_versioning_off (
2023-05-26 04:49:23 +00:00
self ,
s3_client : S3ClientWrapper ,
2023-08-02 11:54:03 +00:00
simple_object_size : ObjectSize ,
2023-05-26 04:49:23 +00:00
cluster_state_controller : ClusterStateController ,
2024-06-24 23:27:54 +00:00
bucket : str ,
2023-05-02 18:46:59 +00:00
) :
2023-08-02 11:54:03 +00:00
file_path = generate_file ( simple_object_size . value )
2023-05-02 18:46:59 +00:00
file_name = s3_helper . object_key_from_file_path ( file_path )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Put object into one bucket " ) :
2023-05-02 18:46:59 +00:00
s3_client . put_object ( bucket , file_path )
s3_helper . check_objects_in_bucket ( s3_client , bucket , expected_objects = [ file_name ] )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Stop all storage nodes " ) :
2023-10-31 11:28:44 +00:00
cluster_state_controller . stop_services_of_type ( StorageNode )
2023-05-02 18:46:59 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Delete blobovnicza and fstree from all nodes " ) :
2023-05-02 18:46:59 +00:00
for node in self . cluster . storage_nodes :
node . delete_blobovnicza ( )
node . delete_fstree ( )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Start all storage nodes " ) :
2023-10-31 11:28:44 +00:00
cluster_state_controller . start_all_stopped_services ( )
2023-05-02 18:46:59 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Delete the object from the bucket " ) :
2023-05-02 18:46:59 +00:00
s3_client . delete_object ( bucket , file_name )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Delete bucket " ) :
2023-05-02 18:46:59 +00:00
s3_client . delete_bucket ( bucket )
2023-05-22 06:55:51 +00:00
@pytest.mark.skip ( reason = " Need to increase cache lifetime " )
@pytest.mark.parametrize (
# versioning should NOT be VersioningStatus.SUSPENDED, it needs to be undefined
" versioning_status " ,
2023-08-07 09:43:16 +00:00
[ VersioningStatus . ENABLED , VersioningStatus . UNDEFINED ] ,
2023-05-22 06:55:51 +00:00
)
@allure.title (
2023-09-08 10:35:34 +00:00
" After Pilorama.db loss on all nodes list objects should return nothing in second listing (versioning_status= {versioning_status} , s3_client= {s3_client} ) "
2023-05-22 06:55:51 +00:00
)
def test_s3_pilorama_loss (
self ,
s3_client : S3ClientWrapper ,
2023-08-02 11:54:03 +00:00
simple_object_size : ObjectSize ,
2023-05-22 06:55:51 +00:00
versioning_status : VersioningStatus ,
2023-05-26 04:49:23 +00:00
cluster_state_controller : ClusterStateController ,
2024-06-24 23:27:54 +00:00
bucket : str ,
2023-05-22 06:55:51 +00:00
) :
2023-08-07 09:43:16 +00:00
s3_helper . set_bucket_versioning ( s3_client , bucket , versioning_status )
2023-05-22 06:55:51 +00:00
2023-08-02 11:54:03 +00:00
file_path = generate_file ( simple_object_size . value )
2023-05-22 06:55:51 +00:00
file_name = s3_helper . object_key_from_file_path ( file_path )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Put object into one bucket " ) :
2023-05-22 06:55:51 +00:00
s3_client . put_object ( bucket , file_path )
s3_helper . check_objects_in_bucket ( s3_client , bucket , expected_objects = [ file_name ] )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Stop all storage nodes " ) :
2023-10-31 11:28:44 +00:00
cluster_state_controller . stop_services_of_type ( StorageNode )
2023-05-22 06:55:51 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Delete pilorama.db from all nodes " ) :
2023-05-22 06:55:51 +00:00
for node in self . cluster . storage_nodes :
2023-12-21 07:44:35 +00:00
for shard in node . get_shards ( ) :
node . delete_file ( shard . pilorama )
2023-05-22 06:55:51 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Start all storage nodes " ) :
2023-10-31 11:28:44 +00:00
cluster_state_controller . start_all_stopped_services ( )
2023-05-22 06:55:51 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Check list objects first time " ) :
2023-05-22 06:55:51 +00:00
objects_list = s3_client . list_objects ( bucket )
assert objects_list , f " Expected not empty bucket "
2023-11-29 13:34:59 +00:00
with reporter . step ( " Check list objects second time " ) :
2023-05-22 06:55:51 +00:00
objects_list = s3_client . list_objects ( bucket )
assert not objects_list , f " Expected empty bucket, got { objects_list } "
2023-11-29 13:34:59 +00:00
with reporter . step ( " Delete bucket " ) :
2023-05-22 06:55:51 +00:00
s3_client . delete_bucket ( bucket )
2023-05-30 13:42:27 +00:00
@pytest.mark.failover
@pytest.mark.failover_data_loss
class TestStorageDataLoss ( ClusterTestBase ) :
@allure.title (
2023-09-08 10:35:34 +00:00
" After metabase loss on all nodes operations on objects and buckets should be still available via S3 (s3_client= {s3_client} ) "
2023-05-30 13:42:27 +00:00
)
@pytest.mark.metabase_loss
def test_metabase_loss (
self ,
s3_client : S3ClientWrapper ,
2023-08-02 11:54:03 +00:00
simple_object_size : ObjectSize ,
complex_object_size : ObjectSize ,
2023-05-30 13:42:27 +00:00
cluster_state_controller : ClusterStateController ,
file_keeper : FileKeeper ,
2024-06-24 23:27:54 +00:00
bucket : str ,
2023-05-30 13:42:27 +00:00
) :
2023-11-29 13:34:59 +00:00
with reporter . step ( " Put objects into bucket " ) :
2023-08-02 11:54:03 +00:00
simple_object_path = generate_file ( simple_object_size . value )
2023-05-30 13:42:27 +00:00
simple_object_key = s3_helper . object_key_from_file_path ( simple_object_path )
2023-08-02 11:54:03 +00:00
complex_object_path = generate_file ( complex_object_size . value )
2023-05-30 13:42:27 +00:00
complex_object_key = s3_helper . object_key_from_file_path ( complex_object_path )
s3_client . put_object ( bucket , simple_object_path )
s3_client . put_object ( bucket , complex_object_path )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Check objects are in bucket " ) :
2023-05-30 13:42:27 +00:00
s3_helper . check_objects_in_bucket (
s3_client , bucket , expected_objects = [ simple_object_key , complex_object_key ]
)
2023-11-29 13:34:59 +00:00
with reporter . step ( " Stop storage services on all nodes " ) :
2023-10-31 11:28:44 +00:00
cluster_state_controller . stop_services_of_type ( StorageNode )
2023-05-30 13:42:27 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Delete metabase from all nodes " ) :
2023-05-30 13:42:27 +00:00
for node in cluster_state_controller . cluster . storage_nodes :
node . delete_metabase ( )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Enable resync_metabase option for storage services " ) :
2023-05-30 13:42:27 +00:00
for storage_node in cluster_state_controller . cluster . storage_nodes :
2023-11-29 13:34:59 +00:00
with reporter . step ( f " Enable resync_metabase option for { storage_node } " ) :
2023-10-25 12:47:29 +00:00
config_file_path , config = storage_node . get_shards_config ( )
2023-05-30 13:42:27 +00:00
if not config [ " storage " ] [ " shard " ] [ " default " ] [ " resync_metabase " ] :
file_keeper . add ( storage_node , config_file_path )
config [ " storage " ] [ " shard " ] [ " default " ] [ " resync_metabase " ] = True
2023-10-25 12:47:29 +00:00
storage_node . save_config ( config , config_file_path )
2023-05-30 13:42:27 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Start storage services on all nodes " ) :
2023-10-31 11:28:44 +00:00
cluster_state_controller . start_all_stopped_services ( )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Wait for tree rebalance " ) :
2023-10-31 11:28:44 +00:00
# TODO: Use product metric when we have proper ones for this check
sleep ( 30 )
2023-05-30 13:42:27 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Delete objects from bucket " ) :
with reporter . step ( " Delete simple object from bucket " ) :
2023-05-30 13:42:27 +00:00
with expect_not_raises ( ) :
s3_client . delete_object ( bucket , simple_object_key )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Delete complex object from bucket " ) :
2023-05-30 13:42:27 +00:00
with expect_not_raises ( ) :
s3_client . delete_object ( bucket , complex_object_key )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Delete bucket " ) :
2023-05-30 13:42:27 +00:00
with expect_not_raises ( ) :
s3_client . delete_bucket ( bucket )
2023-05-31 15:31:35 +00:00
2023-10-13 08:52:42 +00:00
@allure.title ( " Write cache loss on one node should not affect shards and should not produce errors in log " )
2023-05-31 15:31:35 +00:00
@pytest.mark.write_cache_loss
def test_write_cache_loss_on_one_node (
self ,
node_under_test : ClusterNode ,
2023-08-02 11:54:03 +00:00
simple_object_size : ObjectSize ,
2023-05-31 15:31:35 +00:00
cluster_state_controller : ClusterStateController ,
shards_watcher : ShardsWatcher ,
2024-03-11 16:34:54 +00:00
default_wallet : WalletInfo ,
2023-05-31 15:31:35 +00:00
test_start_time : datetime ,
) :
exception_messages = [ ]
2023-11-29 13:34:59 +00:00
with reporter . step ( f " Create container on node { node_under_test } " ) :
2023-05-31 15:31:35 +00:00
locode = node_under_test . storage_node . get_un_locode ( )
placement_rule = f """ REP 1 IN X
CBF 1
SELECT 1 FROM C AS X
FILTER ' UN-LOCODE ' EQ ' {locode} ' AS C """
cid = create_container (
default_wallet ,
self . shell ,
node_under_test . storage_node . get_rpc_endpoint ( ) ,
rule = placement_rule ,
)
container = StorageContainer (
2024-03-21 08:35:26 +00:00
StorageContainerInfo ( cid , default_wallet ) ,
2023-05-31 15:31:35 +00:00
self . shell ,
cluster_state_controller . cluster ,
)
2023-11-29 13:34:59 +00:00
with reporter . step ( f " Put couple objects to container on node { node_under_test } " ) :
2023-05-31 15:31:35 +00:00
storage_objects : list [ StorageObjectInfo ] = [ ]
for _ in range ( 5 ) :
storage_object = container . generate_object (
2023-08-02 11:54:03 +00:00
simple_object_size . value ,
endpoint = node_under_test . storage_node . get_rpc_endpoint ( ) ,
2023-05-31 15:31:35 +00:00
)
storage_objects . append ( storage_object )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Take shards snapshot " ) :
2023-05-31 15:31:35 +00:00
shards_watcher . take_shards_snapshot ( )
2023-11-29 13:34:59 +00:00
with reporter . step ( f " Stop storage service on node { node_under_test } " ) :
2023-10-31 11:28:44 +00:00
cluster_state_controller . stop_service_of_type ( node_under_test , StorageNode )
2023-05-31 15:31:35 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( f " Delete write cache from node { node_under_test } " ) :
2023-05-31 15:31:35 +00:00
node_under_test . storage_node . delete_write_cache ( )
2023-11-29 13:34:59 +00:00
with reporter . step ( f " Start storage service on node { node_under_test } " ) :
2023-10-31 11:28:44 +00:00
cluster_state_controller . start_all_stopped_services ( )
2023-05-31 15:31:35 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Objects should be available " ) :
2023-05-31 15:31:35 +00:00
for storage_object in storage_objects :
get_object (
2024-03-11 16:34:54 +00:00
storage_object . wallet ,
2023-05-31 15:31:35 +00:00
container . get_id ( ) ,
storage_object . oid ,
self . shell ,
node_under_test . storage_node . get_rpc_endpoint ( ) ,
)
2023-11-29 13:34:59 +00:00
with reporter . step ( " No shards should have new errors " ) :
2023-05-31 15:31:35 +00:00
shards_watcher . take_shards_snapshot ( )
shards_with_errors = shards_watcher . get_shards_with_new_errors ( )
if shards_with_errors :
exception_messages . append ( f " Shards have new errors: { shards_with_errors } " )
2023-11-29 13:34:59 +00:00
with reporter . step ( " No shards should have degraded status " ) :
2023-05-31 15:31:35 +00:00
snapshot = shards_watcher . get_shards_snapshot ( )
for shard in snapshot :
status = snapshot [ shard ] [ " mode " ]
if status != " read-write " :
exception_messages . append ( f " Shard { shard } changed status to { status } " )
2023-11-29 13:34:59 +00:00
with reporter . step ( " No related errors should be in log " ) :
2023-05-31 15:31:35 +00:00
if node_under_test . host . is_message_in_logs (
message_regex = r " \ Wno such file or directory \ W " , since = test_start_time
) :
exception_messages . append ( f " Node { node_under_test } have shard errors in logs " )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Pass test if no errors found " ) :
2023-05-31 15:31:35 +00:00
assert not exception_messages , " \n " . join ( exception_messages )
2023-05-31 10:16:38 +00:00
@allure.title (
2023-09-08 10:35:34 +00:00
" Loss of one node should trigger use of tree and storage service in another node (s3_client= {s3_client} ) "
2023-05-31 10:16:38 +00:00
)
def test_s3_one_endpoint_loss (
self ,
bucket ,
s3_client : S3ClientWrapper ,
2023-08-02 11:54:03 +00:00
simple_object_size : ObjectSize ,
2023-05-31 10:16:38 +00:00
cluster_state_controller : ClusterStateController ,
) :
# TODO: need to check that s3 gate is connected to localhost (such metric will be supported in 1.3)
2023-11-29 13:34:59 +00:00
with reporter . step ( " Stop one node and wait for rebalance connection of s3 gate to storage service " ) :
2023-05-31 10:16:38 +00:00
current_node = self . cluster . cluster_nodes [ 0 ]
2023-10-31 11:28:44 +00:00
cluster_state_controller . stop_service_of_type ( current_node , StorageNode )
2023-05-31 10:16:38 +00:00
# waiting for rebalance connection of s3 gate to storage service
sleep ( 60 )
2023-08-02 11:54:03 +00:00
file_path = generate_file ( simple_object_size . value )
2023-05-31 10:16:38 +00:00
file_name = s3_helper . object_key_from_file_path ( file_path )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Put object into one bucket " ) :
2023-05-31 10:16:38 +00:00
put_object = s3_client . put_object ( bucket , file_path )
2023-05-30 14:14:26 +00:00
s3_helper . check_objects_in_bucket ( s3_client , bucket , expected_objects = [ file_name ] )
2023-10-13 08:52:42 +00:00
@allure.title ( " After Pilorama.db loss on one node object is retrievable (s3_client= {s3_client} ) " )
2023-05-30 14:14:26 +00:00
def test_s3_one_pilorama_loss (
self ,
s3_client : S3ClientWrapper ,
2023-08-02 11:54:03 +00:00
simple_object_size : ObjectSize ,
2023-05-30 14:14:26 +00:00
cluster_state_controller : ClusterStateController ,
) :
bucket = s3_client . create_bucket (
location_constraint = " load-1-4 " ,
grant_read = " uri=http://acs.amazonaws.com/groups/global/AllUsers " ,
)
s3_helper . set_bucket_versioning ( s3_client , bucket , VersioningStatus . ENABLED )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Check bucket versioning " ) :
2023-05-30 14:14:26 +00:00
bucket_versioning = s3_client . get_bucket_versioning_status ( bucket )
assert bucket_versioning == " Enabled " , " Bucket should have enabled versioning "
2023-08-02 11:54:03 +00:00
file_path = generate_file ( simple_object_size . value )
2023-05-30 14:14:26 +00:00
file_name = s3_helper . object_key_from_file_path ( file_path )
object_versions = [ ]
2023-11-29 13:34:59 +00:00
with reporter . step ( " Put object into one bucket " ) :
2023-05-30 14:14:26 +00:00
put_object = s3_client . put_object ( bucket , file_path )
s3_helper . check_objects_in_bucket ( s3_client , bucket , expected_objects = [ file_name ] )
object_versions . append ( put_object )
node_to_check = self . cluster . storage_nodes [ 0 ]
2023-12-21 07:44:35 +00:00
piloramas_list_before_removing = [ ]
with reporter . step ( " Get list of all pilorama.db on shards " ) :
for shard in node_to_check . get_shards ( ) :
piloramas_list_before_removing . append ( shard . pilorama )
with reporter . step ( " Check that all pilorama.db files exist on node " ) :
for pilorama in piloramas_list_before_removing :
assert node_to_check . is_file_exist ( pilorama ) , f " File { pilorama } does not exist "
2023-05-30 14:14:26 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Stop all storage nodes " ) :
2023-10-31 11:28:44 +00:00
cluster_state_controller . stop_services_of_type ( StorageNode )
2023-05-30 14:14:26 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Delete pilorama.db from one node " ) :
2023-12-21 07:44:35 +00:00
for pilorama in piloramas_list_before_removing :
node_to_check . delete_file ( pilorama )
2023-05-30 14:14:26 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Start all storage nodes " ) :
2023-10-31 11:28:44 +00:00
cluster_state_controller . start_all_stopped_services ( )
2023-05-30 14:14:26 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Tick epoch to trigger sync and then wait for 1 minute " ) :
2023-05-30 14:14:26 +00:00
self . tick_epochs ( 1 )
sleep ( 120 )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Get list of all pilorama.db after sync " ) :
2023-12-21 07:44:35 +00:00
for pilorama in piloramas_list_before_removing :
assert node_to_check . is_file_exist ( pilorama ) , f " File { pilorama } does not exist "
2023-05-30 14:14:26 +00:00
2023-11-29 13:34:59 +00:00
with reporter . step ( " Check bucket versioning " ) :
2023-05-30 14:14:26 +00:00
bucket_versioning = s3_client . get_bucket_versioning_status ( bucket )
assert bucket_versioning == " Enabled " , " Bucket should have enabled versioning "
2023-11-29 13:34:59 +00:00
with reporter . step ( " Check list objects " ) :
2023-05-30 14:14:26 +00:00
objects_list = s3_client . list_objects ( bucket )
assert objects_list , f " Expected not empty bucket "
2023-11-29 13:34:59 +00:00
with reporter . step ( " Delete the object from the bucket " ) :
2023-05-30 14:14:26 +00:00
delete_object = s3_client . delete_object ( bucket , file_name )
assert " DeleteMarker " in delete_object . keys ( ) , " Delete markers not found "
2023-11-29 13:34:59 +00:00
with reporter . step ( " Check list objects " ) :
2023-05-30 14:14:26 +00:00
objects_list = s3_client . list_objects_versions ( bucket )
assert objects_list , f " Expected not empty bucket "
object_versions . append ( delete_object [ " VersionId " ] )
# and now delete all versions of object (including Delete Markers)
2023-11-29 13:34:59 +00:00
with reporter . step ( " Delete all versions of the object from the bucket " ) :
2023-05-30 14:14:26 +00:00
for version in object_versions :
delete_object = s3_client . delete_object ( bucket , file_name , version_id = version )
2023-11-29 13:34:59 +00:00
with reporter . step ( " Check list objects " ) :
2023-05-30 14:14:26 +00:00
objects_list = s3_client . list_objects_versions ( bucket )
assert not objects_list , f " Expected empty bucket "
2023-11-29 13:34:59 +00:00
with reporter . step ( " Delete bucket " ) :
2023-05-30 14:14:26 +00:00
s3_client . delete_bucket ( bucket )