package putsvc import ( "context" "fmt" "sync" "sync/atomic" "git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs" "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/core/object" svcutil "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/object/util" "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/object_manager/placement" "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/object_manager/transformer" "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util" "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/logger" objectSDK "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object" "go.uber.org/zap" ) type preparedObjectTarget interface { WriteObject(*objectSDK.Object, object.ContentMeta) error Close(ctx context.Context) (*transformer.AccessIdentifiers, error) } type distributedTarget struct { traversal traversal remotePool, localPool util.WorkerPool obj *objectSDK.Object objMeta object.ContentMeta payload *payload nodeTargetInitializer func(nodeDesc) preparedObjectTarget isLocalKey func([]byte) bool relay func(context.Context, nodeDesc) error fmt *object.FormatValidator log *logger.Logger } // parameters and state of container traversal. type traversal struct { opts []placement.Option // need of additional broadcast after the object is saved extraBroadcastEnabled bool // mtx protects mExclude map. mtx sync.RWMutex // container nodes which was processed during the primary object placement mExclude map[string]struct{} } // updates traversal parameters after the primary placement finish and // returns true if additional container broadcast is needed. func (x *traversal) submitPrimaryPlacementFinish() bool { if x.extraBroadcastEnabled { // do not track success during container broadcast (best-effort) x.opts = append(x.opts, placement.WithoutSuccessTracking()) // avoid 2nd broadcast x.extraBroadcastEnabled = false return true } return false } // marks the container node as processed during the primary object placement. func (x *traversal) submitProcessed(n placement.Node) { if x.extraBroadcastEnabled { key := string(n.PublicKey()) x.mtx.Lock() if x.mExclude == nil { x.mExclude = make(map[string]struct{}, 1) } x.mExclude[key] = struct{}{} x.mtx.Unlock() } } // checks if specified node was processed during the primary object placement. func (x *traversal) processed(n placement.Node) bool { x.mtx.RLock() _, ok := x.mExclude[string(n.PublicKey())] x.mtx.RUnlock() return ok } type nodeDesc struct { local bool info placement.Node } // errIncompletePut is returned if processing on a container fails. type errIncompletePut struct { singleErr error // error from the last responding node } func (x errIncompletePut) Error() string { const commonMsg = "incomplete object PUT by placement" if x.singleErr != nil { return fmt.Sprintf("%s: %v", commonMsg, x.singleErr) } return commonMsg } func (t *distributedTarget) WriteHeader(_ context.Context, obj *objectSDK.Object) error { t.obj = obj return nil } func (t *distributedTarget) Write(_ context.Context, p []byte) (n int, err error) { t.payload.Data = append(t.payload.Data, p...) return len(p), nil } func (t *distributedTarget) Close(ctx context.Context) (*transformer.AccessIdentifiers, error) { defer func() { putPayload(t.payload) t.payload = nil }() t.obj.SetPayload(t.payload.Data) var err error if t.objMeta, err = t.fmt.ValidateContent(t.obj); err != nil { return nil, fmt.Errorf("(%T) could not validate payload content: %w", t, err) } if len(t.obj.Children()) > 0 { // enabling extra broadcast for linking objects t.traversal.extraBroadcastEnabled = true } return t.iteratePlacement(ctx) } func (t *distributedTarget) sendObject(ctx context.Context, node nodeDesc) error { if !node.local && t.relay != nil { return t.relay(ctx, node) } target := t.nodeTargetInitializer(node) if err := target.WriteObject(t.obj, t.objMeta); err != nil { return fmt.Errorf("could not write header: %w", err) } else if _, err := target.Close(ctx); err != nil { return fmt.Errorf("could not close object stream: %w", err) } return nil } func (t *distributedTarget) iteratePlacement(ctx context.Context) (*transformer.AccessIdentifiers, error) { id, _ := t.obj.ID() traverser, err := placement.NewTraverser( append(t.traversal.opts, placement.ForObject(id))..., ) if err != nil { return nil, fmt.Errorf("(%T) could not create object placement traverser: %w", t, err) } resErr := &atomic.Value{} for { addrs := traverser.Next() if len(addrs) == 0 { break } if t.iterateAddresses(ctx, traverser, addrs, resErr) { break } } if !traverser.Success() { var err errIncompletePut err.singleErr, _ = resErr.Load().(error) return nil, err } // perform additional container broadcast if needed if t.traversal.submitPrimaryPlacementFinish() { _, err = t.iteratePlacement(ctx) if err != nil { t.log.Error(logs.PutAdditionalContainerBroadcastFailure, zap.Error(err)) // we don't fail primary operation because of broadcast failure } } id, _ = t.obj.ID() return new(transformer.AccessIdentifiers). WithSelfID(id), nil } func (t *distributedTarget) iterateAddresses(ctx context.Context, traverser *placement.Traverser, addrs []placement.Node, resErr *atomic.Value) bool { wg := &sync.WaitGroup{} for i := range addrs { if t.traversal.processed(addrs[i]) { // it can happen only during additional container broadcast continue } wg.Add(1) addr := addrs[i] isLocal := t.isLocalKey(addr.PublicKey()) workerPool := t.remotePool if isLocal { workerPool = t.localPool } if err := workerPool.Submit(func() { defer wg.Done() err := t.sendObject(ctx, nodeDesc{local: isLocal, info: addr}) // mark the container node as processed in order to exclude it // in subsequent container broadcast. Note that we don't // process this node during broadcast if primary placement // on it failed. t.traversal.submitProcessed(addr) if err != nil { resErr.Store(err) svcutil.LogServiceError(t.log, "PUT", addr.Addresses(), err) return } traverser.SubmitSuccess() }); err != nil { wg.Done() svcutil.LogWorkerPoolError(t.log, "PUT", err) return true } } wg.Wait() return false }