forked from TrueCloudLab/frostfs-node
Anton Nikiforov
8d589314b5
Once the node was processed it skipped, at the step of forming result in case when all nodes skipped, because processed for previous REP, service mark the whole request as incomplete. Example of policies which are unblocked: - REP 1 REP 1 CBF 1 - REP 4 IN X REP 4 IN Y CBF 4 SELECT 2 FROM FX AS X SELECT 2 FROM FY AS Y FILTER Country EQ Russia OR Country EQ Sweden OR Country EQ Finland AS FY FILTER Price GE 0 AS FX Signed-off-by: Anton Nikiforov <an.nikiforov@yadro.com>
255 lines
6.2 KiB
Go
255 lines
6.2 KiB
Go
package putsvc
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sync"
|
|
"sync/atomic"
|
|
|
|
"git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs"
|
|
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/core/object"
|
|
svcutil "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/object/util"
|
|
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/object_manager/placement"
|
|
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util"
|
|
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/logger"
|
|
objectSDK "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object"
|
|
"git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object/transformer"
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
type preparedObjectTarget interface {
|
|
WriteObject(context.Context, *objectSDK.Object, object.ContentMeta) error
|
|
}
|
|
|
|
type distributedTarget struct {
|
|
traversal traversal
|
|
|
|
obj *objectSDK.Object
|
|
objMeta object.ContentMeta
|
|
|
|
payload *payload
|
|
|
|
nodeTargetInitializer func(nodeDesc) preparedObjectTarget
|
|
|
|
getWorkerPool func([]byte) (util.WorkerPool, bool)
|
|
|
|
relay func(context.Context, nodeDesc) error
|
|
|
|
fmt *object.FormatValidator
|
|
|
|
log *logger.Logger
|
|
}
|
|
|
|
// parameters and state of container traversal.
|
|
type traversal struct {
|
|
opts []placement.Option
|
|
|
|
// need of additional broadcast after the object is saved
|
|
extraBroadcastEnabled bool
|
|
|
|
// container nodes which was processed during the primary object placement
|
|
mExclude map[string]*bool
|
|
}
|
|
|
|
// updates traversal parameters after the primary placement finish and
|
|
// returns true if additional container broadcast is needed.
|
|
func (x *traversal) submitPrimaryPlacementFinish() bool {
|
|
if x.extraBroadcastEnabled {
|
|
// do not track success during container broadcast (best-effort)
|
|
x.opts = append(x.opts, placement.WithoutSuccessTracking())
|
|
|
|
// avoid 2nd broadcast
|
|
x.extraBroadcastEnabled = false
|
|
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// marks the container node as processed during the primary object placement.
|
|
func (x *traversal) submitProcessed(n placement.Node, item *bool) {
|
|
if x.extraBroadcastEnabled {
|
|
key := string(n.PublicKey())
|
|
|
|
if x.mExclude == nil {
|
|
x.mExclude = make(map[string]*bool, 1)
|
|
}
|
|
|
|
x.mExclude[key] = item
|
|
}
|
|
}
|
|
|
|
type nodeDesc struct {
|
|
local bool
|
|
|
|
info placement.Node
|
|
}
|
|
|
|
// errIncompletePut is returned if processing on a container fails.
|
|
type errIncompletePut struct {
|
|
singleErr error // error from the last responding node
|
|
}
|
|
|
|
func (x errIncompletePut) Error() string {
|
|
const commonMsg = "incomplete object PUT by placement"
|
|
|
|
if x.singleErr != nil {
|
|
return fmt.Sprintf("%s: %v", commonMsg, x.singleErr)
|
|
}
|
|
|
|
return commonMsg
|
|
}
|
|
|
|
func (t *distributedTarget) WriteHeader(_ context.Context, obj *objectSDK.Object) error {
|
|
t.obj = obj
|
|
|
|
return nil
|
|
}
|
|
|
|
func (t *distributedTarget) Write(_ context.Context, p []byte) (n int, err error) {
|
|
t.payload.Data = append(t.payload.Data, p...)
|
|
|
|
return len(p), nil
|
|
}
|
|
|
|
func (t *distributedTarget) Close(ctx context.Context) (*transformer.AccessIdentifiers, error) {
|
|
defer func() {
|
|
putPayload(t.payload)
|
|
t.payload = nil
|
|
}()
|
|
|
|
t.obj.SetPayload(t.payload.Data)
|
|
|
|
if err := t.WriteObject(ctx, t.obj); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
id, _ := t.obj.ID()
|
|
return &transformer.AccessIdentifiers{
|
|
SelfID: id,
|
|
}, nil
|
|
}
|
|
|
|
// WriteObject implements the transformer.ObjectWriter interface.
|
|
func (t *distributedTarget) WriteObject(ctx context.Context, obj *objectSDK.Object) error {
|
|
t.obj = obj
|
|
|
|
var err error
|
|
|
|
if t.objMeta, err = t.fmt.ValidateContent(t.obj); err != nil {
|
|
return fmt.Errorf("(%T) could not validate payload content: %w", t, err)
|
|
}
|
|
|
|
if len(t.obj.Children()) > 0 {
|
|
// enabling extra broadcast for linking objects
|
|
t.traversal.extraBroadcastEnabled = true
|
|
}
|
|
|
|
return t.iteratePlacement(ctx)
|
|
}
|
|
|
|
func (t *distributedTarget) sendObject(ctx context.Context, node nodeDesc) error {
|
|
if !node.local && t.relay != nil {
|
|
return t.relay(ctx, node)
|
|
}
|
|
|
|
target := t.nodeTargetInitializer(node)
|
|
|
|
err := target.WriteObject(ctx, t.obj, t.objMeta)
|
|
if err != nil {
|
|
return fmt.Errorf("could not write header: %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (t *distributedTarget) iteratePlacement(ctx context.Context) error {
|
|
id, _ := t.obj.ID()
|
|
|
|
traverser, err := placement.NewTraverser(
|
|
append(t.traversal.opts, placement.ForObject(id))...,
|
|
)
|
|
if err != nil {
|
|
return fmt.Errorf("(%T) could not create object placement traverser: %w", t, err)
|
|
}
|
|
|
|
resErr := &atomic.Value{}
|
|
|
|
// Must iterate over all replicas, regardless of whether there are identical nodes there.
|
|
// At the same time need to exclude identical nodes from processing.
|
|
for {
|
|
addrs := traverser.Next()
|
|
if len(addrs) == 0 {
|
|
break
|
|
}
|
|
|
|
if t.iterateAddresses(ctx, traverser, addrs, resErr) {
|
|
break
|
|
}
|
|
}
|
|
|
|
if !traverser.Success() {
|
|
var err errIncompletePut
|
|
err.singleErr, _ = resErr.Load().(error)
|
|
return err
|
|
}
|
|
|
|
// perform additional container broadcast if needed
|
|
if t.traversal.submitPrimaryPlacementFinish() {
|
|
err = t.iteratePlacement(ctx)
|
|
if err != nil {
|
|
t.log.Error(logs.PutAdditionalContainerBroadcastFailure, zap.Error(err))
|
|
// we don't fail primary operation because of broadcast failure
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (t *distributedTarget) iterateAddresses(ctx context.Context, traverser *placement.Traverser, addrs []placement.Node, resErr *atomic.Value) bool {
|
|
wg := &sync.WaitGroup{}
|
|
|
|
for i := range addrs {
|
|
addr := addrs[i]
|
|
if val := t.traversal.mExclude[string(addr.PublicKey())]; val != nil {
|
|
// Check is node processed successful on the previous iteration.
|
|
if *val {
|
|
traverser.SubmitSuccess()
|
|
}
|
|
// it can happen only during additional container broadcast
|
|
continue
|
|
}
|
|
|
|
wg.Add(1)
|
|
item := new(bool)
|
|
|
|
workerPool, isLocal := t.getWorkerPool(addr.PublicKey())
|
|
if err := workerPool.Submit(func() {
|
|
defer wg.Done()
|
|
|
|
err := t.sendObject(ctx, nodeDesc{local: isLocal, info: addr})
|
|
if err != nil {
|
|
resErr.Store(err)
|
|
svcutil.LogServiceError(t.log, "PUT", addr.Addresses(), err)
|
|
return
|
|
}
|
|
|
|
traverser.SubmitSuccess()
|
|
*item = true
|
|
}); err != nil {
|
|
wg.Done()
|
|
svcutil.LogWorkerPoolError(t.log, "PUT", err)
|
|
return true
|
|
}
|
|
|
|
// mark the container node as processed in order to exclude it
|
|
// in subsequent container broadcast. Note that we don't
|
|
// process this node during broadcast if primary placement
|
|
// on it failed.
|
|
t.traversal.submitProcessed(addr, item)
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
return false
|
|
}
|