From 165a600624b59797b532576e291c55cdf3e018cb Mon Sep 17 00:00:00 2001 From: Evgenii Stratonikov Date: Tue, 17 Jan 2023 16:16:50 +0300 Subject: [PATCH] [#2210] pilorama: Reduce the amount of keys per node Under high load we are limited by the _amount_ of keys we need to update in a single transaction. In this commit we try storing all state with a single key. Signed-off-by: Evgenii Stratonikov --- pkg/local_object_storage/pilorama/boltdb.go | 216 +++++++++--------- pkg/local_object_storage/pilorama/forest.go | 12 +- pkg/local_object_storage/pilorama/inmemory.go | 14 +- pkg/local_object_storage/pilorama/types.go | 6 +- 4 files changed, 126 insertions(+), 122 deletions(-) diff --git a/pkg/local_object_storage/pilorama/boltdb.go b/pkg/local_object_storage/pilorama/boltdb.go index 6df455be0b..42401e4d88 100644 --- a/pkg/local_object_storage/pilorama/boltdb.go +++ b/pkg/local_object_storage/pilorama/boltdb.go @@ -156,8 +156,7 @@ func (t *boltForest) TreeMove(d CIDDescriptor, treeID string, m *Move) (*LogMove return nil, ErrReadOnlyMode } - var lm LogMove - lm.Move = *m + lm := *m return &lm, t.db.Batch(func(tx *bbolt.Tx) error { bLog, bTree, err := t.getTreeBuckets(tx, d.CID, treeID) if err != nil { @@ -227,7 +226,7 @@ func (t *boltForest) TreeAddByPath(d CIDDescriptor, treeID string, attr string, ts := t.getLatestTimestamp(bLog, d.Position, d.Size) lm = make([]LogMove, len(path)-i+1) for j := i; j < len(path); j++ { - lm[j-i].Move = Move{ + lm[j-i] = Move{ Parent: node, Meta: Meta{ Time: ts, @@ -245,7 +244,7 @@ func (t *boltForest) TreeAddByPath(d CIDDescriptor, treeID string, attr string, node = lm[j-i].Child } - lm[len(lm)-1].Move = Move{ + lm[len(lm)-1] = Move{ Parent: node, Meta: Meta{ Time: ts, @@ -274,17 +273,14 @@ func (t *boltForest) getLatestTimestamp(bLog *bbolt.Bucket, pos, size int) uint6 // findSpareID returns random unused ID. func (t *boltForest) findSpareID(bTree *bbolt.Bucket) uint64 { id := uint64(rand.Int63()) - - var key [9]byte - key[0] = 't' - binary.LittleEndian.PutUint64(key[1:], id) + key := make([]byte, 9) for { - if bTree.Get(key[:]) == nil { + _, _, _, ok := t.getState(bTree, stateKey(key, id)) + if !ok { return id } id = uint64(rand.Int63()) - binary.LittleEndian.PutUint64(key[1:], id) } } @@ -420,8 +416,12 @@ func (t *boltForest) applyOperation(logBucket, treeBucket *bbolt.Bucket, ms []*M // 1. Undo up until the desired timestamp is here. for len(key) == 8 && ms[0].Time < binary.BigEndian.Uint64(key) { b.Reset(value) - if err := t.logFromBytes(&tmp, r); err != nil { - return err + + tmp.Child = r.ReadU64LE() + tmp.Parent = r.ReadU64LE() + tmp.Time = r.ReadVarUint() + if r.Err != nil { + return r.Err } if err := t.undo(&tmp, treeBucket, cKey[:]); err != nil { return err @@ -433,7 +433,7 @@ func (t *boltForest) applyOperation(logBucket, treeBucket *bbolt.Bucket, ms []*M // Loop invariant: key represents the next stored timestamp after ms[i].Time. // 2. Insert the operation. - lm.Move = *ms[i] + *lm = *ms[i] if err := t.do(logBucket, treeBucket, cKey[:], lm); err != nil { return err } @@ -445,11 +445,10 @@ func (t *boltForest) applyOperation(logBucket, treeBucket *bbolt.Bucket, ms []*M // 3. Re-apply all other operations. for len(key) == 8 && (i == len(ms)-1 || binary.BigEndian.Uint64(key) < ms[i+1].Time) { - b.Reset(value) - if err := t.logFromBytes(&tmp, r); err != nil { + if err := t.logFromBytes(&tmp, value); err != nil { return err } - if err := t.do(logBucket, treeBucket, cKey[:], &tmp); err != nil { + if err := t.redo(treeBucket, cKey[:], &tmp, value[16:]); err != nil { return err } key, value = c.Next() @@ -460,39 +459,42 @@ func (t *boltForest) applyOperation(logBucket, treeBucket *bbolt.Bucket, ms []*M } func (t *boltForest) do(lb *bbolt.Bucket, b *bbolt.Bucket, key []byte, op *LogMove) error { - currParent := b.Get(parentKey(key, op.Child)) - op.HasOld = currParent != nil - if currParent != nil { // node is already in tree - op.Old.Parent = binary.LittleEndian.Uint64(currParent) - if err := op.Old.Meta.FromBytes(b.Get(metaKey(key, op.Child))); err != nil { - return err - } - } else { - op.HasOld = false - op.Old = nodeInfo{} - } - binary.BigEndian.PutUint64(key, op.Time) - if err := lb.Put(key[:8], t.logToBytes(op)); err != nil { + rawLog := t.logToBytes(op) + if err := lb.Put(key[:8], rawLog); err != nil { return err } - if op.Child == op.Parent || t.isAncestor(b, op.Child, op.Parent) { - return nil + return t.redo(b, key, op, rawLog[16:]) +} + +func (t *boltForest) redo(b *bbolt.Bucket, key []byte, op *LogMove, rawMeta []byte) error { + var err error + + parent, ts, currMeta, inTree := t.getState(b, stateKey(key, op.Child)) + if inTree { + err = t.putState(b, oldKey(key, op.Time), parent, ts, currMeta) + } else { + ts = op.Time + err = b.Delete(oldKey(key, op.Time)) } - if currParent == nil { - if err := b.Put(timestampKey(key, op.Child), toUint64(op.Time)); err != nil { - return err - } - } else { - parent := binary.LittleEndian.Uint64(currParent) + if err != nil || op.Child == op.Parent || t.isAncestor(b, op.Child, op.Parent) { + return err + } + + if inTree { if err := b.Delete(childrenKey(key, op.Child, parent)); err != nil { return err } - for i := range op.Old.Meta.Items { - if isAttributeInternal(op.Old.Meta.Items[i].Key) { - key = internalKey(key, op.Old.Meta.Items[i].Key, string(op.Old.Meta.Items[i].Value), parent, op.Child) + + var meta Meta + if err := meta.FromBytes(currMeta); err != nil { + return err + } + for i := range meta.Items { + if isAttributeInternal(meta.Items[i].Key) { + key = internalKey(key, meta.Items[i].Key, string(meta.Items[i].Value), parent, op.Child) err := b.Delete(key) if err != nil { return err @@ -500,17 +502,16 @@ func (t *boltForest) do(lb *bbolt.Bucket, b *bbolt.Bucket, key []byte, op *LogMo } } } - return t.addNode(b, key, op.Child, op.Parent, op.Meta) + return t.addNode(b, key, op.Child, op.Parent, ts, op.Meta, rawMeta) } // removeNode removes node keys from the tree except the children key or its parent. func (t *boltForest) removeNode(b *bbolt.Bucket, key []byte, node, parent Node) error { - if err := b.Delete(parentKey(key, node)); err != nil { - return err - } + k := stateKey(key, node) + _, _, rawMeta, _ := t.getState(b, k) + var meta Meta - var k = metaKey(key, node) - if err := meta.FromBytes(b.Get(k)); err == nil { + if err := meta.FromBytes(rawMeta); err == nil { for i := range meta.Items { if isAttributeInternal(meta.Items[i].Key) { err := b.Delete(internalKey(nil, meta.Items[i].Key, string(meta.Items[i].Value), parent, node)) @@ -520,23 +521,16 @@ func (t *boltForest) removeNode(b *bbolt.Bucket, key []byte, node, parent Node) } } } - if err := b.Delete(metaKey(key, node)); err != nil { - return err - } - return b.Delete(timestampKey(key, node)) + return b.Delete(k) } // addNode adds node keys to the tree except the timestamp key. -func (t *boltForest) addNode(b *bbolt.Bucket, key []byte, child, parent Node, meta Meta) error { - err := b.Put(parentKey(key, child), toUint64(parent)) - if err != nil { +func (t *boltForest) addNode(b *bbolt.Bucket, key []byte, child, parent Node, time Timestamp, meta Meta, rawMeta []byte) error { + if err := t.putState(b, stateKey(key, child), parent, time, rawMeta); err != nil { return err } - err = b.Put(childrenKey(key, child, parent), []byte{1}) - if err != nil { - return err - } - err = b.Put(metaKey(key, child), meta.Bytes()) + + err := b.Put(childrenKey(key, child, parent), []byte{1}) if err != nil { return err } @@ -564,22 +558,28 @@ func (t *boltForest) undo(m *LogMove, b *bbolt.Bucket, key []byte) error { return err } - if !m.HasOld { + parent, ts, rawMeta, ok := t.getState(b, oldKey(key, m.Time)) + if !ok { return t.removeNode(b, key, m.Child, m.Parent) } - return t.addNode(b, key, m.Child, m.Old.Parent, m.Old.Meta) + + var meta Meta + if err := meta.FromBytes(rawMeta); err != nil { + return err + } + return t.addNode(b, key, m.Child, parent, ts, meta, rawMeta) } func (t *boltForest) isAncestor(b *bbolt.Bucket, parent, child Node) bool { key := make([]byte, 9) - key[0] = 'p' + key[0] = 's' for node := child; node != parent; { binary.LittleEndian.PutUint64(key[1:], node) - rawParent := b.Get(key) - if len(rawParent) != 8 { + parent, _, _, ok := t.getState(b, key) + if !ok { return false } - node = binary.LittleEndian.Uint64(rawParent) + node = parent } return true } @@ -619,10 +619,7 @@ func (t *boltForest) TreeGetByPath(cid cidSDK.ID, treeID string, attr string, pa return nil } - var ( - childID [9]byte - maxTimestamp uint64 - ) + var maxTimestamp uint64 c := b.Cursor() @@ -632,7 +629,7 @@ func (t *boltForest) TreeGetByPath(cid cidSDK.ID, treeID string, attr string, pa for len(childKey) == len(attrKey)+8 && bytes.Equal(attrKey, childKey[:len(childKey)-8]) { child := binary.LittleEndian.Uint64(childKey[len(childKey)-8:]) if latest { - ts := binary.LittleEndian.Uint64(b.Get(timestampKey(childID[:], child))) + _, ts, _, _ := t.getState(b, stateKey(make([]byte, 9), child)) if ts >= maxTimestamp { nodes = append(nodes[:0], child) maxTimestamp = ts @@ -655,7 +652,7 @@ func (t *boltForest) TreeGetMeta(cid cidSDK.ID, treeID string, nodeID Node) (Met return Meta{}, 0, ErrDegradedMode } - key := parentKey(make([]byte, 9), nodeID) + key := stateKey(make([]byte, 9), nodeID) var m Meta var parentID uint64 @@ -667,10 +664,11 @@ func (t *boltForest) TreeGetMeta(cid cidSDK.ID, treeID string, nodeID Node) (Met } b := treeRoot.Bucket(dataBucket) - if data := b.Get(key); len(data) == 8 { + if data := b.Get(key); len(data) != 0 { parentID = binary.LittleEndian.Uint64(data) } - return m.FromBytes(b.Get(metaKey(key, nodeID))) + _, _, meta, _ := t.getState(b, stateKey(key, nodeID)) + return m.FromBytes(meta) }) return m, parentID, err @@ -828,41 +826,31 @@ loop: } func (t *boltForest) moveFromBytes(m *Move, data []byte) error { - r := io.NewBinReaderFromBuf(data) - m.Child = r.ReadU64LE() - m.Parent = r.ReadU64LE() - m.Meta.DecodeBinary(r) - return r.Err + return t.logFromBytes(m, data) } -func (t *boltForest) logFromBytes(lm *LogMove, r *io.BinReader) error { - lm.Child = r.ReadU64LE() - lm.Parent = r.ReadU64LE() - lm.Meta.DecodeBinary(r) - lm.HasOld = r.ReadBool() - if lm.HasOld { - lm.Old.Parent = r.ReadU64LE() - lm.Old.Meta.DecodeBinary(r) - } - return r.Err +func (t *boltForest) logFromBytes(lm *LogMove, data []byte) error { + lm.Child = binary.LittleEndian.Uint64(data) + lm.Parent = binary.LittleEndian.Uint64(data[8:]) + return lm.Meta.FromBytes(data[16:]) } func (t *boltForest) logToBytes(lm *LogMove) []byte { w := io.NewBufBinWriter() size := 8 + 8 + lm.Meta.Size() + 1 - if lm.HasOld { - size += 8 + lm.Old.Meta.Size() - } + //if lm.HasOld { + // size += 8 + lm.Old.Meta.Size() + //} w.Grow(size) w.WriteU64LE(lm.Child) w.WriteU64LE(lm.Parent) lm.Meta.EncodeBinary(w.BinWriter) - w.WriteBool(lm.HasOld) - if lm.HasOld { - w.WriteU64LE(lm.Old.Parent) - lm.Old.Meta.EncodeBinary(w.BinWriter) - } + //w.WriteBool(lm.HasOld) + //if lm.HasOld { + // w.WriteU64LE(lm.Old.Parent) + // lm.Old.Meta.EncodeBinary(w.BinWriter) + //} return w.Bytes() } @@ -870,25 +858,37 @@ func bucketName(cid cidSDK.ID, treeID string) []byte { return []byte(cid.String() + treeID) } -// 't' + node (id) -> timestamp when the node first appeared. -func timestampKey(key []byte, child Node) []byte { - key[0] = 't' +// 'o' + time -> old meta. +func oldKey(key []byte, ts Timestamp) []byte { + key[0] = 'o' + binary.LittleEndian.PutUint64(key[1:], ts) + return key[:9] +} + +// 's' + child ID -> parent + timestamp of the first appearance + meta. +func stateKey(key []byte, child Node) []byte { + key[0] = 's' binary.LittleEndian.PutUint64(key[1:], child) return key[:9] } -// 'p' + node (id) -> parent (id). -func parentKey(key []byte, child Node) []byte { - key[0] = 'p' - binary.LittleEndian.PutUint64(key[1:], child) - return key[:9] +func (t *boltForest) putState(b *bbolt.Bucket, key []byte, parent Node, timestamp Timestamp, meta []byte) error { + data := make([]byte, len(meta)+8+8) + binary.LittleEndian.PutUint64(data, parent) + binary.LittleEndian.PutUint64(data[8:], timestamp) + copy(data[16:], meta) + return b.Put(key, data) } -// 'm' + node (id) -> serialized meta. -func metaKey(key []byte, child Node) []byte { - key[0] = 'm' - binary.LittleEndian.PutUint64(key[1:], child) - return key[:9] +func (t *boltForest) getState(b *bbolt.Bucket, key []byte) (Node, Timestamp, []byte, bool) { + data := b.Get(key) + if data == nil { + return 0, 0, nil, false + } + + parent := binary.LittleEndian.Uint64(data) + timestamp := binary.LittleEndian.Uint64(data[8:]) + return parent, timestamp, data[16:], true } // 'c' + parent (id) + child (id) -> 0/1. diff --git a/pkg/local_object_storage/pilorama/forest.go b/pkg/local_object_storage/pilorama/forest.go index ead1334ba9..975eb8b0ac 100644 --- a/pkg/local_object_storage/pilorama/forest.go +++ b/pkg/local_object_storage/pilorama/forest.go @@ -44,7 +44,7 @@ func (f *memoryForest) TreeMove(d CIDDescriptor, treeID string, op *Move) (*LogM lm := s.do(op) s.operations = append(s.operations, lm) - return &lm, nil + return &lm.Move, nil } // TreeAddByPath implements the Forest interface. @@ -66,20 +66,21 @@ func (f *memoryForest) TreeAddByPath(d CIDDescriptor, treeID string, attr string i, node := s.getPathPrefix(attr, path) lm := make([]LogMove, len(path)-i+1) for j := i; j < len(path); j++ { - lm[j-i] = s.do(&Move{ + op := s.do(&Move{ Parent: node, Meta: Meta{ Time: s.timestamp(d.Position, d.Size), Items: []KeyValue{{Key: attr, Value: []byte(path[j])}}}, Child: s.findSpareID(), }) - node = lm[j-i].Child - s.operations = append(s.operations, lm[j-i]) + lm[j-i] = op.Move + node = op.Child + s.operations = append(s.operations, op) } mCopy := make([]KeyValue, len(m)) copy(mCopy, m) - lm[len(lm)-1] = s.do(&Move{ + op := s.do(&Move{ Parent: node, Meta: Meta{ Time: s.timestamp(d.Position, d.Size), @@ -87,6 +88,7 @@ func (f *memoryForest) TreeAddByPath(d CIDDescriptor, treeID string, attr string }, Child: s.findSpareID(), }) + lm[len(lm)-1] = op.Move return lm, nil } diff --git a/pkg/local_object_storage/pilorama/inmemory.go b/pkg/local_object_storage/pilorama/inmemory.go index 8a6a3d8635..fbd84162c2 100644 --- a/pkg/local_object_storage/pilorama/inmemory.go +++ b/pkg/local_object_storage/pilorama/inmemory.go @@ -6,9 +6,15 @@ type nodeInfo struct { Meta Meta } +type move struct { + Move + HasOld bool + Old nodeInfo +} + // state represents state being replicated. type state struct { - operations []LogMove + operations []move tree } @@ -20,7 +26,7 @@ func newState() *state { } // undo un-does op and changes s in-place. -func (s *state) undo(op *LogMove) { +func (s *state) undo(op *move) { children := s.tree.childMap[op.Parent] for i := range children { if children[i] == op.Child { @@ -76,8 +82,8 @@ func (s *state) Apply(op *Move) error { } // do performs a single move operation on a tree. -func (s *state) do(op *Move) LogMove { - lm := LogMove{ +func (s *state) do(op *Move) move { + lm := move{ Move: Move{ Parent: op.Parent, Meta: op.Meta, diff --git a/pkg/local_object_storage/pilorama/types.go b/pkg/local_object_storage/pilorama/types.go index 090fe3dc08..4a859a7e4a 100644 --- a/pkg/local_object_storage/pilorama/types.go +++ b/pkg/local_object_storage/pilorama/types.go @@ -36,11 +36,7 @@ type Move struct { } // LogMove represents log record for a single move operation. -type LogMove struct { - Move - HasOld bool - Old nodeInfo -} +type LogMove = Move const ( // RootID represents the ID of a root node.