From fc31b9c947bf7185ec252e2e63d615a6c59b16ab Mon Sep 17 00:00:00 2001 From: Roman Khimov Date: Sat, 9 Sep 2023 11:16:06 +0300 Subject: [PATCH] [#970] fstree: Add linux-specific file writer using O_TMPFILE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit O_TMPFILE is implemented for all modern FSes and it's much easier and safer to use. If application crashes in the middle of writing this file would be gone and won't leave any garbage. Notice that this implementation makes a different choice wrt EEXIST handling, generic one always overwrites, while this one keeps the old data. There is no real performance difference. SSD (slow&old), XFS, Core i7-8565U: Sync ``` name old time/op new time/op delta Put/size=1024,thread=1/fstree-8 1.74ms ± 3% 0.06ms ± 7% -96.31% (p=0.000 n=10+10) Put/size=1024,thread=20/fstree-8 10.0ms ±41% 1.1ms ±18% -88.95% (p=0.000 n=9+10) Put/size=1024,thread=100/fstree-8 32.3ms ±60% 6.5ms ±14% -79.97% (p=0.000 n=10+10) Put/size=1048576,thread=1/fstree-8 17.8ms ±90% 3.4ms ±70% -81.08% (p=0.000 n=10+10) Put/size=1048576,thread=20/fstree-8 103ms ±174% 112ms ±158% ~ (p=0.971 n=10+10) Put/size=1048576,thread=100/fstree-8 949ms ±78% 583ms ±132% ~ (p=0.089 n=10+10) name old alloc/op new alloc/op delta Put/size=1024,thread=1/fstree-8 3.17kB ± 1% 1.96kB ± 0% -38.09% (p=0.000 n=10+10) Put/size=1024,thread=20/fstree-8 59.6kB ± 1% 39.2kB ± 1% -34.30% (p=0.000 n=8+10) Put/size=1024,thread=100/fstree-8 299kB ± 0% 198kB ± 0% -33.90% (p=0.000 n=7+9) Put/size=1048576,thread=1/fstree-8 3.38kB ± 1% 2.36kB ± 1% -30.22% (p=0.000 n=10+10) Put/size=1048576,thread=20/fstree-8 65.7kB ± 4% 47.7kB ± 6% -27.27% (p=0.000 n=10+10) Put/size=1048576,thread=100/fstree-8 351kB ± 8% 245kB ± 8% -30.22% (p=0.000 n=10+10) name old allocs/op new allocs/op delta Put/size=1024,thread=1/fstree-8 30.3 ± 2% 21.0 ± 0% -30.69% (p=0.000 n=10+10) Put/size=1024,thread=20/fstree-8 554 ± 1% 413 ± 0% -25.35% (p=0.000 n=8+10) Put/size=1024,thread=100/fstree-8 2.77k ± 0% 2.07k ± 0% -25.27% (p=0.000 n=7+10) Put/size=1048576,thread=1/fstree-8 32.0 ± 0% 25.0 ± 0% -21.88% (p=0.000 n=9+8) Put/size=1048576,thread=20/fstree-8 609 ± 5% 494 ± 6% -18.93% (p=0.000 n=10+10) Put/size=1048576,thread=100/fstree-8 3.25k ± 9% 2.50k ± 8% -23.21% (p=0.000 n=10+10) ``` No sync ``` name old time/op new time/op delta Put/size=1024,thread=1/fstree-8 71.3µs ±10% 59.8µs ±10% -16.21% (p=0.000 n=10+10) Put/size=1024,thread=20/fstree-8 1.43ms ± 6% 1.22ms ±13% -14.53% (p=0.000 n=10+10) Put/size=1024,thread=100/fstree-8 8.12ms ± 3% 6.36ms ± 2% -21.67% (p=0.000 n=8+9) Put/size=1048576,thread=1/fstree-8 1.88ms ±70% 1.61ms ±78% ~ (p=0.393 n=10+10) Put/size=1048576,thread=20/fstree-8 32.7ms ±28% 34.2ms ±112% ~ (p=0.968 n=9+10) Put/size=1048576,thread=100/fstree-8 262ms ±56% 226ms ±34% ~ (p=0.447 n=10+9) name old alloc/op new alloc/op delta Put/size=1024,thread=1/fstree-8 2.89kB ± 0% 1.96kB ± 0% -32.28% (p=0.000 n=10+10) Put/size=1024,thread=20/fstree-8 58.2kB ± 0% 39.5kB ± 0% -32.09% (p=0.000 n=8+8) Put/size=1024,thread=100/fstree-8 291kB ± 0% 198kB ± 0% -32.19% (p=0.000 n=9+9) Put/size=1048576,thread=1/fstree-8 3.05kB ± 1% 2.13kB ± 1% -30.16% (p=0.000 n=10+9) Put/size=1048576,thread=20/fstree-8 62.6kB ± 0% 44.3kB ± 0% -29.23% (p=0.000 n=9+9) Put/size=1048576,thread=100/fstree-8 302kB ± 0% 210kB ± 1% -30.39% (p=0.000 n=9+9) name old allocs/op new allocs/op delta Put/size=1024,thread=1/fstree-8 27.0 ± 0% 21.0 ± 0% -22.22% (p=0.000 n=10+10) Put/size=1024,thread=20/fstree-8 539 ± 0% 415 ± 0% -22.98% (p=0.000 n=10+10) Put/size=1024,thread=100/fstree-8 2.69k ± 0% 2.07k ± 0% -23.09% (p=0.000 n=9+9) Put/size=1048576,thread=1/fstree-8 28.0 ± 0% 22.3 ± 3% -20.36% (p=0.000 n=8+10) Put/size=1048576,thread=20/fstree-8 577 ± 0% 458 ± 0% -20.72% (p=0.000 n=9+9) Put/size=1048576,thread=100/fstree-8 2.76k ± 0% 2.15k ± 0% -22.05% (p=0.000 n=9+8) ``` HDD (LVM), ext4, Ryzen 5 1600: Sync ``` │ fs.sync-generic │ fs.sync-linux │ │ sec/op │ sec/op vs base │ Put/size=1024,thread=1/fstree-12 34.70m ± 19% 33.59m ± 16% ~ (p=0.529 n=10) Put/size=1024,thread=20/fstree-12 188.8m ± 8% 189.2m ± 16% ~ (p=0.739 n=10) Put/size=1024,thread=100/fstree-12 264.8m ± 22% 273.6m ± 28% ~ (p=0.353 n=10) Put/size=1048576,thread=1/fstree-12 54.90m ± 14% 47.08m ± 18% ~ (p=0.063 n=10) Put/size=1048576,thread=20/fstree-12 244.1m ± 14% 220.4m ± 22% ~ (p=0.579 n=10) Put/size=1048576,thread=100/fstree-12 847.2m ± 5% 893.6m ± 3% +5.48% (p=0.000 n=10) geomean 164.3m 158.9m -3.29% │ fs.sync-generic │ fs.sync-linux │ │ B/op │ B/op vs base │ Put/size=1024,thread=1/fstree-12 3.375Ki ± 1% 2.471Ki ± 1% -26.80% (p=0.000 n=10) Put/size=1024,thread=20/fstree-12 66.62Ki ± 6% 49.21Ki ± 6% -26.15% (p=0.000 n=10) Put/size=1024,thread=100/fstree-12 319.2Ki ± 1% 230.9Ki ± 2% -27.64% (p=0.000 n=10) Put/size=1048576,thread=1/fstree-12 3.457Ki ± 1% 2.559Ki ± 1% -25.97% (p=0.000 n=10) Put/size=1048576,thread=20/fstree-12 66.91Ki ± 1% 49.16Ki ± 1% -26.52% (p=0.000 n=10) Put/size=1048576,thread=100/fstree-12 338.8Ki ± 2% 252.3Ki ± 3% -25.54% (p=0.000 n=10) geomean 42.17Ki 31.02Ki -26.44% │ fs.sync-generic │ fs.sync-linux │ │ allocs/op │ allocs/op vs base │ Put/size=1024,thread=1/fstree-12 33.00 ± 0% 27.00 ± 0% -18.18% (p=0.000 n=10) Put/size=1024,thread=20/fstree-12 639.5 ± 1% 519.0 ± 2% -18.84% (p=0.000 n=10) Put/size=1024,thread=100/fstree-12 3.059k ± 1% 2.478k ± 2% -18.99% (p=0.000 n=10) Put/size=1048576,thread=1/fstree-12 33.50 ± 1% 28.00 ± 4% -16.42% (p=0.000 n=10) Put/size=1048576,thread=20/fstree-12 638.5 ± 1% 520.0 ± 1% -18.56% (p=0.000 n=10) Put/size=1048576,thread=100/fstree-12 3.209k ± 2% 2.655k ± 2% -17.28% (p=0.000 n=10) geomean 405.3 332.1 -18.05% ``` No sync ``` │ fs.nosync-generic │ fs.nosync-linux │ │ sec/op │ sec/op vs base │ Put/size=1024,thread=1/fstree-12 148.2µ ± 20% 136.6µ ± 19% -7.89% (p=0.029 n=10) Put/size=1024,thread=20/fstree-12 1.140m ± 26% 1.364m ± 16% ~ (p=0.143 n=10) Put/size=1024,thread=100/fstree-12 11.93m ± 68% 26.89m ± 62% ~ (p=0.123 n=10) Put/size=1048576,thread=1/fstree-12 1.302m ± 3% 1.287m ± 5% ~ (p=0.481 n=10) Put/size=1048576,thread=20/fstree-12 77.52m ± 8% 74.07m ± 7% ~ (p=0.278 n=10+9) Put/size=1048576,thread=100/fstree-12 226.1m ± ∞ ¹ geomean 5.986m 3.434m +18.60% ² ¹ need >= 6 samples for confidence interval at level 0.95 ² benchmark set differs from baseline; geomeans may not be comparable │ fs.nosync-generic │ fs.nosync-linux │ │ B/op │ B/op vs base │ Put/size=1024,thread=1/fstree-12 2.879Ki ± 0% 1.972Ki ± 0% -31.51% (p=0.000 n=10) Put/size=1024,thread=20/fstree-12 55.94Ki ± 1% 37.90Ki ± 1% -32.25% (p=0.000 n=10) Put/size=1024,thread=100/fstree-12 272.6Ki ± 0% 182.1Ki ± 9% -33.21% (p=0.000 n=10) Put/size=1048576,thread=1/fstree-12 3.158Ki ± 0% 2.259Ki ± 0% -28.46% (p=0.000 n=10) Put/size=1048576,thread=20/fstree-12 58.87Ki ± 0% 41.03Ki ± 0% -30.30% (p=0.000 n=10+9) Put/size=1048576,thread=100/fstree-12 299.8Ki ± ∞ ¹ geomean 36.71Ki 16.60Ki -31.17% ² ¹ need >= 6 samples for confidence interval at level 0.95 ² benchmark set differs from baseline; geomeans may not be comparable │ fs.nosync-generic │ fs.nosync-linux │ │ allocs/op │ allocs/op vs base │ Put/size=1024,thread=1/fstree-12 28.00 ± 0% 22.00 ± 0% -21.43% (p=0.000 n=10) Put/size=1024,thread=20/fstree-12 530.0 ± 0% 407.5 ± 1% -23.11% (p=0.000 n=10) Put/size=1024,thread=100/fstree-12 2.567k ± 0% 1.956k ± 9% -23.77% (p=0.000 n=10) Put/size=1048576,thread=1/fstree-12 30.00 ± 0% 24.00 ± 0% -20.00% (p=0.000 n=10) Put/size=1048576,thread=20/fstree-12 553.5 ± 0% 434.0 ± 0% -21.59% (n=10+9) Put/size=1048576,thread=100/fstree-12 2.803k ± ∞ ¹ geomean 347.9 178.8 -21.99% ² ¹ need >= 6 samples for confidence interval at level 0.95 ² benchmark set differs from baseline; geomeans may not be comparable ``` Signed-off-by: Roman Khimov Signed-off-by: Evgenii Stratonikov --- go.mod | 2 +- .../blobstor/fstree/fstree_write_generic.go | 2 + .../blobstor/fstree/fstree_write_linux.go | 48 +++++++++++++++++++ 3 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 pkg/local_object_storage/blobstor/fstree/fstree_write_linux.go diff --git a/go.mod b/go.mod index 5d6edc22..06d2d329 100644 --- a/go.mod +++ b/go.mod @@ -37,6 +37,7 @@ require ( go.uber.org/zap v1.26.0 golang.org/x/exp v0.0.0-20240119083558-1b970713d09a golang.org/x/sync v0.6.0 + golang.org/x/sys v0.16.0 golang.org/x/term v0.16.0 google.golang.org/grpc v1.61.0 google.golang.org/protobuf v1.32.0 @@ -120,7 +121,6 @@ require ( go.uber.org/multierr v1.11.0 // indirect golang.org/x/crypto v0.18.0 // indirect golang.org/x/net v0.20.0 // indirect - golang.org/x/sys v0.16.0 // indirect golang.org/x/text v0.14.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20240123012728-ef4313101c80 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240123012728-ef4313101c80 // indirect diff --git a/pkg/local_object_storage/blobstor/fstree/fstree_write_generic.go b/pkg/local_object_storage/blobstor/fstree/fstree_write_generic.go index 41191403..0882be0d 100644 --- a/pkg/local_object_storage/blobstor/fstree/fstree_write_generic.go +++ b/pkg/local_object_storage/blobstor/fstree/fstree_write_generic.go @@ -1,3 +1,5 @@ +//go:build !linux + package fstree import ( diff --git a/pkg/local_object_storage/blobstor/fstree/fstree_write_linux.go b/pkg/local_object_storage/blobstor/fstree/fstree_write_linux.go new file mode 100644 index 00000000..edb7f0dc --- /dev/null +++ b/pkg/local_object_storage/blobstor/fstree/fstree_write_linux.go @@ -0,0 +1,48 @@ +//go:build linux + +package fstree + +import ( + "errors" + "strconv" + + "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/blobstor/common" + "golang.org/x/sys/unix" +) + +func (t *FSTree) writeData(p string, data []byte) error { + err := t.writeFile(p, data) + if errors.Is(err, unix.ENOSPC) { + return common.ErrNoSpace + } + return err +} + +func (t *FSTree) writeFile(p string, data []byte) error { + flags := unix.O_WRONLY | unix.O_TMPFILE | unix.O_CLOEXEC + if !t.noSync { + flags |= unix.O_DSYNC + } + fd, err := unix.Open(t.RootPath, flags, uint32(t.Permissions)) + if err != nil { + return err + } + tmpPath := "/proc/self/fd/" + strconv.FormatUint(uint64(fd), 10) + n, err := unix.Write(fd, data) + if err == nil { + if n == len(data) { + err = unix.Linkat(unix.AT_FDCWD, tmpPath, unix.AT_FDCWD, p, unix.AT_SYMLINK_FOLLOW) + if errors.Is(err, unix.EEXIST) { + // https://github.com/nspcc-dev/neofs-node/issues/2563 + err = nil + } + } else { + err = errors.New("incomplete write") + } + } + errClose := unix.Close(fd) + if err != nil { + return err // Close() error is ignored, we have a better one. + } + return errClose +}