forked from TrueCloudLab/frostfs-node
[#970] fstree: Add linux-specific file writer using O_TMPFILE
O_TMPFILE is implemented for all modern FSes and it's much easier and safer to use. If application crashes in the middle of writing this file would be gone and won't leave any garbage. Notice that this implementation makes a different choice wrt EEXIST handling, generic one always overwrites, while this one keeps the old data. There is no real performance difference. SSD (slow&old), XFS, Core i7-8565U: Sync ``` name old time/op new time/op delta Put/size=1024,thread=1/fstree-8 1.74ms ± 3% 0.06ms ± 7% -96.31% (p=0.000 n=10+10) Put/size=1024,thread=20/fstree-8 10.0ms ±41% 1.1ms ±18% -88.95% (p=0.000 n=9+10) Put/size=1024,thread=100/fstree-8 32.3ms ±60% 6.5ms ±14% -79.97% (p=0.000 n=10+10) Put/size=1048576,thread=1/fstree-8 17.8ms ±90% 3.4ms ±70% -81.08% (p=0.000 n=10+10) Put/size=1048576,thread=20/fstree-8 103ms ±174% 112ms ±158% ~ (p=0.971 n=10+10) Put/size=1048576,thread=100/fstree-8 949ms ±78% 583ms ±132% ~ (p=0.089 n=10+10) name old alloc/op new alloc/op delta Put/size=1024,thread=1/fstree-8 3.17kB ± 1% 1.96kB ± 0% -38.09% (p=0.000 n=10+10) Put/size=1024,thread=20/fstree-8 59.6kB ± 1% 39.2kB ± 1% -34.30% (p=0.000 n=8+10) Put/size=1024,thread=100/fstree-8 299kB ± 0% 198kB ± 0% -33.90% (p=0.000 n=7+9) Put/size=1048576,thread=1/fstree-8 3.38kB ± 1% 2.36kB ± 1% -30.22% (p=0.000 n=10+10) Put/size=1048576,thread=20/fstree-8 65.7kB ± 4% 47.7kB ± 6% -27.27% (p=0.000 n=10+10) Put/size=1048576,thread=100/fstree-8 351kB ± 8% 245kB ± 8% -30.22% (p=0.000 n=10+10) name old allocs/op new allocs/op delta Put/size=1024,thread=1/fstree-8 30.3 ± 2% 21.0 ± 0% -30.69% (p=0.000 n=10+10) Put/size=1024,thread=20/fstree-8 554 ± 1% 413 ± 0% -25.35% (p=0.000 n=8+10) Put/size=1024,thread=100/fstree-8 2.77k ± 0% 2.07k ± 0% -25.27% (p=0.000 n=7+10) Put/size=1048576,thread=1/fstree-8 32.0 ± 0% 25.0 ± 0% -21.88% (p=0.000 n=9+8) Put/size=1048576,thread=20/fstree-8 609 ± 5% 494 ± 6% -18.93% (p=0.000 n=10+10) Put/size=1048576,thread=100/fstree-8 3.25k ± 9% 2.50k ± 8% -23.21% (p=0.000 n=10+10) ``` No sync ``` name old time/op new time/op delta Put/size=1024,thread=1/fstree-8 71.3µs ±10% 59.8µs ±10% -16.21% (p=0.000 n=10+10) Put/size=1024,thread=20/fstree-8 1.43ms ± 6% 1.22ms ±13% -14.53% (p=0.000 n=10+10) Put/size=1024,thread=100/fstree-8 8.12ms ± 3% 6.36ms ± 2% -21.67% (p=0.000 n=8+9) Put/size=1048576,thread=1/fstree-8 1.88ms ±70% 1.61ms ±78% ~ (p=0.393 n=10+10) Put/size=1048576,thread=20/fstree-8 32.7ms ±28% 34.2ms ±112% ~ (p=0.968 n=9+10) Put/size=1048576,thread=100/fstree-8 262ms ±56% 226ms ±34% ~ (p=0.447 n=10+9) name old alloc/op new alloc/op delta Put/size=1024,thread=1/fstree-8 2.89kB ± 0% 1.96kB ± 0% -32.28% (p=0.000 n=10+10) Put/size=1024,thread=20/fstree-8 58.2kB ± 0% 39.5kB ± 0% -32.09% (p=0.000 n=8+8) Put/size=1024,thread=100/fstree-8 291kB ± 0% 198kB ± 0% -32.19% (p=0.000 n=9+9) Put/size=1048576,thread=1/fstree-8 3.05kB ± 1% 2.13kB ± 1% -30.16% (p=0.000 n=10+9) Put/size=1048576,thread=20/fstree-8 62.6kB ± 0% 44.3kB ± 0% -29.23% (p=0.000 n=9+9) Put/size=1048576,thread=100/fstree-8 302kB ± 0% 210kB ± 1% -30.39% (p=0.000 n=9+9) name old allocs/op new allocs/op delta Put/size=1024,thread=1/fstree-8 27.0 ± 0% 21.0 ± 0% -22.22% (p=0.000 n=10+10) Put/size=1024,thread=20/fstree-8 539 ± 0% 415 ± 0% -22.98% (p=0.000 n=10+10) Put/size=1024,thread=100/fstree-8 2.69k ± 0% 2.07k ± 0% -23.09% (p=0.000 n=9+9) Put/size=1048576,thread=1/fstree-8 28.0 ± 0% 22.3 ± 3% -20.36% (p=0.000 n=8+10) Put/size=1048576,thread=20/fstree-8 577 ± 0% 458 ± 0% -20.72% (p=0.000 n=9+9) Put/size=1048576,thread=100/fstree-8 2.76k ± 0% 2.15k ± 0% -22.05% (p=0.000 n=9+8) ``` HDD (LVM), ext4, Ryzen 5 1600: Sync ``` │ fs.sync-generic │ fs.sync-linux │ │ sec/op │ sec/op vs base │ Put/size=1024,thread=1/fstree-12 34.70m ± 19% 33.59m ± 16% ~ (p=0.529 n=10) Put/size=1024,thread=20/fstree-12 188.8m ± 8% 189.2m ± 16% ~ (p=0.739 n=10) Put/size=1024,thread=100/fstree-12 264.8m ± 22% 273.6m ± 28% ~ (p=0.353 n=10) Put/size=1048576,thread=1/fstree-12 54.90m ± 14% 47.08m ± 18% ~ (p=0.063 n=10) Put/size=1048576,thread=20/fstree-12 244.1m ± 14% 220.4m ± 22% ~ (p=0.579 n=10) Put/size=1048576,thread=100/fstree-12 847.2m ± 5% 893.6m ± 3% +5.48% (p=0.000 n=10) geomean 164.3m 158.9m -3.29% │ fs.sync-generic │ fs.sync-linux │ │ B/op │ B/op vs base │ Put/size=1024,thread=1/fstree-12 3.375Ki ± 1% 2.471Ki ± 1% -26.80% (p=0.000 n=10) Put/size=1024,thread=20/fstree-12 66.62Ki ± 6% 49.21Ki ± 6% -26.15% (p=0.000 n=10) Put/size=1024,thread=100/fstree-12 319.2Ki ± 1% 230.9Ki ± 2% -27.64% (p=0.000 n=10) Put/size=1048576,thread=1/fstree-12 3.457Ki ± 1% 2.559Ki ± 1% -25.97% (p=0.000 n=10) Put/size=1048576,thread=20/fstree-12 66.91Ki ± 1% 49.16Ki ± 1% -26.52% (p=0.000 n=10) Put/size=1048576,thread=100/fstree-12 338.8Ki ± 2% 252.3Ki ± 3% -25.54% (p=0.000 n=10) geomean 42.17Ki 31.02Ki -26.44% │ fs.sync-generic │ fs.sync-linux │ │ allocs/op │ allocs/op vs base │ Put/size=1024,thread=1/fstree-12 33.00 ± 0% 27.00 ± 0% -18.18% (p=0.000 n=10) Put/size=1024,thread=20/fstree-12 639.5 ± 1% 519.0 ± 2% -18.84% (p=0.000 n=10) Put/size=1024,thread=100/fstree-12 3.059k ± 1% 2.478k ± 2% -18.99% (p=0.000 n=10) Put/size=1048576,thread=1/fstree-12 33.50 ± 1% 28.00 ± 4% -16.42% (p=0.000 n=10) Put/size=1048576,thread=20/fstree-12 638.5 ± 1% 520.0 ± 1% -18.56% (p=0.000 n=10) Put/size=1048576,thread=100/fstree-12 3.209k ± 2% 2.655k ± 2% -17.28% (p=0.000 n=10) geomean 405.3 332.1 -18.05% ``` No sync ``` │ fs.nosync-generic │ fs.nosync-linux │ │ sec/op │ sec/op vs base │ Put/size=1024,thread=1/fstree-12 148.2µ ± 20% 136.6µ ± 19% -7.89% (p=0.029 n=10) Put/size=1024,thread=20/fstree-12 1.140m ± 26% 1.364m ± 16% ~ (p=0.143 n=10) Put/size=1024,thread=100/fstree-12 11.93m ± 68% 26.89m ± 62% ~ (p=0.123 n=10) Put/size=1048576,thread=1/fstree-12 1.302m ± 3% 1.287m ± 5% ~ (p=0.481 n=10) Put/size=1048576,thread=20/fstree-12 77.52m ± 8% 74.07m ± 7% ~ (p=0.278 n=10+9) Put/size=1048576,thread=100/fstree-12 226.1m ± ∞ ¹ geomean 5.986m 3.434m +18.60% ² ¹ need >= 6 samples for confidence interval at level 0.95 ² benchmark set differs from baseline; geomeans may not be comparable │ fs.nosync-generic │ fs.nosync-linux │ │ B/op │ B/op vs base │ Put/size=1024,thread=1/fstree-12 2.879Ki ± 0% 1.972Ki ± 0% -31.51% (p=0.000 n=10) Put/size=1024,thread=20/fstree-12 55.94Ki ± 1% 37.90Ki ± 1% -32.25% (p=0.000 n=10) Put/size=1024,thread=100/fstree-12 272.6Ki ± 0% 182.1Ki ± 9% -33.21% (p=0.000 n=10) Put/size=1048576,thread=1/fstree-12 3.158Ki ± 0% 2.259Ki ± 0% -28.46% (p=0.000 n=10) Put/size=1048576,thread=20/fstree-12 58.87Ki ± 0% 41.03Ki ± 0% -30.30% (p=0.000 n=10+9) Put/size=1048576,thread=100/fstree-12 299.8Ki ± ∞ ¹ geomean 36.71Ki 16.60Ki -31.17% ² ¹ need >= 6 samples for confidence interval at level 0.95 ² benchmark set differs from baseline; geomeans may not be comparable │ fs.nosync-generic │ fs.nosync-linux │ │ allocs/op │ allocs/op vs base │ Put/size=1024,thread=1/fstree-12 28.00 ± 0% 22.00 ± 0% -21.43% (p=0.000 n=10) Put/size=1024,thread=20/fstree-12 530.0 ± 0% 407.5 ± 1% -23.11% (p=0.000 n=10) Put/size=1024,thread=100/fstree-12 2.567k ± 0% 1.956k ± 9% -23.77% (p=0.000 n=10) Put/size=1048576,thread=1/fstree-12 30.00 ± 0% 24.00 ± 0% -20.00% (p=0.000 n=10) Put/size=1048576,thread=20/fstree-12 553.5 ± 0% 434.0 ± 0% -21.59% (n=10+9) Put/size=1048576,thread=100/fstree-12 2.803k ± ∞ ¹ geomean 347.9 178.8 -21.99% ² ¹ need >= 6 samples for confidence interval at level 0.95 ² benchmark set differs from baseline; geomeans may not be comparable ``` Signed-off-by: Roman Khimov <roman@nspcc.ru> Signed-off-by: Evgenii Stratonikov <e.stratonikov@yadro.com>
This commit is contained in:
parent
ff488b53a1
commit
fc31b9c947
3 changed files with 51 additions and 1 deletions
2
go.mod
2
go.mod
|
@ -37,6 +37,7 @@ require (
|
||||||
go.uber.org/zap v1.26.0
|
go.uber.org/zap v1.26.0
|
||||||
golang.org/x/exp v0.0.0-20240119083558-1b970713d09a
|
golang.org/x/exp v0.0.0-20240119083558-1b970713d09a
|
||||||
golang.org/x/sync v0.6.0
|
golang.org/x/sync v0.6.0
|
||||||
|
golang.org/x/sys v0.16.0
|
||||||
golang.org/x/term v0.16.0
|
golang.org/x/term v0.16.0
|
||||||
google.golang.org/grpc v1.61.0
|
google.golang.org/grpc v1.61.0
|
||||||
google.golang.org/protobuf v1.32.0
|
google.golang.org/protobuf v1.32.0
|
||||||
|
@ -120,7 +121,6 @@ require (
|
||||||
go.uber.org/multierr v1.11.0 // indirect
|
go.uber.org/multierr v1.11.0 // indirect
|
||||||
golang.org/x/crypto v0.18.0 // indirect
|
golang.org/x/crypto v0.18.0 // indirect
|
||||||
golang.org/x/net v0.20.0 // indirect
|
golang.org/x/net v0.20.0 // indirect
|
||||||
golang.org/x/sys v0.16.0 // indirect
|
|
||||||
golang.org/x/text v0.14.0 // indirect
|
golang.org/x/text v0.14.0 // indirect
|
||||||
google.golang.org/genproto/googleapis/api v0.0.0-20240123012728-ef4313101c80 // indirect
|
google.golang.org/genproto/googleapis/api v0.0.0-20240123012728-ef4313101c80 // indirect
|
||||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20240123012728-ef4313101c80 // indirect
|
google.golang.org/genproto/googleapis/rpc v0.0.0-20240123012728-ef4313101c80 // indirect
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
//go:build !linux
|
||||||
|
|
||||||
package fstree
|
package fstree
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
|
|
@ -0,0 +1,48 @@
|
||||||
|
//go:build linux
|
||||||
|
|
||||||
|
package fstree
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"strconv"
|
||||||
|
|
||||||
|
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/blobstor/common"
|
||||||
|
"golang.org/x/sys/unix"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (t *FSTree) writeData(p string, data []byte) error {
|
||||||
|
err := t.writeFile(p, data)
|
||||||
|
if errors.Is(err, unix.ENOSPC) {
|
||||||
|
return common.ErrNoSpace
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *FSTree) writeFile(p string, data []byte) error {
|
||||||
|
flags := unix.O_WRONLY | unix.O_TMPFILE | unix.O_CLOEXEC
|
||||||
|
if !t.noSync {
|
||||||
|
flags |= unix.O_DSYNC
|
||||||
|
}
|
||||||
|
fd, err := unix.Open(t.RootPath, flags, uint32(t.Permissions))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
tmpPath := "/proc/self/fd/" + strconv.FormatUint(uint64(fd), 10)
|
||||||
|
n, err := unix.Write(fd, data)
|
||||||
|
if err == nil {
|
||||||
|
if n == len(data) {
|
||||||
|
err = unix.Linkat(unix.AT_FDCWD, tmpPath, unix.AT_FDCWD, p, unix.AT_SYMLINK_FOLLOW)
|
||||||
|
if errors.Is(err, unix.EEXIST) {
|
||||||
|
// https://github.com/nspcc-dev/neofs-node/issues/2563
|
||||||
|
err = nil
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
err = errors.New("incomplete write")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
errClose := unix.Close(fd)
|
||||||
|
if err != nil {
|
||||||
|
return err // Close() error is ignored, we have a better one.
|
||||||
|
}
|
||||||
|
return errClose
|
||||||
|
}
|
Loading…
Reference in a new issue