forked from TrueCloudLab/rclone
f396550934
This patch makes rclone keep linux page cache usage under control when uploading local files to remote backends. When opening a file it issues FADV_SEQUENTIAL to configure read ahead strategy. While reading the file it issues FADV_DONTNEED every 128kB to free page cache from already consumed pages. ``` fadvise64(5, 0, 0, POSIX_FADV_SEQUENTIAL) = 0 read(5, "\324\375\251\376\213\361\240\224>\t5E\301\331X\274^\203oA\353\303.2'\206z\177N\27fB"..., 32768) = 32768 read(5, "\361\311\vW!\354_\317hf\276t\307\30L\351\272T\342C\243\370\240\213\355\210\v\221\201\177[\333"..., 32768) = 32768 read(5, ":\371\337Gn\355C\322\334 \253f\373\277\301;\215\n\240\347\305\6N\257\313\4\365\276ANq!"..., 32768) = 32768 read(5, "\312\243\360P\263\242\267H\304\240Y\310\367sT\321\256\6[b\310\224\361\344$Ms\234\5\314\306i"..., 32768) = 32768 fadvise64(5, 0, 131072, POSIX_FADV_DONTNEED) = 0 read(5, "m\251\7a\306\226\366-\v~\"\216\353\342~0\fht\315DK0\236.\\\201!A#\177\320"..., 32768) = 32768 read(5, "\7\324\207,\205\360\376\307\276\254\250\232\21G\323n\255\354\234\257P\322y\3502\37\246\21\334^42"..., 32768) = 32768 read(5, "e{*\225\223R\320\212EG:^\302\377\242\337\10\222J\16A\305\0\353\354\326P\336\357A|-"..., 32768) = 32768 read(5, "n\23XA4*R\352\234\257\364\355Y\204t9T\363\33\357\333\3674\246\221T\360\226\326G\354\374"..., 32768) = 32768 fadvise64(5, 131072, 131072, POSIX_FADV_DONTNEED) = 0 read(5, "SX\331\251}\24\353\37\310#\307|h%\372\34\310\3070YX\250s\2269\242\236\371\302z\357_"..., 32768) = 32768 read(5, "\177\3500\236Y\245\376NIY\177\360p!\337L]\2726\206@\240\246pG\213\254N\274\226\303\357"..., 32768) = 32768 read(5, "\242$*\364\217U\264]\221Y\245\342r\t\253\25Hr\363\263\364\336\322\t\325\325\f\37z\324\201\351"..., 32768) = 32768 read(5, "\2305\242\366\370\203tM\226<\230\25\316(9\25x\2\376\212\346Q\223 \353\225\323\264jf|\216"..., 32768) = 32768 fadvise64(5, 262144, 131072, POSIX_FADV_DONTNEED) = 0 ``` Page cache consumption per file can be checked with tools like [pcstat](https://github.com/tobert/pcstat). This patch does not have a performance impact. Please find below results of an experiment comparing local copy of 1GB file with and without this patch. With the patch: ``` (mmt/fadvise)$ pcstat 1GB.bin.1 +-----------+----------------+------------+-----------+---------+ | Name | Size (bytes) | Pages | Cached | Percent | |-----------+----------------+------------+-----------+---------| | 1GB.bin.1 | 1073741824 | 262144 | 0 | 000.000 | +-----------+----------------+------------+-----------+---------+ (mmt/fadvise)$ taskset -c 0 /usr/bin/time -v ./rclone copy 1GB.bin.1 /var/empty/rclone Command being timed: "./rclone copy 1GB.bin.1 /var/empty/rclone" User time (seconds): 13.19 System time (seconds): 1.12 Percent of CPU this job got: 96% Elapsed (wall clock) time (h:mm:ss or m:ss): 0:14.81 Average shared text size (kbytes): 0 Average unshared data size (kbytes): 0 Average stack size (kbytes): 0 Average total size (kbytes): 0 Maximum resident set size (kbytes): 27660 Average resident set size (kbytes): 0 Major (requiring I/O) page faults: 0 Minor (reclaiming a frame) page faults: 2212 Voluntary context switches: 5755 Involuntary context switches: 9782 Swaps: 0 File system inputs: 4155264 File system outputs: 2097152 Socket messages sent: 0 Socket messages received: 0 Signals delivered: 0 Page size (bytes): 4096 Exit status: 0 (mmt/fadvise)$ pcstat 1GB.bin.1 +-----------+----------------+------------+-----------+---------+ | Name | Size (bytes) | Pages | Cached | Percent | |-----------+----------------+------------+-----------+---------| | 1GB.bin.1 | 1073741824 | 262144 | 0 | 000.000 | +-----------+----------------+------------+-----------+---------+ ``` Without the patch: ``` (master)$ taskset -c 0 /usr/bin/time -v ./rclone copy 1GB.bin.1 /var/empty/rclone Command being timed: "./rclone copy 1GB.bin.1 /var/empty/rclone" User time (seconds): 14.46 System time (seconds): 0.81 Percent of CPU this job got: 93% Elapsed (wall clock) time (h:mm:ss or m:ss): 0:16.41 Average shared text size (kbytes): 0 Average unshared data size (kbytes): 0 Average stack size (kbytes): 0 Average total size (kbytes): 0 Maximum resident set size (kbytes): 27600 Average resident set size (kbytes): 0 Major (requiring I/O) page faults: 0 Minor (reclaiming a frame) page faults: 2228 Voluntary context switches: 7190 Involuntary context switches: 1980 Swaps: 0 File system inputs: 2097152 File system outputs: 2097152 Socket messages sent: 0 Socket messages received: 0 Signals delivered: 0 Page size (bytes): 4096 Exit status: 0 (master)$ pcstat 1GB.bin.1 +-----------+----------------+------------+-----------+---------+ | Name | Size (bytes) | Pages | Cached | Percent | |-----------+----------------+------------+-----------+---------| | 1GB.bin.1 | 1073741824 | 262144 | 262144 | 100.000 | +-----------+----------------+------------+-----------+---------+ ```
129 lines
3.7 KiB
Go
129 lines
3.7 KiB
Go
//+build linux
|
|
|
|
package local
|
|
|
|
import (
|
|
"io"
|
|
"os"
|
|
|
|
"github.com/rclone/rclone/fs"
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
// fadvise provides means to automate freeing pages in kernel page cache for
|
|
// a given file descriptor as the file is sequentially processed (read or
|
|
// written).
|
|
//
|
|
// When copying a file to a remote backend all the file content is read by
|
|
// kernel and put to page cache to make future reads faster.
|
|
// This causes memory pressure visible in both memory usage and CPU consumption
|
|
// and can even cause OOM errors in applications consuming large amounts memory.
|
|
//
|
|
// In case of an upload to a remote backend, there is no benefits from caching.
|
|
//
|
|
// fadvise would orchestrate calling POSIX_FADV_DONTNEED
|
|
//
|
|
// POSIX_FADV_DONTNEED attempts to free cached pages associated
|
|
// with the specified region. This is useful, for example, while
|
|
// streaming large files. A program may periodically request the
|
|
// kernel to free cached data that has already been used, so that
|
|
// more useful cached pages are not discarded instead.
|
|
//
|
|
// Requests to discard partial pages are ignored. It is
|
|
// preferable to preserve needed data than discard unneeded data.
|
|
// If the application requires that data be considered for
|
|
// discarding, then offset and len must be page-aligned.
|
|
//
|
|
// The implementation may attempt to write back dirty pages in
|
|
// the specified region, but this is not guaranteed. Any
|
|
// unwritten dirty pages will not be freed. If the application
|
|
// wishes to ensure that dirty pages will be released, it should
|
|
// call fsync(2) or fdatasync(2) first.
|
|
type fadvise struct {
|
|
o *Object
|
|
fd int
|
|
lastPos int64
|
|
curPos int64
|
|
windowSize int64
|
|
}
|
|
|
|
// sequential configures readahead strategy in Linux kernel.
|
|
//
|
|
// Under Linux, POSIX_FADV_NORMAL sets the readahead window to the
|
|
// default size for the backing device; POSIX_FADV_SEQUENTIAL doubles
|
|
// this size, and POSIX_FADV_RANDOM disables file readahead entirely.
|
|
func (f *fadvise) sequential(limit int64) bool {
|
|
l := int64(0)
|
|
if limit > 0 {
|
|
l = limit
|
|
}
|
|
if err := unix.Fadvise(f.fd, f.curPos, l, unix.FADV_SEQUENTIAL); err != nil {
|
|
fs.Debugf(f.o, "fadvise sequential failed on file descriptor %d: %s", f.fd, err)
|
|
return false
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
func (f *fadvise) next(n int) {
|
|
f.curPos += int64(n)
|
|
f.freePagesIfNeeded()
|
|
}
|
|
|
|
func (f *fadvise) freePagesIfNeeded() {
|
|
if f.curPos >= f.lastPos+f.windowSize {
|
|
f.freePages()
|
|
}
|
|
}
|
|
|
|
func (f *fadvise) freePages() {
|
|
if err := unix.Fadvise(f.fd, f.lastPos, f.curPos-f.lastPos, unix.FADV_DONTNEED); err != nil {
|
|
fs.Debugf(f.o, "fadvise dontneed failed on file descriptor %d: %s", f.fd, err)
|
|
}
|
|
f.lastPos = f.curPos
|
|
}
|
|
|
|
type fadviseReadCloser struct {
|
|
*fadvise
|
|
inner io.ReadCloser
|
|
}
|
|
|
|
var defaultWindowSize = int64(32 * os.Getpagesize())
|
|
|
|
// newFadviseReadCloser wraps os.File so that reading from that file would
|
|
// remove already consumed pages from kernel page cache.
|
|
// In addition to that it instructs kernel to double the readahead window to
|
|
// make sequential reads faster.
|
|
// See also fadvise.
|
|
func newFadviseReadCloser(o *Object, f *os.File, offset, limit int64) io.ReadCloser {
|
|
r := fadviseReadCloser{
|
|
fadvise: &fadvise{
|
|
o: o,
|
|
fd: int(f.Fd()),
|
|
lastPos: offset,
|
|
curPos: offset,
|
|
windowSize: defaultWindowSize,
|
|
},
|
|
inner: f,
|
|
}
|
|
|
|
// If syscall failed it's likely that the subsequent syscalls to that
|
|
// file descriptor would also fail. In that case return the provided os.File
|
|
// pointer.
|
|
if !r.sequential(limit) {
|
|
return f
|
|
}
|
|
|
|
return r
|
|
}
|
|
|
|
func (f fadviseReadCloser) Read(p []byte) (n int, err error) {
|
|
n, err = f.inner.Read(p)
|
|
f.next(n)
|
|
return
|
|
}
|
|
|
|
func (f fadviseReadCloser) Close() error {
|
|
f.freePages()
|
|
return f.inner.Close()
|
|
}
|