backend/internetarchive: fix uploads can take very long time

* fill in empty values for non-wait mode
* add tracking metadata to observe file change
* completely remove getHashes
* remove unreliable update tests

Closes #6150
This commit is contained in:
Lesmiscore 2022-05-08 05:54:14 +09:00 committed by GitHub
parent a3d4307892
commit 1d6d41fb91
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -28,6 +28,7 @@ import (
"github.com/rclone/rclone/lib/bucket" "github.com/rclone/rclone/lib/bucket"
"github.com/rclone/rclone/lib/encoder" "github.com/rclone/rclone/lib/encoder"
"github.com/rclone/rclone/lib/pacer" "github.com/rclone/rclone/lib/pacer"
"github.com/rclone/rclone/lib/random"
"github.com/rclone/rclone/lib/rest" "github.com/rclone/rclone/lib/rest"
) )
@ -129,6 +130,7 @@ type IAFile struct {
// Source string `json:"source"` // Source string `json:"source"`
Mtime string `json:"mtime"` Mtime string `json:"mtime"`
RcloneMtime json.RawMessage `json:"rclone-mtime"` RcloneMtime json.RawMessage `json:"rclone-mtime"`
UpdateTrack json.RawMessage `json:"rclone-update-track"`
Size string `json:"size"` Size string `json:"size"`
Md5 string `json:"md5"` Md5 string `json:"md5"`
Crc32 string `json:"crc32"` Crc32 string `json:"crc32"`
@ -294,7 +296,7 @@ func (o *Object) Storable() bool {
return true return true
} }
// SetModTime is not supported // SetModTime sets modTime on a particular file
func (o *Object) SetModTime(ctx context.Context, t time.Time) (err error) { func (o *Object) SetModTime(ctx context.Context, t time.Time) (err error) {
bucket, reqDir := o.split() bucket, reqDir := o.split()
if bucket == "" { if bucket == "" {
@ -483,6 +485,7 @@ func (f *Fs) Copy(ctx context.Context, src fs.Object, remote string) (_ fs.Objec
return nil, fs.ErrorCantCopy return nil, fs.ErrorCantCopy
} }
updateTracker := random.String(32)
headers := map[string]string{ headers := map[string]string{
"x-archive-auto-make-bucket": "1", "x-archive-auto-make-bucket": "1",
"x-archive-queue-derive": "0", "x-archive-queue-derive": "0",
@ -495,6 +498,7 @@ func (f *Fs) Copy(ctx context.Context, src fs.Object, remote string) (_ fs.Objec
"x-archive-filemeta-size": fmt.Sprint(srcObj.size), "x-archive-filemeta-size": fmt.Sprint(srcObj.size),
// add this too for sure // add this too for sure
"x-archive-filemeta-rclone-mtime": srcObj.modTime.Format(time.RFC3339Nano), "x-archive-filemeta-rclone-mtime": srcObj.modTime.Format(time.RFC3339Nano),
"x-archive-filemeta-rclone-update-track": updateTracker,
} }
// make a PUT request at (IAS3)/:item/:path without body // make a PUT request at (IAS3)/:item/:path without body
@ -515,7 +519,7 @@ func (f *Fs) Copy(ctx context.Context, src fs.Object, remote string) (_ fs.Objec
// we can't update/find metadata here as IA will also // we can't update/find metadata here as IA will also
// queue server-side copy as well as upload/delete. // queue server-side copy as well as upload/delete.
return f.waitFileUpload(ctx, trimPathPrefix(path.Join(dstBucket, dstPath), f.root, f.opt.Enc), f.getHashes(ctx, src), srcObj.size) return f.waitFileUpload(ctx, trimPathPrefix(path.Join(dstBucket, dstPath), f.root, f.opt.Enc), updateTracker, srcObj.size)
} }
// ListR lists the objects and directories of the Fs starting // ListR lists the objects and directories of the Fs starting
@ -660,12 +664,14 @@ func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, op
bucket, bucketPath := o.split() bucket, bucketPath := o.split()
modTime := src.ModTime(ctx) modTime := src.ModTime(ctx)
size := src.Size() size := src.Size()
updateTracker := random.String(32)
// Set the mtime in the metadata // Set the mtime in the metadata
// internetarchive backend builds at header level as IAS3 has extension outside X-Amz- // internetarchive backend builds at header level as IAS3 has extension outside X-Amz-
headers := map[string]string{ headers := map[string]string{
// https://github.com/jjjake/internetarchive/blob/2456376533251df9d05e0a14d796ec1ced4959f5/internetarchive/iarequest.py#L158 // https://github.com/jjjake/internetarchive/blob/2456376533251df9d05e0a14d796ec1ced4959f5/internetarchive/iarequest.py#L158
"x-amz-filemeta-rclone-mtime": modTime.Format(time.RFC3339Nano), "x-amz-filemeta-rclone-mtime": modTime.Format(time.RFC3339Nano),
"x-amz-filemeta-rclone-update-track": updateTracker,
// we add some more headers for intuitive actions // we add some more headers for intuitive actions
"x-amz-auto-make-bucket": "1", // create an item if does not exist, do nothing if already "x-amz-auto-make-bucket": "1", // create an item if does not exist, do nothing if already
@ -712,7 +718,7 @@ func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, op
// or we have to wait for finish? (needs polling (frontend)/metadata/:item or scraping (frontend)/history/:item) // or we have to wait for finish? (needs polling (frontend)/metadata/:item or scraping (frontend)/history/:item)
var newObj *Object var newObj *Object
if err == nil { if err == nil {
newObj, err = o.fs.waitFileUpload(ctx, o.remote, o.fs.getHashes(ctx, src), size) newObj, err = o.fs.waitFileUpload(ctx, o.remote, updateTracker, size)
} else { } else {
newObj = &Object{} newObj = &Object{}
} }
@ -782,18 +788,6 @@ func (o *Object) split() (bucket, bucketPath string) {
return o.fs.split(o.remote) return o.fs.split(o.remote)
} }
func (f *Fs) getHashes(ctx context.Context, src fs.ObjectInfo) map[hash.Type]string {
hashMap := map[hash.Type]string{}
for _, ty := range f.Hashes().Array() {
sum, err := src.Hash(ctx, ty)
if err != nil || sum == "" {
continue
}
hashMap[ty] = sum
}
return hashMap
}
func (f *Fs) requestMetadata(ctx context.Context, bucket string) (result MetadataResponse, err error) { func (f *Fs) requestMetadata(ctx context.Context, bucket string) (result MetadataResponse, err error) {
var resp *http.Response var resp *http.Response
// make a GET request to (frontend)/metadata/:item/ // make a GET request to (frontend)/metadata/:item/
@ -852,7 +846,7 @@ func (f *Fs) listAllUnconstrained(ctx context.Context, bucket string) (entries f
return entries, nil return entries, nil
} }
func (f *Fs) waitFileUpload(ctx context.Context, reqPath string, newHashes map[hash.Type]string, newSize int64) (ret *Object, err error) { func (f *Fs) waitFileUpload(ctx context.Context, reqPath, tracker string, newSize int64) (ret *Object, err error) {
bucket, bucketPath := f.split(reqPath) bucket, bucketPath := f.split(reqPath)
ret = &Object{ ret = &Object{
@ -869,6 +863,10 @@ func (f *Fs) waitFileUpload(ctx context.Context, reqPath string, newHashes map[h
ret2, ok := ret2.(*Object) ret2, ok := ret2.(*Object)
if ok { if ok {
ret = ret2 ret = ret2
ret.crc32 = ""
ret.md5 = ""
ret.sha1 = ""
ret.size = -1
} }
} }
return ret, nil return ret, nil
@ -881,9 +879,6 @@ func (f *Fs) waitFileUpload(ctx context.Context, reqPath string, newHashes map[h
go func() { go func() {
isFirstTime := true isFirstTime := true
existed := false existed := false
oldMtime := ""
oldCrc32 := ""
unreliablePassCount := 0
for { for {
if !isFirstTime { if !isFirstTime {
// depending on the queue, it takes time // depending on the queue, it takes time
@ -908,10 +903,6 @@ func (f *Fs) waitFileUpload(ctx context.Context, reqPath string, newHashes map[h
if isFirstTime { if isFirstTime {
isFirstTime = false isFirstTime = false
existed = iaFile != nil existed = iaFile != nil
if iaFile != nil {
oldMtime = iaFile.Mtime
oldCrc32 = iaFile.Crc32
}
} }
if iaFile == nil { if iaFile == nil {
continue continue
@ -925,38 +916,20 @@ func (f *Fs) waitFileUpload(ctx context.Context, reqPath string, newHashes map[h
return return
} }
hashMatched := true fileTrackers, _ := listOrString(iaFile.UpdateTrack)
for tt, sum := range newHashes { trackerMatch := false
if tt == hash.MD5 && !hash.Equals(iaFile.Md5, sum) { for _, v := range fileTrackers {
hashMatched = false if v == tracker {
break trackerMatch = true
}
if tt == hash.SHA1 && !hash.Equals(iaFile.Sha1, sum) {
hashMatched = false
break
}
if tt == hash.CRC32 && !hash.Equals(iaFile.Crc32, sum) {
hashMatched = false
break break
} }
} }
if !hashMatched { if !trackerMatch {
continue continue
} }
if !compareSize(parseSize(iaFile.Size), newSize) { if !compareSize(parseSize(iaFile.Size), newSize) {
continue continue
} }
if hash.Equals(oldCrc32, iaFile.Crc32) && unreliablePassCount < 60 {
// the following two are based on a sort of "bad" assumption;
// what if the file is updated immediately, before polling?
// by limiting hits of these tests, avoid infinite loop
unreliablePassCount++
continue
}
if hash.Equals(iaFile.Mtime, oldMtime) && unreliablePassCount < 60 {
unreliablePassCount++
continue
}
// voila! // voila!
retC <- struct { retC <- struct {
@ -1036,20 +1009,24 @@ func makeValidObject2(f *Fs, file IAFile, bucket string) *Object {
return makeValidObject(f, trimPathPrefix(path.Join(bucket, file.Name), f.root, f.opt.Enc), file, mtimeTime, size) return makeValidObject(f, trimPathPrefix(path.Join(bucket, file.Name), f.root, f.opt.Enc), file, mtimeTime, size)
} }
func (file IAFile) parseMtime() (mtime time.Time) { func listOrString(jm json.RawMessage) (rmArray []string, err error) {
// method 1: use metadata added by rclone
var rmArray []string
// rclone-metadata can be an array or string // rclone-metadata can be an array or string
// try to deserialize it as array first // try to deserialize it as array first
err := json.Unmarshal(file.RcloneMtime, &rmArray) err = json.Unmarshal(jm, &rmArray)
if err != nil { if err != nil {
// if not, it's a string // if not, it's a string
dst := new(string) dst := new(string)
err = json.Unmarshal(file.RcloneMtime, dst) err = json.Unmarshal(jm, dst)
if err == nil { if err == nil {
rmArray = []string{*dst} rmArray = []string{*dst}
} }
} }
return
}
func (file IAFile) parseMtime() (mtime time.Time) {
// method 1: use metadata added by rclone
rmArray, err := listOrString(file.RcloneMtime)
// let's take the first value we can deserialize // let's take the first value we can deserialize
for _, value := range rmArray { for _, value := range rmArray {
mtime, err = time.Parse(time.RFC3339Nano, value) mtime, err = time.Parse(time.RFC3339Nano, value)