From 42dfadfa1b7c1be6f167b74539757dc9dc7ba62a Mon Sep 17 00:00:00 2001 From: "Lesmiscore (Naoya Ozaki)" Date: Sat, 9 Jul 2022 07:47:50 +0900 Subject: [PATCH] internetarchive: add support for Metadata --- backend/internetarchive/internetarchive.go | 196 +++++++++++++++++++-- docs/content/internetarchive.md | 27 +++ docs/content/overview.md | 2 +- 3 files changed, 208 insertions(+), 17 deletions(-) diff --git a/backend/internetarchive/internetarchive.go b/backend/internetarchive/internetarchive.go index ce021e35f..9d4dd36db 100644 --- a/backend/internetarchive/internetarchive.go +++ b/backend/internetarchive/internetarchive.go @@ -38,6 +38,84 @@ func init() { Name: "internetarchive", Description: "Internet Archive", NewFs: NewFs, + + MetadataInfo: &fs.MetadataInfo{ + System: map[string]fs.MetadataHelp{ + "name": { + Help: "Full file path, without the bucket part", + Type: "filename", + Example: "backend/internetarchive/internetarchive.go", + }, + "source": { + Help: "The source of the file", + Type: "string", + Example: "original", + }, + "mtime": { + Help: "Time of last modification, managed by Rclone", + Type: "RFC 3339", + Example: "2006-01-02T15:04:05.999999999Z", + }, + "size": { + Help: "File size in bytes", + Type: "decimal number", + Example: "123456", + }, + "md5": { + Help: "MD5 hash calculated by Internet Archive", + Type: "string", + Example: "01234567012345670123456701234567", + }, + "crc32": { + Help: "CRC32 calculated by Internet Archive", + Type: "string", + Example: "01234567", + }, + "sha1": { + Help: "SHA1 hash calculated by Internet Archive", + Type: "string", + Example: "0123456701234567012345670123456701234567", + }, + "format": { + Help: "Name of format identified by Internet Archive", + Type: "string", + Example: "Comma-Separated Values", + }, + "old_version": { + Help: "Whether the file was replaced and moved by keep-old-version flag", + Type: "boolean", + Example: "true", + }, + "viruscheck": { + Help: "The last time viruscheck process was run for the file (?)", + Type: "unixtime", + Example: "1654191352", + }, + + "rclone-ia-mtime": { + Help: "Time of last modification, managed by Internet Archive", + Type: "RFC 3339", + Example: "2006-01-02T15:04:05.999999999Z", + }, + "rclone-mtime": { + Help: "Time of last modification, managed by Rclone", + Type: "RFC 3339", + Example: "2006-01-02T15:04:05.999999999Z", + }, + "rclone-update-track": { + Help: "Random value used by Rclone for tracking changes inside Internet Archive", + Type: "string", + Example: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + }, + }, + Help: `Metadata fields provided by Internet Archive. +If there are multiple values for a key, only the first one is returned. +This is a limitation of Rclone, that supports one value per one key. + +Owner is able to add custom keys. Metadata feature grabs all the keys including them. +`, + }, + Options: []fs.Option{{ Name: "access_key_id", Help: "IAS3 Access Key.\n\nLeave blank for anonymous access.\nYou can find one here: https://archive.org/account/s3.php", @@ -90,6 +168,14 @@ Only enable if you need to be guaranteed to be reflected after write operations. // maximum size of an item. this is constant across all items const iaItemMaxSize int64 = 1099511627776 +// metadata keys that are not writeable +var roMetadataKey = map[string]interface{}{ + // do not add mtime here, it's a documented exception + "name": nil, "source": nil, "size": nil, "md5": nil, + "crc32": nil, "sha1": nil, "format": nil, "old_version": nil, + "viruscheck": nil, +} + // Options defines the configuration for this backend type Options struct { AccessKeyID string `config:"access_key_id"` @@ -122,6 +208,7 @@ type Object struct { md5 string // md5 hash of the file presented by the server sha1 string // sha1 hash of the file presented by the server crc32 string // crc32 of the file presented by the server + rawData json.RawMessage } // IAFile reprensents a subset of object in MetadataResponse.Files @@ -135,6 +222,8 @@ type IAFile struct { Md5 string `json:"md5"` Crc32 string `json:"crc32"` Sha1 string `json:"sha1"` + + rawData json.RawMessage } // MetadataResponse reprensents subset of the JSON object returned by (frontend)/metadata/ @@ -143,6 +232,12 @@ type MetadataResponse struct { ItemSize int64 `json:"item_size"` } +// MetadataResponseRaw is the form of MetadataResponse to deal with metadata +type MetadataResponseRaw struct { + Files []json.RawMessage `json:"files"` + ItemSize int64 `json:"item_size"` +} + // ModMetadataResponse represents response for amending metadata type ModMetadataResponse struct { // https://archive.org/services/docs/api/md-write.html#example @@ -226,7 +321,10 @@ func NewFs(ctx context.Context, name, root string, m configmap.Mapper) (fs.Fs, e } f.setRoot(root) f.features = (&fs.Features{ - BucketBased: true, + BucketBased: true, + ReadMetadata: true, + WriteMetadata: true, + UserMetadata: true, }).Fill(ctx, f) f.srv = rest.NewClient(fshttp.NewClient(ctx)) @@ -307,18 +405,17 @@ func (o *Object) SetModTime(ctx context.Context, t time.Time) (err error) { } // https://archive.org/services/docs/api/md-write.html - var patch = []interface{}{ + // the following code might be useful for modifying metadata of an uploaded file + patch := []map[string]string{ // we should drop it first to clear all rclone-provided mtimes - struct { - Op string `json:"op"` - Path string `json:"path"` - }{"remove", "/rclone-mtime"}, - struct { - Op string `json:"op"` - Path string `json:"path"` - Value string `json:"value"` - }{"add", "/rclone-mtime", t.Format(time.RFC3339Nano)}, - } + { + "op": "remove", + "path": "/rclone-mtime", + }, { + "op": "add", + "path": "/rclone-mtime", + "value": t.Format(time.RFC3339Nano), + }} res, err := json.Marshal(patch) if err != nil { return err @@ -685,6 +782,23 @@ func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, op headers["Content-Length"] = fmt.Sprintf("%d", size) headers["x-archive-size-hint"] = fmt.Sprintf("%d", size) } + var mdata fs.Metadata + mdata, err = fs.GetMetadataOptions(ctx, src, options) + if err == nil && mdata != nil { + for mk, mv := range mdata { + mk = strings.ToLower(mk) + if strings.HasPrefix(mk, "rclone-") { + fs.LogPrintf(fs.LogLevelWarning, o, "reserved metadata key %s is about to set", mk) + } else if _, ok := roMetadataKey[mk]; ok { + fs.LogPrintf(fs.LogLevelWarning, o, "setting or modifying read-only key %s is requested, skipping", mk) + continue + } else if mk == "mtime" { + // redirect to make it work + mk = "rclone-mtime" + } + headers[fmt.Sprintf("x-amz-filemeta-%s", mk)] = mv + } + } // read the md5sum if available var md5sumHex string @@ -762,6 +876,34 @@ func (o *Object) String() string { return o.remote } +// Metadata returns all file metadata provided by Internet Archive +func (o *Object) Metadata(ctx context.Context) (m fs.Metadata, err error) { + if o.rawData == nil { + return nil, nil + } + raw := make(map[string]json.RawMessage) + err = json.Unmarshal(o.rawData, &raw) + if err != nil { + // fatal: json parsing failed + return + } + for k, v := range raw { + items, err := listOrString(v) + if len(items) == 0 || err != nil { + // skip: an entry failed to parse + continue + } + m.Set(k, items[0]) + } + // move the old mtime to an another key + if v, ok := m["mtime"]; ok { + m["rclone-ia-mtime"] = v + } + // overwrite with a correct mtime + m["mtime"] = o.modTime.Format(time.RFC3339Nano) + return +} + func (f *Fs) shouldRetry(resp *http.Response, err error) (bool, error) { if resp != nil { for _, e := range retryErrorCodes { @@ -788,7 +930,7 @@ func (o *Object) split() (bucket, bucketPath string) { return o.fs.split(o.remote) } -func (f *Fs) requestMetadata(ctx context.Context, bucket string) (result MetadataResponse, err error) { +func (f *Fs) requestMetadata(ctx context.Context, bucket string) (result *MetadataResponse, err error) { var resp *http.Response // make a GET request to (frontend)/metadata/:item/ opts := rest.Opts{ @@ -796,12 +938,15 @@ func (f *Fs) requestMetadata(ctx context.Context, bucket string) (result Metadat Path: path.Join("/metadata/", bucket), } + var temp MetadataResponseRaw err = f.pacer.Call(func() (bool, error) { - resp, err = f.front.CallJSON(ctx, &opts, nil, &result) + resp, err = f.front.CallJSON(ctx, &opts, nil, &temp) return f.shouldRetry(resp, err) }) - - return result, err + if err != nil { + return + } + return temp.unraw() } // list up all files/directories without any filters @@ -998,6 +1143,7 @@ func makeValidObject(f *Fs, remote string, file IAFile, mtime time.Time, size in md5: file.Md5, crc32: file.Crc32, sha1: file.Sha1, + rawData: file.rawData, } } @@ -1045,6 +1191,23 @@ func (file IAFile) parseMtime() (mtime time.Time) { return mtime } +func (mrr *MetadataResponseRaw) unraw() (_ *MetadataResponse, err error) { + var files []IAFile + for _, raw := range mrr.Files { + var parsed IAFile + err = json.Unmarshal(raw, &parsed) + if err != nil { + return nil, err + } + parsed.rawData = raw + files = append(files, parsed) + } + return &MetadataResponse{ + Files: files, + ItemSize: mrr.ItemSize, + }, nil +} + func compareSize(a, b int64) bool { if a < 0 || b < 0 { // we won't compare if any of them is not known @@ -1106,4 +1269,5 @@ var ( _ fs.PublicLinker = &Fs{} _ fs.Abouter = &Fs{} _ fs.Object = &Object{} + _ fs.Metadataer = &Object{} ) diff --git a/docs/content/internetarchive.md b/docs/content/internetarchive.md index d0e937fdf..622db4d60 100644 --- a/docs/content/internetarchive.md +++ b/docs/content/internetarchive.md @@ -38,6 +38,33 @@ You can optionally wait for the server's processing to finish, by setting non-ze By making it wait, rclone can do normal file comparison. Make sure to set a large enough value (e.g. `30m0s` for smaller files) as it can take a long time depending on server's queue. +## About metadata +This backend supports setting, updating and reading metadata of each file. +The metadata will appear as file metadata on Internet Archive. +However, some fields are reserved by both Internet Archive and rclone. + +The following are reserved by Internet Archive: +- `name` +- `source` +- `size` +- `md5` +- `crc32` +- `sha1` +- `format` +- `old_version` +- `viruscheck` + +Trying to set values to these keys is ignored with a warning. +Only setting `mtime` is an exception. Doing so make it the identical behavior as setting ModTime. + +rclone reserves all the keys starting with `rclone-`. Setting value for these keys will give you warnings, but values are set according to request. + +If there are multiple values for a key, only the first one is returned. +This is a limitation of rclone, that supports one value per one key. +It can be triggered when you did a server-side copy. + +Reading metadata will also provide custom (non-standard nor reserved) ones. + ## Configuration Here is an example of making an internetarchive configuration. diff --git a/docs/content/overview.md b/docs/content/overview.md index 3f298c1ba..2a9301acf 100644 --- a/docs/content/overview.md +++ b/docs/content/overview.md @@ -33,7 +33,7 @@ Here is an overview of the major features of each cloud storage system. | HiDrive | HiDrive ¹² | R/W | No | No | - | - | | HTTP | - | R | No | No | R | - | | Hubic | MD5 | R/W | No | No | R/W | - | -| Internet Archive | MD5, SHA1, CRC32 | R/W ¹¹ | No | No | - | - | +| Internet Archive | MD5, SHA1, CRC32 | R/W ¹¹ | No | No | - | RWU | | Jottacloud | MD5 | R/W | Yes | No | R | - | | Koofr | MD5 | - | Yes | No | - | - | | Mail.ru Cloud | Mailru ⁶ | R/W | Yes | No | - | - |