internetarchive: add support for Metadata

This commit is contained in:
Lesmiscore (Naoya Ozaki) 2022-07-09 07:47:50 +09:00 committed by GitHub
parent b4d847cadd
commit 42dfadfa1b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 208 additions and 17 deletions

View file

@ -38,6 +38,84 @@ func init() {
Name: "internetarchive",
Description: "Internet Archive",
NewFs: NewFs,
MetadataInfo: &fs.MetadataInfo{
System: map[string]fs.MetadataHelp{
"name": {
Help: "Full file path, without the bucket part",
Type: "filename",
Example: "backend/internetarchive/internetarchive.go",
},
"source": {
Help: "The source of the file",
Type: "string",
Example: "original",
},
"mtime": {
Help: "Time of last modification, managed by Rclone",
Type: "RFC 3339",
Example: "2006-01-02T15:04:05.999999999Z",
},
"size": {
Help: "File size in bytes",
Type: "decimal number",
Example: "123456",
},
"md5": {
Help: "MD5 hash calculated by Internet Archive",
Type: "string",
Example: "01234567012345670123456701234567",
},
"crc32": {
Help: "CRC32 calculated by Internet Archive",
Type: "string",
Example: "01234567",
},
"sha1": {
Help: "SHA1 hash calculated by Internet Archive",
Type: "string",
Example: "0123456701234567012345670123456701234567",
},
"format": {
Help: "Name of format identified by Internet Archive",
Type: "string",
Example: "Comma-Separated Values",
},
"old_version": {
Help: "Whether the file was replaced and moved by keep-old-version flag",
Type: "boolean",
Example: "true",
},
"viruscheck": {
Help: "The last time viruscheck process was run for the file (?)",
Type: "unixtime",
Example: "1654191352",
},
"rclone-ia-mtime": {
Help: "Time of last modification, managed by Internet Archive",
Type: "RFC 3339",
Example: "2006-01-02T15:04:05.999999999Z",
},
"rclone-mtime": {
Help: "Time of last modification, managed by Rclone",
Type: "RFC 3339",
Example: "2006-01-02T15:04:05.999999999Z",
},
"rclone-update-track": {
Help: "Random value used by Rclone for tracking changes inside Internet Archive",
Type: "string",
Example: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
},
},
Help: `Metadata fields provided by Internet Archive.
If there are multiple values for a key, only the first one is returned.
This is a limitation of Rclone, that supports one value per one key.
Owner is able to add custom keys. Metadata feature grabs all the keys including them.
`,
},
Options: []fs.Option{{
Name: "access_key_id",
Help: "IAS3 Access Key.\n\nLeave blank for anonymous access.\nYou can find one here: https://archive.org/account/s3.php",
@ -90,6 +168,14 @@ Only enable if you need to be guaranteed to be reflected after write operations.
// maximum size of an item. this is constant across all items
const iaItemMaxSize int64 = 1099511627776
// metadata keys that are not writeable
var roMetadataKey = map[string]interface{}{
// do not add mtime here, it's a documented exception
"name": nil, "source": nil, "size": nil, "md5": nil,
"crc32": nil, "sha1": nil, "format": nil, "old_version": nil,
"viruscheck": nil,
}
// Options defines the configuration for this backend
type Options struct {
AccessKeyID string `config:"access_key_id"`
@ -122,6 +208,7 @@ type Object struct {
md5 string // md5 hash of the file presented by the server
sha1 string // sha1 hash of the file presented by the server
crc32 string // crc32 of the file presented by the server
rawData json.RawMessage
}
// IAFile reprensents a subset of object in MetadataResponse.Files
@ -135,6 +222,8 @@ type IAFile struct {
Md5 string `json:"md5"`
Crc32 string `json:"crc32"`
Sha1 string `json:"sha1"`
rawData json.RawMessage
}
// MetadataResponse reprensents subset of the JSON object returned by (frontend)/metadata/
@ -143,6 +232,12 @@ type MetadataResponse struct {
ItemSize int64 `json:"item_size"`
}
// MetadataResponseRaw is the form of MetadataResponse to deal with metadata
type MetadataResponseRaw struct {
Files []json.RawMessage `json:"files"`
ItemSize int64 `json:"item_size"`
}
// ModMetadataResponse represents response for amending metadata
type ModMetadataResponse struct {
// https://archive.org/services/docs/api/md-write.html#example
@ -227,6 +322,9 @@ func NewFs(ctx context.Context, name, root string, m configmap.Mapper) (fs.Fs, e
f.setRoot(root)
f.features = (&fs.Features{
BucketBased: true,
ReadMetadata: true,
WriteMetadata: true,
UserMetadata: true,
}).Fill(ctx, f)
f.srv = rest.NewClient(fshttp.NewClient(ctx))
@ -307,18 +405,17 @@ func (o *Object) SetModTime(ctx context.Context, t time.Time) (err error) {
}
// https://archive.org/services/docs/api/md-write.html
var patch = []interface{}{
// the following code might be useful for modifying metadata of an uploaded file
patch := []map[string]string{
// we should drop it first to clear all rclone-provided mtimes
struct {
Op string `json:"op"`
Path string `json:"path"`
}{"remove", "/rclone-mtime"},
struct {
Op string `json:"op"`
Path string `json:"path"`
Value string `json:"value"`
}{"add", "/rclone-mtime", t.Format(time.RFC3339Nano)},
}
{
"op": "remove",
"path": "/rclone-mtime",
}, {
"op": "add",
"path": "/rclone-mtime",
"value": t.Format(time.RFC3339Nano),
}}
res, err := json.Marshal(patch)
if err != nil {
return err
@ -685,6 +782,23 @@ func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, op
headers["Content-Length"] = fmt.Sprintf("%d", size)
headers["x-archive-size-hint"] = fmt.Sprintf("%d", size)
}
var mdata fs.Metadata
mdata, err = fs.GetMetadataOptions(ctx, src, options)
if err == nil && mdata != nil {
for mk, mv := range mdata {
mk = strings.ToLower(mk)
if strings.HasPrefix(mk, "rclone-") {
fs.LogPrintf(fs.LogLevelWarning, o, "reserved metadata key %s is about to set", mk)
} else if _, ok := roMetadataKey[mk]; ok {
fs.LogPrintf(fs.LogLevelWarning, o, "setting or modifying read-only key %s is requested, skipping", mk)
continue
} else if mk == "mtime" {
// redirect to make it work
mk = "rclone-mtime"
}
headers[fmt.Sprintf("x-amz-filemeta-%s", mk)] = mv
}
}
// read the md5sum if available
var md5sumHex string
@ -762,6 +876,34 @@ func (o *Object) String() string {
return o.remote
}
// Metadata returns all file metadata provided by Internet Archive
func (o *Object) Metadata(ctx context.Context) (m fs.Metadata, err error) {
if o.rawData == nil {
return nil, nil
}
raw := make(map[string]json.RawMessage)
err = json.Unmarshal(o.rawData, &raw)
if err != nil {
// fatal: json parsing failed
return
}
for k, v := range raw {
items, err := listOrString(v)
if len(items) == 0 || err != nil {
// skip: an entry failed to parse
continue
}
m.Set(k, items[0])
}
// move the old mtime to an another key
if v, ok := m["mtime"]; ok {
m["rclone-ia-mtime"] = v
}
// overwrite with a correct mtime
m["mtime"] = o.modTime.Format(time.RFC3339Nano)
return
}
func (f *Fs) shouldRetry(resp *http.Response, err error) (bool, error) {
if resp != nil {
for _, e := range retryErrorCodes {
@ -788,7 +930,7 @@ func (o *Object) split() (bucket, bucketPath string) {
return o.fs.split(o.remote)
}
func (f *Fs) requestMetadata(ctx context.Context, bucket string) (result MetadataResponse, err error) {
func (f *Fs) requestMetadata(ctx context.Context, bucket string) (result *MetadataResponse, err error) {
var resp *http.Response
// make a GET request to (frontend)/metadata/:item/
opts := rest.Opts{
@ -796,12 +938,15 @@ func (f *Fs) requestMetadata(ctx context.Context, bucket string) (result Metadat
Path: path.Join("/metadata/", bucket),
}
var temp MetadataResponseRaw
err = f.pacer.Call(func() (bool, error) {
resp, err = f.front.CallJSON(ctx, &opts, nil, &result)
resp, err = f.front.CallJSON(ctx, &opts, nil, &temp)
return f.shouldRetry(resp, err)
})
return result, err
if err != nil {
return
}
return temp.unraw()
}
// list up all files/directories without any filters
@ -998,6 +1143,7 @@ func makeValidObject(f *Fs, remote string, file IAFile, mtime time.Time, size in
md5: file.Md5,
crc32: file.Crc32,
sha1: file.Sha1,
rawData: file.rawData,
}
}
@ -1045,6 +1191,23 @@ func (file IAFile) parseMtime() (mtime time.Time) {
return mtime
}
func (mrr *MetadataResponseRaw) unraw() (_ *MetadataResponse, err error) {
var files []IAFile
for _, raw := range mrr.Files {
var parsed IAFile
err = json.Unmarshal(raw, &parsed)
if err != nil {
return nil, err
}
parsed.rawData = raw
files = append(files, parsed)
}
return &MetadataResponse{
Files: files,
ItemSize: mrr.ItemSize,
}, nil
}
func compareSize(a, b int64) bool {
if a < 0 || b < 0 {
// we won't compare if any of them is not known
@ -1106,4 +1269,5 @@ var (
_ fs.PublicLinker = &Fs{}
_ fs.Abouter = &Fs{}
_ fs.Object = &Object{}
_ fs.Metadataer = &Object{}
)

View file

@ -38,6 +38,33 @@ You can optionally wait for the server's processing to finish, by setting non-ze
By making it wait, rclone can do normal file comparison.
Make sure to set a large enough value (e.g. `30m0s` for smaller files) as it can take a long time depending on server's queue.
## About metadata
This backend supports setting, updating and reading metadata of each file.
The metadata will appear as file metadata on Internet Archive.
However, some fields are reserved by both Internet Archive and rclone.
The following are reserved by Internet Archive:
- `name`
- `source`
- `size`
- `md5`
- `crc32`
- `sha1`
- `format`
- `old_version`
- `viruscheck`
Trying to set values to these keys is ignored with a warning.
Only setting `mtime` is an exception. Doing so make it the identical behavior as setting ModTime.
rclone reserves all the keys starting with `rclone-`. Setting value for these keys will give you warnings, but values are set according to request.
If there are multiple values for a key, only the first one is returned.
This is a limitation of rclone, that supports one value per one key.
It can be triggered when you did a server-side copy.
Reading metadata will also provide custom (non-standard nor reserved) ones.
## Configuration
Here is an example of making an internetarchive configuration.

View file

@ -33,7 +33,7 @@ Here is an overview of the major features of each cloud storage system.
| HiDrive | HiDrive ¹² | R/W | No | No | - | - |
| HTTP | - | R | No | No | R | - |
| Hubic | MD5 | R/W | No | No | R/W | - |
| Internet Archive | MD5, SHA1, CRC32 | R/W ¹¹ | No | No | - | - |
| Internet Archive | MD5, SHA1, CRC32 | R/W ¹¹ | No | No | - | RWU |
| Jottacloud | MD5 | R/W | Yes | No | R | - |
| Koofr | MD5 | - | Yes | No | - | - |
| Mail.ru Cloud | Mailru ⁶ | R/W | Yes | No | - | - |