From bc279ec4b07332c2b103221640e730f0b10f0ca6 Mon Sep 17 00:00:00 2001 From: Nick Craig-Wood Date: Sun, 12 Jul 2020 16:52:33 +0100 Subject: [PATCH] zip: backend to read or create zip files FIXME WIP Use as `:zip:remote:path/to/file.zip` for reading or writing. - reading zip files works - can mount zip files - writing works - unknowns in writing like end? - lots of bodges --- backend/all/all.go | 1 + backend/zip/zip.go | 572 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 573 insertions(+) create mode 100644 backend/zip/zip.go diff --git a/backend/all/all.go b/backend/all/all.go index 107975cd6..b0451eb51 100644 --- a/backend/all/all.go +++ b/backend/all/all.go @@ -54,5 +54,6 @@ import ( _ "github.com/rclone/rclone/backend/uptobox" _ "github.com/rclone/rclone/backend/webdav" _ "github.com/rclone/rclone/backend/yandex" + _ "github.com/rclone/rclone/backend/zip" _ "github.com/rclone/rclone/backend/zoho" ) diff --git a/backend/zip/zip.go b/backend/zip/zip.go new file mode 100644 index 000000000..ef540bf65 --- /dev/null +++ b/backend/zip/zip.go @@ -0,0 +1,572 @@ +// Package zip provides wrappers for Fs and Object which read and +// write a zip file. +// +// Use like :zip:remote:path/to/file.zip +package zip + +/* +FIXME could maybe make an append to zip file with specially named zip +files having more info in? + +So have base zip file file.zip then file.extra000.zip etc. + +We read the objects sequentially but can transfer them in parallel. + +FIXME what happens when we want to copy a file out of the zip? + +So rclone copy remote:file.zip/file/inside.zip? +Maybe rclone copy remote:file.zip#file/inside.zip? +Or just look for first .zip file? + +FIXME what happens when writing - need to know the end... Don't have a +backend Shutdown call. + +Could have a read only Feature flag so we turn that on when we have a +zip. Then if placed in dest position could error. Might be useful for +http also. + +FIXME this will perform poorly for unpacking as the VFS Reader is bad +at multiple streams. + +FIXME not writing directories + +cf zipinfo +drwxr-xr-x 3.0 unx 0 bx stor 19-Oct-05 12:14 rclone-v1.49.5-linux-amd64/ + +FIXME can probably check directories better than trailing / + +FIXME enormous atexit bodge + +FIXME crc32 skip on write bodge + +Bodge bodge bodge + +*/ + +import ( + "archive/zip" + "context" + "errors" + "fmt" + "io" + "io/ioutil" + "os" + "path" + "strings" + "sync" + "time" + + "github.com/rclone/rclone/fs" + "github.com/rclone/rclone/fs/cache" + "github.com/rclone/rclone/fs/config/configmap" + "github.com/rclone/rclone/fs/config/configstruct" + "github.com/rclone/rclone/fs/dirtree" + "github.com/rclone/rclone/fs/fspath" + "github.com/rclone/rclone/fs/hash" + "github.com/rclone/rclone/fs/log" + "github.com/rclone/rclone/lib/atexit" + "github.com/rclone/rclone/lib/readers" + "github.com/rclone/rclone/vfs" +) + +// Globals +// Register with Fs +func init() { + fs.Register(&fs.RegInfo{ + Name: "zip", + Description: "Read or write a zip file", + NewFs: NewFs, + Options: []fs.Option{}, + }) +} + +// NewFs constructs an Fs from the path, container:path +func NewFs(ctx context.Context, name, root string, m configmap.Mapper) (fs.Fs, error) { + // Parse config into Options struct + opt := new(Options) + err := configstruct.Set(m, opt) + if err != nil { + return nil, err + } + + // Parse the root which should be remote:path/to/file.zip + parent, leaf, err := fspath.Split(root) + if err != nil { + return nil, fmt.Errorf("failed to parse root %q: %w", root, err) + } + if leaf == "" { + return nil, fmt.Errorf("failed to parse root %q: not pointing to a file", root) + } + root = strings.TrimRight(parent, "/") + + // Create the remote from the cache + wrappedFs, err := cache.Get(ctx, root) + if err != nil { + return nil, fmt.Errorf("failed to make %q to wrap: %w", root, err) + } + // FIXME vfs cache? + // FIXME could factor out ReadFileHandle and just use that rather than the full VFS + VFS := vfs.New(wrappedFs, nil) + if err != nil { + return nil, fmt.Errorf("failed to create VFS for %q: %w", root, err) + } + node, err := VFS.Stat(leaf) + if err != nil && !os.IsNotExist(err) { + return nil, fmt.Errorf("failed to find %q object: %w", leaf, err) + } + if node != nil && node.IsDir() { + return nil, fmt.Errorf("can't unzip a directory %q", leaf) + } + f := &Fs{ + f: wrappedFs, + name: name, + root: root, + opt: *opt, + vfs: VFS, + node: node, + leaf: leaf, + } + // FIXME Maybe delay this until first read size if writing we don't need it? + if node != nil { + err = f.readZip() + if err != nil { + return nil, fmt.Errorf("failed to open zip file: %w", err) + } + } + + // FIXME + // the features here are ones we could support, and they are + // ANDed with the ones from wrappedFs + // + // FIXME some of these need to be forced on - CanHaveEmptyDirectories + f.features = (&fs.Features{ + CaseInsensitive: false, + DuplicateFiles: false, + ReadMimeType: false, // MimeTypes not supported with gzip + WriteMimeType: false, + BucketBased: false, + CanHaveEmptyDirectories: true, + }).Fill(ctx, f).Mask(ctx, wrappedFs).WrapsFs(f, wrappedFs) + + return f, nil +} + +// Options defines the configuration for this backend +type Options struct { +} + +// Fs represents a wrapped fs.Fs +type Fs struct { + f fs.Fs + wrapper fs.Fs + name string + root string + opt Options + features *fs.Features // optional features + vfs *vfs.VFS + node vfs.Node // zip file object - set if reading + leaf string // leaf name of the zip file object + dt dirtree.DirTree // read from zipfile + + wrmu sync.Mutex // writing mutex protects the below + wh vfs.Handle // write handle + zw *zip.Writer +} + +// Name of the remote (as passed into NewFs) +func (f *Fs) Name() string { + return f.name +} + +// Root of the remote (as passed into NewFs) +func (f *Fs) Root() string { + return f.root +} + +// Features returns the optional features of this Fs +func (f *Fs) Features() *fs.Features { + return f.features +} + +// String returns a description of the FS +func (f *Fs) String() string { + return fmt.Sprintf("Zip '%s:%s'", f.name, f.root) +} + +// readZip the zip file into f +func (f *Fs) readZip() (err error) { + if f.node == nil { + return fs.ErrorDirNotFound + } + size := f.node.Size() + if size < 0 { + return errors.New("can't read from zip file with unknown size") + } + r, err := f.node.Open(os.O_RDONLY) + if err != nil { + return fmt.Errorf("failed to open zip file: %w", err) + } + zr, err := zip.NewReader(r, size) + if err != nil { + return fmt.Errorf("failed to read zip file: %w", err) + } + dt := dirtree.New() + for _, file := range zr.File { + remote := strings.Trim(path.Clean(file.Name), "/") + if remote == "." { + remote = "" + } + if strings.HasSuffix(file.Name, "/") { + dir := fs.NewDir(remote, file.Modified) + dt.AddDir(dir) + } else { + o := &Object{ + f: f, + remote: remote, + fh: &file.FileHeader, + file: file, + } + dt.Add(o) + } + } + dt.CheckParents("") + dt.Sort() + f.dt = dt + fs.Debugf(nil, "dt = %#v", dt) + return nil +} + +// List the objects and directories in dir into entries. The +// entries can be returned in any order but should be for a +// complete directory. +// +// dir should be "" to list the root, and should not have +// trailing slashes. +// +// This should return ErrDirNotFound if the directory isn't +// found. +func (f *Fs) List(ctx context.Context, dir string) (entries fs.DirEntries, err error) { + defer log.Trace(f, "dir=%q", dir)("entries=%v, err=%v", &entries, &err) + entries, ok := f.dt[dir] + if !ok { + return nil, fs.ErrorDirNotFound + } + fs.Debugf(f, "dir=%q, entries=%v", dir, entries) + return entries, nil +} + +// ListR lists the objects and directories of the Fs starting +// from dir recursively into out. +// +// dir should be "" to start from the root, and should not +// have trailing slashes. +// +// This should return ErrDirNotFound if the directory isn't +// found. +// +// It should call callback for each tranche of entries read. +// These need not be returned in any particular order. If +// callback returns an error then the listing will stop +// immediately. +// +// Don't implement this unless you have a more efficient way +// of listing recursively that doing a directory traversal. +func (f *Fs) ListR(ctx context.Context, dir string, callback fs.ListRCallback) (err error) { + for _, entries := range f.dt { + err = callback(entries) + if err != nil { + return err + } + } + return nil +} + +// NewObject finds the Object at remote. +func (f *Fs) NewObject(ctx context.Context, remote string) (o fs.Object, err error) { + defer log.Trace(f, "remote=%q", remote)("obj=%v, err=%v", &o, &err) + if f.dt == nil { + return nil, fs.ErrorObjectNotFound + } + _, entry := f.dt.Find(remote) + if entry == nil { + return nil, fs.ErrorObjectNotFound + } + o, ok := entry.(*Object) + if !ok { + return nil, fs.ErrorNotAFile + } + return o, nil +} + +// Precision of the ModTimes in this Fs +func (f *Fs) Precision() time.Duration { + return time.Second +} + +// Mkdir makes the directory (container, bucket) +// +// Shouldn't return an error if it already exists +func (f *Fs) Mkdir(ctx context.Context, dir string) error { + // FIXME should this create a "directory" entry if not already created? + return nil +} + +// Rmdir removes the directory (container, bucket) if empty +// +// Return an error if it doesn't exist or isn't empty +func (f *Fs) Rmdir(ctx context.Context, dir string) error { + return errors.New("can't remove directories from zip file") +} + +// Wrapper for src to make it into os.FileInfo +type fileInfo struct { + src fs.ObjectInfo +} + +// Name - base name of the file (actually full name for zip) +func (fi fileInfo) Name() string { + return fi.src.Remote() +} + +// Size - length in bytes for regular files; system-dependent for others +func (fi fileInfo) Size() int64 { + return fi.src.Size() +} + +// Mode - file mode bits +func (fi fileInfo) Mode() os.FileMode { + return 0777 +} + +// ModTime - modification time +func (fi fileInfo) ModTime() time.Time { + return fi.src.ModTime(context.Background()) +} + +// IsDir - abbreviation for Mode().IsDir() +func (fi fileInfo) IsDir() bool { + return false +} + +// Sys - underlying data source (can return nil) +func (fi fileInfo) Sys() interface{} { + return nil +} + +// check type +var _ os.FileInfo = fileInfo{} + +// Put in to the remote path with the modTime given of the given size +// +// May create the object even if it returns an error - if so +// will return the object and the error, otherwise will return +// nil and the error +func (f *Fs) Put(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (o fs.Object, err error) { + defer log.Trace(f, "src=%v", src)("obj=%v, err=%v", &o, &err) + f.wrmu.Lock() + defer f.wrmu.Unlock() + + size := src.Size() + if size < 0 { + return nil, errors.New("can't zip unknown sized objects") + } + + if f.zw == nil { + wh, err := f.vfs.OpenFile(f.leaf, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0777) + if err != nil { + return nil, fmt.Errorf("zip put: failed to open output file: %w", err) + } + f.wh = wh + f.zw = zip.NewWriter(f.wh) + if err != nil { + return nil, fmt.Errorf("zip put: failed to create zip writer: %w", err) + } + + // FIXME enormous bodge to work around no lifecycle for backends + atexit.Register(func() { + err = f.zw.Close() + if err != nil { + fs.Errorf(f, "Error closing zip writer: %v", err) + } + err = f.wh.Close() + if err != nil { + fs.Errorf(f, "Error closing zip file: %v", err) + } + }) + } + + // FIXME not putting in directories? + + fh, err := zip.FileInfoHeader(fileInfo{src}) + if err != nil { + return nil, fmt.Errorf("zip put: failed to create file header: %w", err) + } + w, err := f.zw.CreateHeader(fh) + if err != nil { + return nil, fmt.Errorf("zip put: failed to open file: %w", err) + } + // FIXME just a guess! + // should probably try a better heuristic? + if size < 10 { + fh.Method = zip.Store + } else { + fh.Method = zip.Deflate + } + _, err = io.CopyN(w, in, size) + if err != nil { + return nil, fmt.Errorf("zip put: failed to copy file: %w", err) + } + + // err = w.Close() + // if err != nil { + // return nil,fmt.Errorf("zip put: failed to close file: %w", err) + // } + + o = &Object{ + f: f, + fh: fh, + remote: src.Remote(), + } + + return o, nil + +} + +// Hashes returns the supported hash sets. +func (f *Fs) Hashes() hash.Set { + return hash.Set(hash.CRC32) +} + +// UnWrap returns the Fs that this Fs is wrapping +func (f *Fs) UnWrap() fs.Fs { + return f.f +} + +// WrapFs returns the Fs that is wrapping this Fs +func (f *Fs) WrapFs() fs.Fs { + return f.wrapper +} + +// SetWrapper sets the Fs that is wrapping this Fs +func (f *Fs) SetWrapper(wrapper fs.Fs) { + f.wrapper = wrapper +} + +// Object describes an object to be read from the raw zip file +type Object struct { + f *Fs + remote string + fh *zip.FileHeader + file *zip.File +} + +// Fs returns read only access to the Fs that this object is part of +func (o *Object) Fs() fs.Info { + return o.f +} + +// Return a string version +func (o *Object) String() string { + if o == nil { + return "" + } + return o.Remote() +} + +// Remote returns the remote path +func (o *Object) Remote() string { + return o.remote +} + +// Size returns the size of the file +func (o *Object) Size() int64 { + return int64(o.fh.UncompressedSize64) +} + +// ModTime returns the modification time of the object +// +// It attempts to read the objects mtime and if that isn't present the +// LastModified returned in the http headers +func (o *Object) ModTime(ctx context.Context) time.Time { + return o.fh.Modified +} + +// SetModTime sets the modification time of the local fs object +func (o *Object) SetModTime(ctx context.Context, modTime time.Time) error { + fs.Debugf(o, "Can't set mod time - ignoring") + return nil +} + +// Storable raturns a boolean indicating if this object is storable +func (o *Object) Storable() bool { + return true +} + +// Hash returns the selected checksum of the file +// If no checksum is available it returns "" +func (o *Object) Hash(ctx context.Context, ht hash.Type) (string, error) { + if ht == hash.CRC32 { + // FIXME return empty CRC if writing + if o.f.dt == nil { + return "", nil + } + return fmt.Sprintf("%08x", o.fh.CRC32), nil + } + return "", hash.ErrUnsupported +} + +// Open opens the file for read. Call Close() on the returned io.ReadCloser +func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (rc io.ReadCloser, err error) { + var offset, limit int64 = 0, -1 + for _, option := range options { + switch x := option.(type) { + case *fs.SeekOption: + offset = x.Offset + case *fs.RangeOption: + offset, limit = x.Decode(o.Size()) + default: + if option.Mandatory() { + fs.Logf(o, "Unsupported mandatory option: %v", option) + } + } + } + + rc, err = o.file.Open() + if err != nil { + return nil, err + } + + // discard data from start as necessary + if offset > 0 { + _, err = io.CopyN(ioutil.Discard, rc, offset) + if err != nil { + return nil, err + } + } + // If limited then don't return everything + if limit >= 0 { + return readers.NewLimitedReadCloser(rc, limit-offset), nil + } + + return rc, nil +} + +// Update in to the object with the modTime given of the given size +func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) error { + return errors.New("can't Update zip files") +} + +// Remove an object +func (o *Object) Remove(ctx context.Context) error { + return errors.New("can't Remove zip file objects") +} + +// Check the interfaces are satisfied +var ( + _ fs.Fs = (*Fs)(nil) + _ fs.UnWrapper = (*Fs)(nil) + _ fs.ListRer = (*Fs)(nil) + // _ fs.Abouter = (*Fs)(nil) - FIXME can implemnet + _ fs.Wrapper = (*Fs)(nil) + _ fs.Object = (*Object)(nil) +)