HDFS (Hadoop Distributed File System) implementation - #42

This includes an HDFS docker image to use with the integration tests.

Co-authored-by: Ivan Andreev <ivandeex@gmail.com>
Co-authored-by: Nick Craig-Wood <nick@craig-wood.com>
This commit is contained in:
Yury Stankevich 2020-09-28 20:29:44 +03:00 committed by Nick Craig-Wood
parent 768e4c4735
commit 71edc75ca6
26 changed files with 906 additions and 0 deletions

257
backend/hdfs/fs.go Normal file
View file

@ -0,0 +1,257 @@
// +build !plan9
package hdfs
import (
"context"
"fmt"
"io"
"os"
"path"
"time"
"github.com/colinmarc/hdfs/v2"
"github.com/rclone/rclone/fs"
"github.com/rclone/rclone/fs/config/configmap"
"github.com/rclone/rclone/fs/config/configstruct"
"github.com/rclone/rclone/fs/hash"
)
// Fs represents a HDFS server
type Fs struct {
name string
root string
features *fs.Features // optional features
opt Options // options for this backend
ci *fs.ConfigInfo // global config
client *hdfs.Client
}
// NewFs constructs an Fs from the path
func NewFs(ctx context.Context, name, root string, m configmap.Mapper) (fs.Fs, error) {
opt := new(Options)
err := configstruct.Set(m, opt)
if err != nil {
return nil, err
}
client, err := hdfs.NewClient(hdfs.ClientOptions{
Addresses: []string{opt.Namenode},
User: opt.Username,
UseDatanodeHostname: false,
})
if err != nil {
return nil, err
}
f := &Fs{
name: name,
root: root,
opt: *opt,
ci: fs.GetConfig(ctx),
client: client,
}
f.features = (&fs.Features{
CanHaveEmptyDirectories: true,
}).Fill(ctx, f)
info, err := f.client.Stat(f.realpath(""))
if err == nil && !info.IsDir() {
f.root = path.Dir(f.root)
return f, fs.ErrorIsFile
}
return f, nil
}
// Name of this fs
func (f *Fs) Name() string {
return f.name
}
// Root of the remote (as passed into NewFs)
func (f *Fs) Root() string {
return f.root
}
// String returns a description of the FS
func (f *Fs) String() string {
return fmt.Sprintf("hdfs://%s", f.opt.Namenode)
}
// Features returns the optional features of this Fs
func (f *Fs) Features() *fs.Features {
return f.features
}
// Precision return the precision of this Fs
func (f *Fs) Precision() time.Duration {
return time.Second
}
// Hashes are not supported
func (f *Fs) Hashes() hash.Set {
return hash.Set(hash.None)
}
// NewObject finds file at remote or return fs.ErrorObjectNotFound
func (f *Fs) NewObject(ctx context.Context, remote string) (fs.Object, error) {
realpath := f.realpath(remote)
fs.Debugf(f, "new [%s]", realpath)
info, err := f.ensureFile(realpath)
if err != nil {
return nil, err
}
return &Object{
fs: f,
remote: remote,
size: info.Size(),
modTime: info.ModTime(),
}, nil
}
// List the objects and directories in dir into entries.
func (f *Fs) List(ctx context.Context, dir string) (entries fs.DirEntries, err error) {
realpath := f.realpath(dir)
fs.Debugf(f, "list [%s]", realpath)
err = f.ensureDirectory(realpath)
if err != nil {
return nil, err
}
list, err := f.client.ReadDir(realpath)
if err != nil {
return nil, err
}
for _, x := range list {
stdName := f.opt.Enc.ToStandardName(x.Name())
remote := path.Join(dir, stdName)
if x.IsDir() {
entries = append(entries, fs.NewDir(remote, x.ModTime()))
} else {
entries = append(entries, &Object{
fs: f,
remote: remote,
size: x.Size(),
modTime: x.ModTime()})
}
}
return entries, nil
}
// Put the object
func (f *Fs) Put(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) {
o := &Object{
fs: f,
remote: src.Remote(),
}
err := o.Update(ctx, in, src, options...)
return o, err
}
// PutStream uploads to the remote path with the modTime given of indeterminate size
func (f *Fs) PutStream(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) {
return f.Put(ctx, in, src, options...)
}
// Mkdir makes a directory
func (f *Fs) Mkdir(ctx context.Context, dir string) error {
fs.Debugf(f, "mkdir [%s]", f.realpath(dir))
return f.client.MkdirAll(f.realpath(dir), 0755)
}
// Rmdir deletes the directory
func (f *Fs) Rmdir(ctx context.Context, dir string) error {
realpath := f.realpath(dir)
fs.Debugf(f, "rmdir [%s]", realpath)
err := f.ensureDirectory(realpath)
if err != nil {
return err
}
// do not remove empty directory
list, err := f.client.ReadDir(realpath)
if err != nil {
return err
}
if len(list) > 0 {
return fs.ErrorDirectoryNotEmpty
}
return f.client.Remove(realpath)
}
// Purge deletes all the files in the directory
func (f *Fs) Purge(ctx context.Context, dir string) error {
realpath := f.realpath(dir)
fs.Debugf(f, "purge [%s]", realpath)
err := f.ensureDirectory(realpath)
if err != nil {
return err
}
return f.client.RemoveAll(realpath)
}
// About gets quota information from the Fs
func (f *Fs) About(ctx context.Context) (*fs.Usage, error) {
info, err := f.client.StatFs()
if err != nil {
return nil, err
}
return &fs.Usage{
Total: fs.NewUsageValue(int64(info.Capacity)),
Used: fs.NewUsageValue(int64(info.Used)),
Free: fs.NewUsageValue(int64(info.Remaining)),
}, nil
}
func (f *Fs) ensureDirectory(realpath string) error {
info, err := f.client.Stat(realpath)
if e, ok := err.(*os.PathError); ok && e.Err == os.ErrNotExist {
return fs.ErrorDirNotFound
}
if err != nil {
return err
}
if !info.IsDir() {
return fs.ErrorDirNotFound
}
return nil
}
func (f *Fs) ensureFile(realpath string) (os.FileInfo, error) {
info, err := f.client.Stat(realpath)
if e, ok := err.(*os.PathError); ok && e.Err == os.ErrNotExist {
return nil, fs.ErrorObjectNotFound
}
if err != nil {
return nil, err
}
if info.IsDir() {
return nil, fs.ErrorObjectNotFound
}
return info, nil
}
func (f *Fs) realpath(dir string) string {
return f.opt.Enc.FromStandardPath(xPath(f.Root(), dir))
}
// Check the interfaces are satisfied
var (
_ fs.Fs = (*Fs)(nil)
_ fs.Purger = (*Fs)(nil)
_ fs.PutStreamer = (*Fs)(nil)
_ fs.Abouter = (*Fs)(nil)
)

58
backend/hdfs/hdfs.go Normal file
View file

@ -0,0 +1,58 @@
// +build !plan9
package hdfs
import (
"path"
"strings"
"github.com/rclone/rclone/fs"
"github.com/rclone/rclone/fs/config"
"github.com/rclone/rclone/lib/encoder"
)
func init() {
fsi := &fs.RegInfo{
Name: "hdfs",
Description: "Hadoop distributed file system",
NewFs: NewFs,
Options: []fs.Option{{
Name: "namenode",
Help: "hadoop name node and port",
Required: true,
Examples: []fs.OptionExample{{
Value: "namenode:8020",
Help: "Connect to host namenode at port 8020",
}},
}, {
Name: "username",
Help: "hadoop user name",
Required: false,
Examples: []fs.OptionExample{{
Value: "root",
Help: "Connect to hdfs as root",
}},
}, {
Name: config.ConfigEncoding,
Help: config.ConfigEncodingHelp,
Advanced: true,
Default: (encoder.Display | encoder.EncodeInvalidUtf8 | encoder.EncodeColon),
}},
}
fs.Register(fsi)
}
// Options for this backend
type Options struct {
Namenode string `config:"namenode"`
Username string `config:"username"`
Enc encoder.MultiEncoder `config:"encoding"`
}
// xPath make correct file path with leading '/'
func xPath(root string, tail string) string {
if !strings.HasPrefix(root, "/") {
root = "/" + root
}
return path.Join(root, tail)
}

20
backend/hdfs/hdfs_test.go Normal file
View file

@ -0,0 +1,20 @@
// Test HDFS filesystem interface
// +build !plan9
package hdfs_test
import (
"testing"
"github.com/rclone/rclone/backend/hdfs"
"github.com/rclone/rclone/fstest/fstests"
)
// TestIntegration runs integration tests against the remote
func TestIntegration(t *testing.T) {
fstests.Run(t, &fstests.Opt{
RemoteName: "TestHdfs:",
NilObject: (*hdfs.Object)(nil),
})
}

View file

@ -0,0 +1,6 @@
// Build for hdfs for unsupported platforms to stop go complaining
// about "no buildable Go source files "
// +build plan9
package hdfs

177
backend/hdfs/object.go Normal file
View file

@ -0,0 +1,177 @@
// +build !plan9
package hdfs
import (
"context"
"io"
"path"
"time"
"github.com/rclone/rclone/fs"
"github.com/rclone/rclone/fs/hash"
"github.com/rclone/rclone/lib/readers"
)
// Object describes an HDFS file
type Object struct {
fs *Fs
remote string
size int64
modTime time.Time
}
// Fs returns the parent Fs
func (o *Object) Fs() fs.Info {
return o.fs
}
// Remote returns the remote path
func (o *Object) Remote() string {
return o.remote
}
// Size returns the size of an object in bytes
func (o *Object) Size() int64 {
return o.size
}
// ModTime returns the modification time of the object
func (o *Object) ModTime(ctx context.Context) time.Time {
return o.modTime
}
// SetModTime sets the modification time of the local fs object
func (o *Object) SetModTime(ctx context.Context, modTime time.Time) error {
realpath := o.fs.realpath(o.Remote())
err := o.fs.client.Chtimes(realpath, modTime, modTime)
if err != nil {
return err
}
o.modTime = modTime
return nil
}
// Storable returns whether this object is storable
func (o *Object) Storable() bool {
return true
}
// Return a string version
func (o *Object) String() string {
if o == nil {
return "<nil>"
}
return o.Remote()
}
// Hash is not supported
func (o *Object) Hash(ctx context.Context, r hash.Type) (string, error) {
return "", hash.ErrUnsupported
}
// Open an object for read
func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.ReadCloser, err error) {
realpath := o.realpath()
fs.Debugf(o.fs, "open [%s]", realpath)
f, err := o.fs.client.Open(realpath)
if err != nil {
return nil, err
}
var offset, limit int64 = 0, -1
for _, option := range options {
switch x := option.(type) {
case *fs.SeekOption:
offset = x.Offset
case *fs.RangeOption:
offset, limit = x.Decode(o.Size())
}
}
_, err = f.Seek(offset, io.SeekStart)
if err != nil {
return nil, err
}
if limit != -1 {
in = readers.NewLimitedReadCloser(f, limit)
} else {
in = f
}
return in, err
}
// Update object
func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) error {
realpath := o.fs.realpath(src.Remote())
dirname := path.Dir(realpath)
fs.Debugf(o.fs, "update [%s]", realpath)
err := o.fs.client.MkdirAll(dirname, 755)
if err != nil {
return err
}
info, err := o.fs.client.Stat(realpath)
if err == nil {
err = o.fs.client.Remove(realpath)
if err != nil {
return err
}
}
out, err := o.fs.client.Create(realpath)
if err != nil {
return err
}
cleanup := func() {
rerr := o.fs.client.Remove(realpath)
if rerr != nil {
fs.Errorf(o.fs, "failed to remove [%v]: %v", realpath, rerr)
}
}
_, err = io.Copy(out, in)
if err != nil {
cleanup()
return err
}
err = out.Close()
if err != nil {
cleanup()
return err
}
info, err = o.fs.client.Stat(realpath)
if err != nil {
return err
}
err = o.SetModTime(ctx, src.ModTime(ctx))
if err != nil {
return err
}
o.size = info.Size()
return nil
}
// Remove an object
func (o *Object) Remove(ctx context.Context) error {
realpath := o.fs.realpath(o.remote)
fs.Debugf(o.fs, "remove [%s]", realpath)
return o.fs.client.Remove(realpath)
}
func (o *Object) realpath() string {
return o.fs.opt.Enc.FromStandardPath(xPath(o.Fs().Root(), o.remote))
}
// Check the interfaces are satisfied
var (
_ fs.Object = (*Object)(nil)
)