// Package http provides a filesystem interface using golang.org/net/http // // It treats HTML pages served from the endpoint as directory // listings, and includes any links found as files. package http import ( "context" "io" "mime" "net/http" "net/url" "path" "strconv" "strings" "sync" "time" "github.com/pkg/errors" "github.com/rclone/rclone/fs" "github.com/rclone/rclone/fs/config/configmap" "github.com/rclone/rclone/fs/config/configstruct" "github.com/rclone/rclone/fs/fshttp" "github.com/rclone/rclone/fs/hash" "github.com/rclone/rclone/lib/rest" "golang.org/x/net/html" ) var ( errorReadOnly = errors.New("http remotes are read only") timeUnset = time.Unix(0, 0) ) func init() { fsi := &fs.RegInfo{ Name: "http", Description: "http Connection", NewFs: NewFs, Options: []fs.Option{{ Name: "url", Help: "URL of http host to connect to.\n\nE.g. \"https://example.com\", or \"https://user:pass@example.com\" to use a username and password.", Required: true, }, { Name: "headers", Help: `Set HTTP headers for all transactions. Use this to set additional HTTP headers for all transactions. The input format is comma separated list of key,value pairs. Standard [CSV encoding](https://godoc.org/encoding/csv) may be used. For example, to set a Cookie use 'Cookie,name=value', or '"Cookie","name=value"'. You can set multiple headers, e.g. '"Cookie","name=value","Authorization","xxx"'. `, Default: fs.CommaSepList{}, Advanced: true, }, { Name: "no_slash", Help: `Set this if the site doesn't end directories with /. Use this if your target website does not use / on the end of directories. A / on the end of a path is how rclone normally tells the difference between files and directories. If this flag is set, then rclone will treat all files with Content-Type: text/html as directories and read URLs from them rather than downloading them. Note that this may cause rclone to confuse genuine HTML files with directories.`, Default: false, Advanced: true, }, { Name: "no_head", Help: `Don't use HEAD requests to find file sizes in dir listing. If your site is being very slow to load then you can try this option. Normally rclone does a HEAD request for each potential file in a directory listing to: - find its size - check it really exists - check to see if it is a directory If you set this option, rclone will not do the HEAD request. This will mean - directory listings are much quicker - rclone won't have the times or sizes of any files - some files that don't exist may be in the listing `, Default: false, Advanced: true, }}, } fs.Register(fsi) } // Options defines the configuration for this backend type Options struct { Endpoint string `config:"url"` NoSlash bool `config:"no_slash"` NoHead bool `config:"no_head"` Headers fs.CommaSepList `config:"headers"` } // Fs stores the interface to the remote HTTP files type Fs struct { name string root string features *fs.Features // optional features opt Options // options for this backend ci *fs.ConfigInfo // global config endpoint *url.URL endpointURL string // endpoint as a string httpClient *http.Client } // Object is a remote object that has been stat'd (so it exists, but is not necessarily open for reading) type Object struct { fs *Fs remote string size int64 modTime time.Time contentType string } // statusError returns an error if the res contained an error func statusError(res *http.Response, err error) error { if err != nil { return err } if res.StatusCode < 200 || res.StatusCode > 299 { _ = res.Body.Close() return errors.Errorf("HTTP Error %d: %s", res.StatusCode, res.Status) } return nil } // NewFs creates a new Fs object from the name and root. It connects to // the host specified in the config file. func NewFs(ctx context.Context, name, root string, m configmap.Mapper) (fs.Fs, error) { // Parse config into Options struct opt := new(Options) err := configstruct.Set(m, opt) if err != nil { return nil, err } if len(opt.Headers)%2 != 0 { return nil, errors.New("odd number of headers supplied") } if !strings.HasSuffix(opt.Endpoint, "/") { opt.Endpoint += "/" } // Parse the endpoint and stick the root onto it base, err := url.Parse(opt.Endpoint) if err != nil { return nil, err } u, err := rest.URLJoin(base, rest.URLPathEscape(root)) if err != nil { return nil, err } client := fshttp.NewClient(ctx) var isFile = false if !strings.HasSuffix(u.String(), "/") { // Make a client which doesn't follow redirects so the server // doesn't redirect http://host/dir to http://host/dir/ noRedir := *client noRedir.CheckRedirect = func(req *http.Request, via []*http.Request) error { return http.ErrUseLastResponse } // check to see if points to a file req, err := http.NewRequestWithContext(ctx, "HEAD", u.String(), nil) if err == nil { addHeaders(req, opt) res, err := noRedir.Do(req) err = statusError(res, err) if err == nil { isFile = true } } } newRoot := u.String() if isFile { // Point to the parent if this is a file newRoot, _ = path.Split(u.String()) } else { if !strings.HasSuffix(newRoot, "/") { newRoot += "/" } } u, err = url.Parse(newRoot) if err != nil { return nil, err } ci := fs.GetConfig(ctx) f := &Fs{ name: name, root: root, opt: *opt, ci: ci, httpClient: client, endpoint: u, endpointURL: u.String(), } f.features = (&fs.Features{ CanHaveEmptyDirectories: true, }).Fill(ctx, f) if isFile { return f, fs.ErrorIsFile } if !strings.HasSuffix(f.endpointURL, "/") { return nil, errors.New("internal error: url doesn't end with /") } return f, nil } // Name returns the configured name of the file system func (f *Fs) Name() string { return f.name } // Root returns the root for the filesystem func (f *Fs) Root() string { return f.root } // String returns the URL for the filesystem func (f *Fs) String() string { return f.endpointURL } // Features returns the optional features of this Fs func (f *Fs) Features() *fs.Features { return f.features } // Precision is the remote http file system's modtime precision, which we have no way of knowing. We estimate at 1s func (f *Fs) Precision() time.Duration { return time.Second } // NewObject creates a new remote http file object func (f *Fs) NewObject(ctx context.Context, remote string) (fs.Object, error) { o := &Object{ fs: f, remote: remote, } err := o.stat(ctx) if err != nil { return nil, err } return o, nil } // Join's the remote onto the base URL func (f *Fs) url(remote string) string { return f.endpointURL + rest.URLPathEscape(remote) } // parse s into an int64, on failure return def func parseInt64(s string, def int64) int64 { n, e := strconv.ParseInt(s, 10, 64) if e != nil { return def } return n } // Errors returned by parseName var ( errURLJoinFailed = errors.New("URLJoin failed") errFoundQuestionMark = errors.New("found ? in URL") errHostMismatch = errors.New("host mismatch") errSchemeMismatch = errors.New("scheme mismatch") errNotUnderRoot = errors.New("not under root") errNameIsEmpty = errors.New("name is empty") errNameContainsSlash = errors.New("name contains /") ) // parseName turns a name as found in the page into a remote path or returns an error func parseName(base *url.URL, name string) (string, error) { // make URL absolute u, err := rest.URLJoin(base, name) if err != nil { return "", errURLJoinFailed } // check it doesn't have URL parameters uStr := u.String() if strings.Index(uStr, "?") >= 0 { return "", errFoundQuestionMark } // check that this is going back to the same host and scheme if base.Host != u.Host { return "", errHostMismatch } if base.Scheme != u.Scheme { return "", errSchemeMismatch } // check has path prefix if !strings.HasPrefix(u.Path, base.Path) { return "", errNotUnderRoot } // calculate the name relative to the base name = u.Path[len(base.Path):] // mustn't be empty if name == "" { return "", errNameIsEmpty } // mustn't contain a / - we are looking for a single level directory slash := strings.Index(name, "/") if slash >= 0 && slash != len(name)-1 { return "", errNameContainsSlash } return name, nil } // Parse turns HTML for a directory into names // base should be the base URL to resolve any relative names from func parse(base *url.URL, in io.Reader) (names []string, err error) { doc, err := html.Parse(in) if err != nil { return nil, err } var ( walk func(*html.Node) seen = make(map[string]struct{}) ) walk = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "a" { for _, a := range n.Attr { if a.Key == "href" { name, err := parseName(base, a.Val) if err == nil { if _, found := seen[name]; !found { names = append(names, name) seen[name] = struct{}{} } } break } } } for c := n.FirstChild; c != nil; c = c.NextSibling { walk(c) } } walk(doc) return names, nil } // Adds the configured headers to the request if any func addHeaders(req *http.Request, opt *Options) { for i := 0; i < len(opt.Headers); i += 2 { key := opt.Headers[i] value := opt.Headers[i+1] req.Header.Add(key, value) } } // Adds the configured headers to the request if any func (f *Fs) addHeaders(req *http.Request) { addHeaders(req, &f.opt) } // Read the directory passed in func (f *Fs) readDir(ctx context.Context, dir string) (names []string, err error) { URL := f.url(dir) u, err := url.Parse(URL) if err != nil { return nil, errors.Wrap(err, "failed to readDir") } if !strings.HasSuffix(URL, "/") { return nil, errors.Errorf("internal error: readDir URL %q didn't end in /", URL) } // Do the request req, err := http.NewRequestWithContext(ctx, "GET", URL, nil) if err != nil { return nil, errors.Wrap(err, "readDir failed") } f.addHeaders(req) res, err := f.httpClient.Do(req) if err == nil { defer fs.CheckClose(res.Body, &err) if res.StatusCode == http.StatusNotFound { return nil, fs.ErrorDirNotFound } } err = statusError(res, err) if err != nil { return nil, errors.Wrap(err, "failed to readDir") } contentType := strings.SplitN(res.Header.Get("Content-Type"), ";", 2)[0] switch contentType { case "text/html": names, err = parse(u, res.Body) if err != nil { return nil, errors.Wrap(err, "readDir") } default: return nil, errors.Errorf("Can't parse content type %q", contentType) } return names, nil } // List the objects and directories in dir into entries. The // entries can be returned in any order but should be for a // complete directory. // // dir should be "" to list the root, and should not have // trailing slashes. // // This should return ErrDirNotFound if the directory isn't // found. func (f *Fs) List(ctx context.Context, dir string) (entries fs.DirEntries, err error) { if !strings.HasSuffix(dir, "/") && dir != "" { dir += "/" } names, err := f.readDir(ctx, dir) if err != nil { return nil, errors.Wrapf(err, "error listing %q", dir) } var ( entriesMu sync.Mutex // to protect entries wg sync.WaitGroup checkers = f.ci.Checkers in = make(chan string, checkers) ) add := func(entry fs.DirEntry) { entriesMu.Lock() entries = append(entries, entry) entriesMu.Unlock() } for i := 0; i < checkers; i++ { wg.Add(1) go func() { defer wg.Done() for remote := range in { file := &Object{ fs: f, remote: remote, } switch err := file.stat(ctx); err { case nil: add(file) case fs.ErrorNotAFile: // ...found a directory not a file add(fs.NewDir(remote, timeUnset)) default: fs.Debugf(remote, "skipping because of error: %v", err) } } }() } for _, name := range names { isDir := name[len(name)-1] == '/' name = strings.TrimRight(name, "/") remote := path.Join(dir, name) if isDir { add(fs.NewDir(remote, timeUnset)) } else { in <- remote } } close(in) wg.Wait() return entries, nil } // Put in to the remote path with the modTime given of the given size // // May create the object even if it returns an error - if so // will return the object and the error, otherwise will return // nil and the error func (f *Fs) Put(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) { return nil, errorReadOnly } // PutStream uploads to the remote path with the modTime given of indeterminate size func (f *Fs) PutStream(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) { return nil, errorReadOnly } // Fs is the filesystem this remote http file object is located within func (o *Object) Fs() fs.Info { return o.fs } // String returns the URL to the remote HTTP file func (o *Object) String() string { if o == nil { return "" } return o.remote } // Remote the name of the remote HTTP file, relative to the fs root func (o *Object) Remote() string { return o.remote } // Hash returns "" since HTTP (in Go or OpenSSH) doesn't support remote calculation of hashes func (o *Object) Hash(ctx context.Context, r hash.Type) (string, error) { return "", hash.ErrUnsupported } // Size returns the size in bytes of the remote http file func (o *Object) Size() int64 { return o.size } // ModTime returns the modification time of the remote http file func (o *Object) ModTime(ctx context.Context) time.Time { return o.modTime } // url returns the native url of the object func (o *Object) url() string { return o.fs.url(o.remote) } // stat updates the info field in the Object func (o *Object) stat(ctx context.Context) error { if o.fs.opt.NoHead { o.size = -1 o.modTime = timeUnset o.contentType = fs.MimeType(ctx, o) return nil } url := o.url() req, err := http.NewRequestWithContext(ctx, "HEAD", url, nil) if err != nil { return errors.Wrap(err, "stat failed") } o.fs.addHeaders(req) res, err := o.fs.httpClient.Do(req) if err == nil && res.StatusCode == http.StatusNotFound { return fs.ErrorObjectNotFound } err = statusError(res, err) if err != nil { return errors.Wrap(err, "failed to stat") } t, err := http.ParseTime(res.Header.Get("Last-Modified")) if err != nil { t = timeUnset } o.size = parseInt64(res.Header.Get("Content-Length"), -1) o.modTime = t o.contentType = res.Header.Get("Content-Type") // If NoSlash is set then check ContentType to see if it is a directory if o.fs.opt.NoSlash { mediaType, _, err := mime.ParseMediaType(o.contentType) if err != nil { return errors.Wrapf(err, "failed to parse Content-Type: %q", o.contentType) } if mediaType == "text/html" { return fs.ErrorNotAFile } } return nil } // SetModTime sets the modification and access time to the specified time // // it also updates the info field func (o *Object) SetModTime(ctx context.Context, modTime time.Time) error { return errorReadOnly } // Storable returns whether the remote http file is a regular file (not a directory, symbolic link, block device, character device, named pipe, etc.) func (o *Object) Storable() bool { return true } // Open a remote http file object for reading. Seek is supported func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.ReadCloser, err error) { url := o.url() req, err := http.NewRequestWithContext(ctx, "GET", url, nil) if err != nil { return nil, errors.Wrap(err, "Open failed") } // Add optional headers for k, v := range fs.OpenOptionHeaders(options) { req.Header.Add(k, v) } o.fs.addHeaders(req) // Do the request res, err := o.fs.httpClient.Do(req) err = statusError(res, err) if err != nil { return nil, errors.Wrap(err, "Open failed") } return res.Body, nil } // Hashes returns hash.HashNone to indicate remote hashing is unavailable func (f *Fs) Hashes() hash.Set { return hash.Set(hash.None) } // Mkdir makes the root directory of the Fs object func (f *Fs) Mkdir(ctx context.Context, dir string) error { return errorReadOnly } // Remove a remote http file object func (o *Object) Remove(ctx context.Context) error { return errorReadOnly } // Rmdir removes the root directory of the Fs object func (f *Fs) Rmdir(ctx context.Context, dir string) error { return errorReadOnly } // Update in to the object with the modTime given of the given size func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) error { return errorReadOnly } // MimeType of an Object if known, "" otherwise func (o *Object) MimeType(ctx context.Context) string { return o.contentType } // Check the interfaces are satisfied var ( _ fs.Fs = &Fs{} _ fs.PutStreamer = &Fs{} _ fs.Object = &Object{} _ fs.MimeTyper = &Object{} )