// Package http provides a filesystem interface using golang.org/net/http // // It treats HTML pages served from the endpoint as directory // listings, and includes any links found as files. package http import ( "context" "errors" "fmt" "io" "mime" "net/http" "net/url" "path" "strconv" "strings" "sync" "time" "github.com/rclone/rclone/fs" "github.com/rclone/rclone/fs/config/configmap" "github.com/rclone/rclone/fs/config/configstruct" "github.com/rclone/rclone/fs/fshttp" "github.com/rclone/rclone/fs/hash" "github.com/rclone/rclone/lib/rest" "golang.org/x/net/html" ) var ( errorReadOnly = errors.New("http remotes are read only") timeUnset = time.Unix(0, 0) ) func init() { fsi := &fs.RegInfo{ Name: "http", Description: "HTTP", NewFs: NewFs, Options: []fs.Option{{ Name: "url", Help: "URL of HTTP host to connect to.\n\nE.g. \"https://example.com\", or \"https://user:pass@example.com\" to use a username and password.", Required: true, }, { Name: "headers", Help: `Set HTTP headers for all transactions. Use this to set additional HTTP headers for all transactions. The input format is comma separated list of key,value pairs. Standard [CSV encoding](https://godoc.org/encoding/csv) may be used. For example, to set a Cookie use 'Cookie,name=value', or '"Cookie","name=value"'. You can set multiple headers, e.g. '"Cookie","name=value","Authorization","xxx"'.`, Default: fs.CommaSepList{}, Advanced: true, }, { Name: "no_slash", Help: `Set this if the site doesn't end directories with /. Use this if your target website does not use / on the end of directories. A / on the end of a path is how rclone normally tells the difference between files and directories. If this flag is set, then rclone will treat all files with Content-Type: text/html as directories and read URLs from them rather than downloading them. Note that this may cause rclone to confuse genuine HTML files with directories.`, Default: false, Advanced: true, }, { Name: "no_head", Help: `Don't use HEAD requests. HEAD requests are mainly used to find file sizes in dir listing. If your site is being very slow to load then you can try this option. Normally rclone does a HEAD request for each potential file in a directory listing to: - find its size - check it really exists - check to see if it is a directory If you set this option, rclone will not do the HEAD request. This will mean that directory listings are much quicker, but rclone won't have the times or sizes of any files, and some files that don't exist may be in the listing.`, Default: false, Advanced: true, }}, } fs.Register(fsi) } // Options defines the configuration for this backend type Options struct { Endpoint string `config:"url"` NoSlash bool `config:"no_slash"` NoHead bool `config:"no_head"` Headers fs.CommaSepList `config:"headers"` } // Fs stores the interface to the remote HTTP files type Fs struct { name string root string features *fs.Features // optional features opt Options // options for this backend ci *fs.ConfigInfo // global config endpoint *url.URL endpointURL string // endpoint as a string httpClient *http.Client } // Object is a remote object that has been stat'd (so it exists, but is not necessarily open for reading) type Object struct { fs *Fs remote string size int64 modTime time.Time contentType string } // statusError returns an error if the res contained an error func statusError(res *http.Response, err error) error { if err != nil { return err } if res.StatusCode < 200 || res.StatusCode > 299 { _ = res.Body.Close() return fmt.Errorf("HTTP Error: %s", res.Status) } return nil } // getFsEndpoint decides if url is to be considered a file or directory, // and returns a proper endpoint url to use for the fs. func getFsEndpoint(ctx context.Context, client *http.Client, url string, opt *Options) (string, bool) { // If url ends with '/' it is already a proper url always assumed to be a directory. if url[len(url)-1] == '/' { return url, false } // If url does not end with '/' we send a HEAD request to decide // if it is directory or file, and if directory appends the missing // '/', or if file returns the directory url to parent instead. createFileResult := func() (string, bool) { fs.Debugf(nil, "If path is a directory you must add a trailing '/'") parent, _ := path.Split(url) return parent, true } createDirResult := func() (string, bool) { fs.Debugf(nil, "To avoid the initial HEAD request add a trailing '/' to the path") return url + "/", false } // If HEAD requests are not allowed we just have to assume it is a file. if opt.NoHead { fs.Debugf(nil, "Assuming path is a file as --http-no-head is set") return createFileResult() } // Use a client which doesn't follow redirects so the server // doesn't redirect http://host/dir to http://host/dir/ noRedir := *client noRedir.CheckRedirect = func(req *http.Request, via []*http.Request) error { return http.ErrUseLastResponse } req, err := http.NewRequestWithContext(ctx, "HEAD", url, nil) if err != nil { fs.Debugf(nil, "Assuming path is a file as HEAD request could not be created: %v", err) return createFileResult() } addHeaders(req, opt) res, err := noRedir.Do(req) if err != nil { fs.Debugf(nil, "Assuming path is a file as HEAD request could not be sent: %v", err) return createFileResult() } if res.StatusCode == http.StatusNotFound { fs.Debugf(nil, "Assuming path is a directory as HEAD response is it does not exist as a file (%s)", res.Status) return createDirResult() } if res.StatusCode == http.StatusMovedPermanently || res.StatusCode == http.StatusFound || res.StatusCode == http.StatusSeeOther || res.StatusCode == http.StatusTemporaryRedirect || res.StatusCode == http.StatusPermanentRedirect { redir := res.Header.Get("Location") if redir != "" { if redir[len(redir)-1] == '/' { fs.Debugf(nil, "Assuming path is a directory as HEAD response is redirect (%s) to a path that ends with '/': %s", res.Status, redir) return createDirResult() } fs.Debugf(nil, "Assuming path is a file as HEAD response is redirect (%s) to a path that does not end with '/': %s", res.Status, redir) return createFileResult() } fs.Debugf(nil, "Assuming path is a file as HEAD response is redirect (%s) but no location header", res.Status) return createFileResult() } if res.StatusCode < 200 || res.StatusCode > 299 { // Example is 403 (http.StatusForbidden) for servers not allowing HEAD requests. fs.Debugf(nil, "Assuming path is a file as HEAD response is an error (%s)", res.Status) return createFileResult() } fs.Debugf(nil, "Assuming path is a file as HEAD response is success (%s)", res.Status) return createFileResult() } // NewFs creates a new Fs object from the name and root. It connects to // the host specified in the config file. func NewFs(ctx context.Context, name, root string, m configmap.Mapper) (fs.Fs, error) { // Parse config into Options struct opt := new(Options) err := configstruct.Set(m, opt) if err != nil { return nil, err } if len(opt.Headers)%2 != 0 { return nil, errors.New("odd number of headers supplied") } if !strings.HasSuffix(opt.Endpoint, "/") { opt.Endpoint += "/" } // Parse the endpoint and stick the root onto it base, err := url.Parse(opt.Endpoint) if err != nil { return nil, err } u, err := rest.URLJoin(base, rest.URLPathEscape(root)) if err != nil { return nil, err } client := fshttp.NewClient(ctx) endpoint, isFile := getFsEndpoint(ctx, client, u.String(), opt) fs.Debugf(nil, "Root: %s", endpoint) u, err = url.Parse(endpoint) if err != nil { return nil, err } ci := fs.GetConfig(ctx) f := &Fs{ name: name, root: root, opt: *opt, ci: ci, httpClient: client, endpoint: u, endpointURL: u.String(), } f.features = (&fs.Features{ CanHaveEmptyDirectories: true, }).Fill(ctx, f) if isFile { // return an error with an fs which points to the parent return f, fs.ErrorIsFile } if !strings.HasSuffix(f.endpointURL, "/") { return nil, errors.New("internal error: url doesn't end with /") } return f, nil } // Name returns the configured name of the file system func (f *Fs) Name() string { return f.name } // Root returns the root for the filesystem func (f *Fs) Root() string { return f.root } // String returns the URL for the filesystem func (f *Fs) String() string { return f.endpointURL } // Features returns the optional features of this Fs func (f *Fs) Features() *fs.Features { return f.features } // Precision is the remote http file system's modtime precision, which we have no way of knowing. We estimate at 1s func (f *Fs) Precision() time.Duration { return time.Second } // NewObject creates a new remote http file object func (f *Fs) NewObject(ctx context.Context, remote string) (fs.Object, error) { o := &Object{ fs: f, remote: remote, } err := o.head(ctx) if err != nil { return nil, err } return o, nil } // Join's the remote onto the base URL func (f *Fs) url(remote string) string { return f.endpointURL + rest.URLPathEscape(remote) } // parse s into an int64, on failure return def func parseInt64(s string, def int64) int64 { n, e := strconv.ParseInt(s, 10, 64) if e != nil { return def } return n } // Errors returned by parseName var ( errURLJoinFailed = errors.New("URLJoin failed") errFoundQuestionMark = errors.New("found ? in URL") errHostMismatch = errors.New("host mismatch") errSchemeMismatch = errors.New("scheme mismatch") errNotUnderRoot = errors.New("not under root") errNameIsEmpty = errors.New("name is empty") errNameContainsSlash = errors.New("name contains /") ) // parseName turns a name as found in the page into a remote path or returns an error func parseName(base *url.URL, name string) (string, error) { // make URL absolute u, err := rest.URLJoin(base, name) if err != nil { return "", errURLJoinFailed } // check it doesn't have URL parameters uStr := u.String() if strings.Contains(uStr, "?") { return "", errFoundQuestionMark } // check that this is going back to the same host and scheme if base.Host != u.Host { return "", errHostMismatch } if base.Scheme != u.Scheme { return "", errSchemeMismatch } // check has path prefix if !strings.HasPrefix(u.Path, base.Path) { return "", errNotUnderRoot } // calculate the name relative to the base name = u.Path[len(base.Path):] // mustn't be empty if name == "" { return "", errNameIsEmpty } // mustn't contain a / - we are looking for a single level directory slash := strings.Index(name, "/") if slash >= 0 && slash != len(name)-1 { return "", errNameContainsSlash } return name, nil } // Parse turns HTML for a directory into names // base should be the base URL to resolve any relative names from func parse(base *url.URL, in io.Reader) (names []string, err error) { doc, err := html.Parse(in) if err != nil { return nil, err } var ( walk func(*html.Node) seen = make(map[string]struct{}) ) walk = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "a" { for _, a := range n.Attr { if a.Key == "href" { name, err := parseName(base, a.Val) if err == nil { if _, found := seen[name]; !found { names = append(names, name) seen[name] = struct{}{} } } break } } } for c := n.FirstChild; c != nil; c = c.NextSibling { walk(c) } } walk(doc) return names, nil } // Adds the configured headers to the request if any func addHeaders(req *http.Request, opt *Options) { for i := 0; i < len(opt.Headers); i += 2 { key := opt.Headers[i] value := opt.Headers[i+1] req.Header.Add(key, value) } } // Adds the configured headers to the request if any func (f *Fs) addHeaders(req *http.Request) { addHeaders(req, &f.opt) } // Read the directory passed in func (f *Fs) readDir(ctx context.Context, dir string) (names []string, err error) { URL := f.url(dir) u, err := url.Parse(URL) if err != nil { return nil, fmt.Errorf("failed to readDir: %w", err) } if !strings.HasSuffix(URL, "/") { return nil, fmt.Errorf("internal error: readDir URL %q didn't end in /", URL) } // Do the request req, err := http.NewRequestWithContext(ctx, "GET", URL, nil) if err != nil { return nil, fmt.Errorf("readDir failed: %w", err) } f.addHeaders(req) res, err := f.httpClient.Do(req) if err == nil { defer fs.CheckClose(res.Body, &err) if res.StatusCode == http.StatusNotFound { return nil, fs.ErrorDirNotFound } } err = statusError(res, err) if err != nil { return nil, fmt.Errorf("failed to readDir: %w", err) } contentType := strings.SplitN(res.Header.Get("Content-Type"), ";", 2)[0] switch contentType { case "text/html": names, err = parse(u, res.Body) if err != nil { return nil, fmt.Errorf("readDir: %w", err) } default: return nil, fmt.Errorf("can't parse content type %q", contentType) } return names, nil } // List the objects and directories in dir into entries. The // entries can be returned in any order but should be for a // complete directory. // // dir should be "" to list the root, and should not have // trailing slashes. // // This should return ErrDirNotFound if the directory isn't // found. func (f *Fs) List(ctx context.Context, dir string) (entries fs.DirEntries, err error) { if !strings.HasSuffix(dir, "/") && dir != "" { dir += "/" } names, err := f.readDir(ctx, dir) if err != nil { return nil, fmt.Errorf("error listing %q: %w", dir, err) } var ( entriesMu sync.Mutex // to protect entries wg sync.WaitGroup checkers = f.ci.Checkers in = make(chan string, checkers) ) add := func(entry fs.DirEntry) { entriesMu.Lock() entries = append(entries, entry) entriesMu.Unlock() } for i := 0; i < checkers; i++ { wg.Add(1) go func() { defer wg.Done() for remote := range in { file := &Object{ fs: f, remote: remote, } switch err := file.head(ctx); err { case nil: add(file) case fs.ErrorNotAFile: // ...found a directory not a file add(fs.NewDir(remote, timeUnset)) default: fs.Debugf(remote, "skipping because of error: %v", err) } } }() } for _, name := range names { isDir := name[len(name)-1] == '/' name = strings.TrimRight(name, "/") remote := path.Join(dir, name) if isDir { add(fs.NewDir(remote, timeUnset)) } else { in <- remote } } close(in) wg.Wait() return entries, nil } // Put in to the remote path with the modTime given of the given size // // May create the object even if it returns an error - if so // will return the object and the error, otherwise will return // nil and the error func (f *Fs) Put(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) { return nil, errorReadOnly } // PutStream uploads to the remote path with the modTime given of indeterminate size func (f *Fs) PutStream(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) { return nil, errorReadOnly } // Fs is the filesystem this remote http file object is located within func (o *Object) Fs() fs.Info { return o.fs } // String returns the URL to the remote HTTP file func (o *Object) String() string { if o == nil { return "" } return o.remote } // Remote the name of the remote HTTP file, relative to the fs root func (o *Object) Remote() string { return o.remote } // Hash returns "" since HTTP (in Go or OpenSSH) doesn't support remote calculation of hashes func (o *Object) Hash(ctx context.Context, r hash.Type) (string, error) { return "", hash.ErrUnsupported } // Size returns the size in bytes of the remote http file func (o *Object) Size() int64 { return o.size } // ModTime returns the modification time of the remote http file func (o *Object) ModTime(ctx context.Context) time.Time { return o.modTime } // url returns the native url of the object func (o *Object) url() string { return o.fs.url(o.remote) } // head sends a HEAD request to update info fields in the Object func (o *Object) head(ctx context.Context) error { if o.fs.opt.NoHead { o.size = -1 o.modTime = timeUnset o.contentType = fs.MimeType(ctx, o) return nil } url := o.url() req, err := http.NewRequestWithContext(ctx, "HEAD", url, nil) if err != nil { return fmt.Errorf("stat failed: %w", err) } o.fs.addHeaders(req) res, err := o.fs.httpClient.Do(req) if err == nil && res.StatusCode == http.StatusNotFound { return fs.ErrorObjectNotFound } err = statusError(res, err) if err != nil { return fmt.Errorf("failed to stat: %w", err) } return o.stat(ctx, res, true) } // stat updates info fields in the Object according to HTTP response headers func (o *Object) stat(ctx context.Context, res *http.Response, isRangeRequest bool) error { t, err := http.ParseTime(res.Header.Get("Last-Modified")) if err != nil { t = timeUnset } o.modTime = t // TODO: parse Content-Range for total size // https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests if !isRangeRequest { o.size = parseInt64(res.Header.Get("Content-Length"), -1) o.contentType = res.Header.Get("Content-Type") } // If NoSlash is set then check ContentType to see if it is a directory if o.fs.opt.NoSlash { mediaType, _, err := mime.ParseMediaType(o.contentType) if err != nil { return fmt.Errorf("failed to parse Content-Type: %q: %w", o.contentType, err) } if mediaType == "text/html" { return fs.ErrorNotAFile } } return nil } // SetModTime sets the modification and access time to the specified time // // it also updates the info field func (o *Object) SetModTime(ctx context.Context, modTime time.Time) error { return errorReadOnly } // Storable returns whether the remote http file is a regular file (not a directory, symbolic link, block device, character device, named pipe, etc.) func (o *Object) Storable() bool { return true } // Open a remote http file object for reading. Seek is supported func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.ReadCloser, err error) { url := o.url() req, err := http.NewRequestWithContext(ctx, "GET", url, nil) if err != nil { return nil, fmt.Errorf("Open failed: %w", err) } // Add optional headers for k, v := range fs.OpenOptionHeaders(options) { req.Header.Add(k, v) } o.fs.addHeaders(req) // Do the request res, err := o.fs.httpClient.Do(req) err = statusError(res, err) if err != nil { return nil, fmt.Errorf("Open failed: %w", err) } if o.fs.opt.NoHead { isRangeRequest := len(req.Header.Get("Range")) > 0 if err = o.stat(ctx, res, isRangeRequest); err != nil { return nil, fmt.Errorf("Stat failed: %w", err) } } return res.Body, nil } // Hashes returns hash.HashNone to indicate remote hashing is unavailable func (f *Fs) Hashes() hash.Set { return hash.Set(hash.None) } // Mkdir makes the root directory of the Fs object func (f *Fs) Mkdir(ctx context.Context, dir string) error { return errorReadOnly } // Remove a remote http file object func (o *Object) Remove(ctx context.Context) error { return errorReadOnly } // Rmdir removes the root directory of the Fs object func (f *Fs) Rmdir(ctx context.Context, dir string) error { return errorReadOnly } // Update in to the object with the modTime given of the given size func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) error { return errorReadOnly } // MimeType of an Object if known, "" otherwise func (o *Object) MimeType(ctx context.Context) string { return o.contentType } // Check the interfaces are satisfied var ( _ fs.Fs = &Fs{} _ fs.PutStreamer = &Fs{} _ fs.Object = &Object{} _ fs.MimeTyper = &Object{} )