drive: Export Google documents - fixes #49

Rclone will download one format of a google doc. The choice of which
export format is controlled by the `--drive-formats` flag.
This commit is contained in:
Nick Craig-Wood 2016-01-26 16:52:53 +00:00
parent 0f73129ab7
commit 558bc2e132
2 changed files with 216 additions and 14 deletions

View file

@ -132,6 +132,50 @@ off, namely deleting files permanently.
Only consider files owned by the authenticated user. Requires Only consider files owned by the authenticated user. Requires
that --drive-full-list=true (default). that --drive-full-list=true (default).
#### --drive-formats ####
Google documents can only be exported from Google drive. When rclone
downloads a Google doc it chooses a format to download depending upon
this setting.
By default the formats are `docx,xlsx,pptx,svg` which are a sensible
default for an editable document.
When choosing a format, rclone runs down the list provided in order
and chooses the first file format the doc can be exported as from the
list. If the file can't be exported to a format on the formats list,
then rclone will choose a format from the default list.
If you prefer an archive copy then you might use `--drive-formats
pdf`, or if you prefer openoffice/libreoffice formats you might use
`--drive-formats ods,odt`.
Note that rclone adds the extension to the google doc, so if it is
calles `My Spreadsheet` on google docs, it will be exported as `My
Spreadsheet.xlsx` or `My Spreadsheet.pdf` etc.
Here are the possible extensions with their corresponding mime types.
| Extension | Mime Type | Description |
| --------- |-----------| ------------|
| csv | text/csv | Standard CSV format for Spreadsheets |
| doc | application/msword | Micosoft Office Document |
| docx | application/vnd.openxmlformats-officedocument.wordprocessingml.document | Microsoft Office Document |
| html | text/html | An HTML Document |
| jpg | image/jpeg | A JPEG Image File |
| ods | application/vnd.oasis.opendocument.spreadsheet | Openoffice Spreadsheet |
| ods | application/x-vnd.oasis.opendocument.spreadsheet | Openoffice Spreadsheet |
| odt | application/vnd.oasis.opendocument.text | Openoffice Document |
| pdf | application/pdf | Adobe PDF Format |
| png | image/png | PNG Image Format|
| pptx | application/vnd.openxmlformats-officedocument.presentationml.presentation | Microsoft Office Powerpoint |
| rtf | application/rtf | Rich Text Format |
| svg | image/svg+xml | Scalable Vector Graphics Format |
| txt | text/plain | Plain Text |
| xls | application/vnd.ms-excel | Microsoft Office Spreadsheet |
| xlsx | application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | Microsoft Office Spreadsheet |
| zip | application/zip | A ZIP file of HTML, Images CSS |
### Limitations ### ### Limitations ###
Drive has quite a lot of rate limiting. This causes rclone to be Drive has quite a lot of rate limiting. This causes rclone to be

View file

@ -38,6 +38,7 @@ const (
minSleep = 10 * time.Millisecond minSleep = 10 * time.Millisecond
maxSleep = 2 * time.Second maxSleep = 2 * time.Second
decayConstant = 2 // bigger for slower decay, exponential decayConstant = 2 // bigger for slower decay, exponential
defaultExtensions = "docx,xlsx,pptx,svg"
) )
// Globals // Globals
@ -46,6 +47,7 @@ var (
driveFullList = pflag.BoolP("drive-full-list", "", true, "Use a full listing for directory list. More data but usually quicker.") driveFullList = pflag.BoolP("drive-full-list", "", true, "Use a full listing for directory list. More data but usually quicker.")
driveAuthOwnerOnly = pflag.BoolP("drive-auth-owner-only", "", false, "Only consider files owned by the authenticated user. Requires drive-full-list.") driveAuthOwnerOnly = pflag.BoolP("drive-auth-owner-only", "", false, "Only consider files owned by the authenticated user. Requires drive-full-list.")
driveUseTrash = pflag.BoolP("drive-use-trash", "", false, "Send files to the trash instead of deleting permanently.") driveUseTrash = pflag.BoolP("drive-use-trash", "", false, "Send files to the trash instead of deleting permanently.")
driveExtensions = pflag.StringP("drive-formats", "", defaultExtensions, "Comma separated list of preferred formats for downloading Google docs.")
// chunkSize is the size of the chunks created during a resumable upload and should be a power of two. // chunkSize is the size of the chunks created during a resumable upload and should be a power of two.
// 1<<18 is the minimum size supported by the Google uploader, and there is no maximum. // 1<<18 is the minimum size supported by the Google uploader, and there is no maximum.
chunkSize = fs.SizeSuffix(256 * 1024) chunkSize = fs.SizeSuffix(256 * 1024)
@ -58,6 +60,25 @@ var (
ClientSecret: fs.Reveal(rcloneClientSecret), ClientSecret: fs.Reveal(rcloneClientSecret),
RedirectURL: oauthutil.TitleBarRedirectURL, RedirectURL: oauthutil.TitleBarRedirectURL,
} }
mimeTypeToExtension = map[string]string{
"application/msword": "doc",
"application/pdf": "pdf",
"application/rtf": "rtf",
"application/vnd.ms-excel": "xls",
"application/vnd.oasis.opendocument.spreadsheet": "ods",
"application/vnd.oasis.opendocument.text": "odt",
"application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
"application/x-vnd.oasis.opendocument.spreadsheet": "ods",
"application/zip": "zip",
"image/jpeg": "jpg",
"image/png": "png",
"image/svg+xml": "svg",
"text/csv": "csv",
"text/html": "html",
"text/plain": "txt",
}
) )
// Register with Fs // Register with Fs
@ -85,13 +106,14 @@ func init() {
// Fs represents a remote drive server // Fs represents a remote drive server
type Fs struct { type Fs struct {
name string // name of this remote name string // name of this remote
svc *drive.Service // the connection to the drive server svc *drive.Service // the connection to the drive server
root string // the path we are working on root string // the path we are working on
client *http.Client // authorized client client *http.Client // authorized client
about *drive.About // information about the drive, including the root about *drive.About // information about the drive, including the root
dirCache *dircache.DirCache // Map of directory path to directory id dirCache *dircache.DirCache // Map of directory path to directory id
pacer *pacer.Pacer // To pace the API calls pacer *pacer.Pacer // To pace the API calls
extensions []string // preferred extensions to download docs
} }
// Object describes a drive object // Object describes a drive object
@ -103,6 +125,7 @@ type Object struct {
md5sum string // md5sum of the object md5sum string // md5sum of the object
bytes int64 // size of the object bytes int64 // size of the object
modifiedDate string // RFC3339 time it was last modified modifiedDate string // RFC3339 time it was last modified
isDocument bool // if set this is a Google doc
} }
// ------------------------------------------------------------ // ------------------------------------------------------------
@ -217,6 +240,31 @@ func isPowerOfTwo(x int64) bool {
} }
} }
// parseExtensions parses drive export extensions from a string
func (f *Fs) parseExtensions(extensions string) {
// Invert mimeTypeToExtension
var extensionToMimeType = make(map[string]string, len(mimeTypeToExtension))
for mimeType, extension := range mimeTypeToExtension {
extensionToMimeType[extension] = mimeType
}
for _, extension := range strings.Split(extensions, ",") {
extension = strings.ToLower(strings.TrimSpace(extension))
if _, found := extensionToMimeType[extension]; !found {
log.Fatalf("Couldn't find mime type for extension %q", extension)
}
found := false
for _, existingExtension := range f.extensions {
if extension == existingExtension {
found = true
break
}
}
if !found {
f.extensions = append(f.extensions, extension)
}
}
}
// NewFs contstructs an Fs from the path, container:path // NewFs contstructs an Fs from the path, container:path
func NewFs(name, path string) (fs.Fs, error) { func NewFs(name, path string) (fs.Fs, error) {
if !isPowerOfTwo(int64(chunkSize)) { if !isPowerOfTwo(int64(chunkSize)) {
@ -260,6 +308,10 @@ func NewFs(name, path string) (fs.Fs, error) {
f.dirCache = dircache.New(root, f.about.RootFolderId, f) f.dirCache = dircache.New(root, f.about.RootFolderId, f)
// Parse extensions
f.parseExtensions(*driveExtensions)
f.parseExtensions(defaultExtensions) // make sure there are some sensible ones on there
// Find the current root // Find the current root
err = f.dirCache.FindRoot(false) err = f.dirCache.FindRoot(false)
if err != nil { if err != nil {
@ -381,11 +433,52 @@ func (f *Fs) listDirRecursive(dirID string, path string, out fs.ObjectsChan) err
}() }()
} else { } else {
// If item has no MD5 sum it isn't stored on drive, so ignore it filepath := path + item.Title
if item.Md5Checksum != "" { if item.Md5Checksum != "" {
if fs := f.newFsObjectWithInfo(path+item.Title, item); fs != nil { // If item has MD5 sum it is a file stored on drive
out <- fs if o := f.newFsObjectWithInfo(filepath, item); o != nil {
out <- o
} }
} else if len(item.ExportLinks) != 0 {
// If item has export links then it is a google doc
var firstExtension, firstLink string
var extension, link string
outer:
for exportMimeType, exportLink := range item.ExportLinks {
exportExtension, ok := mimeTypeToExtension[exportMimeType]
if !ok {
fs.Debug(filepath, "Unknown export type %q - ignoring", exportMimeType)
continue
}
if firstExtension == "" {
firstExtension = exportExtension
firstLink = exportLink
}
for _, preferredExtension := range f.extensions {
if exportExtension == preferredExtension {
extension = exportExtension
link = exportLink
break outer
}
}
}
if extension == "" {
extension = firstExtension
link = firstLink
}
if extension == "" {
fs.Debug(filepath, "No export formats found")
} else {
if o := f.newFsObjectWithInfo(filepath+"."+extension, item); o != nil {
obj := o.(*Object)
obj.isDocument = true
obj.url = link
obj.bytes = -1
out <- o
}
}
} else {
fs.Debug(filepath, "Ignoring unknown object")
} }
} }
return false return false
@ -817,6 +910,18 @@ func (o *Object) Hash(t fs.HashType) (string, error) {
// Size returns the size of an object in bytes // Size returns the size of an object in bytes
func (o *Object) Size() int64 { func (o *Object) Size() int64 {
if o.isDocument && o.bytes < 0 {
// If it is a google doc then we must HEAD it to see
// how big it is
res, err := o.httpResponse("HEAD")
if err != nil {
fs.ErrorLog(o, "Error reading size: %v", err)
return 0
}
_ = res.Body.Close()
o.bytes = res.ContentLength
// fs.Debug(o, "Read size of document: %v", o.bytes)
}
return o.bytes return o.bytes
} }
@ -908,17 +1013,17 @@ func (o *Object) Storable() bool {
return true return true
} }
// Open an object for read // httpResponse gets an http.Response object for the object o.url
func (o *Object) Open() (in io.ReadCloser, err error) { // using the method passed in
func (o *Object) httpResponse(method string) (res *http.Response, err error) {
if o.url == "" { if o.url == "" {
return nil, fmt.Errorf("Forbidden to download - check sharing permission") return nil, fmt.Errorf("Forbidden to download - check sharing permission")
} }
req, err := http.NewRequest("GET", o.url, nil) req, err := http.NewRequest(method, o.url, nil)
if err != nil { if err != nil {
return nil, err return nil, err
} }
req.Header.Set("User-Agent", fs.UserAgent) req.Header.Set("User-Agent", fs.UserAgent)
var res *http.Response
err = o.fs.pacer.Call(func() (bool, error) { err = o.fs.pacer.Call(func() (bool, error) {
res, err = o.fs.client.Do(req) res, err = o.fs.client.Do(req)
return shouldRetry(err) return shouldRetry(err)
@ -926,10 +1031,57 @@ func (o *Object) Open() (in io.ReadCloser, err error) {
if err != nil { if err != nil {
return nil, err return nil, err
} }
return res, nil
}
// openFile represents an Object open for reading
type openFile struct {
o *Object // Object we are reading for
in io.ReadCloser // reading from here
bytes int64 // number of bytes read on this connection
eof bool // whether we have read end of file
}
// Read bytes from the object - see io.Reader
func (file *openFile) Read(p []byte) (n int, err error) {
n, err = file.in.Read(p)
file.bytes += int64(n)
if err == io.EOF {
file.eof = true
}
return
}
// Close the object and update bytes read
func (file *openFile) Close() (err error) {
// If end of file, update bytes read
if file.eof {
// fs.Debug(file.o, "Updating size of doc after download to %v", file.bytes)
file.o.bytes = file.bytes
}
return file.in.Close()
}
// Check it satisfies the interfaces
var _ io.ReadCloser = &openFile{}
// Open an object for read
func (o *Object) Open() (in io.ReadCloser, err error) {
res, err := o.httpResponse("GET")
if err != nil {
return nil, err
}
if res.StatusCode != 200 { if res.StatusCode != 200 {
_ = res.Body.Close() // ignore error _ = res.Body.Close() // ignore error
return nil, fmt.Errorf("Bad response: %d: %s", res.StatusCode, res.Status) return nil, fmt.Errorf("Bad response: %d: %s", res.StatusCode, res.Status)
} }
// If it is a document, update the size with what we are
// reading as it can change from the HEAD in the listing to
// this GET. This stops rclone marking the transfer as
// corrupted.
if o.isDocument {
return &openFile{o: o, in: res.Body}, nil
}
return res.Body, nil return res.Body, nil
} }
@ -939,6 +1091,9 @@ func (o *Object) Open() (in io.ReadCloser, err error) {
// //
// The new object may have been created if an error is returned // The new object may have been created if an error is returned
func (o *Object) Update(in io.Reader, modTime time.Time, size int64) error { func (o *Object) Update(in io.Reader, modTime time.Time, size int64) error {
if o.isDocument {
return fmt.Errorf("Can't update a google document")
}
updateInfo := &drive.File{ updateInfo := &drive.File{
Id: o.id, Id: o.id,
ModifiedDate: modTime.Format(timeFormatOut), ModifiedDate: modTime.Format(timeFormatOut),
@ -969,6 +1124,9 @@ func (o *Object) Update(in io.Reader, modTime time.Time, size int64) error {
// Remove an object // Remove an object
func (o *Object) Remove() error { func (o *Object) Remove() error {
if o.isDocument {
return fmt.Errorf("Can't delete a google document")
}
var err error var err error
err = o.fs.pacer.Call(func() (bool, error) { err = o.fs.pacer.Call(func() (bool, error) {
if *driveUseTrash { if *driveUseTrash {