Vendor dependencies for GCS

2017-08-05 20:17:15 +02:00 · 2017-08-05 20:17:15 +02:00 · 8ca6a9a240
commit 8ca6a9a240
parent ba75a3884c
1228 changed files with 1769186 additions and 1 deletions
--- a/vendor/google.golang.org/api/examples/bigquery.go
+++ b/vendor/google.golang.org/api/examples/bigquery.go
@ -0,0 +1,368 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"container/list"
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"log"
+	"math"
+	"math/rand"
+	"net/http"
+	"os"
+	"strconv"
+	"strings"
+	"time"
+
+	bigquery "google.golang.org/api/bigquery/v2"
+	storage "google.golang.org/api/storage/v1"
+)
+
+const (
+	GB                         = 1 << 30
+	MaxBackoff                 = 30000
+	BaseBackoff                = 250
+	BackoffGrowthFactor        = 1.8
+	BackoffGrowthDamper        = 0.25
+	JobStatusDone              = "DONE"
+	DatasetAlreadyExists       = "Already Exists: Dataset"
+	TableWriteEmptyDisposition = "WRITE_EMPTY"
+)
+
+func init() {
+	scope := fmt.Sprintf("%s %s %s", bigquery.BigqueryScope,
+		storage.DevstorageReadOnlyScope,
+		"https://www.googleapis.com/auth/userinfo.profile")
+	registerDemo("bigquery", scope, bqMain)
+}
+
+// This example demonstrates loading objects from Google Cloud Storage into
+// BigQuery. Objects are specified by their bucket and a name prefix. Each
+// object will be loaded into a new table identified by the object name minus
+// any file extension. All tables are added to the specified dataset (one will
+// be created if necessary). Currently, tables will not be overwritten and an
+// attempt to load an object into a dataset that already contains its table
+// will emit an error message indicating the table already exists.
+// A schema file must be provided and it will be applied to every object/table.
+// Example usage:
+//   go-api-demo -clientid="my-clientid" -secret="my-secret" bq myProject
+//								myDataBucket datafile2013070 DataFiles2013
+//								./datafile_schema.json 100
+//
+// This will load all objects (e.g. all data files from July 2013) from
+// gs://myDataBucket into a (possibly new) BigQuery dataset named DataFiles2013
+// using the schema file provided and allowing up to 100 bad records. Assuming
+// each object is named like datafileYYYYMMDD.csv.gz and all of July's files are
+// stored in the bucket, 9 tables will be created named like datafile201307DD
+// where DD ranges from 01 to 09, inclusive.
+// When the program completes, it will emit a results line similar to:
+//
+// 9 files loaded in 3m58s (18m2.708s). Size: 7.18GB Rows: 7130725
+//
+// The total elapsed time from the start of first job to the end of the last job
+// (effectively wall clock time) is shown. In parenthesis is the aggregate time
+// taken to load all tables.
+func bqMain(client *http.Client, argv []string) {
+	if len(argv) != 6 {
+		fmt.Fprintln(os.Stderr,
+			"Usage: bq project_id bucket prefix dataset schema max_bad_records")
+		return
+	}
+
+	var (
+		project    = argv[0]
+		bucket     = argv[1]
+		objPrefix  = argv[2]
+		datasetId  = argv[3]
+		schemaFile = argv[4]
+	)
+	badRecords, err := strconv.ParseInt(argv[5], 10, 64)
+	if err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		return
+	}
+
+	rand.Seed(time.Now().UnixNano())
+
+	service, err := storage.New(client)
+	if err != nil {
+		log.Fatalf("Unable to create Storage service: %v", err)
+	}
+
+	// Get the list of objects in the bucket matching the specified prefix.
+	list := service.Objects.List(bucket)
+	list.Prefix(objPrefix)
+	objects, err := list.Do()
+	if err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		return
+	}
+
+	// Create the wrapper and insert the (new) dataset.
+	dataset, err := newBQDataset(client, project, datasetId)
+	if err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		return
+	}
+	if err = dataset.insert(true); err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		return
+	}
+
+	objectSource := &tableSource{
+		maxBadRecords: badRecords,
+		disposition:   TableWriteEmptyDisposition,
+	}
+
+	// Load the schema from disk.
+	f, err := ioutil.ReadFile(schemaFile)
+	if err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		return
+	}
+	if err = json.Unmarshal(f, &objectSource.schema); err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		return
+	}
+
+	// Assumes all objects have .csv, .csv.gz (or no) extension.
+	tableIdFromObject := func(name string) string {
+		return strings.TrimSuffix(strings.TrimSuffix(name, ".gz"), ".csv")
+	}
+
+	// A jobset is way to group a collection of jobs together for monitoring.
+	// For this example, we just use the name of the bucket and object prefix.
+	jobset := fmt.Sprintf("%s:%s", bucket, objPrefix)
+	fmt.Fprintf(os.Stderr, "\nLoading %d objects.\n", len(objects.Items))
+
+	// Load each object into a dataset of the same name (minus any extension).
+	// A successful insert call will inject the job into our queue for monitoring.
+	for _, o := range objects.Items {
+		objectSource.id = tableIdFromObject(o.Name)
+		objectSource.uri = fmt.Sprintf("gs://%s/%s", o.Bucket, o.Name)
+		if err = dataset.load(jobset, objectSource); err != nil {
+			fmt.Fprintln(os.Stderr, err)
+		}
+	}
+
+	dataset.monitor(jobset)
+}
+
+// Wraps the BigQuery service and dataset and provides some helper functions.
+type bqDataset struct {
+	project string
+	id      string
+	bq      *bigquery.Service
+	dataset *bigquery.Dataset
+	jobsets map[string]*list.List
+}
+
+func newBQDataset(client *http.Client, dsProj string, dsId string) (*bqDataset,
+	error) {
+
+	service, err := bigquery.New(client)
+	if err != nil {
+		log.Fatalf("Unable to create BigQuery service: %v", err)
+	}
+
+	return &bqDataset{
+		project: dsProj,
+		id:      dsId,
+		bq:      service,
+		dataset: &bigquery.Dataset{
+			DatasetReference: &bigquery.DatasetReference{
+				DatasetId: dsId,
+				ProjectId: dsProj,
+			},
+		},
+		jobsets: make(map[string]*list.List),
+	}, nil
+}
+
+func (ds *bqDataset) insert(existsOK bool) error {
+	call := ds.bq.Datasets.Insert(ds.project, ds.dataset)
+	_, err := call.Do()
+	if err != nil && (!existsOK || !strings.Contains(err.Error(),
+		DatasetAlreadyExists)) {
+		return err
+	}
+
+	return nil
+}
+
+type tableSource struct {
+	id            string
+	uri           string
+	schema        bigquery.TableSchema
+	maxBadRecords int64
+	disposition   string
+}
+
+func (ds *bqDataset) load(jobset string, source *tableSource) error {
+	job := &bigquery.Job{
+		Configuration: &bigquery.JobConfiguration{
+			Load: &bigquery.JobConfigurationLoad{
+				DestinationTable: &bigquery.TableReference{
+					DatasetId: ds.dataset.DatasetReference.DatasetId,
+					ProjectId: ds.project,
+					TableId:   source.id,
+				},
+				MaxBadRecords:    source.maxBadRecords,
+				Schema:           &source.schema,
+				SourceUris:       []string{source.uri},
+				WriteDisposition: source.disposition,
+			},
+		},
+	}
+
+	call := ds.bq.Jobs.Insert(ds.project, job)
+	job, err := call.Do()
+	if err != nil {
+		return err
+	}
+
+	_, ok := ds.jobsets[jobset]
+	if !ok {
+		ds.jobsets[jobset] = list.New()
+	}
+	ds.jobsets[jobset].PushBack(job)
+
+	return nil
+}
+
+func (ds *bqDataset) getJob(id string) (*bigquery.Job, error) {
+	return ds.bq.Jobs.Get(ds.project, id).Do()
+}
+
+func (ds *bqDataset) monitor(jobset string) {
+	jobq, ok := ds.jobsets[jobset]
+	if !ok {
+		return
+	}
+
+	var backoff float64 = BaseBackoff
+	pause := func(grow bool) {
+		if grow {
+			backoff *= BackoffGrowthFactor
+			backoff -= (backoff * rand.Float64() * BackoffGrowthDamper)
+			backoff = math.Min(backoff, MaxBackoff)
+			fmt.Fprintf(os.Stderr, "[%s] Checking remaining %d jobs...\n", jobset,
+				1+jobq.Len())
+		}
+		time.Sleep(time.Duration(backoff) * time.Millisecond)
+	}
+	var stats jobStats
+
+	// Track a 'head' pending job in queue for detecting cycling.
+	head := ""
+	// Loop until all jobs are done - with either success or error.
+	for jobq.Len() > 0 {
+		jel := jobq.Front()
+		job := jel.Value.(*bigquery.Job)
+		jobq.Remove(jel)
+		jid := job.JobReference.JobId
+		loop := false
+
+		// Check and possibly pick a new head job id.
+		if len(head) == 0 {
+			head = jid
+		} else {
+			if jid == head {
+				loop = true
+			}
+		}
+
+		// Retrieve the job's current status.
+		pause(loop)
+		j, err := ds.getJob(jid)
+		if err != nil {
+			fmt.Fprintln(os.Stderr, err)
+			// In this case of a transient API error, we want keep the job.
+			if j == nil {
+				jobq.PushBack(job)
+			} else {
+				// Must reset head tracker if job is discarded.
+				if loop {
+					head = ""
+					backoff = BaseBackoff
+				}
+			}
+			continue
+		}
+
+		// Reassign with the updated job data (from Get).
+		// We don't use j here as Get might return nil for this value.
+		job = j
+
+		if job.Status.State != JobStatusDone {
+			jobq.PushBack(job)
+			continue
+		}
+
+		if res := job.Status.ErrorResult; res != nil {
+			fmt.Fprintln(os.Stderr, res.Message)
+		} else {
+			stat := job.Statistics
+			lstat := stat.Load
+			stats.files += 1
+			stats.bytesIn += lstat.InputFileBytes
+			stats.bytesOut += lstat.OutputBytes
+			stats.rows += lstat.OutputRows
+			stats.elapsed +=
+				time.Duration(stat.EndTime-stat.StartTime) * time.Millisecond
+
+			if stats.start.IsZero() {
+				stats.start = time.Unix(stat.StartTime/1000, 0)
+			} else {
+				t := time.Unix(stat.StartTime/1000, 0)
+				if stats.start.Sub(t) > 0 {
+					stats.start = t
+				}
+			}
+
+			if stats.finish.IsZero() {
+				stats.finish = time.Unix(stat.EndTime/1000, 0)
+			} else {
+				t := time.Unix(stat.EndTime/1000, 0)
+				if t.Sub(stats.finish) > 0 {
+					stats.finish = t
+				}
+			}
+		}
+		// When the head job is processed reset the backoff since the loads
+		// run in BQ in parallel.
+		if loop {
+			head = ""
+			backoff = BaseBackoff
+		}
+	}
+
+	fmt.Fprintf(os.Stderr, "%#v\n", stats)
+}
+
+type jobStats struct {
+	// Number of files (sources) loaded.
+	files int64
+	// Bytes read from source (possibly compressed).
+	bytesIn int64
+	// Bytes loaded into BigQuery (uncompressed).
+	bytesOut int64
+	// Rows loaded into BigQuery.
+	rows int64
+	// Time taken to load source into table.
+	elapsed time.Duration
+	// Start time of the job.
+	start time.Time
+	// End time of the job.
+	finish time.Time
+}
+
+func (s jobStats) GoString() string {
+	return fmt.Sprintf("\n%d files loaded in %v (%v). Size: %.2fGB Rows: %d\n",
+		s.files, s.finish.Sub(s.start), s.elapsed, float64(s.bytesOut)/GB,
+		s.rows)
+}