| // Copyright 2017 Google LLC |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| package bigquery |
| |
| import ( |
| "encoding/base64" |
| "unicode/utf8" |
| |
| bq "google.golang.org/api/bigquery/v2" |
| ) |
| |
| // DataFormat describes the format of BigQuery table data. |
| type DataFormat string |
| |
| // Constants describing the format of BigQuery table data. |
| const ( |
| CSV DataFormat = "CSV" |
| Avro DataFormat = "AVRO" |
| JSON DataFormat = "NEWLINE_DELIMITED_JSON" |
| DatastoreBackup DataFormat = "DATASTORE_BACKUP" |
| GoogleSheets DataFormat = "GOOGLE_SHEETS" |
| Bigtable DataFormat = "BIGTABLE" |
| Parquet DataFormat = "PARQUET" |
| ORC DataFormat = "ORC" |
| ) |
| |
| // ExternalData is a table which is stored outside of BigQuery. It is implemented by |
| // *ExternalDataConfig. |
| // GCSReference also implements it, for backwards compatibility. |
| type ExternalData interface { |
| toBQ() bq.ExternalDataConfiguration |
| } |
| |
| // ExternalDataConfig describes data external to BigQuery that can be used |
| // in queries and to create external tables. |
| type ExternalDataConfig struct { |
| // The format of the data. Required. |
| SourceFormat DataFormat |
| |
| // The fully-qualified URIs that point to your |
| // data in Google Cloud. Required. |
| // |
| // For Google Cloud Storage URIs, each URI can contain one '*' wildcard character |
| // and it must come after the 'bucket' name. Size limits related to load jobs |
| // apply to external data sources. |
| // |
| // For Google Cloud Bigtable URIs, exactly one URI can be specified and it has be |
| // a fully specified and valid HTTPS URL for a Google Cloud Bigtable table. |
| // |
| // For Google Cloud Datastore backups, exactly one URI can be specified. Also, |
| // the '*' wildcard character is not allowed. |
| SourceURIs []string |
| |
| // The schema of the data. Required for CSV and JSON; disallowed for the |
| // other formats. |
| Schema Schema |
| |
| // Try to detect schema and format options automatically. |
| // Any option specified explicitly will be honored. |
| AutoDetect bool |
| |
| // The compression type of the data. |
| Compression Compression |
| |
| // IgnoreUnknownValues causes values not matching the schema to be |
| // tolerated. Unknown values are ignored. For CSV this ignores extra values |
| // at the end of a line. For JSON this ignores named values that do not |
| // match any column name. If this field is not set, records containing |
| // unknown values are treated as bad records. The MaxBadRecords field can |
| // be used to customize how bad records are handled. |
| IgnoreUnknownValues bool |
| |
| // MaxBadRecords is the maximum number of bad records that will be ignored |
| // when reading data. |
| MaxBadRecords int64 |
| |
| // Additional options for CSV, GoogleSheets and Bigtable formats. |
| Options ExternalDataConfigOptions |
| } |
| |
| func (e *ExternalDataConfig) toBQ() bq.ExternalDataConfiguration { |
| q := bq.ExternalDataConfiguration{ |
| SourceFormat: string(e.SourceFormat), |
| SourceUris: e.SourceURIs, |
| Autodetect: e.AutoDetect, |
| Compression: string(e.Compression), |
| IgnoreUnknownValues: e.IgnoreUnknownValues, |
| MaxBadRecords: e.MaxBadRecords, |
| } |
| if e.Schema != nil { |
| q.Schema = e.Schema.toBQ() |
| } |
| if e.Options != nil { |
| e.Options.populateExternalDataConfig(&q) |
| } |
| return q |
| } |
| |
| func bqToExternalDataConfig(q *bq.ExternalDataConfiguration) (*ExternalDataConfig, error) { |
| e := &ExternalDataConfig{ |
| SourceFormat: DataFormat(q.SourceFormat), |
| SourceURIs: q.SourceUris, |
| AutoDetect: q.Autodetect, |
| Compression: Compression(q.Compression), |
| IgnoreUnknownValues: q.IgnoreUnknownValues, |
| MaxBadRecords: q.MaxBadRecords, |
| Schema: bqToSchema(q.Schema), |
| } |
| switch { |
| case q.CsvOptions != nil: |
| e.Options = bqToCSVOptions(q.CsvOptions) |
| case q.GoogleSheetsOptions != nil: |
| e.Options = bqToGoogleSheetsOptions(q.GoogleSheetsOptions) |
| case q.BigtableOptions != nil: |
| var err error |
| e.Options, err = bqToBigtableOptions(q.BigtableOptions) |
| if err != nil { |
| return nil, err |
| } |
| } |
| return e, nil |
| } |
| |
| // ExternalDataConfigOptions are additional options for external data configurations. |
| // This interface is implemented by CSVOptions, GoogleSheetsOptions and BigtableOptions. |
| type ExternalDataConfigOptions interface { |
| populateExternalDataConfig(*bq.ExternalDataConfiguration) |
| } |
| |
| // CSVOptions are additional options for CSV external data sources. |
| type CSVOptions struct { |
| // AllowJaggedRows causes missing trailing optional columns to be tolerated |
| // when reading CSV data. Missing values are treated as nulls. |
| AllowJaggedRows bool |
| |
| // AllowQuotedNewlines sets whether quoted data sections containing |
| // newlines are allowed when reading CSV data. |
| AllowQuotedNewlines bool |
| |
| // Encoding is the character encoding of data to be read. |
| Encoding Encoding |
| |
| // FieldDelimiter is the separator for fields in a CSV file, used when |
| // reading or exporting data. The default is ",". |
| FieldDelimiter string |
| |
| // Quote is the value used to quote data sections in a CSV file. The |
| // default quotation character is the double quote ("), which is used if |
| // both Quote and ForceZeroQuote are unset. |
| // To specify that no character should be interpreted as a quotation |
| // character, set ForceZeroQuote to true. |
| // Only used when reading data. |
| Quote string |
| ForceZeroQuote bool |
| |
| // The number of rows at the top of a CSV file that BigQuery will skip when |
| // reading data. |
| SkipLeadingRows int64 |
| } |
| |
| func (o *CSVOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) { |
| c.CsvOptions = &bq.CsvOptions{ |
| AllowJaggedRows: o.AllowJaggedRows, |
| AllowQuotedNewlines: o.AllowQuotedNewlines, |
| Encoding: string(o.Encoding), |
| FieldDelimiter: o.FieldDelimiter, |
| Quote: o.quote(), |
| SkipLeadingRows: o.SkipLeadingRows, |
| } |
| } |
| |
| // quote returns the CSV quote character, or nil if unset. |
| func (o *CSVOptions) quote() *string { |
| if o.ForceZeroQuote { |
| quote := "" |
| return "e |
| } |
| if o.Quote == "" { |
| return nil |
| } |
| return &o.Quote |
| } |
| |
| func (o *CSVOptions) setQuote(ps *string) { |
| if ps != nil { |
| o.Quote = *ps |
| if o.Quote == "" { |
| o.ForceZeroQuote = true |
| } |
| } |
| } |
| |
| func bqToCSVOptions(q *bq.CsvOptions) *CSVOptions { |
| o := &CSVOptions{ |
| AllowJaggedRows: q.AllowJaggedRows, |
| AllowQuotedNewlines: q.AllowQuotedNewlines, |
| Encoding: Encoding(q.Encoding), |
| FieldDelimiter: q.FieldDelimiter, |
| SkipLeadingRows: q.SkipLeadingRows, |
| } |
| o.setQuote(q.Quote) |
| return o |
| } |
| |
| // GoogleSheetsOptions are additional options for GoogleSheets external data sources. |
| type GoogleSheetsOptions struct { |
| // The number of rows at the top of a sheet that BigQuery will skip when |
| // reading data. |
| SkipLeadingRows int64 |
| } |
| |
| func (o *GoogleSheetsOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) { |
| c.GoogleSheetsOptions = &bq.GoogleSheetsOptions{ |
| SkipLeadingRows: o.SkipLeadingRows, |
| } |
| } |
| |
| func bqToGoogleSheetsOptions(q *bq.GoogleSheetsOptions) *GoogleSheetsOptions { |
| return &GoogleSheetsOptions{ |
| SkipLeadingRows: q.SkipLeadingRows, |
| } |
| } |
| |
| // BigtableOptions are additional options for Bigtable external data sources. |
| type BigtableOptions struct { |
| // A list of column families to expose in the table schema along with their |
| // types. If omitted, all column families are present in the table schema and |
| // their values are read as BYTES. |
| ColumnFamilies []*BigtableColumnFamily |
| |
| // If true, then the column families that are not specified in columnFamilies |
| // list are not exposed in the table schema. Otherwise, they are read with BYTES |
| // type values. The default is false. |
| IgnoreUnspecifiedColumnFamilies bool |
| |
| // If true, then the rowkey column families will be read and converted to string. |
| // Otherwise they are read with BYTES type values and users need to manually cast |
| // them with CAST if necessary. The default is false. |
| ReadRowkeyAsString bool |
| } |
| |
| func (o *BigtableOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) { |
| q := &bq.BigtableOptions{ |
| IgnoreUnspecifiedColumnFamilies: o.IgnoreUnspecifiedColumnFamilies, |
| ReadRowkeyAsString: o.ReadRowkeyAsString, |
| } |
| for _, f := range o.ColumnFamilies { |
| q.ColumnFamilies = append(q.ColumnFamilies, f.toBQ()) |
| } |
| c.BigtableOptions = q |
| } |
| |
| func bqToBigtableOptions(q *bq.BigtableOptions) (*BigtableOptions, error) { |
| b := &BigtableOptions{ |
| IgnoreUnspecifiedColumnFamilies: q.IgnoreUnspecifiedColumnFamilies, |
| ReadRowkeyAsString: q.ReadRowkeyAsString, |
| } |
| for _, f := range q.ColumnFamilies { |
| f2, err := bqToBigtableColumnFamily(f) |
| if err != nil { |
| return nil, err |
| } |
| b.ColumnFamilies = append(b.ColumnFamilies, f2) |
| } |
| return b, nil |
| } |
| |
| // BigtableColumnFamily describes how BigQuery should access a Bigtable column family. |
| type BigtableColumnFamily struct { |
| // Identifier of the column family. |
| FamilyID string |
| |
| // Lists of columns that should be exposed as individual fields as opposed to a |
| // list of (column name, value) pairs. All columns whose qualifier matches a |
| // qualifier in this list can be accessed as .. Other columns can be accessed as |
| // a list through .Column field. |
| Columns []*BigtableColumn |
| |
| // The encoding of the values when the type is not STRING. Acceptable encoding values are: |
| // - TEXT - indicates values are alphanumeric text strings. |
| // - BINARY - indicates values are encoded using HBase Bytes.toBytes family of functions. |
| // This can be overridden for a specific column by listing that column in 'columns' and |
| // specifying an encoding for it. |
| Encoding string |
| |
| // If true, only the latest version of values are exposed for all columns in this |
| // column family. This can be overridden for a specific column by listing that |
| // column in 'columns' and specifying a different setting for that column. |
| OnlyReadLatest bool |
| |
| // The type to convert the value in cells of this |
| // column family. The values are expected to be encoded using HBase |
| // Bytes.toBytes function when using the BINARY encoding value. |
| // Following BigQuery types are allowed (case-sensitive): |
| // BYTES STRING INTEGER FLOAT BOOLEAN. |
| // The default type is BYTES. This can be overridden for a specific column by |
| // listing that column in 'columns' and specifying a type for it. |
| Type string |
| } |
| |
| func (b *BigtableColumnFamily) toBQ() *bq.BigtableColumnFamily { |
| q := &bq.BigtableColumnFamily{ |
| FamilyId: b.FamilyID, |
| Encoding: b.Encoding, |
| OnlyReadLatest: b.OnlyReadLatest, |
| Type: b.Type, |
| } |
| for _, col := range b.Columns { |
| q.Columns = append(q.Columns, col.toBQ()) |
| } |
| return q |
| } |
| |
| func bqToBigtableColumnFamily(q *bq.BigtableColumnFamily) (*BigtableColumnFamily, error) { |
| b := &BigtableColumnFamily{ |
| FamilyID: q.FamilyId, |
| Encoding: q.Encoding, |
| OnlyReadLatest: q.OnlyReadLatest, |
| Type: q.Type, |
| } |
| for _, col := range q.Columns { |
| c, err := bqToBigtableColumn(col) |
| if err != nil { |
| return nil, err |
| } |
| b.Columns = append(b.Columns, c) |
| } |
| return b, nil |
| } |
| |
| // BigtableColumn describes how BigQuery should access a Bigtable column. |
| type BigtableColumn struct { |
| // Qualifier of the column. Columns in the parent column family that have this |
| // exact qualifier are exposed as . field. The column field name is the |
| // same as the column qualifier. |
| Qualifier string |
| |
| // If the qualifier is not a valid BigQuery field identifier i.e. does not match |
| // [a-zA-Z][a-zA-Z0-9_]*, a valid identifier must be provided as the column field |
| // name and is used as field name in queries. |
| FieldName string |
| |
| // If true, only the latest version of values are exposed for this column. |
| // See BigtableColumnFamily.OnlyReadLatest. |
| OnlyReadLatest bool |
| |
| // The encoding of the values when the type is not STRING. |
| // See BigtableColumnFamily.Encoding |
| Encoding string |
| |
| // The type to convert the value in cells of this column. |
| // See BigtableColumnFamily.Type |
| Type string |
| } |
| |
| func (b *BigtableColumn) toBQ() *bq.BigtableColumn { |
| q := &bq.BigtableColumn{ |
| FieldName: b.FieldName, |
| OnlyReadLatest: b.OnlyReadLatest, |
| Encoding: b.Encoding, |
| Type: b.Type, |
| } |
| if utf8.ValidString(b.Qualifier) { |
| q.QualifierString = b.Qualifier |
| } else { |
| q.QualifierEncoded = base64.RawStdEncoding.EncodeToString([]byte(b.Qualifier)) |
| } |
| return q |
| } |
| |
| func bqToBigtableColumn(q *bq.BigtableColumn) (*BigtableColumn, error) { |
| b := &BigtableColumn{ |
| FieldName: q.FieldName, |
| OnlyReadLatest: q.OnlyReadLatest, |
| Encoding: q.Encoding, |
| Type: q.Type, |
| } |
| if q.QualifierString != "" { |
| b.Qualifier = q.QualifierString |
| } else { |
| bytes, err := base64.RawStdEncoding.DecodeString(q.QualifierEncoded) |
| if err != nil { |
| return nil, err |
| } |
| b.Qualifier = string(bytes) |
| } |
| return b, nil |
| } |