1635 lines
62 KiB
Python
1635 lines
62 KiB
Python
#!/usr/bin/env python
|
|
"""The BigQuery CLI job client library."""
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
import itertools
|
|
import logging
|
|
import os
|
|
import sys
|
|
import time
|
|
from typing import Any, Callable, Dict, List, Optional, Union
|
|
import uuid
|
|
|
|
# To configure apiclient logging.
|
|
from absl import flags
|
|
from googleapiclient import http as http_request
|
|
|
|
import bq_flags
|
|
from clients import bigquery_client
|
|
from clients import table_reader as bq_table_reader
|
|
from clients import utils as bq_client_utils
|
|
from clients import wait_printer
|
|
from utils import bq_error
|
|
from utils import bq_id_utils
|
|
from utils import bq_processor_utils
|
|
|
|
|
|
def ReadSchemaAndJobRows(
|
|
bqclient: bigquery_client.BigqueryClient,
|
|
job_dict: Dict[str, str], # Can be stricter.
|
|
start_row: Optional[int],
|
|
max_rows: Optional[int],
|
|
result_first_page=None,
|
|
):
|
|
"""Convenience method to get the schema and rows from job query result.
|
|
|
|
Arguments:
|
|
bqclient: A BigqueryClient to get state and request clients from.
|
|
job_dict: job reference dictionary.
|
|
start_row: first row to read.
|
|
max_rows: number of rows to read.
|
|
result_first_page: the first page of the result of a query job.
|
|
|
|
Returns:
|
|
A tuple where the first item is the list of fields and the
|
|
second item a list of rows.
|
|
Raises:
|
|
ValueError: will be raised if start_row is not explicitly provided.
|
|
ValueError: will be raised if max_rows is not explicitly provided.
|
|
"""
|
|
if start_row is None:
|
|
raise ValueError('start_row is required')
|
|
if max_rows is None:
|
|
raise ValueError('max_rows is required')
|
|
if not job_dict:
|
|
job_ref: bq_id_utils.ApiClientHelper.JobReference = None
|
|
else:
|
|
job_ref = bq_id_utils.ApiClientHelper.JobReference.Create(**job_dict)
|
|
if flags.FLAGS.jobs_query_use_results_from_response and result_first_page:
|
|
reader = bq_table_reader.QueryTableReader(
|
|
bqclient.apiclient,
|
|
bqclient.max_rows_per_request,
|
|
job_ref,
|
|
result_first_page,
|
|
)
|
|
else:
|
|
reader = bq_table_reader.JobTableReader(
|
|
bqclient.apiclient, bqclient.max_rows_per_request, job_ref
|
|
)
|
|
return reader.ReadSchemaAndRows(
|
|
start_row, max_rows,
|
|
)
|
|
|
|
|
|
def ListJobRefs(bqclient: bigquery_client.BigqueryClient, **kwds):
|
|
return list(
|
|
map( # pylint: disable=g-long-lambda
|
|
bq_processor_utils.ConstructObjectReference,
|
|
ListJobs(bqclient, **kwds),
|
|
)
|
|
)
|
|
|
|
|
|
def ListJobs(
|
|
bqclient: bigquery_client.BigqueryClient,
|
|
reference: Optional[bq_id_utils.ApiClientHelper.ProjectReference] = None,
|
|
max_results: Optional[int] = None,
|
|
page_token: Optional[str] = None,
|
|
state_filter: Optional[Union[List[str], str]] = None, # Actually an enum.
|
|
min_creation_time: Optional[str] = None,
|
|
max_creation_time: Optional[str] = None,
|
|
all_users: Optional[bool] = None,
|
|
parent_job_id: Optional[str] = None,
|
|
):
|
|
# pylint: disable=g-doc-args
|
|
"""Return a list of jobs.
|
|
|
|
Args:
|
|
bqclient: A BigqueryClient to get state and request clients from.
|
|
reference: The ProjectReference to list jobs for.
|
|
max_results: The maximum number of jobs to return.
|
|
page_token: Current page token (optional).
|
|
state_filter: A single state filter or a list of filters to apply. If not
|
|
specified, no filtering is applied.
|
|
min_creation_time: Timestamp in milliseconds. Only return jobs created after
|
|
or at this timestamp.
|
|
max_creation_time: Timestamp in milliseconds. Only return jobs created
|
|
before or at this timestamp.
|
|
all_users: Whether to list jobs for all users of the project. Requesting
|
|
user must be an owner of the project to list all jobs.
|
|
parent_job_id: Retrieve only child jobs belonging to this parent; None to
|
|
retrieve top-level jobs.
|
|
|
|
Returns:
|
|
A list of jobs.
|
|
"""
|
|
return ListJobsWithTokenAndUnreachable(
|
|
bqclient,
|
|
reference,
|
|
max_results,
|
|
page_token,
|
|
state_filter,
|
|
min_creation_time,
|
|
max_creation_time,
|
|
all_users,
|
|
parent_job_id,
|
|
)['results']
|
|
|
|
|
|
def ListJobsWithTokenAndUnreachable(
|
|
bqclient: bigquery_client.BigqueryClient,
|
|
reference: Optional[bq_id_utils.ApiClientHelper.ProjectReference] = None,
|
|
max_results: Optional[int] = None,
|
|
page_token: Optional[str] = None,
|
|
state_filter: Optional[Union[List[str], str]] = None,
|
|
min_creation_time: Optional[str] = None,
|
|
max_creation_time: Optional[str] = None,
|
|
all_users: Optional[bool] = None,
|
|
parent_job_id: Optional[str] = None,
|
|
):
|
|
# pylint: disable=g-doc-args
|
|
"""Return a list of jobs.
|
|
|
|
Args:
|
|
bqclient: A BigqueryClient to get state and request clients from.
|
|
reference: The ProjectReference to list jobs for.
|
|
max_results: The maximum number of jobs to return.
|
|
page_token: Current page token (optional).
|
|
state_filter: A single state filter or a list of filters to apply. If not
|
|
specified, no filtering is applied.
|
|
min_creation_time: Timestamp in milliseconds. Only return jobs created after
|
|
or at this timestamp.
|
|
max_creation_time: Timestamp in milliseconds. Only return jobs created
|
|
before or at this timestamp.
|
|
all_users: Whether to list jobs for all users of the project. Requesting
|
|
user must be an owner of the project to list all jobs.
|
|
parent_job_id: Retrieve only child jobs belonging to this parent; None to
|
|
retrieve top-level jobs.
|
|
|
|
Returns:
|
|
A dict that contains enytries:
|
|
'results': a list of jobs
|
|
'token': nextPageToken for the last page, if present.
|
|
"""
|
|
reference = bq_client_utils.NormalizeProjectReference(
|
|
id_fallbacks=bqclient, reference=reference
|
|
)
|
|
bq_id_utils.typecheck(
|
|
reference,
|
|
bq_id_utils.ApiClientHelper.ProjectReference,
|
|
method='ListJobs',
|
|
)
|
|
if max_results is not None:
|
|
if max_results > bq_processor_utils.MAX_RESULTS:
|
|
max_results = bq_processor_utils.MAX_RESULTS
|
|
request = bq_processor_utils.PrepareListRequest(
|
|
reference, max_results, page_token
|
|
)
|
|
if state_filter is not None:
|
|
# The apiclient wants enum values as lowercase strings.
|
|
if isinstance(state_filter, str):
|
|
state_filter = state_filter.lower()
|
|
else:
|
|
state_filter = [s.lower() for s in state_filter]
|
|
bq_processor_utils.ApplyParameters(
|
|
request,
|
|
projection='full',
|
|
state_filter=state_filter,
|
|
all_users=all_users,
|
|
parent_job_id=parent_job_id,
|
|
)
|
|
if min_creation_time is not None:
|
|
request['minCreationTime'] = min_creation_time
|
|
if max_creation_time is not None:
|
|
request['maxCreationTime'] = max_creation_time
|
|
result = bqclient.apiclient.jobs().list(**request).execute()
|
|
results = result.get('jobs', [])
|
|
if max_results is not None:
|
|
while 'nextPageToken' in result and len(results) < max_results:
|
|
request['maxResults'] = max_results - len(results)
|
|
request['pageToken'] = result['nextPageToken']
|
|
result = bqclient.apiclient.jobs().list(**request).execute()
|
|
results.extend(result.get('jobs', []))
|
|
response = dict(results=results)
|
|
if 'nextPageToken' in result:
|
|
response['token'] = result['nextPageToken']
|
|
# The 'unreachable' field is a list of skipped locations that were
|
|
# unreachable. The field definition is
|
|
# google3/google/cloud/bigquery/v2/job.proto;rcl=622304818;l=593
|
|
if 'unreachable' in result:
|
|
response['unreachable'] = result['unreachable']
|
|
return response
|
|
|
|
|
|
def CopyTable(
|
|
bqclient: bigquery_client.BigqueryClient,
|
|
source_references: List[bq_id_utils.ApiClientHelper.TableReference],
|
|
dest_reference: bq_id_utils.ApiClientHelper.TableReference,
|
|
create_disposition: Optional[str] = None,
|
|
write_disposition: Optional[str] = None, # Actually an enum.
|
|
ignore_already_exists: Optional[bool] = False,
|
|
encryption_configuration=None,
|
|
operation_type: Optional[str] = 'COPY', # Actually an enum.
|
|
destination_expiration_time: Optional[str] = None,
|
|
**kwds,
|
|
):
|
|
"""Copies a table.
|
|
|
|
Args:
|
|
bqclient: A BigqueryClient to get state and request clients from.
|
|
source_references: TableReferences of source tables.
|
|
dest_reference: TableReference of destination table.
|
|
create_disposition: Optional. Specifies the create_disposition for the
|
|
dest_reference.
|
|
write_disposition: Optional. Specifies the write_disposition for the
|
|
dest_reference.
|
|
ignore_already_exists: Whether to ignore "already exists" errors.
|
|
encryption_configuration: Optional. Allows user to encrypt the table from
|
|
the copy table command with Cloud KMS key. Passed as a dictionary in the
|
|
following format: {'kmsKeyName': 'destination_kms_key'}
|
|
**kwds: Passed on to ExecuteJob.
|
|
|
|
Returns:
|
|
The job description, or None for ignored errors.
|
|
|
|
Raises:
|
|
BigqueryDuplicateError: when write_disposition 'WRITE_EMPTY' is
|
|
specified and the dest_reference table already exists.
|
|
"""
|
|
for src_ref in source_references:
|
|
bq_id_utils.typecheck(
|
|
src_ref,
|
|
bq_id_utils.ApiClientHelper.TableReference,
|
|
method='CopyTable',
|
|
)
|
|
bq_id_utils.typecheck(
|
|
dest_reference,
|
|
bq_id_utils.ApiClientHelper.TableReference,
|
|
method='CopyTable',
|
|
)
|
|
copy_config = {
|
|
'destinationTable': dict(dest_reference),
|
|
'sourceTables': [dict(src_ref) for src_ref in source_references],
|
|
}
|
|
if encryption_configuration:
|
|
copy_config['destinationEncryptionConfiguration'] = encryption_configuration
|
|
|
|
if operation_type:
|
|
copy_config['operationType'] = operation_type
|
|
|
|
if destination_expiration_time:
|
|
copy_config['destinationExpirationTime'] = destination_expiration_time
|
|
|
|
bq_processor_utils.ApplyParameters(
|
|
copy_config,
|
|
create_disposition=create_disposition,
|
|
write_disposition=write_disposition,
|
|
)
|
|
|
|
try:
|
|
return ExecuteJob(bqclient, {'copy': copy_config}, **kwds)
|
|
except bq_error.BigqueryDuplicateError as e:
|
|
if ignore_already_exists:
|
|
return None
|
|
raise e
|
|
|
|
|
|
def JobExists(
|
|
bqclient: bigquery_client.BigqueryClient,
|
|
reference: bq_id_utils.ApiClientHelper.JobReference,
|
|
) -> bool:
|
|
"""Returns true if the job exists."""
|
|
bq_id_utils.typecheck(
|
|
reference, bq_id_utils.ApiClientHelper.JobReference, method='JobExists'
|
|
)
|
|
try:
|
|
return bqclient.apiclient.jobs().get(**dict(reference)).execute()
|
|
except bq_error.BigqueryNotFoundError:
|
|
return False
|
|
|
|
|
|
def DeleteJob(
|
|
bqclient: bigquery_client.BigqueryClient,
|
|
reference: bq_id_utils.ApiClientHelper.JobReference,
|
|
ignore_not_found: Optional[bool] = False,
|
|
):
|
|
"""Deletes JobReference reference.
|
|
|
|
Args:
|
|
bqclient: A BigqueryClient to get state and request clients from.
|
|
reference: the JobReference to delete.
|
|
ignore_not_found: Whether to ignore "not found" errors.
|
|
|
|
Raises:
|
|
BigqueryTypeError: if reference is not a JobReference.
|
|
bq_error.BigqueryNotFoundError: if reference does not exist and
|
|
ignore_not_found is False.
|
|
"""
|
|
bq_id_utils.typecheck(
|
|
reference, bq_id_utils.ApiClientHelper.JobReference, method='DeleteJob'
|
|
)
|
|
try:
|
|
bqclient.apiclient.jobs().delete(**dict(reference)).execute()
|
|
except bq_error.BigqueryNotFoundError:
|
|
if not ignore_not_found:
|
|
raise
|
|
|
|
|
|
|
|
#################################
|
|
## Job control
|
|
#################################
|
|
|
|
|
|
def StartJob(
|
|
bqclient: bigquery_client.BigqueryClient,
|
|
configuration,
|
|
project_id: Optional[str] = None,
|
|
upload_file: Optional[str] = None,
|
|
job_id: Optional[str] = None,
|
|
location: Optional[str] = None,
|
|
):
|
|
"""Start a job with the given configuration.
|
|
|
|
Args:
|
|
bqclient: A BigqueryClient to get state and request clients from.
|
|
configuration: The configuration for a job.
|
|
project_id: The project_id to run the job under. If None,
|
|
bqclient.project_id is used.
|
|
upload_file: A file to include as a media upload to this request. Only valid
|
|
on job requests that expect a media upload file.
|
|
job_id: A unique job_id to use for this job. If a JobIdGenerator, a job id
|
|
will be generated from the job configuration. If None, a unique job_id
|
|
will be created for this request.
|
|
location: Optional. The geographic location where the job should run.
|
|
|
|
Returns:
|
|
The job resource returned from the insert job request. If there is an
|
|
error, the jobReference field will still be filled out with the job
|
|
reference used in the request.
|
|
|
|
Raises:
|
|
bq_error.BigqueryClientConfigurationError: if project_id and
|
|
bqclient.project_id are None.
|
|
"""
|
|
project_id = project_id or bqclient.project_id
|
|
if not project_id:
|
|
raise bq_error.BigqueryClientConfigurationError(
|
|
'Cannot start a job without a project id.'
|
|
)
|
|
configuration = configuration.copy()
|
|
if bqclient.job_property:
|
|
configuration['properties'] = dict(
|
|
prop.partition('=')[0::2] for prop in bqclient.job_property
|
|
)
|
|
job_request = {'configuration': configuration}
|
|
|
|
# Use the default job id generator if no job id was supplied.
|
|
job_id = job_id or bqclient.job_id_generator
|
|
|
|
if isinstance(job_id, bq_client_utils.JobIdGenerator):
|
|
job_id = job_id.Generate(configuration)
|
|
|
|
if job_id is not None:
|
|
job_reference = {'jobId': job_id, 'projectId': project_id}
|
|
job_request['jobReference'] = job_reference
|
|
if location:
|
|
job_reference['location'] = location
|
|
media_upload = ''
|
|
if upload_file:
|
|
resumable = bqclient.enable_resumable_uploads
|
|
# There is a bug in apiclient http lib that make uploading resumable files
|
|
# with 0 length broken.
|
|
if os.stat(upload_file).st_size == 0:
|
|
resumable = False
|
|
media_upload = http_request.MediaFileUpload(
|
|
filename=upload_file,
|
|
mimetype='application/octet-stream',
|
|
resumable=resumable,
|
|
)
|
|
request = bqclient.apiclient.jobs().insert(
|
|
body=job_request, media_body=media_upload, projectId=project_id
|
|
)
|
|
if upload_file and resumable:
|
|
result = wait_printer.execute_in_chunks_with_progress(request)
|
|
else:
|
|
result = request.execute()
|
|
return result
|
|
|
|
|
|
def _StartQueryRpc(
|
|
bqclient: bigquery_client.BigqueryClient,
|
|
query: str,
|
|
dry_run: Optional[bool] = None,
|
|
use_cache: Optional[bool] = None,
|
|
preserve_nulls: Optional[bool] = None,
|
|
request_id: Optional[str] = None,
|
|
maximum_bytes_billed: Optional[int] = None,
|
|
max_results: Optional[int] = None,
|
|
timeout_ms: Optional[int] = None,
|
|
job_timeout_ms: Optional[int] = None,
|
|
max_slots: Optional[int] = None,
|
|
min_completion_ratio: Optional[float] = None,
|
|
project_id: Optional[str] = None,
|
|
external_table_definitions_json=None,
|
|
udf_resources=None,
|
|
use_legacy_sql: Optional[bool] = None,
|
|
location: Optional[str] = None,
|
|
connection_properties=None,
|
|
job_creation_mode: Optional[
|
|
bigquery_client.BigqueryClient.JobCreationMode
|
|
] = None,
|
|
reservation_id: Optional[str] = None,
|
|
create_session: Optional[bool] = None,
|
|
query_parameters=None,
|
|
positional_parameter_mode=None,
|
|
destination_encryption_configuration=None,
|
|
**kwds,
|
|
):
|
|
"""Executes the given query using the rpc-style query api.
|
|
|
|
Args:
|
|
bqclient: A BigqueryClient to get state and request clients from.
|
|
query: Query to execute.
|
|
dry_run: Optional. Indicates whether the query will only be validated and
|
|
return processing statistics instead of actually running.
|
|
use_cache: Optional. Whether to use the query cache. Caching is best-effort
|
|
only and you should not make assumptions about whether or how long a query
|
|
result will be cached.
|
|
preserve_nulls: Optional. Indicates whether to preserve nulls in input data.
|
|
Temporary flag; will be removed in a future version.
|
|
request_id: Optional. The idempotency token for jobs.query
|
|
maximum_bytes_billed: Optional. Upper limit on the number of billed bytes.
|
|
max_results: Maximum number of results to return.
|
|
timeout_ms: Timeout, in milliseconds, for the call to query().
|
|
job_timeout_ms: Optional. How long to let the job run.
|
|
max_slots: Optional. Cap on target rate of slot consumption by the query.
|
|
min_completion_ratio: Optional. Specifies the minimum fraction of data that
|
|
must be scanned before a query returns. This value should be between 0.0
|
|
and 1.0 inclusive.
|
|
project_id: Project id to use.
|
|
external_table_definitions_json: Json representation of external table
|
|
definitions.
|
|
udf_resources: Array of inline and external UDF code resources.
|
|
use_legacy_sql: The choice of using Legacy SQL for the query is optional. If
|
|
not specified, the server will automatically determine the dialect based
|
|
on query information, such as dialect prefixes. If no prefixes are found,
|
|
it will default to Legacy SQL.
|
|
location: Optional. The geographic location where the job should run.
|
|
connection_properties: Optional. Connection properties to use when running
|
|
the query, presented as a list of key/value pairs. A key of "time_zone"
|
|
indicates that the query will be run with the default timezone
|
|
corresponding to the value.
|
|
job_creation_mode: Optional. An option for job creation. The valid values
|
|
are JOB_CREATION_REQUIRED and JOB_CREATION_OPTIONAL.
|
|
reservation_id: Optional. An option to set the reservation to use when
|
|
execute the job. Reservation should be in the format of
|
|
"project_id:reservation_id", "project_id:location.reservation_id", or
|
|
"reservation_id".
|
|
create_session: Optional. True to create a session for the query.
|
|
query_parameters: parameter values for use_legacy_sql=False queries.
|
|
positional_parameter_mode: If true, set the parameter mode to POSITIONAL
|
|
instead of the default NAMED.
|
|
destination_encryption_configuration: Optional. Allows user to encrypt the
|
|
table created from a query job with a Cloud KMS key.
|
|
**kwds: Extra keyword arguments passed directly to jobs.Query().
|
|
|
|
Returns:
|
|
The query response.
|
|
|
|
Raises:
|
|
bq_error.BigqueryClientConfigurationError: if project_id and
|
|
bqclient.project_id are None.
|
|
bq_error.BigqueryError: if query execution fails.
|
|
"""
|
|
project_id = project_id or bqclient.project_id
|
|
if not project_id:
|
|
raise bq_error.BigqueryClientConfigurationError(
|
|
'Cannot run a query without a project id.'
|
|
)
|
|
request = {'query': query}
|
|
if external_table_definitions_json:
|
|
request['tableDefinitions'] = external_table_definitions_json
|
|
if udf_resources:
|
|
request['userDefinedFunctionResources'] = udf_resources
|
|
if bqclient.dataset_id:
|
|
request['defaultDataset'] = bq_client_utils.GetQueryDefaultDataset(
|
|
bqclient.dataset_id
|
|
)
|
|
|
|
# If the request id flag is set, generate a random one if it is not provided
|
|
# explicitly.
|
|
if request_id is None and flags.FLAGS.jobs_query_use_request_id:
|
|
request_id = str(uuid.uuid4())
|
|
|
|
reservation_path = _GetReservationPath(
|
|
bqclient,
|
|
reservation_id,
|
|
)
|
|
|
|
bq_processor_utils.ApplyParameters(
|
|
request,
|
|
preserve_nulls=preserve_nulls,
|
|
request_id=request_id,
|
|
maximum_bytes_billed=maximum_bytes_billed,
|
|
use_query_cache=use_cache,
|
|
timeout_ms=timeout_ms,
|
|
job_timeout_ms=job_timeout_ms,
|
|
max_slots=max_slots,
|
|
max_results=max_results,
|
|
use_legacy_sql=use_legacy_sql,
|
|
min_completion_ratio=min_completion_ratio,
|
|
job_creation_mode=job_creation_mode,
|
|
reservation=reservation_path,
|
|
location=location,
|
|
create_session=create_session,
|
|
query_parameters=query_parameters,
|
|
destination_encryption_configuration=destination_encryption_configuration,
|
|
parameter_mode=None
|
|
if positional_parameter_mode is None
|
|
else ('POSITIONAL' if positional_parameter_mode else 'NAMED'),
|
|
)
|
|
bq_processor_utils.ApplyParameters(
|
|
request, connection_properties=connection_properties
|
|
)
|
|
bq_processor_utils.ApplyParameters(request, dry_run=dry_run)
|
|
logging.debug(
|
|
'Calling bqclient.apiclient.jobs().query(%s, %s, %s)',
|
|
request,
|
|
project_id,
|
|
kwds,
|
|
)
|
|
return (
|
|
bqclient.apiclient.jobs()
|
|
.query(body=request, projectId=project_id, **kwds)
|
|
.execute()
|
|
)
|
|
|
|
|
|
def GetQueryResults(
|
|
bqclient: bigquery_client.BigqueryClient,
|
|
job_id: Optional[str] = None,
|
|
project_id: Optional[str] = None,
|
|
max_results: Optional[int] = None,
|
|
timeout_ms: Optional[int] = None,
|
|
location: Optional[str] = None,
|
|
):
|
|
"""Waits for a query job to run and returns results if complete.
|
|
|
|
By default, waits 10s for the provided job to complete and either returns
|
|
the results or a response where jobComplete is set to false. The timeout can
|
|
be increased but the call is not guaranteed to wait for the specified
|
|
timeout.
|
|
|
|
Args:
|
|
bqclient: A BigqueryClient to get state and request clients from.
|
|
job_id: The job id of the query job that we are waiting to complete.
|
|
project_id: The project id of the query job.
|
|
max_results: The maximum number of results.
|
|
timeout_ms: The number of milliseconds to wait for the query to complete.
|
|
location: Optional. The geographic location of the job.
|
|
|
|
Returns:
|
|
The getQueryResults() result.
|
|
|
|
Raises:
|
|
bq_error.BigqueryClientConfigurationError: if project_id and
|
|
bqclient.project_id are None.
|
|
"""
|
|
project_id = project_id or bqclient.project_id
|
|
if not project_id:
|
|
raise bq_error.BigqueryClientConfigurationError(
|
|
'Cannot get query results without a project id.'
|
|
)
|
|
kwds = {}
|
|
bq_processor_utils.ApplyParameters(
|
|
kwds,
|
|
job_id=job_id,
|
|
project_id=project_id,
|
|
timeout_ms=timeout_ms,
|
|
max_results=max_results,
|
|
location=location,
|
|
)
|
|
return bqclient.apiclient.jobs().getQueryResults(**kwds).execute()
|
|
|
|
|
|
def RunJobSynchronously(
|
|
bqclient: bigquery_client.BigqueryClient,
|
|
configuration,
|
|
project_id: Optional[str] = None,
|
|
upload_file: Optional[str] = None,
|
|
job_id: Optional[str] = None,
|
|
location: Optional[str] = None,
|
|
):
|
|
"""Starts a job and waits for it to complete.
|
|
|
|
Args:
|
|
bqclient: A BigqueryClient to get state and request clients from.
|
|
configuration: The configuration for a job.
|
|
project_id: The project_id to run the job under. If None,
|
|
bqclient.project_id is used.
|
|
upload_file: A file to include as a media upload to this request. Only valid
|
|
on job requests that expect a media upload file.
|
|
job_id: A unique job_id to use for this job. If a JobIdGenerator, a job id
|
|
will be generated from the job configuration. If None, a unique job_id
|
|
will be created for this request.
|
|
location: Optional. The geographic location where the job should run.
|
|
|
|
Returns:
|
|
job, if it did not fail.
|
|
|
|
Raises:
|
|
BigQueryError: if the job fails.
|
|
"""
|
|
result = StartJob(
|
|
bqclient,
|
|
configuration,
|
|
project_id=project_id,
|
|
upload_file=upload_file,
|
|
job_id=job_id,
|
|
location=location,
|
|
)
|
|
if result['status']['state'] != 'DONE':
|
|
job_reference = bq_processor_utils.ConstructObjectReference(result)
|
|
result = WaitJob(bqclient, job_reference)
|
|
return bq_client_utils.RaiseIfJobError(result)
|
|
|
|
|
|
def ExecuteJob(
|
|
bqclient: bigquery_client.BigqueryClient,
|
|
configuration,
|
|
sync: Optional[bool] = None,
|
|
project_id: Optional[str] = None,
|
|
upload_file: Optional[str] = None,
|
|
job_id: Optional[str] = None,
|
|
location: Optional[str] = None,
|
|
):
|
|
"""Execute a job, possibly waiting for results."""
|
|
if sync is None:
|
|
sync = bqclient.sync
|
|
|
|
if sync:
|
|
job = RunJobSynchronously(
|
|
bqclient,
|
|
configuration,
|
|
project_id=project_id,
|
|
upload_file=upload_file,
|
|
job_id=job_id,
|
|
location=location,
|
|
)
|
|
else:
|
|
job = StartJob(
|
|
bqclient,
|
|
configuration,
|
|
project_id=project_id,
|
|
upload_file=upload_file,
|
|
job_id=job_id,
|
|
location=location,
|
|
)
|
|
bq_client_utils.RaiseIfJobError(job)
|
|
return job
|
|
|
|
|
|
def CancelJob(
|
|
bqclient: bigquery_client.BigqueryClient,
|
|
project_id: Optional[str] = None,
|
|
job_id: Optional[str] = None,
|
|
location: Optional[str] = None,
|
|
):
|
|
"""Attempt to cancel the specified job if it is running.
|
|
|
|
Args:
|
|
bqclient: A BigqueryClient to get state and request clients from.
|
|
project_id: The project_id to the job is running under. If None,
|
|
bqclient.project_id is used.
|
|
job_id: The job id for this job.
|
|
location: Optional. The geographic location of the job.
|
|
|
|
Returns:
|
|
The job resource returned for the job for which cancel is being requested.
|
|
|
|
Raises:
|
|
bq_error.BigqueryClientConfigurationError: if project_id or job_id
|
|
are None.
|
|
"""
|
|
project_id = project_id or bqclient.project_id
|
|
if not project_id:
|
|
raise bq_error.BigqueryClientConfigurationError(
|
|
'Cannot cancel a job without a project id.'
|
|
)
|
|
if not job_id:
|
|
raise bq_error.BigqueryClientConfigurationError(
|
|
'Cannot cancel a job without a job id.'
|
|
)
|
|
|
|
job_reference = bq_id_utils.ApiClientHelper.JobReference.Create(
|
|
projectId=project_id, jobId=job_id, location=location
|
|
)
|
|
result = (
|
|
bqclient.apiclient.jobs().cancel(**dict(job_reference)).execute()['job']
|
|
)
|
|
if result['status']['state'] != 'DONE' and bqclient.sync:
|
|
job_reference = bq_processor_utils.ConstructObjectReference(result)
|
|
result = WaitJob(bqclient, job_reference=job_reference)
|
|
return result
|
|
|
|
|
|
def WaitJob(
|
|
bqclient: bigquery_client.BigqueryClient,
|
|
job_reference: bq_id_utils.ApiClientHelper.JobReference,
|
|
status: str = 'DONE', # Should be an enum
|
|
wait: int = sys.maxsize,
|
|
wait_printer_factory: Optional[
|
|
Callable[[], wait_printer.WaitPrinter]
|
|
] = None,
|
|
):
|
|
"""Poll for a job to run until it reaches the requested status.
|
|
|
|
Arguments:
|
|
bqclient: A BigqueryClient to get state and request clients from.
|
|
job_reference: JobReference to poll.
|
|
status: (optional, default 'DONE') Desired job status.
|
|
wait: (optional, default maxint) Max wait time.
|
|
wait_printer_factory: (optional, defaults to bqclient.wait_printer_factory)
|
|
Returns a subclass of WaitPrinter that will be called after each job poll.
|
|
|
|
Returns:
|
|
The job object returned by the final status call.
|
|
|
|
Raises:
|
|
StopIteration: If polling does not reach the desired state before
|
|
timing out.
|
|
ValueError: If given an invalid wait value.
|
|
"""
|
|
bq_id_utils.typecheck(
|
|
job_reference,
|
|
bq_id_utils.ApiClientHelper.JobReference,
|
|
method='WaitJob',
|
|
)
|
|
start_time = time.time()
|
|
job = None
|
|
if wait_printer_factory:
|
|
printer = wait_printer_factory()
|
|
else:
|
|
printer = bqclient.wait_printer_factory()
|
|
|
|
# This is a first pass at wait logic: we ping at 1s intervals a few
|
|
# times, then increase to max(3, max_wait), and then keep waiting
|
|
# that long until we've run out of time.
|
|
waits = itertools.chain(
|
|
itertools.repeat(1, 8), range(2, 30, 3),
|
|
itertools.repeat(30))
|
|
current_wait = 0
|
|
current_status = 'UNKNOWN'
|
|
in_error_state = False
|
|
while current_wait <= wait:
|
|
try:
|
|
done, job = PollJob(bqclient, job_reference, status=status, wait=wait)
|
|
current_status = job['status']['state']
|
|
in_error_state = False
|
|
if done:
|
|
printer.print(job_reference.jobId, current_wait, current_status)
|
|
break
|
|
except bq_error.BigqueryCommunicationError as e:
|
|
# Communication errors while waiting on a job are okay.
|
|
logging.warning('Transient error during job status check: %s', e)
|
|
except bq_error.BigqueryBackendError as e:
|
|
# Temporary server errors while waiting on a job are okay.
|
|
logging.warning('Transient error during job status check: %s', e)
|
|
except bq_error.BigqueryServiceError as e:
|
|
# Among this catch-all class, some kinds are permanent
|
|
# errors, so we don't want to retry indefinitely, but if
|
|
# the error is transient we'd like "wait" to get past it.
|
|
if in_error_state: raise
|
|
in_error_state = True
|
|
|
|
# For every second we're polling, update the message to the user.
|
|
for _ in range(next(waits)):
|
|
current_wait = time.time() - start_time
|
|
printer.print(job_reference.jobId, current_wait, current_status)
|
|
time.sleep(1)
|
|
else:
|
|
raise StopIteration(
|
|
'Wait timed out. Operation not finished, in state %s'
|
|
% (current_status,)
|
|
)
|
|
printer.done()
|
|
return job
|
|
|
|
|
|
def PollJob(
|
|
bqclient: bigquery_client.BigqueryClient,
|
|
job_reference: bq_id_utils.ApiClientHelper.JobReference,
|
|
status: str = 'DONE', # Actrually an enum.
|
|
wait: int = 0,
|
|
):
|
|
"""Poll a job once for a specific status.
|
|
|
|
Arguments:
|
|
bqclient: A BigqueryClient to get state and request clients from.
|
|
job_reference: JobReference to poll.
|
|
status: (optional, default 'DONE') Desired job status.
|
|
wait: (optional, default 0) Max server-side wait time for one poll call.
|
|
|
|
Returns:
|
|
Tuple (in_state, job) where in_state is True if job is
|
|
in the desired state.
|
|
|
|
Raises:
|
|
ValueError: If given an invalid wait value.
|
|
"""
|
|
bq_id_utils.typecheck(
|
|
job_reference,
|
|
bq_id_utils.ApiClientHelper.JobReference,
|
|
method='PollJob',
|
|
)
|
|
wait = bq_client_utils.NormalizeWait(wait)
|
|
job = bqclient.apiclient.jobs().get(**dict(job_reference)).execute()
|
|
current = job['status']['state']
|
|
return (current == status, job)
|
|
|
|
|
|
#################################
|
|
## Wrappers for job types
|
|
#################################
|
|
|
|
|
|
def RunQuery(
|
|
bqclient: bigquery_client.BigqueryClient,
|
|
start_row: int,
|
|
max_rows: int,
|
|
**kwds,
|
|
):
|
|
"""Run a query job synchronously, and return the result.
|
|
|
|
Args:
|
|
bqclient: A BigqueryClient to get state and request clients from.
|
|
start_row: first row to read.
|
|
max_rows: number of rows to read.
|
|
**kwds: Passed on to Query.
|
|
|
|
Returns:
|
|
A tuple where the first item is the list of fields and the
|
|
second item a list of rows.
|
|
"""
|
|
new_kwds = dict(kwds)
|
|
new_kwds['sync'] = True
|
|
job = Query(bqclient, **new_kwds)
|
|
|
|
return ReadSchemaAndJobRows(
|
|
bqclient,
|
|
job['jobReference'],
|
|
start_row=start_row,
|
|
max_rows=max_rows,
|
|
)
|
|
|
|
|
|
def RunQueryRpc(
|
|
bqclient: bigquery_client.BigqueryClient,
|
|
query: str,
|
|
dry_run: Optional[bool] = None,
|
|
use_cache: Optional[bool] = None,
|
|
preserve_nulls: Optional[bool] = None,
|
|
request_id: Optional[str] = None,
|
|
maximum_bytes_billed: Optional[int] = None,
|
|
max_results: Optional[int] = None,
|
|
wait: int = sys.maxsize,
|
|
min_completion_ratio: Optional[float] = None,
|
|
wait_printer_factory: Optional[
|
|
Callable[[], wait_printer.WaitPrinter]
|
|
] = None,
|
|
max_single_wait: Optional[int] = None,
|
|
external_table_definitions_json=None,
|
|
udf_resources=None,
|
|
location: Optional[str] = None,
|
|
connection_properties=None,
|
|
job_creation_mode: Optional[
|
|
bigquery_client.BigqueryClient.JobCreationMode
|
|
] = None,
|
|
reservation_id: Optional[str] = None,
|
|
job_timeout_ms: Optional[int] = None,
|
|
max_slots: Optional[int] = None,
|
|
**kwds,
|
|
):
|
|
"""Executes the given query using the rpc-style query api.
|
|
|
|
Args:
|
|
bqclient: A BigqueryClient to get state and request clients from.
|
|
query: Query to execute.
|
|
dry_run: Optional. Indicates whether the query will only be validated and
|
|
return processing statistics instead of actually running.
|
|
use_cache: Optional. Whether to use the query cache. Caching is best-effort
|
|
only and you should not make assumptions about whether or how long a query
|
|
result will be cached.
|
|
preserve_nulls: Optional. Indicates whether to preserve nulls in input data.
|
|
Temporary flag; will be removed in a future version.
|
|
request_id: Optional. Specifies the idempotency token for the request.
|
|
maximum_bytes_billed: Optional. Upper limit on maximum bytes billed.
|
|
max_results: Optional. Maximum number of results to return.
|
|
wait: (optional, default maxint) Max wait time in seconds.
|
|
min_completion_ratio: Optional. Specifies the minimum fraction of data that
|
|
must be scanned before a query returns. This value should be between 0.0
|
|
and 1.0 inclusive.
|
|
wait_printer_factory: (optional, defaults to bqclient.wait_printer_factory)
|
|
Returns a subclass of WaitPrinter that will be called after each job poll.
|
|
max_single_wait: Optional. Maximum number of seconds to wait for each call
|
|
to query() / getQueryResults().
|
|
external_table_definitions_json: Json representation of external table
|
|
definitions.
|
|
udf_resources: Array of inline and remote UDF resources.
|
|
location: Optional. The geographic location where the job should run.
|
|
connection_properties: Optional. Connection properties to use when running
|
|
the query, presented as a list of key/value pairs. A key of "time_zone"
|
|
indicates that the query will be run with the default timezone
|
|
corresponding to the value.
|
|
job_creation_mode: Optional. An option for job creation. The valid values
|
|
are JOB_CREATION_REQUIRED and JOB_CREATION_OPTIONAL.
|
|
reservation_id: Optional. An option to set the reservation to use when
|
|
execute the job. Reservation should be in the format of
|
|
"project_id:reservation_id", "project_id:location.reservation_id", or
|
|
"reservation_id".
|
|
job_timeout_ms: Optional. How long to let the job run.
|
|
max_slots: Optional. Cap on target rate of slot consumption by the query.
|
|
**kwds: Passed directly to ExecuteSyncQuery.
|
|
|
|
Raises:
|
|
bq_error.BigqueryClientError: if no query is provided.
|
|
StopIteration: if the query does not complete within wait seconds.
|
|
bq_error.BigqueryError: if query fails.
|
|
|
|
Returns:
|
|
A tuple (schema fields, row results, execution metadata).
|
|
For regular queries, the execution metadata dict contains
|
|
the 'State' and 'status' elements that would be in a job result
|
|
after FormatJobInfo().
|
|
For dry run queries schema and rows are empty, the execution metadata
|
|
dict contains statistics
|
|
"""
|
|
if not bqclient.sync:
|
|
raise bq_error.BigqueryClientError(
|
|
'Running RPC-style query asynchronously is not supported'
|
|
)
|
|
if not query:
|
|
raise bq_error.BigqueryClientError('No query string provided')
|
|
|
|
if request_id is not None and not flags.FLAGS.jobs_query_use_request_id:
|
|
raise bq_error.BigqueryClientError('request_id is not yet supported')
|
|
|
|
if wait_printer_factory:
|
|
printer = wait_printer_factory()
|
|
else:
|
|
printer = bqclient.wait_printer_factory()
|
|
|
|
start_time = time.time()
|
|
elapsed_time = 0
|
|
job_reference = None
|
|
current_wait_ms = None
|
|
while True:
|
|
try:
|
|
elapsed_time = 0 if job_reference is None else time.time() - start_time
|
|
remaining_time = wait - elapsed_time
|
|
if max_single_wait is not None:
|
|
# Compute the current wait, being careful about overflow, since
|
|
# remaining_time may be counting down from sys.maxint.
|
|
current_wait_ms = int(min(remaining_time, max_single_wait) * 1000)
|
|
if current_wait_ms < 0:
|
|
current_wait_ms = sys.maxsize
|
|
if remaining_time < 0:
|
|
raise StopIteration('Wait timed out. Query not finished.')
|
|
if job_reference is None:
|
|
# We haven't yet run a successful Query(), so we don't
|
|
# have a job id to check on.
|
|
rows_to_read = max_results
|
|
if bqclient.max_rows_per_request is not None:
|
|
if rows_to_read is None:
|
|
rows_to_read = bqclient.max_rows_per_request
|
|
else:
|
|
rows_to_read = min(bqclient.max_rows_per_request, int(rows_to_read))
|
|
result = _StartQueryRpc(
|
|
bqclient=bqclient,
|
|
query=query,
|
|
preserve_nulls=preserve_nulls,
|
|
request_id=request_id,
|
|
maximum_bytes_billed=maximum_bytes_billed,
|
|
use_cache=use_cache,
|
|
dry_run=dry_run,
|
|
min_completion_ratio=min_completion_ratio,
|
|
job_timeout_ms=job_timeout_ms,
|
|
max_slots=max_slots,
|
|
max_results=rows_to_read,
|
|
external_table_definitions_json=external_table_definitions_json,
|
|
udf_resources=udf_resources,
|
|
location=location,
|
|
connection_properties=connection_properties,
|
|
job_creation_mode=job_creation_mode,
|
|
reservation_id=reservation_id,
|
|
**kwds,
|
|
)
|
|
if dry_run:
|
|
execution = dict(
|
|
statistics=dict(
|
|
query=dict(
|
|
totalBytesProcessed=result['totalBytesProcessed'],
|
|
)
|
|
)
|
|
)
|
|
if 'cacheHit' in result:
|
|
execution['statistics']['query']['cacheHit'] = result['cacheHit']
|
|
if 'schema' in result:
|
|
execution['statistics']['query']['schema'] = result['schema']
|
|
return ([], [], execution)
|
|
if 'jobReference' in result:
|
|
job_reference = bq_id_utils.ApiClientHelper.JobReference.Create(
|
|
**result['jobReference']
|
|
)
|
|
else:
|
|
# The query/getQueryResults methods do not return the job state,
|
|
# so we just print 'RUNNING' while we are actively waiting.
|
|
printer.print(job_reference.jobId, elapsed_time, 'RUNNING')
|
|
result = GetQueryResults(
|
|
bqclient,
|
|
job_reference.jobId,
|
|
max_results=max_results,
|
|
timeout_ms=current_wait_ms,
|
|
location=location,
|
|
)
|
|
if result['jobComplete']:
|
|
(schema, rows) = ReadSchemaAndJobRows(
|
|
bqclient,
|
|
dict(job_reference) if job_reference else {},
|
|
start_row=0,
|
|
max_rows=max_results,
|
|
result_first_page=result,
|
|
)
|
|
# If we get here, we must have succeeded. We could still have
|
|
# non-fatal errors though.
|
|
status = {}
|
|
if 'errors' in result:
|
|
status['errors'] = result['errors']
|
|
execution = {
|
|
'State': 'SUCCESS',
|
|
'status': status,
|
|
'jobReference': job_reference,
|
|
}
|
|
return (schema, rows, execution)
|
|
except bq_error.BigqueryCommunicationError as e:
|
|
# Communication errors while waiting on a job are okay.
|
|
logging.warning('Transient error during query: %s', e)
|
|
except bq_error.BigqueryBackendError as e:
|
|
# Temporary server errors while waiting on a job are okay.
|
|
logging.warning('Transient error during query: %s', e)
|
|
|
|
|
|
def Query(
|
|
bqclient: bigquery_client.BigqueryClient,
|
|
query: str,
|
|
destination_table: Optional[str] = None,
|
|
create_disposition: Optional[str] = None,
|
|
write_disposition: Optional[str] = None,
|
|
priority: Optional[str] = None,
|
|
preserve_nulls: Optional[bool] = None,
|
|
allow_large_results: Optional[bool] = None,
|
|
dry_run: Optional[bool] = None,
|
|
use_cache: Optional[bool] = None,
|
|
min_completion_ratio: Optional[float] = None,
|
|
flatten_results: Optional[bool] = None,
|
|
external_table_definitions_json=None,
|
|
udf_resources=None,
|
|
maximum_billing_tier: Optional[int] = None,
|
|
maximum_bytes_billed: Optional[int] = None,
|
|
use_legacy_sql: Optional[bool] = None,
|
|
schema_update_options: Optional[List[str]] = None,
|
|
labels: Optional[Dict[str, str]] = None,
|
|
query_parameters=None,
|
|
time_partitioning=None,
|
|
destination_encryption_configuration=None,
|
|
clustering=None,
|
|
range_partitioning=None,
|
|
script_options=None,
|
|
job_timeout_ms: Optional[int] = None,
|
|
max_slots: Optional[int] = None,
|
|
create_session: Optional[bool] = None,
|
|
connection_properties=None,
|
|
continuous=None,
|
|
job_creation_mode: Optional[
|
|
bigquery_client.BigqueryClient.JobCreationMode
|
|
] = None,
|
|
reservation_id: Optional[str] = None,
|
|
**kwds,
|
|
):
|
|
# pylint: disable=g-doc-args
|
|
"""Execute the given query, returning the created job.
|
|
|
|
The job will execute synchronously if sync=True is provided as an
|
|
argument or if bqclient.sync is true.
|
|
|
|
Args:
|
|
bqclient: A BigqueryClient to get state and request clients from.
|
|
query: Query to execute.
|
|
destination_table: (default None) If provided, send the results to the given
|
|
table.
|
|
create_disposition: Optional. Specifies the create_disposition for the
|
|
destination_table.
|
|
write_disposition: Optional. Specifies the write_disposition for the
|
|
destination_table.
|
|
priority: Optional. Priority to run the query with. Either 'INTERACTIVE'
|
|
(default) or 'BATCH'.
|
|
preserve_nulls: Optional. Indicates whether to preserve nulls in input data.
|
|
Temporary flag; will be removed in a future version.
|
|
allow_large_results: Enables larger destination table sizes.
|
|
dry_run: Optional. Indicates whether the query will only be validated and
|
|
return processing statistics instead of actually running.
|
|
use_cache: Optional. Whether to use the query cache. If create_disposition
|
|
is CREATE_NEVER, will only run the query if the result is already cached.
|
|
Caching is best-effort only and you should not make assumptions about
|
|
whether or how long a query result will be cached.
|
|
min_completion_ratio: Optional. Specifies the minimum fraction of data that
|
|
must be scanned before a query returns. This value should be between 0.0
|
|
and 1.0 inclusive.
|
|
flatten_results: Whether to flatten nested and repeated fields in the result
|
|
schema. If not set, the default behavior is to flatten.
|
|
external_table_definitions_json: Json representation of external table
|
|
definitions.
|
|
udf_resources: Array of inline and remote UDF resources.
|
|
maximum_billing_tier: Upper limit for billing tier.
|
|
maximum_bytes_billed: Upper limit for bytes billed.
|
|
use_legacy_sql: The choice of using Legacy SQL for the query is optional. If
|
|
not specified, the server will automatically determine the dialect based
|
|
on query information, such as dialect prefixes. If no prefixes are found,
|
|
it will default to Legacy SQL.
|
|
schema_update_options: schema update options when appending to the
|
|
destination table or truncating a table partition.
|
|
labels: an optional dict of labels to set on the query job.
|
|
query_parameters: parameter values for use_legacy_sql=False queries.
|
|
time_partitioning: Optional. Provides time based partitioning specification
|
|
for the destination table.
|
|
clustering: Optional. Provides clustering specification for the destination
|
|
table.
|
|
destination_encryption_configuration: Optional. Allows user to encrypt the
|
|
table created from a query job with a Cloud KMS key.
|
|
range_partitioning: Optional. Provides range partitioning specification for
|
|
the destination table.
|
|
script_options: Optional. Options controlling script execution.
|
|
job_timeout_ms: Optional. How long to let the job run.
|
|
continuous: Optional. Whether the query should be executed as continuous
|
|
query.
|
|
job_creation_mode: Optional. An option for job creation. The valid values
|
|
are JOB_CREATION_REQUIRED and JOB_CREATION_OPTIONAL.
|
|
reservation_id: Optional. An option to set the reservation to use when
|
|
execute the job. Reservation should be in the format of
|
|
"project_id:reservation_id", "project_id:location.reservation_id", or
|
|
"reservation_id". If reservation_id is "none", the job will be executed
|
|
without assigned reservation using the on-demand slots.
|
|
**kwds: Passed on to ExecuteJob.
|
|
|
|
Raises:
|
|
bq_error.BigqueryClientError: if no query is provided.
|
|
|
|
Returns:
|
|
The resulting job info.
|
|
"""
|
|
if not query:
|
|
raise bq_error.BigqueryClientError('No query string provided')
|
|
query_config = {'query': query}
|
|
if bqclient.dataset_id:
|
|
query_config['defaultDataset'] = bq_client_utils.GetQueryDefaultDataset(
|
|
bqclient.dataset_id
|
|
)
|
|
if external_table_definitions_json:
|
|
query_config['tableDefinitions'] = external_table_definitions_json
|
|
if udf_resources:
|
|
query_config['userDefinedFunctionResources'] = udf_resources
|
|
if destination_table:
|
|
try:
|
|
reference = bq_client_utils.GetTableReference(
|
|
id_fallbacks=bqclient, identifier=destination_table
|
|
)
|
|
except bq_error.BigqueryError as e:
|
|
raise bq_error.BigqueryError(
|
|
'Invalid value %s for destination_table: %s' % (destination_table, e)
|
|
)
|
|
query_config['destinationTable'] = dict(reference)
|
|
if destination_encryption_configuration:
|
|
query_config['destinationEncryptionConfiguration'] = (
|
|
destination_encryption_configuration
|
|
)
|
|
if script_options:
|
|
query_config['scriptOptions'] = script_options
|
|
if job_creation_mode:
|
|
query_config['jobCreationMode'] = job_creation_mode.name
|
|
bq_processor_utils.ApplyParameters(
|
|
query_config,
|
|
allow_large_results=allow_large_results,
|
|
create_disposition=create_disposition,
|
|
preserve_nulls=preserve_nulls,
|
|
priority=priority,
|
|
write_disposition=write_disposition,
|
|
use_query_cache=use_cache,
|
|
flatten_results=flatten_results,
|
|
maximum_billing_tier=maximum_billing_tier,
|
|
maximum_bytes_billed=maximum_bytes_billed,
|
|
use_legacy_sql=use_legacy_sql,
|
|
schema_update_options=schema_update_options,
|
|
query_parameters=query_parameters,
|
|
time_partitioning=time_partitioning,
|
|
clustering=clustering,
|
|
create_session=create_session,
|
|
min_completion_ratio=min_completion_ratio,
|
|
continuous=continuous,
|
|
job_creation_mode=job_creation_mode,
|
|
range_partitioning=range_partitioning,
|
|
)
|
|
bq_processor_utils.ApplyParameters(
|
|
query_config, connection_properties=connection_properties
|
|
)
|
|
request = {'query': query_config}
|
|
reservation_path = _GetReservationPath(
|
|
bqclient,
|
|
reservation_id,
|
|
check_reservation_project=False,
|
|
)
|
|
bq_processor_utils.ApplyParameters(
|
|
request,
|
|
dry_run=dry_run,
|
|
labels=labels,
|
|
job_timeout_ms=job_timeout_ms,
|
|
reservation=reservation_path,
|
|
)
|
|
bq_processor_utils.ApplyParameters(
|
|
request,
|
|
max_slots=max_slots,
|
|
)
|
|
return ExecuteJob(bqclient, request, **kwds)
|
|
|
|
|
|
def _GetReservationPath(
|
|
bqclient: bigquery_client.BigqueryClient,
|
|
reservation_id: Optional[str],
|
|
check_reservation_project: bool = True,
|
|
) -> Optional[str]:
|
|
"""Converts the reservation_id from the format `<project_id>:<location>.<reservation_id>` to the fully qualified reservation path `projects/<project_id>/locations/<location>/reservations/<reservation_id>`.
|
|
|
|
The special value "none" is returned as is.
|
|
|
|
Args:
|
|
bqclient: A BigqueryClient to get state and request clients from.
|
|
reservation_id: The reservation id to convert.
|
|
check_reservation_project: Whether to validate the reservation project.
|
|
|
|
Returns:
|
|
The fully qualified reservation path or "none" if reservation_id is "none".
|
|
"""
|
|
if reservation_id is None or reservation_id == 'none':
|
|
return reservation_id
|
|
reference = bq_client_utils.GetReservationReference(
|
|
id_fallbacks=bqclient,
|
|
identifier=reservation_id,
|
|
default_location=bq_flags.LOCATION.value,
|
|
check_reservation_project=check_reservation_project,
|
|
)
|
|
return reference.path()
|
|
|
|
|
|
def Load(
|
|
bqclient: bigquery_client.BigqueryClient,
|
|
destination_table_reference: bq_id_utils.ApiClientHelper.TableReference,
|
|
source: str,
|
|
schema=None,
|
|
create_disposition: Optional[str] = None,
|
|
write_disposition: Optional[str] = None,
|
|
field_delimiter: Optional[str] = None,
|
|
skip_leading_rows: Optional[bool] = None,
|
|
encoding: Optional[str] = None,
|
|
quote: Optional[str] = None,
|
|
max_bad_records: Optional[int] = None,
|
|
allow_quoted_newlines: Optional[bool] = None,
|
|
source_format: Optional[str] = None,
|
|
allow_jagged_rows: Optional[bool] = None,
|
|
preserve_ascii_control_characters: Optional[bool] = None,
|
|
ignore_unknown_values: Optional[bool] = None,
|
|
projection_fields: Optional[List[str]] = None,
|
|
autodetect: Optional[bool] = None,
|
|
schema_update_options: Optional[List[str]] = None,
|
|
null_marker: Optional[str] = None,
|
|
null_markers: Optional[List[str]] = None,
|
|
time_partitioning=None,
|
|
clustering=None,
|
|
destination_encryption_configuration=None,
|
|
use_avro_logical_types: Optional[bool] = None,
|
|
reference_file_schema_uri=None,
|
|
range_partitioning=None,
|
|
hive_partitioning_options=None,
|
|
decimal_target_types=None,
|
|
json_extension: Optional[str] = None, # Actually an enum
|
|
column_name_character_map=None,
|
|
time_zone=None,
|
|
date_format=None,
|
|
datetime_format=None,
|
|
time_format=None,
|
|
timestamp_format=None,
|
|
file_set_spec_type=None,
|
|
thrift_options=None,
|
|
parquet_options=None,
|
|
connection_properties=None,
|
|
reservation_id: Optional[str] = None,
|
|
copy_files_only: Optional[bool] = None,
|
|
source_column_match: Optional[str] = None,
|
|
timestamp_target_precision: Optional[list[int]] = None,
|
|
**kwds,
|
|
):
|
|
"""Load the given data into BigQuery.
|
|
|
|
The job will execute synchronously if sync=True is provided as an
|
|
argument or if bqclient.sync is true.
|
|
|
|
Args:
|
|
bqclient: A BigqueryClient to get state and request clients from.
|
|
destination_table_reference: TableReference to load data into.
|
|
source: String specifying source data to load.
|
|
schema: (default None) Schema of the created table. (Can be left blank for
|
|
append operations.)
|
|
create_disposition: Optional. Specifies the create_disposition for the
|
|
destination_table_reference.
|
|
write_disposition: Optional. Specifies the write_disposition for the
|
|
destination_table_reference.
|
|
field_delimiter: Optional. Specifies the single byte field delimiter.
|
|
skip_leading_rows: Optional. Number of rows of initial data to skip.
|
|
encoding: Optional. Specifies character encoding of the input data. May be
|
|
"UTF-8" or "ISO-8859-1". Defaults to UTF-8 if not specified.
|
|
quote: Optional. Quote character to use. Default is '"'. Note that quoting
|
|
is done on the raw binary data before encoding is applied.
|
|
max_bad_records: Optional. Maximum number of bad records that should be
|
|
ignored before the entire job is aborted. Only supported for CSV and
|
|
NEWLINE_DELIMITED_JSON file formats.
|
|
allow_quoted_newlines: Optional. Whether to allow quoted newlines in CSV
|
|
import data.
|
|
source_format: Optional. Format of source data. May be "CSV",
|
|
"DATASTORE_BACKUP", or "NEWLINE_DELIMITED_JSON".
|
|
allow_jagged_rows: Optional. Whether to allow missing trailing optional
|
|
columns in CSV import data.
|
|
preserve_ascii_control_characters: Optional. Whether to preserve embedded
|
|
Ascii Control characters in CSV import data.
|
|
ignore_unknown_values: Optional. Whether to allow extra, unrecognized values
|
|
in CSV or JSON data.
|
|
projection_fields: Optional. If sourceFormat is set to "DATASTORE_BACKUP",
|
|
indicates which entity properties to load into BigQuery from a Cloud
|
|
Datastore backup.
|
|
autodetect: Optional. If true, then we automatically infer the schema and
|
|
options of the source files if they are CSV or JSON formats.
|
|
schema_update_options: schema update options when appending to the
|
|
destination table or truncating a table partition.
|
|
null_marker: Optional. String that will be interpreted as a NULL value.
|
|
null_markers: Optional. List of strings that will be interpreted as a NULL
|
|
value.
|
|
time_partitioning: Optional. Provides time based partitioning specification
|
|
for the destination table.
|
|
clustering: Optional. Provides clustering specification for the destination
|
|
table.
|
|
destination_encryption_configuration: Optional. Allows user to encrypt the
|
|
table created from a load job with Cloud KMS key.
|
|
use_avro_logical_types: Optional. Allows user to override default behaviour
|
|
for Avro logical types. If this is set, Avro fields with logical types
|
|
will be interpreted into their corresponding types (ie. TIMESTAMP),
|
|
instead of only using their raw types (ie. INTEGER).
|
|
reference_file_schema_uri: Optional. Allows user to provide a reference file
|
|
with the reader schema, enabled for the format: AVRO, PARQUET, ORC.
|
|
range_partitioning: Optional. Provides range partitioning specification for
|
|
the destination table.
|
|
hive_partitioning_options: (experimental) Options for configuring hive is
|
|
picked if it is in the specified list and if it supports the precision and
|
|
the scale. STRING supports all precision and scale values. If none of the
|
|
listed types supports the precision and the scale, the type supporting the
|
|
widest range in the specified list is picked, and if a value exceeds the
|
|
supported range when reading the data, an error will be returned. This
|
|
field cannot contain duplicate types. The order of the
|
|
decimal_target_types: (experimental) Defines the list of possible SQL data
|
|
types to which the source decimal values are converted. This list and the
|
|
precision and the scale parameters of the decimal field determine the
|
|
target type. In the order of NUMERIC, BIGNUMERIC, and STRING, a type is
|
|
picked if it is in the specified list and if it supports the precision and
|
|
the scale. STRING supports all precision and scale values. If none of the
|
|
listed types supports the precision and the scale, the type supporting the
|
|
widest range in the specified list is picked, and if a value exceeds the
|
|
supported range when reading the data, an error will be returned. This
|
|
field cannot contain duplicate types. The order of the types in this field
|
|
is ignored. For example, ["BIGNUMERIC", "NUMERIC"] is the same as
|
|
["NUMERIC", "BIGNUMERIC"] and NUMERIC always takes precedence over
|
|
BIGNUMERIC. Defaults to ["NUMERIC", "STRING"] for ORC and ["NUMERIC"] for
|
|
the other file formats.
|
|
json_extension: (experimental) Specify alternative parsing for JSON source
|
|
format. To load newline-delimited JSON, specify 'GEOJSON'. Only applicable
|
|
if `source_format` is 'NEWLINE_DELIMITED_JSON'.
|
|
column_name_character_map: Indicates the character map used for column
|
|
names. Specify 'STRICT' to use the latest character map and reject invalid
|
|
column names. Specify 'V1' to support alphanumeric + underscore and name
|
|
must start with a letter or underscore. Invalid column names will be
|
|
normalized. Specify 'V2' to support flexible column name. Invalid column
|
|
names will be normalized.
|
|
file_set_spec_type: Set how to discover files for loading. Specify
|
|
'FILE_SYSTEM_MATCH' (default behavior) to expand source URIs by listing
|
|
files from the underlying object store. Specify
|
|
'NEW_LINE_DELIMITED_MANIFEST' to parse the URIs as new line delimited
|
|
manifest files, where each line contains a URI (No wild-card URIs are
|
|
supported).
|
|
thrift_options: (experimental) Options for configuring Apache Thrift load,
|
|
which is required if `source_format` is 'THRIFT'.
|
|
parquet_options: Options for configuring parquet files load, only applicable
|
|
if `source_format` is 'PARQUET'.
|
|
connection_properties: Optional. ConnectionProperties for load job.
|
|
reservation_id: Optional. An option to set the reservation to use when
|
|
execute the job. Reservation should be in the format of
|
|
"project_id:reservation_id", "project_id:location.reservation_id", or
|
|
"reservation_id".
|
|
copy_files_only: Optional. True to configures the load job to only copy
|
|
files to the destination BigLake managed table, without reading file
|
|
content and writing them to new files.
|
|
source_column_match: Optional. Controls the strategy used to match loaded
|
|
columns to the schema.
|
|
timestamp_target_precision: Precision (maximum number of total
|
|
digits in base 10) for second of TIMESTAMP type.
|
|
Available for the formats: CSV.
|
|
**kwds: Passed on to ExecuteJob.
|
|
|
|
Returns:
|
|
The resulting job info.
|
|
"""
|
|
bq_id_utils.typecheck(
|
|
destination_table_reference, bq_id_utils.ApiClientHelper.TableReference
|
|
)
|
|
load_config = {'destinationTable': dict(destination_table_reference)}
|
|
sources = bq_processor_utils.ProcessSources(source)
|
|
if sources[0].startswith(bq_processor_utils.GCS_SCHEME_PREFIX):
|
|
load_config['sourceUris'] = sources
|
|
upload_file = None
|
|
else:
|
|
upload_file = sources[0]
|
|
if schema is not None:
|
|
load_config['schema'] = {'fields': bq_client_utils.ReadSchema(schema)}
|
|
if use_avro_logical_types is not None:
|
|
load_config['useAvroLogicalTypes'] = use_avro_logical_types
|
|
if reference_file_schema_uri is not None:
|
|
load_config['reference_file_schema_uri'] = reference_file_schema_uri
|
|
if file_set_spec_type is not None:
|
|
load_config['fileSetSpecType'] = file_set_spec_type
|
|
if json_extension is not None:
|
|
load_config['jsonExtension'] = json_extension
|
|
if column_name_character_map is not None:
|
|
load_config['columnNameCharacterMap'] = column_name_character_map
|
|
if parquet_options is not None:
|
|
load_config['parquetOptions'] = parquet_options
|
|
load_config['decimalTargetTypes'] = decimal_target_types
|
|
if destination_encryption_configuration:
|
|
load_config['destinationEncryptionConfiguration'] = (
|
|
destination_encryption_configuration
|
|
)
|
|
|
|
if time_zone is not None:
|
|
load_config['timeZone'] = time_zone
|
|
if date_format is not None:
|
|
load_config['dateFormat'] = date_format
|
|
if datetime_format is not None:
|
|
load_config['datetimeFormat'] = datetime_format
|
|
if time_format is not None:
|
|
load_config['timeFormat'] = time_format
|
|
if timestamp_format is not None:
|
|
load_config['timestampFormat'] = timestamp_format
|
|
|
|
if source_column_match is not None:
|
|
load_config['sourceColumnMatch'] = source_column_match
|
|
if timestamp_target_precision is not None:
|
|
load_config['timestampTargetPrecision'] = timestamp_target_precision
|
|
|
|
bq_processor_utils.ApplyParameters(
|
|
load_config,
|
|
create_disposition=create_disposition,
|
|
write_disposition=write_disposition,
|
|
field_delimiter=field_delimiter,
|
|
skip_leading_rows=skip_leading_rows,
|
|
encoding=encoding,
|
|
quote=quote,
|
|
max_bad_records=max_bad_records,
|
|
source_format=source_format,
|
|
allow_quoted_newlines=allow_quoted_newlines,
|
|
allow_jagged_rows=allow_jagged_rows,
|
|
preserve_ascii_control_characters=preserve_ascii_control_characters,
|
|
ignore_unknown_values=ignore_unknown_values,
|
|
projection_fields=projection_fields,
|
|
schema_update_options=schema_update_options,
|
|
null_marker=null_marker,
|
|
null_markers=null_markers,
|
|
time_partitioning=time_partitioning,
|
|
clustering=clustering,
|
|
autodetect=autodetect,
|
|
range_partitioning=range_partitioning,
|
|
hive_partitioning_options=hive_partitioning_options,
|
|
thrift_options=thrift_options,
|
|
connection_properties=connection_properties,
|
|
copy_files_only=copy_files_only,
|
|
parquet_options=parquet_options,
|
|
)
|
|
configuration = {'load': load_config}
|
|
if reservation_id is not None:
|
|
reference = bq_client_utils.GetReservationReference(
|
|
id_fallbacks=bqclient,
|
|
identifier=reservation_id,
|
|
default_location=bq_flags.LOCATION.value,
|
|
check_reservation_project=False,
|
|
)
|
|
configuration['reservation'] = reference.path()
|
|
return ExecuteJob(
|
|
bqclient, configuration=configuration, upload_file=upload_file, **kwds
|
|
)
|
|
|
|
|
|
|
|
|
|
def Extract(
|
|
bqclient: bigquery_client.BigqueryClient,
|
|
reference: bq_id_utils.ApiClientHelper.TableReference,
|
|
destination_uris: str,
|
|
print_header: Optional[bool] = None,
|
|
field_delimiter: Optional[str] = None,
|
|
destination_format: Optional[str] = None, # Actually an enum.
|
|
trial_id=None,
|
|
add_serving_default_signature=None,
|
|
compression: Optional[str] = None, # Actually an enum.
|
|
use_avro_logical_types: Optional[bool] = None,
|
|
reservation_id: Optional[str] = None,
|
|
**kwds,
|
|
):
|
|
"""Extract the given table from BigQuery.
|
|
|
|
The job will execute synchronously if sync=True is provided as an
|
|
argument or if bqclient.sync is true.
|
|
|
|
Args:
|
|
bqclient: A BigqueryClient to get state and request clients from.
|
|
reference: TableReference to read data from.
|
|
destination_uris: String specifying one or more destination locations,
|
|
separated by commas.
|
|
print_header: Optional. Whether to print out a header row in the results.
|
|
field_delimiter: Optional. Specifies the single byte field delimiter.
|
|
destination_format: Optional. Format to extract table to. May be "CSV",
|
|
"AVRO" or "NEWLINE_DELIMITED_JSON".
|
|
trial_id: Optional. 1-based ID of the trial to be exported from a
|
|
hyperparameter tuning model.
|
|
add_serving_default_signature: Optional. Whether to add serving_default
|
|
signature for BigQuery ML trained tf based models.
|
|
compression: Optional. The compression type to use for exported files.
|
|
Possible values include "GZIP" and "NONE". The default value is NONE.
|
|
use_avro_logical_types: Optional. Whether to use avro logical types for
|
|
applicable column types on extract jobs.
|
|
reservation_id: Optional. An option to set the reservation to use when
|
|
execute the job. Reservation should be in the format of
|
|
"project_id:reservation_id", "project_id:location.reservation_id", or
|
|
"reservation_id".
|
|
**kwds: Passed on to ExecuteJob.
|
|
|
|
Returns:
|
|
The resulting job info.
|
|
|
|
Raises:
|
|
bq_error.BigqueryClientError: if required parameters are invalid.
|
|
"""
|
|
bq_id_utils.typecheck(
|
|
reference,
|
|
(
|
|
bq_id_utils.ApiClientHelper.TableReference,
|
|
bq_id_utils.ApiClientHelper.ModelReference,
|
|
),
|
|
method='Extract',
|
|
)
|
|
uris = destination_uris.split(',')
|
|
for uri in uris:
|
|
if not uri.startswith(bq_processor_utils.GCS_SCHEME_PREFIX):
|
|
raise bq_error.BigqueryClientError(
|
|
'Illegal URI: {}. Extract URI must start with "{}".'.format(
|
|
uri, bq_processor_utils.GCS_SCHEME_PREFIX
|
|
)
|
|
)
|
|
extract_config = {}
|
|
if isinstance(reference, bq_id_utils.ApiClientHelper.TableReference):
|
|
extract_config = {'sourceTable': dict(reference)}
|
|
elif isinstance(reference, bq_id_utils.ApiClientHelper.ModelReference):
|
|
extract_config = {'sourceModel': dict(reference)}
|
|
if trial_id:
|
|
extract_config.update({'modelExtractOptions': {'trialId': trial_id}})
|
|
if add_serving_default_signature:
|
|
extract_config.update({
|
|
'modelExtractOptions': {
|
|
'addServingDefaultSignature': add_serving_default_signature
|
|
}
|
|
})
|
|
bq_processor_utils.ApplyParameters(
|
|
extract_config,
|
|
destination_uris=uris,
|
|
destination_format=destination_format,
|
|
print_header=print_header,
|
|
field_delimiter=field_delimiter,
|
|
compression=compression,
|
|
use_avro_logical_types=use_avro_logical_types,
|
|
)
|
|
configuration = {'extract': extract_config}
|
|
if reservation_id is not None:
|
|
reference = bq_client_utils.GetReservationReference(
|
|
id_fallbacks=bqclient,
|
|
identifier=reservation_id,
|
|
default_location=bq_flags.LOCATION.value,
|
|
check_reservation_project=False,
|
|
)
|
|
configuration['reservation'] = reference.path()
|
|
return ExecuteJob(bqclient, configuration=configuration, **kwds)
|