504 lines
18 KiB
Python
504 lines
18 KiB
Python
# -*- coding: utf-8 -*- #
|
|
# Copyright 2016 Google LLC. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""Implementation of gcloud genomics pipelines run.
|
|
"""
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import unicode_literals
|
|
|
|
import base64
|
|
from googlecloudsdk.api_lib import genomics as lib
|
|
from googlecloudsdk.api_lib.genomics import exceptions
|
|
from googlecloudsdk.api_lib.genomics import genomics_util
|
|
from googlecloudsdk.calliope import arg_parsers
|
|
from googlecloudsdk.calliope import base
|
|
from googlecloudsdk.command_lib.util.args import labels_util
|
|
from googlecloudsdk.core import log
|
|
from googlecloudsdk.core import properties
|
|
from googlecloudsdk.core.util import files
|
|
import six
|
|
|
|
CLOUD_SDK_IMAGE = 'google/cloud-sdk:slim'
|
|
SHARED_DISK = 'gcloud-shared'
|
|
|
|
|
|
class _SharedPathGenerator(object):
|
|
|
|
def __init__(self, root):
|
|
self.root = root
|
|
self.index = -1
|
|
|
|
def Generate(self):
|
|
self.index += 1
|
|
return '/%s/%s%d' % (SHARED_DISK, self.root, self.index)
|
|
|
|
|
|
def _ValidateAndMergeArgInputs(args):
|
|
"""Turn args.inputs and args.inputs_from_file dicts into a single dict.
|
|
|
|
Args:
|
|
args: The parsed command-line arguments
|
|
|
|
Returns:
|
|
A dict that is the merge of args.inputs and args.inputs_from_file
|
|
Raises:
|
|
files.Error
|
|
"""
|
|
|
|
is_local_file = {}
|
|
|
|
# If no inputs from file, then no validation or merge needed
|
|
if not args.inputs_from_file:
|
|
return args.inputs, is_local_file
|
|
|
|
# Initialize the merged dictionary
|
|
arg_inputs = {}
|
|
|
|
if args.inputs:
|
|
# Validate args.inputs and args.inputs-from-file do not overlap
|
|
overlap = set(args.inputs.keys()).intersection(
|
|
set(args.inputs_from_file.keys()))
|
|
if overlap:
|
|
raise exceptions.GenomicsError(
|
|
'--{0} and --{1} may not specify overlapping values: {2}'
|
|
.format('inputs', 'inputs-from-file', ', '.join(overlap)))
|
|
|
|
# Add the args.inputs
|
|
arg_inputs.update(args.inputs)
|
|
|
|
# Read up the inputs-from-file and add the values from the file
|
|
for key, value in six.iteritems(args.inputs_from_file):
|
|
arg_inputs[key] = files.ReadFileContents(value)
|
|
is_local_file[key] = True
|
|
|
|
return arg_inputs, is_local_file
|
|
|
|
|
|
class Run(base.SilentCommand):
|
|
"""Defines and runs a pipeline.
|
|
|
|
A pipeline is a transformation of a set of inputs to a set of outputs.
|
|
Supports docker-based commands.
|
|
"""
|
|
|
|
@staticmethod
|
|
def Args(parser):
|
|
"""Args is called by calliope to gather arguments for this command.
|
|
|
|
Args:
|
|
parser: An argparse parser that you can use to add arguments that go
|
|
on the command line after this command. Positional arguments are
|
|
allowed.
|
|
"""
|
|
parser.add_argument(
|
|
'--pipeline-file',
|
|
help='''A YAML or JSON file containing a v2alpha1 Pipeline object. See
|
|
[](https://cloud.google.com/genomics/reference/rest/v2alpha1/pipelines#Pipeline)
|
|
''')
|
|
|
|
parser.add_argument(
|
|
'--docker-image',
|
|
category=base.COMMONLY_USED_FLAGS,
|
|
default=CLOUD_SDK_IMAGE,
|
|
help='''A docker image to run. Requires --command-line to
|
|
be specified and cannot be used with --pipeline-file.''')
|
|
|
|
parser.add_argument(
|
|
'--command-line',
|
|
category=base.COMMONLY_USED_FLAGS,
|
|
help='''Command line to run with /bin/sh in the specified
|
|
docker image. Cannot be used with --pipeline-file.''')
|
|
|
|
parser.add_argument(
|
|
'--inputs',
|
|
category=base.COMMONLY_USED_FLAGS,
|
|
metavar='NAME=VALUE',
|
|
type=arg_parsers.ArgDict(),
|
|
action=arg_parsers.UpdateAction,
|
|
help='''Map of input PipelineParameter names to values.
|
|
Used to pass literal parameters to the pipeline, and to specify
|
|
input files in Google Cloud Storage that will have a localCopy
|
|
made. Specified as a comma-separated list: --inputs
|
|
file=gs://my-bucket/in.txt,name=hello''')
|
|
|
|
parser.add_argument(
|
|
'--inputs-from-file',
|
|
category=base.COMMONLY_USED_FLAGS,
|
|
metavar='NAME=FILE',
|
|
type=arg_parsers.ArgDict(),
|
|
action=arg_parsers.UpdateAction,
|
|
help='''Map of input PipelineParameter names to values.
|
|
Used to pass literal parameters to the pipeline where values come
|
|
from local files; this can be used to send large pipeline input
|
|
parameters, such as code, data, or configuration values.
|
|
Specified as a comma-separated list:
|
|
--inputs-from-file script=myshellscript.sh,pyfile=mypython.py''')
|
|
|
|
parser.add_argument(
|
|
'--outputs',
|
|
category=base.COMMONLY_USED_FLAGS,
|
|
metavar='NAME=VALUE',
|
|
type=arg_parsers.ArgDict(),
|
|
action=arg_parsers.UpdateAction,
|
|
help='''Map of output PipelineParameter names to values.
|
|
Used to specify output files in Google Cloud Storage that will be
|
|
made from a localCopy. Specified as a comma-separated list:
|
|
--outputs ref=gs://my-bucket/foo,ref2=gs://my-bucket/bar''')
|
|
|
|
parser.add_argument(
|
|
'--logging',
|
|
category=base.COMMONLY_USED_FLAGS,
|
|
help='''The location in Google Cloud Storage to which the pipeline logs
|
|
will be copied. Can be specified as a fully qualified directory
|
|
path, in which case logs will be output with a unique identifier
|
|
as the filename in that directory, or as a fully specified path,
|
|
which must end in `.log`, in which case that path will be
|
|
used. Stdout and stderr logs from the run are also generated and
|
|
output as `-stdout.log` and `-stderr.log`.''')
|
|
|
|
parser.add_argument(
|
|
'--env-vars',
|
|
category=base.COMMONLY_USED_FLAGS,
|
|
metavar='NAME=VALUE',
|
|
type=arg_parsers.ArgDict(),
|
|
help='''List of key-value pairs to set as environment variables.''')
|
|
|
|
labels_util.AddCreateLabelsFlags(parser)
|
|
|
|
parser.add_argument(
|
|
'--memory',
|
|
category=base.COMMONLY_USED_FLAGS,
|
|
type=float,
|
|
help='''The number of GB of RAM needed to run the pipeline. Overrides
|
|
any value specified in the pipeline-file.''')
|
|
|
|
parser.add_argument(
|
|
'--cpus',
|
|
category=base.COMMONLY_USED_FLAGS,
|
|
type=int,
|
|
help='''The minimum number of CPUs to run the pipeline. Overrides
|
|
any value specified in the pipeline-file.''')
|
|
|
|
parser.add_argument(
|
|
'--disk-size',
|
|
category=base.COMMONLY_USED_FLAGS,
|
|
default=None,
|
|
help='''The disk size(s) in GB, specified as a comma-separated list of
|
|
pairs of disk name and size. For example:
|
|
--disk-size "name:size,name2:size2".
|
|
Overrides any values specified in the pipeline-file.''')
|
|
|
|
parser.add_argument(
|
|
'--preemptible',
|
|
category=base.COMMONLY_USED_FLAGS,
|
|
action='store_true',
|
|
help='''Whether to use a preemptible VM for this pipeline. The
|
|
"resource" section of the pipeline-file must also set preemptible
|
|
to "true" for this flag to take effect.''')
|
|
|
|
parser.add_argument(
|
|
'--run-id',
|
|
hidden=True,
|
|
help='THIS ARGUMENT NEEDS HELP TEXT.')
|
|
|
|
parser.add_argument(
|
|
'--service-account-email',
|
|
default='default',
|
|
help='''The service account used to run the pipeline. If unspecified,
|
|
defaults to the Compute Engine service account for your project.''')
|
|
|
|
parser.add_argument(
|
|
'--service-account-scopes',
|
|
metavar='SCOPE',
|
|
type=arg_parsers.ArgList(),
|
|
default=[],
|
|
help='''List of additional scopes to be made available for this service
|
|
account. The following scopes are always requested:
|
|
|
|
https://www.googleapis.com/auth/devstorage.read_write
|
|
https://www.googleapis.com/auth/genomics''')
|
|
|
|
parser.add_argument(
|
|
'--zones',
|
|
metavar='ZONE',
|
|
type=arg_parsers.ArgList(),
|
|
help='''List of Compute Engine zones the pipeline can run in.
|
|
|
|
If no zones are specified with the zones flag, then zones in the
|
|
pipeline definition file will be used.
|
|
|
|
If no zones are specified in the pipeline definition, then the
|
|
default zone in your local client configuration is used.
|
|
|
|
If you have no default zone then at least one zone or region must be specified.
|
|
|
|
For more information on default zones, see
|
|
https://cloud.google.com/compute/docs/gcloud-compute/#set_default_zone_and_region_in_your_local_client'''
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--regions',
|
|
metavar='REGION',
|
|
type=arg_parsers.ArgList(),
|
|
help='''List of Compute Engine regions the pipeline can
|
|
run in.
|
|
|
|
If no regions are specified with the regions flag, then regions in the
|
|
pipeline definition file will be used.
|
|
|
|
If no regions are specified in the pipeline definition, then the
|
|
default region in your local client configuration is used.
|
|
|
|
At least one region or region must be specified.
|
|
|
|
For more information on default regions, see
|
|
https://cloud.google.com/compute/docs/gcloud-compute/#set_default_zone_and_region_in_your_local_client'''
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--network',
|
|
help='''The network name to attach the VM's network
|
|
interface to.
|
|
|
|
The value will be prefixed with global/networks/ unless it contains a /, in
|
|
which case it is assumed to be a fully specified network resource URL.
|
|
|
|
If unspecified, the global default network is used.''')
|
|
|
|
parser.add_argument(
|
|
'--subnetwork',
|
|
help='''The subnetwork to use on the provided network.
|
|
|
|
If the specified network is configured for custom subnet creation, the name of
|
|
the subnetwork to attach the instance to must be specified here.
|
|
|
|
The value is prefixed with regions/*/subnetworks/ unless it contains a /, in
|
|
which case it is assumed to be a fully specified subnetwork resource URL.
|
|
|
|
If the * character appears in the value, it is replaced with the region that
|
|
the virtual machine has been allocated in.''')
|
|
|
|
parser.add_argument(
|
|
'--boot-disk-size',
|
|
type=int,
|
|
help='''The size of the boot disk in GB.
|
|
|
|
The boot disk size must be large enough to accomondate all Docker images from
|
|
each action in the pipeline at the same time. If not specified, a small but
|
|
reasonable default value is used.''')
|
|
|
|
def Run(self, args):
|
|
"""This is what gets called when the user runs this command.
|
|
|
|
Args:
|
|
args: argparse.Namespace, All the arguments that were provided to this
|
|
command invocation.
|
|
|
|
Raises:
|
|
files.Error: A file argument could not be read.
|
|
GenomicsError: User input was invalid.
|
|
HttpException: An http error response was received while executing api
|
|
request.
|
|
Returns:
|
|
Operation representing the running pipeline.
|
|
"""
|
|
pipeline = None
|
|
apitools_client = genomics_util.GetGenomicsClient('v2alpha1')
|
|
genomics_messages = genomics_util.GetGenomicsMessages('v2alpha1')
|
|
if args.pipeline_file:
|
|
if args.command_line:
|
|
# TODO(b/79982664): Use a mutex argument group instead.
|
|
raise exceptions.GenomicsError(
|
|
'--command-line cannot be used with --pipeline-file.')
|
|
|
|
pipeline = genomics_util.GetFileAsMessage(
|
|
args.pipeline_file,
|
|
genomics_messages.Pipeline,
|
|
self.context[lib.STORAGE_V1_CLIENT_KEY])
|
|
elif args.command_line:
|
|
pipeline = genomics_messages.Pipeline(
|
|
actions=[genomics_messages.Action(
|
|
imageUri=args.docker_image,
|
|
commands=['-c', args.command_line],
|
|
entrypoint='bash')])
|
|
else:
|
|
raise exceptions.GenomicsError(
|
|
'Either --pipeline-file or --command-line is required.')
|
|
|
|
arg_inputs, is_local_file = _ValidateAndMergeArgInputs(args)
|
|
|
|
request = None
|
|
# Create messages up front to avoid checking for None everywhere.
|
|
if not pipeline.resources:
|
|
pipeline.resources = genomics_messages.Resources()
|
|
resources = pipeline.resources
|
|
|
|
if not resources.virtualMachine:
|
|
resources.virtualMachine = genomics_messages.VirtualMachine(
|
|
machineType='n1-standard-1')
|
|
virtual_machine = resources.virtualMachine
|
|
|
|
if not virtual_machine.serviceAccount:
|
|
virtual_machine.serviceAccount = genomics_messages.ServiceAccount()
|
|
|
|
# Always set the project id.
|
|
resources.projectId = genomics_util.GetProjectId()
|
|
|
|
# Update the pipeline based on arguments.
|
|
if args.memory or args.cpus:
|
|
# Default to n1-standard1 sizes.
|
|
virtual_machine.machineType = 'custom-%d-%d' % (args.cpus or 1,
|
|
(args.memory or 3.75) *
|
|
1024)
|
|
|
|
if args.preemptible:
|
|
virtual_machine.preemptible = args.preemptible
|
|
|
|
if args.zones:
|
|
resources.zones = args.zones
|
|
elif not resources.zones and properties.VALUES.compute.zone.Get():
|
|
resources.zones = [properties.VALUES.compute.zone.Get()]
|
|
|
|
if args.regions:
|
|
resources.regions = args.regions
|
|
elif not resources.regions and properties.VALUES.compute.region.Get():
|
|
resources.regions = [properties.VALUES.compute.region.Get()]
|
|
|
|
if args.service_account_email != 'default':
|
|
virtual_machine.serviceAccount.email = args.service_account_email
|
|
|
|
if args.service_account_scopes:
|
|
virtual_machine.serviceAccount.scopes = args.service_account_scopes
|
|
|
|
# Always add a scope for GCS in case any arguments need it.
|
|
virtual_machine.serviceAccount.scopes.append(
|
|
'https://www.googleapis.com/auth/devstorage.read_write')
|
|
|
|
# Attach custom network/subnetwork (if set).
|
|
if args.network or args.subnetwork:
|
|
if not virtual_machine.network:
|
|
virtual_machine.network = genomics_messages.Network()
|
|
if args.network:
|
|
virtual_machine.network.name = args.network
|
|
if args.subnetwork:
|
|
virtual_machine.network.subnetwork = args.subnetwork
|
|
|
|
if args.boot_disk_size is not None:
|
|
if args.boot_disk_size <= 0:
|
|
raise exceptions.GenomicsError(
|
|
'Boot disk size must be greater than zero.')
|
|
virtual_machine.bootDiskSizeGb = args.boot_disk_size
|
|
|
|
# Generate paths for inputs and outputs in a shared location and put them
|
|
# into the environment for actions based on their name.
|
|
env = {}
|
|
if arg_inputs:
|
|
input_generator = _SharedPathGenerator('input')
|
|
for name, value in arg_inputs.items():
|
|
if genomics_util.IsGcsPath(value):
|
|
env[name] = input_generator.Generate()
|
|
pipeline.actions.insert(
|
|
0,
|
|
genomics_messages.Action(
|
|
imageUri=CLOUD_SDK_IMAGE,
|
|
commands=[
|
|
'/bin/sh', '-c',
|
|
'gsutil -m -q cp %s ${%s}' % (value, name)
|
|
]))
|
|
elif name in is_local_file:
|
|
# TODO(b/183206325): Get test coverage to 100%.
|
|
env[name] = input_generator.Generate()
|
|
pipeline.actions.insert(
|
|
0,
|
|
genomics_messages.Action(
|
|
imageUri=CLOUD_SDK_IMAGE,
|
|
commands=[
|
|
'/bin/sh', '-c',
|
|
'echo "%s" | base64 -d > ${%s}' %
|
|
(base64.b64encode(value.encode()).decode(), name)
|
|
]))
|
|
else:
|
|
env[name] = value
|
|
|
|
if args.outputs:
|
|
output_generator = _SharedPathGenerator('output')
|
|
for name, value in args.outputs.items():
|
|
env[name] = output_generator.Generate()
|
|
pipeline.actions.append(
|
|
genomics_messages.Action(
|
|
imageUri=CLOUD_SDK_IMAGE,
|
|
commands=[
|
|
'/bin/sh', '-c',
|
|
'gsutil -m -q cp ${%s} %s' % (name, value)
|
|
]))
|
|
if args.env_vars:
|
|
for name, value in args.env_vars.items():
|
|
env[name] = value
|
|
|
|
# Merge any existing pipeline arguments into the generated environment and
|
|
# update the pipeline.
|
|
if pipeline.environment:
|
|
for val in pipeline.environment.additionalProperties:
|
|
if val.key not in env:
|
|
env[val.key] = val.value
|
|
|
|
pipeline.environment = genomics_messages.Pipeline.EnvironmentValue(
|
|
additionalProperties=genomics_util.ArgDictToAdditionalPropertiesList(
|
|
env,
|
|
genomics_messages.Pipeline.EnvironmentValue.AdditionalProperty))
|
|
|
|
if arg_inputs or args.outputs:
|
|
virtual_machine.disks.append(genomics_messages.Disk(name=SHARED_DISK))
|
|
|
|
for action in pipeline.actions:
|
|
action.mounts.append(
|
|
genomics_messages.Mount(disk=SHARED_DISK, path='/' + SHARED_DISK))
|
|
|
|
if args.logging:
|
|
pipeline.actions.append(
|
|
genomics_messages.Action(
|
|
imageUri=CLOUD_SDK_IMAGE,
|
|
commands=[
|
|
'/bin/sh', '-c',
|
|
'gsutil -m -q cp /google/logs/output ' + args.logging
|
|
],
|
|
flags=[(genomics_messages.Action.FlagsValueListEntryValuesEnum
|
|
.ALWAYS_RUN)]))
|
|
|
|
# Update disk sizes if specified, potentially including the shared disk.
|
|
if args.disk_size:
|
|
disk_sizes = {}
|
|
for disk_encoding in args.disk_size.split(','):
|
|
parts = disk_encoding.split(':', 1)
|
|
try:
|
|
disk_sizes[parts[0]] = int(parts[1])
|
|
except:
|
|
raise exceptions.GenomicsError('Invalid --disk-size.')
|
|
|
|
for disk in virtual_machine.disks:
|
|
if disk.name in disk_sizes:
|
|
disk.sizeGb = disk_sizes[disk.name]
|
|
|
|
request = genomics_messages.RunPipelineRequest(
|
|
pipeline=pipeline,
|
|
labels=labels_util.ParseCreateArgs(
|
|
args, genomics_messages.RunPipelineRequest.LabelsValue))
|
|
|
|
result = apitools_client.pipelines.Run(request)
|
|
log.status.Print('Running [{0}].'.format(result.name))
|
|
return result
|