489 lines
17 KiB
Python
489 lines
17 KiB
Python
# -*- coding: utf-8 -*- #
|
|
# Copyright 2024 Google LLC. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""Model Garden deploy command."""
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import unicode_literals
|
|
|
|
import time
|
|
|
|
from apitools.base.py import exceptions as apitools_exceptions
|
|
from googlecloudsdk.api_lib.ai import operations
|
|
from googlecloudsdk.api_lib.ai.model_garden import client as client_mg
|
|
from googlecloudsdk.api_lib.util import apis
|
|
from googlecloudsdk.calliope import arg_parsers
|
|
from googlecloudsdk.calliope import base
|
|
from googlecloudsdk.calliope import exceptions as c_exceptions
|
|
from googlecloudsdk.command_lib.ai import constants
|
|
from googlecloudsdk.command_lib.ai import endpoint_util
|
|
from googlecloudsdk.command_lib.ai import flags
|
|
from googlecloudsdk.command_lib.ai import model_garden_utils
|
|
from googlecloudsdk.command_lib.ai import region_util
|
|
from googlecloudsdk.command_lib.ai import validation
|
|
from googlecloudsdk.command_lib.ai.region_util import (
|
|
_IsDefaultUniverse,
|
|
)
|
|
from googlecloudsdk.core import properties
|
|
|
|
|
|
@base.ReleaseTracks(
|
|
base.ReleaseTrack.ALPHA, base.ReleaseTrack.BETA, base.ReleaseTrack.GA
|
|
)
|
|
@base.UniverseCompatible
|
|
class Deploy(base.Command):
|
|
"""Deploy a model in Model Garden to a Vertex AI endpoint.
|
|
|
|
## EXAMPLES
|
|
|
|
To deploy a Model Garden model `google/gemma2/gemma2-9b` under project
|
|
`example` in region
|
|
`us-central1`, run:
|
|
|
|
$ gcloud ai model-garden models deploy
|
|
--model=google/gemma2@gemma-2-9b
|
|
--project=example
|
|
--region=us-central1
|
|
|
|
To deploy a Hugging Face model `meta-llama/Meta-Llama-3-8B` under project
|
|
`example` in region `us-central1`, run:
|
|
|
|
$ gcloud ai model-garden models deploy
|
|
--model=meta-llama/Meta-Llama-3-8B
|
|
--hugging-face-access-token={hf_token}
|
|
--project=example
|
|
--region=us-central1
|
|
"""
|
|
|
|
@staticmethod
|
|
def Args(parser):
|
|
base.Argument(
|
|
'--model',
|
|
required=True,
|
|
help=(
|
|
'The model to be deployed. If it is a Model Garden model, it should'
|
|
' be in the format of'
|
|
' `{publisher_name}/{model_name}@{model_version_name}, e.g.'
|
|
' `google/gemma2@gemma-2-2b`. If it is a Hugging Face model, it'
|
|
' should be in the convention of Hugging Face models, e.g.'
|
|
' `meta-llama/Meta-Llama-3-8B`. If it is a Custom Weights model, it'
|
|
' should be in the format of `gs://{gcs_bucket_uri}`, e.g. `gs://'
|
|
'-model-garden-public-us/llama3.1/Meta-Llama-3.1-8B-Instruct`.'
|
|
),
|
|
).AddToParser(parser)
|
|
base.Argument(
|
|
'--hugging-face-access-token',
|
|
required=False,
|
|
help=(
|
|
'The access token from Hugging Face needed to read the'
|
|
' model artifacts of gated models. It is only needed when'
|
|
' the Hugging Face model to deploy is gated.'
|
|
),
|
|
).AddToParser(parser)
|
|
base.Argument(
|
|
'--endpoint-display-name',
|
|
required=False,
|
|
help='Display name of the endpoint with the deployed model.',
|
|
).AddToParser(parser)
|
|
|
|
flags.AddRegionResourceArg(
|
|
parser, 'to deploy the model', prompt_func=region_util.PromptForOpRegion
|
|
)
|
|
base.Argument(
|
|
'--machine-type',
|
|
help=(
|
|
'The machine type to deploy the model to. It should be a supported'
|
|
' machine type from the deployment configurations of the model. Use'
|
|
' `gcloud ai model-garden models list-deployment-config` to check'
|
|
' the supported machine types.'
|
|
),
|
|
required=False,
|
|
).AddToParser(parser)
|
|
base.Argument(
|
|
'--accelerator-type',
|
|
help=(
|
|
'The accelerator type to serve the model. It should be a supported'
|
|
' accelerator type from the verified deployment configurations of'
|
|
' the model. Use `gcloud ai model-garden models'
|
|
' list-deployment-config` to check the supported accelerator types.'
|
|
),
|
|
required=False,
|
|
).AddToParser(parser)
|
|
base.Argument(
|
|
'--accelerator-count',
|
|
help=(
|
|
'The accelerator count to serve the model. Accelerator count'
|
|
' should be non-negative.'
|
|
),
|
|
type=int,
|
|
required=False,
|
|
).AddToParser(parser)
|
|
base.Argument(
|
|
'--accept-eula',
|
|
help=(
|
|
'When set, the user accepts the End User License Agreement (EULA)'
|
|
' of the model.'
|
|
),
|
|
action='store_true',
|
|
default=False,
|
|
required=False,
|
|
).AddToParser(parser)
|
|
base.Argument(
|
|
'--asynchronous',
|
|
help=(
|
|
'If set to true, the command will terminate immediately and not'
|
|
' keep polling the operation status.'
|
|
),
|
|
action='store_true',
|
|
default=False,
|
|
required=False,
|
|
).AddToParser(parser)
|
|
base.Argument(
|
|
'--reservation-affinity',
|
|
type=arg_parsers.ArgDict(
|
|
spec={
|
|
'reservation-affinity-type': str,
|
|
'key': str,
|
|
'values': arg_parsers.ArgList(),
|
|
},
|
|
required_keys=['reservation-affinity-type'],
|
|
),
|
|
help=(
|
|
'A ReservationAffinity can be used to configure a Vertex AI'
|
|
' resource (e.g., a DeployedModel) to draw its Compute Engine'
|
|
' resources from a Shared Reservation, or exclusively from'
|
|
' on-demand capacity.'
|
|
),
|
|
).AddToParser(parser)
|
|
|
|
base.Argument(
|
|
'--spot',
|
|
action='store_true',
|
|
default=False,
|
|
required=False,
|
|
help='If true, schedule the deployment workload on Spot VM.',
|
|
).AddToParser(parser)
|
|
|
|
base.Argument(
|
|
'--use-dedicated-endpoint',
|
|
action='store_true',
|
|
default=False,
|
|
required=False,
|
|
help=(
|
|
'If true, the endpoint will be exposed through a dedicated DNS.'
|
|
' Your request to the dedicated DNS will be isolated from other'
|
|
" users' traffic and will have better performance and reliability."
|
|
),
|
|
).AddToParser(parser)
|
|
|
|
base.Argument(
|
|
'--enable-fast-tryout',
|
|
action='store_true',
|
|
default=False,
|
|
required=False,
|
|
help=(
|
|
'If True, model will be deployed using faster deployment path.'
|
|
' Useful for quick experiments. Not for production workloads. Only'
|
|
' available for most popular models with certain machine types.'
|
|
),
|
|
).AddToParser(parser)
|
|
|
|
base.Argument(
|
|
'--container-image-uri',
|
|
help=("""\
|
|
URI of the Model serving container file in the Container Registry
|
|
(e.g. gcr.io/myproject/server:latest).
|
|
"""),
|
|
).AddToParser(parser)
|
|
|
|
parser.add_argument(
|
|
'--container-env-vars',
|
|
metavar='KEY=VALUE',
|
|
type=arg_parsers.ArgDict(),
|
|
action=arg_parsers.UpdateAction,
|
|
help='List of key-value pairs to set as environment variables.',
|
|
)
|
|
parser.add_argument(
|
|
'--container-command',
|
|
type=arg_parsers.ArgList(),
|
|
metavar='COMMAND',
|
|
action=arg_parsers.UpdateAction,
|
|
help="""\
|
|
Entrypoint for the container image. If not specified, the container
|
|
image's default entrypoint is run.
|
|
""",
|
|
)
|
|
parser.add_argument(
|
|
'--container-args',
|
|
metavar='ARG',
|
|
type=arg_parsers.ArgList(),
|
|
help="""\
|
|
Comma-separated arguments passed to the command run by the container
|
|
image. If not specified and no `--command` is provided, the container
|
|
image's default command is used.
|
|
""",
|
|
)
|
|
parser.add_argument(
|
|
'--container-ports',
|
|
metavar='PORT',
|
|
type=arg_parsers.ArgList(element_type=arg_parsers.BoundedInt(1, 65535)),
|
|
action=arg_parsers.UpdateAction,
|
|
help="""\
|
|
Container ports to receive http requests at. Must be a number between 1 and
|
|
65535, inclusive.
|
|
""",
|
|
)
|
|
parser.add_argument(
|
|
'--container-grpc-ports',
|
|
metavar='PORT',
|
|
type=arg_parsers.ArgList(element_type=arg_parsers.BoundedInt(1, 65535)),
|
|
action=arg_parsers.UpdateAction,
|
|
help="""\
|
|
Container ports to receive grpc requests at. Must be a number between 1 and
|
|
65535, inclusive.
|
|
""",
|
|
)
|
|
parser.add_argument(
|
|
'--container-predict-route',
|
|
help='HTTP path to send prediction requests to inside the container.',
|
|
)
|
|
parser.add_argument(
|
|
'--container-health-route',
|
|
help='HTTP path to send health checks to inside the container.',
|
|
)
|
|
parser.add_argument(
|
|
'--container-deployment-timeout-seconds',
|
|
type=int,
|
|
help='Deployment timeout in seconds.',
|
|
)
|
|
parser.add_argument(
|
|
'--container-shared-memory-size-mb',
|
|
type=int,
|
|
help="""\
|
|
The amount of the VM memory to reserve as the shared memory for the model in
|
|
megabytes.
|
|
""",
|
|
)
|
|
parser.add_argument(
|
|
'--container-startup-probe-exec',
|
|
type=arg_parsers.ArgList(),
|
|
metavar='STARTUP_PROBE_EXEC',
|
|
help="""\
|
|
Exec specifies the action to take. Used by startup probe. An example of this
|
|
argument would be ["cat", "/tmp/healthy"].
|
|
""",
|
|
)
|
|
parser.add_argument(
|
|
'--container-startup-probe-period-seconds',
|
|
type=int,
|
|
help="""\
|
|
How often (in seconds) to perform the startup probe. Default to 10 seconds.
|
|
Minimum value is 1.
|
|
""",
|
|
)
|
|
parser.add_argument(
|
|
'--container-startup-probe-timeout-seconds',
|
|
type=int,
|
|
help="""\
|
|
Number of seconds after which the startup probe times out. Defaults to 1 second.
|
|
Minimum value is 1.
|
|
""",
|
|
)
|
|
parser.add_argument(
|
|
'--container-health-probe-exec',
|
|
type=arg_parsers.ArgList(),
|
|
metavar='HEALTH_PROBE_EXEC',
|
|
help="""\
|
|
Exec specifies the action to take. Used by health probe. An example of this
|
|
argument would be ["cat", "/tmp/healthy"].
|
|
""",
|
|
)
|
|
parser.add_argument(
|
|
'--container-health-probe-period-seconds',
|
|
type=int,
|
|
help="""\
|
|
How often (in seconds) to perform the health probe. Default to 10 seconds.
|
|
Minimum value is 1.
|
|
""",
|
|
)
|
|
parser.add_argument(
|
|
'--container-health-probe-timeout-seconds',
|
|
type=int,
|
|
help="""\
|
|
Number of seconds after which the health probe times out. Defaults to 1 second.
|
|
Minimum value is 1.
|
|
""",
|
|
)
|
|
|
|
def Run(self, args):
|
|
is_custom_weights_model = args.model.startswith('gs://')
|
|
if not is_custom_weights_model:
|
|
validation.ValidateModelGardenModelArgs(args)
|
|
validation.ValidateDisplayName(args.endpoint_display_name)
|
|
region_ref = args.CONCEPTS.region.Parse()
|
|
args.region = region_ref.AsDict()['locationsId']
|
|
version = constants.BETA_VERSION
|
|
is_hf_model = '@' not in args.model
|
|
|
|
region = 'us-central1' if _IsDefaultUniverse() else None
|
|
with endpoint_util.AiplatformEndpointOverrides(version, region=region):
|
|
# Custom weights model deployment.
|
|
if is_custom_weights_model:
|
|
|
|
if not (
|
|
bool(args.machine_type)
|
|
== bool(args.accelerator_type)
|
|
== bool(args.accelerator_count)
|
|
):
|
|
raise c_exceptions.InvalidArgumentException(
|
|
'--machine-type, --accelerator-type and --accelerator-count',
|
|
' Arguments for MachineType, AcceleratorType and AcceleratorCount'
|
|
' must either all be provided or all be empty for custom weights'
|
|
' model deployment.',
|
|
)
|
|
|
|
machine_spec = None
|
|
# Check accelerator quota.
|
|
if args.machine_type:
|
|
model_garden_utils.CheckAcceleratorQuota(
|
|
args,
|
|
machine_type=args.machine_type,
|
|
accelerator_type=args.accelerator_type,
|
|
accelerator_count=args.accelerator_count,
|
|
)
|
|
client = apis.GetClientInstance(
|
|
constants.AI_PLATFORM_API_NAME,
|
|
constants.AI_PLATFORM_API_VERSION[version],
|
|
)
|
|
|
|
machine_spec = client.MESSAGES_MODULE.GoogleCloudAiplatformV1beta1MachineSpec(
|
|
machineType=args.machine_type,
|
|
acceleratorType=client.MESSAGES_MODULE.GoogleCloudAiplatformV1beta1MachineSpec.AcceleratorTypeValueValuesEnum(
|
|
args.accelerator_type
|
|
),
|
|
acceleratorCount=args.accelerator_count,
|
|
)
|
|
|
|
# Deploy the model.
|
|
with endpoint_util.AiplatformEndpointOverrides(
|
|
version, region=args.region
|
|
):
|
|
default_endpoint_name = '-'.join([
|
|
'custom-weights',
|
|
str(time.time()).split('.')[0],
|
|
'mg-cli-deploy',
|
|
])
|
|
mg_client = client_mg.ModelGardenClient()
|
|
operation_client = operations.OperationsClient(version=version)
|
|
endpoint_name = (
|
|
args.endpoint_display_name
|
|
if args.endpoint_display_name
|
|
else default_endpoint_name
|
|
)
|
|
|
|
model_garden_utils.Deploy(
|
|
args,
|
|
machine_spec,
|
|
endpoint_name,
|
|
args.model,
|
|
operation_client,
|
|
mg_client,
|
|
)
|
|
else:
|
|
# Model Garden model deployment.
|
|
# Step 1: Fetch PublisherModel data, including deployment configs. Use
|
|
# us-central1 because all data are stored in us-central1.
|
|
mg_client = client_mg.ModelGardenClient()
|
|
if is_hf_model:
|
|
# Convert to lower case because API only takes in lower case.
|
|
publisher_name, model_name = args.model.lower().split('/')
|
|
|
|
try:
|
|
publisher_model = mg_client.GetPublisherModel(
|
|
model_name=f'publishers/{publisher_name}/models/{model_name}',
|
|
is_hugging_face_model=True,
|
|
)
|
|
except apitools_exceptions.HttpNotFoundError:
|
|
raise c_exceptions.UnknownArgumentException(
|
|
'--model',
|
|
f'{args.model} is not a supported Hugging Face'
|
|
' model for deployment in Model Garden.',
|
|
)
|
|
|
|
default_endpoint_name = '-'.join(
|
|
[publisher_name, model_name, 'hf', 'mg-cli-deploy']
|
|
)
|
|
api_model_arg = f'{publisher_name}/{model_name}'
|
|
else:
|
|
# Convert to lower case because API only takes in lower case.
|
|
publisher_name, model_and_version_name = args.model.lower().split('/')
|
|
|
|
try:
|
|
publisher_model = mg_client.GetPublisherModel(
|
|
f'publishers/{publisher_name}/models/{model_and_version_name}'
|
|
)
|
|
except apitools_exceptions.HttpNotFoundError:
|
|
raise c_exceptions.UnknownArgumentException(
|
|
'--model',
|
|
f'{args.model} is not a supported Model Garden model for'
|
|
' deployment in Model Garden.',
|
|
)
|
|
|
|
default_endpoint_name = '-'.join([
|
|
publisher_name,
|
|
model_and_version_name.split('@')[1],
|
|
'mg-cli-deploy',
|
|
])
|
|
api_model_arg = (
|
|
f'publishers/{publisher_name}/models/{model_and_version_name}'
|
|
)
|
|
|
|
deploy_config = model_garden_utils.GetDeployConfig(
|
|
args, publisher_model
|
|
)
|
|
|
|
# Step 2: Check accelerator quota.
|
|
model_garden_utils.CheckAcceleratorQuota(
|
|
args,
|
|
machine_type=deploy_config.dedicatedResources.machineSpec.machineType,
|
|
accelerator_type=str(
|
|
deploy_config.dedicatedResources.machineSpec.acceleratorType
|
|
),
|
|
accelerator_count=deploy_config.dedicatedResources.machineSpec.acceleratorCount,
|
|
)
|
|
# Clear the aiplatform URI value so that new values can be set.
|
|
properties.VALUES.api_endpoint_overrides.aiplatform.Set(None)
|
|
|
|
# Step 3: Deploy the model.
|
|
with endpoint_util.AiplatformEndpointOverrides(
|
|
version, region=args.region
|
|
):
|
|
mg_client = client_mg.ModelGardenClient()
|
|
operation_client = operations.OperationsClient(version=version)
|
|
endpoint_name = (
|
|
args.endpoint_display_name
|
|
if args.endpoint_display_name
|
|
else default_endpoint_name
|
|
)
|
|
|
|
model_garden_utils.Deploy(
|
|
args,
|
|
deploy_config.dedicatedResources.machineSpec,
|
|
endpoint_name,
|
|
api_model_arg,
|
|
operation_client,
|
|
mg_client,
|
|
)
|