feat: Add new gcloud commands, API clients, and third-party libraries across various services.

This commit is contained in:
2026-01-01 20:26:35 +01:00
parent 5e23cbece0
commit a19e592eb7
25221 changed files with 8324611 additions and 0 deletions

View File

@@ -0,0 +1,45 @@
# -*- coding: utf-8 -*- #
# Copyright 2025 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The command group for the profiles CLI."""
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.projects import util
@base.UniverseCompatible
@base.ReleaseTracks(base.ReleaseTrack.ALPHA, base.ReleaseTrack.GA)
class Profiles(base.Group):
"""Quickstart engine for GKE AI workloads.
The GKE Inference Quickstart helps simplify deploying AI inference on Google
Kubernetes Engine (GKE). It provides tailored profiles based on
Google's internal benchmarks. Provide inputs like your preferred open-source
model (e.g. Llama, Gemma, or Mistral) and your application's performance
target. Based on these inputs, the quickstart generates accelerator choices
with performance metrics, and detailed, ready-to-deploy profiles for
compute, load balancing, and autoscaling. These profiles are provided
as standard Kubernetes YAML manifests, which you can deploy or modify.
To visualize the benchmarking data that support these estimates, see the
accompanying Colab notebook:
https://colab.research.google.com/github/GoogleCloudPlatform/kubernetes-engine-samples/blob/main/ai-ml/notebooks/giq_visualizations.ipynb
"""
category = base.SDK_TOOLS_CATEGORY

View File

@@ -0,0 +1,29 @@
# -*- coding: utf-8 -*- #
# Copyright 2025 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The command group for the accelerators CLI."""
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.projects import util
@base.UniverseCompatible
@base.ReleaseTracks(base.ReleaseTrack.ALPHA)
class ModelServers(base.Group):
"""Manage supported accelerators for GKE Inference Quickstart."""

View File

@@ -0,0 +1,190 @@
# -*- coding: utf-8 -*- #
# Copyright 2025 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Lists compatible accelerator profiles for GKE Inference Quickstart."""
from googlecloudsdk.api_lib.ai.recommender import util
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.run import commands
from googlecloudsdk.command_lib.run.printers import profiles_printer
from googlecloudsdk.core import exceptions
from googlecloudsdk.core import log
from googlecloudsdk.core.resource import resource_printer
_EXAMPLES = """
To list compatible accelerator profiles for a model, run:
$ {command} --model=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
"""
def decimal_to_amount(decimal_value):
"""Converts a decimal representation to an Amount proto."""
units = int(decimal_value)
nanos = int((decimal_value - units) * 1e9)
return (units, nanos)
@base.DefaultUniverseOnly
@base.ReleaseTracks(base.ReleaseTrack.ALPHA)
class List(commands.List):
"""List compatible accelerator profiles.
This command lists all supported accelerators with their performance details.
By default, the supported accelerators are displayed in a table format with
select information for each accelerator. To see all details, use
--format=yaml.
To get supported model, model servers, and model server versions, run `gcloud
alpha container ai profiles models list`, `gcloud alpha container ai
profiles model-servers list`, and `gcloud alpha container ai profiles
model-server-versions list`.
Alternatively, run `gcloud alpha container ai profiles
model-and-server-combinations list` to get all supported model and server
combinations.
"""
@staticmethod
def Args(parser):
parser.add_argument(
"--model",
required=True,
help="The model.",
)
parser.add_argument(
"--model-server",
help=(
"The model server. If not specified, this defaults to any model"
" server."
),
)
parser.add_argument(
"--model-server-version",
help=(
"The model server version. If not specified, this defaults to the"
" latest version."
),
)
parser.add_argument(
"--max-ntpot-milliseconds",
type=int,
help=(
"The maximum normalized time per output token (NTPOT) in"
" milliseconds. NTPOT is measured as the request_latency /"
" output_tokens. If this field is set, the command will only return"
" accelerators that can meet the target ntpot milliseconds and"
" display their throughput performance at the target latency."
" Otherwise, the command will return all accelerators and display"
" their highest throughput performance."
),
)
parser.add_argument(
"--target-cost-per-million-output-tokens",
hidden=True,
type=float,
required=False,
help=(
"The target cost per million output tokens to filter profiles by,"
" unit is 1 USD up to 5 decimal places."
),
)
parser.add_argument(
"--target-cost-per-million-input-tokens",
hidden=True,
type=float,
required=False,
help=(
"The target cost per million input tokens to filter profiles by,"
" unit is 1 USD up to 5 decimal places."
),
)
parser.add_argument(
"--pricing-model",
hidden=True,
required=False,
type=str,
help=(
"The pricing model to use to calculate token cost. Currently, this"
" supports on-demand, spot, 3-years-cud, 1-year-cud"
),
)
parser.add_argument(
"--format",
type=str,
help="The format to use for the output. Default is table. yaml|table",
)
resource_printer.RegisterFormatter(
profiles_printer.PROFILES_PRINTER_FORMAT,
profiles_printer.ProfilePrinter,
)
parser.display_info.AddFormat(profiles_printer.PROFILES_PRINTER_FORMAT)
parser.display_info.AddFormat(
"table("
"acceleratorType,"
"modelAndModelServerInfo.modelName,"
"modelAndModelServerInfo.modelServerName,"
"modelAndModelServerInfo.modelServerVersion,"
"resourcesUsed.acceleratorCount,"
"performanceStats.outputTokensPerSecond,"
"performanceStats.ntpotMilliseconds"
")"
)
def Run(self, args):
client = util.GetClientInstance(base.ReleaseTrack.ALPHA)
messages = util.GetMessagesModule(base.ReleaseTrack.ALPHA)
try:
request = messages.GkerecommenderAcceleratorsListRequest(
modelName=args.model,
modelServerName=args.model_server,
modelServerVersion=args.model_server_version,
performanceRequirements_maxNtpotMilliseconds=args.max_ntpot_milliseconds,
performanceRequirements_cost_pricingModel=args.pricing_model,
)
if args.target_cost_per_million_output_tokens:
units, nanos = decimal_to_amount(
args.target_cost_per_million_output_tokens
)
request.performanceRequirements_cost_costPerMillionNormalizedOutputTokens_units = (
units
)
request.performanceRequirements_cost_costPerMillionNormalizedOutputTokens_nanos = (
nanos
)
if args.target_cost_per_million_input_tokens:
units, nanos = decimal_to_amount(
args.target_cost_per_million_input_tokens
)
request.performanceRequirements_cost_costPerMillionInputTokens_units = (
units
)
request.performanceRequirements_cost_costPerMillionInputTokens_nanos = (
nanos
)
response = client.accelerators.List(request)
self.comments = response.comments
if response:
return response
else:
return []
except exceptions.Error as e:
log.error(f"An error has occurred: {e}")
log.status.Print(f"An error has occurred: {e}")
return []

View File

@@ -0,0 +1,29 @@
# -*- coding: utf-8 -*- #
# Copyright 2025 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The command group for the benchmarks CLI."""
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.projects import util
@base.UniverseCompatible
@base.ReleaseTracks(base.ReleaseTrack.GA)
class Benchmarks(base.Group):
"""Manage benchmarks for GKE Inference Quickstart."""

View File

@@ -0,0 +1,179 @@
# -*- coding: utf-8 -*- #
# Copyright 2025 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Outputs benchmarking data for GKE Inference Quickstart."""
from apitools.base.py import exceptions as apitools_exceptions
from googlecloudsdk.api_lib.ai.recommender import util
from googlecloudsdk.api_lib.util import exceptions
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.run import commands
from googlecloudsdk.command_lib.run.printers import profiles_csv_printer
from googlecloudsdk.core.resource import resource_printer
_EXAMPLE = """
To get benchmarking data for a given model and model server, run:
$ {command} --model=google/gemma-2-27b-it --model-server=vllm --pricing-model=spot
"""
def amount_to_decimal(cost):
"""Converts cost to a decimal representation."""
units = cost.units
if not units:
units = 0
decimal_value = +(units + cost.nanos / 1e9)
return f"{decimal_value:.3f}"
def get_decimal_cost(costs):
"""Returns the cost per million normalized output tokens as a decimal.
Args:
costs: The costs to convert.
"""
output_token_cost = "N/A"
if costs and costs[0].costPerMillionOutputTokens:
output_token_cost = amount_to_decimal(costs[0].costPerMillionOutputTokens)
input_token_cost = "N/A"
if costs and costs[0].costPerMillionInputTokens:
input_token_cost = amount_to_decimal(costs[0].costPerMillionInputTokens)
return (input_token_cost, output_token_cost)
@base.DefaultUniverseOnly
@base.ReleaseTracks(base.ReleaseTrack.GA)
class List(commands.List):
"""List benchmarks for a given model and model server.
This command lists all benchmarking data for a given model and model server.
By default, the benchmarks are displayed in a CSV format.
For examples of visualizing the benchmarking data, see the accompanying Colab
notebook:
https://colab.research.google.com/github/GoogleCloudPlatform/kubernetes-engine-samples/blob/main/ai-ml/notebooks/giq_visualizations.ipynb
"""
@staticmethod
def Args(parser):
parser.add_argument(
"--model",
required=True,
help="The model.",
)
parser.add_argument(
"--model-server",
required=True,
help="The model server.",
)
parser.add_argument(
"--model-server-version",
help=(
"The model server version. Default is latest. Other options include"
" the model server version of a profile, all which returns all"
" versions."
),
)
parser.add_argument(
"--instance-type",
help=(
"The instance type. If not specified, this defaults to any"
"instance type."
),
)
parser.add_argument(
"--format",
help=(
"The format to print the output in. Default is csvprofile, which"
" displays the profile information in a CSV format, including"
"cost conversions."
),
)
parser.add_argument(
"--pricing-model",
required=False,
help=(
"The pricing model to use to calculate token cost. Currently, this"
" supports on-demand, spot, 3-years-cud, 1-year-cud"
),
)
parser.add_argument(
"--use-case",
required=False,
help=(
"If specified, results will only show profiles that match the"
" provided use case. Options are: Advanced Customer Support, Code"
" Completion, Text Summarization, Chatbot (ShareGPT), Code"
" Generation, Deep Research."
),
)
parser.add_argument(
"--serving-stack",
required=False,
help=(
"The serving stack to filter benchmarking data by. If not"
" provided, benchmarking data for all serving stacks that support"
" the given model and model server will be returned."
),
)
parser.add_argument(
"--serving-stack-version",
required=False,
help=(
"The serving stack version to filter benchmarking data by. If not"
" provided, benchmarking data for all versions that support"
" the given model and model server will be returned."
),
)
resource_printer.RegisterFormatter(
profiles_csv_printer.PROFILES_PRINTER_FORMAT,
profiles_csv_printer.ProfileCSVPrinter,
)
parser.display_info.AddFormat(profiles_csv_printer.PROFILES_PRINTER_FORMAT)
def Run(self, args):
client = util.GetClientInstance(base.ReleaseTrack.GA)
messages = util.GetMessagesModule(base.ReleaseTrack.GA)
try:
model_server_info = messages.ModelServerInfo(
model=args.model,
modelServer=args.model_server,
modelServerVersion=args.model_server_version,
)
serving_stack = None
if args.serving_stack:
serving_stack = messages.ServingStack(
name=args.serving_stack,
)
if args.serving_stack_version:
serving_stack.version = args.serving_stack_version
request = messages.FetchBenchmarkingDataRequest(
modelServerInfo=model_server_info,
instanceType=args.instance_type,
pricingModel=args.pricing_model,
useCase=args.use_case,
servingStack=serving_stack,
)
response = client.benchmarkingData.Fetch(request)
if not response.profile:
return []
else:
return response.profile
except apitools_exceptions.HttpError as error:
raise exceptions.HttpException(error, util.HTTP_ERROR_FORMAT)

View File

@@ -0,0 +1,323 @@
# -*- coding: utf-8 -*- #
# Copyright 2025 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Lists compatible accelerator profiles for GKE Inference Quickstart."""
from apitools.base.py import exceptions as apitools_exceptions
from googlecloudsdk.api_lib.ai.recommender import util
from googlecloudsdk.api_lib.util import exceptions
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.run import commands
from googlecloudsdk.command_lib.run.printers import profiles_csv_printer
from googlecloudsdk.command_lib.run.printers import profiles_printer_ga as profiles_printer
from googlecloudsdk.core.resource import resource_printer
_EXAMPLES = """
To list compatible accelerator profiles for a model, run:
$ {command} --model=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
"""
def decimal_to_amount(decimal_value):
"""Converts a decimal representation to an Amount proto."""
units = int(decimal_value)
nanos = int((decimal_value - units) * 1e9)
return (units, nanos)
def amount_to_decimal(cost):
"""Converts cost to a decimal representation."""
units = cost.units
if not units:
units = 0
decimal_value = +(units + cost.nanos / 1e9)
return f"{decimal_value:.3f}"
def get_decimal_cost(costs):
"""Returns the cost per million normalized output tokens as a decimal.
Args:
costs: The costs to convert.
"""
output_token_cost = "N/A"
if costs and costs[0].costPerMillionOutputTokens:
output_token_cost = amount_to_decimal(
costs[0].costPerMillionOutputTokens
)
input_token_cost = "N/A"
if costs and costs[0].costPerMillionInputTokens:
input_token_cost = amount_to_decimal(costs[0].costPerMillionInputTokens)
return (input_token_cost, output_token_cost)
@base.DefaultUniverseOnly
@base.ReleaseTracks(base.ReleaseTrack.GA)
class List(commands.List):
"""List compatible accelerator profiles.
This command lists all supported accelerators with their performance details.
By default, the supported accelerators are displayed in a table format with
select information for each accelerator. To see all details, use
--format=yaml or --format=csvprofile.
To get supported model, model servers, and model server versions, run `gcloud
container ai profiles models list`, `gcloud container ai
profiles model-servers list`, and `gcloud container ai profiles
model-server-versions list`.
"""
@staticmethod
def Args(parser):
parser.add_argument(
"--model",
help="The model.",
)
parser.add_argument(
"--model-server",
help=(
"The model server version. Default is latest. Other options include"
" the model server version of a profile, all which returns all"
" versions."
),
)
parser.add_argument(
"--model-server-version",
help=(
"The model server version. If not specified, this defaults to the"
" latest version."
),
)
parser.add_argument(
"--target-ntpot-milliseconds",
type=int,
help=(
"The target normalized time per output token (NTPOT) in"
" milliseconds. NTPOT is measured as the request_latency /"
" output_tokens. If this field is set, the command will only return"
" accelerators that can meet the target ntpot milliseconds and"
" display their throughput performance at the target latency."
" Otherwise, the command will return all accelerators and display"
" their highest throughput performance."
),
)
parser.add_argument(
"--target-ttft-milliseconds",
type=int,
help=(
"The target time to first token (TTFT) in"
" milliseconds. TTFT is measured as the request_latency /"
" output_tokens. If this field is set, the command will only return"
" profiles that can meet the target ttft milliseconds and"
" display their throughput performance at the target latency."
" Otherwise, the command will return all profiles and display"
" their highest throughput performance."
),
)
parser.add_argument(
"--target-itl-milliseconds",
type=int,
help=(
"If specified, results will only show profiles with instance types"
" that can meet the latency target and will show their throughput"
" performances at the target inter-token latency (ITL)."
),
)
parser.add_argument(
"--target-cost-per-million-output-tokens",
type=float,
required=False,
help=(
"The target cost per million output tokens to filter profiles by,"
" unit is 1 USD up to 5 decimal places."
),
)
parser.add_argument(
"--target-cost-per-million-input-tokens",
type=float,
required=False,
help=(
"The target cost per million input tokens to filter profiles by,"
" unit is 1 USD up to 5 decimal places."
),
)
parser.add_argument(
"--pricing-model",
required=False,
type=str,
help=(
"The pricing model to use to calculate token cost. Currently, this"
" supports on-demand, spot, 3-years-cud, 1-year-cud"
),
)
parser.add_argument(
"--format",
help=(
"The output format. Default is profile, which displays the profile"
" information in a table format, including cost conversions."
" csvprofile displays the profile information in a CSV"
" format.Options include csvprofile, profile, and yaml. "
),
)
parser.add_argument(
"--use-case",
required=False,
type=str,
help=(
" If specified, results will only show profiles that match the"
" provided use case. Options are: Advanced Customer Support, Code"
" Completion, Text Summarization, Chatbot (ShareGPT), Text"
" Generation, Deep Research"
),
)
parser.add_argument(
"--target-input-length",
required=False,
type=int,
help=(
" If specified, results will only show profiles that have an input"
" length within 20% of the specified one. Only works alongside"
" output length."
),
)
parser.add_argument(
"--target-output-length",
required=False,
type=int,
help=(
"If specified, results will only show profiles that have an output"
" length within 20% of the specified one. Only works alongside"
" input length."
),
)
parser.add_argument(
"--serving-stack",
required=False,
help=(
"The serving stack to filter profiles by. If not"
" provided, profiles for all serving stacks that support"
" the given model and model server will be returned."
),
)
parser.add_argument(
"--serving-stack-version",
required=False,
help=(
"The serving stack version to filter profiles by. If not"
" provided, profiles for all versions that support"
" the given model and model server will be returned."
),
)
resource_printer.RegisterFormatter(
profiles_printer.PROFILES_PRINTER_FORMAT,
profiles_printer.ProfilePrinter,
)
resource_printer.RegisterFormatter(
profiles_csv_printer.PROFILES_PRINTER_FORMAT,
profiles_csv_printer.ProfileCSVPrinter,
)
parser.display_info.AddFormat(profiles_printer.PROFILES_PRINTER_FORMAT)
def Run(self, args):
client = util.GetClientInstance(base.ReleaseTrack.GA)
messages = util.GetMessagesModule(base.ReleaseTrack.GA)
performance_requirements = messages.PerformanceRequirements()
workload_spec = messages.WorkloadSpec()
if args.target_ntpot_milliseconds:
performance_requirements.targetNtpotMilliseconds = (
args.target_ntpot_milliseconds
)
if args.target_ttft_milliseconds:
performance_requirements.targetTtftMilliseconds = (
args.target_ttft_milliseconds
)
if args.target_itl_milliseconds:
performance_requirements.targetItlMilliseconds = (
args.target_itl_milliseconds
)
if args.use_case:
workload_spec.useCase = (
args.use_case
)
if args.target_input_length:
workload_spec.averageInputLength = (
args.target_input_length
)
if args.target_output_length:
workload_spec.averageOutputLength = (
args.target_output_length
)
if (
args.target_cost_per_million_output_tokens
or args.target_cost_per_million_input_tokens
or args.pricing_model
):
cost = messages.Cost()
if args.target_cost_per_million_output_tokens:
units, nanos = decimal_to_amount(
args.target_cost_per_million_output_tokens
)
cost.costPerMillionOutputTokens = messages.Amount(
units=units, nanos=nanos
)
if args.target_cost_per_million_input_tokens:
units, nanos = decimal_to_amount(
args.target_cost_per_million_input_tokens
)
cost.costPerMillionInputTokens = messages.Amount(
units=units, nanos=nanos
)
if args.pricing_model:
cost.pricingModel = args.pricing_model
performance_requirements.targetCost = cost
serving_stack = None
if args.serving_stack:
serving_stack = messages.ServingStack(
name=args.serving_stack,
)
if args.serving_stack_version:
serving_stack.version = args.serving_stack_version
try:
request = messages.FetchProfilesRequest(
model=args.model,
modelServer=args.model_server,
modelServerVersion=args.model_server_version,
servingStack=serving_stack,
)
if (
performance_requirements.targetNtpotMilliseconds is not None
or performance_requirements.targetTtftMilliseconds is not None
or performance_requirements.targetCost is not None
):
request.performanceRequirements = performance_requirements
if (
workload_spec.useCase is not None
or workload_spec.averageInputLength is not None
or workload_spec.averageOutputLength is not None
):
request.workloadSpec = workload_spec
response = client.profiles.Fetch(request)
if response.profile:
return response.profile
else:
return []
except apitools_exceptions.HttpError as error:
raise exceptions.HttpException(error, util.HTTP_ERROR_FORMAT)

View File

@@ -0,0 +1,29 @@
# -*- coding: utf-8 -*- #
# Copyright 2025 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The command group for the manifests CLI."""
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.projects import util
@base.UniverseCompatible
@base.ReleaseTracks(base.ReleaseTrack.ALPHA, base.ReleaseTrack.GA)
class Manifests(base.Group):
"""Generate optimized Kubernetes manifests."""

View File

@@ -0,0 +1,358 @@
# -*- coding: utf-8 -*- #
# Copyright 2025 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Generates optimized Kubernetes manifests for GKE Inference Quickstart."""
from apitools.base.py import exceptions as apitools_exceptions
from googlecloudsdk.api_lib.ai.recommender import util
from googlecloudsdk.api_lib.util import exceptions as api_lib_exceptions
from googlecloudsdk.calliope import base
from googlecloudsdk.core import exceptions
from googlecloudsdk.core import log
from googlecloudsdk.core.util import files
@base.DefaultUniverseOnly
@base.ReleaseTracks(base.ReleaseTrack.GA)
class Create(base.CreateCommand):
"""Generate ready-to-deploy Kubernetes manifests with compute, load balancing, and autoscaling capabilities.
To get supported model, model servers, and model server versions, run `gcloud
alpha container ai profiles model-and-server-combinations list`. To get
supported accelerators with their performance metrics, run `gcloud alpha
container ai profiles accelerators list`.
"""
@staticmethod
def Args(parser):
parser.add_argument(
"--model",
required=True,
help="The model.",
)
parser.add_argument(
"--model-server",
required=True,
help="The model server.",
)
parser.add_argument(
"--model-server-version",
help=(
"The model server version. If not specified, this defaults to the"
" latest version."
),
)
parser.add_argument(
"--target-ntpot-milliseconds",
type=int,
help=(
"The maximum normalized time per output token (NTPOT) in"
" milliseconds. NTPOT is measured as the request_latency /"
" output_tokens. If this is set, the manifests will include"
" Horizontal Pod Autoscaler (HPA) resources which automatically"
" adjust the model server replica count in response to changes in"
" model server load to keep p50 NTPOT below the specified"
" threshold. If the provided target-ntpot-milliseconds is too low"
" to achieve, the HPA manifest will not be generated. "
),
)
parser.add_argument(
"--target-ttft-milliseconds",
type=int,
help=(
"If specified, results will only show accelerators that can meet"
" the latency target and will show their throughput performances at"
" the target ttft target to achieve, the HPA manifest will not be"
" generated. "
),
)
parser.add_argument(
"--accelerator-type",
required=True,
help="The accelerator type.",
)
parser.add_argument(
"--namespace",
help=(
"The namespace to deploy the manifests in. Default namespace is"
" 'default'."
),
)
parser.add_argument(
"--output",
choices=["manifest", "comments", "all"],
default="all",
help="The output to display. Default is all.",
)
parser.add_argument(
"--output-path",
help=(
"The path to save the output to. If not specified, output to the"
" terminal."
),
)
parser.add_argument(
"--model-bucket-uri",
help=(
"The Google Cloud Storage bucket URI to load the model from. This"
" URI must point to the directory containing the model's config"
" file (config.json) and model weights. If unspecified, defaults to"
" loading the model from Hugging Face."
),
)
parser.add_argument(
"--target-itl-milliseconds",
type=int,
help=(
"The target inter-token latency (ITL) in milliseconds. If this is"
" set, the manifest will include Horizontal Pod Autoscaler (HPA)"
" resources which automatically adjust the model server replica"
" count in response to changes in model server load to keep p50 ITL"
" below the specified threshold. If the provided"
" target-itl-milliseconds is too low to achieve, the HPA manifest"
" will not be generated."
),
)
parser.add_argument(
"--use-case",
help=(
"The manifest will be optimized for this use case. Options are:"
" Advanced Customer Support, Code Completion, Text Summarization,"
" Chatbot (ShareGPT), Code Generation, Deep Research. Will default"
" to Chatbot if not specified."
),
)
parser.add_argument(
"--serving-stack",
required=False,
help=(
"The serving stack to filter manifests by. If not"
" provided, manifests for all serving stacks that support"
" the given model and model server will be considered."
),
)
parser.add_argument(
"--serving-stack-version",
required=False,
help=(
"The serving stack version to filter manifests by. If not"
" provided, manifests for all versions that support"
" the given model and model server will be considered."
),
)
def Run(self, args):
client = util.GetClientInstance(base.ReleaseTrack.GA)
messages = util.GetMessagesModule(base.ReleaseTrack.GA)
try:
model_server_info = messages.ModelServerInfo(
model=args.model,
modelServer=args.model_server,
modelServerVersion=args.model_server_version,
)
performance_requirements = messages.PerformanceRequirements()
if args.target_ntpot_milliseconds:
performance_requirements.targetNtpotMilliseconds = (
args.target_ntpot_milliseconds
)
if args.target_ttft_milliseconds:
performance_requirements.targetTtftMilliseconds = (
args.target_ttft_milliseconds
)
if args.target_itl_milliseconds:
performance_requirements.targetItlMilliseconds = (
args.target_itl_milliseconds
)
storage_config = messages.StorageConfig()
if args.model_bucket_uri:
storage_config.modelBucketUri = args.model_bucket_uri
serving_stack = None
if args.serving_stack:
serving_stack = messages.ServingStack(
name=args.serving_stack,
)
if args.serving_stack_version:
serving_stack.version = args.serving_stack_version
request = messages.GenerateOptimizedManifestRequest(
modelServerInfo=model_server_info,
acceleratorType=args.accelerator_type,
kubernetesNamespace=args.namespace,
servingStack=serving_stack,
)
if (
performance_requirements.targetNtpotMilliseconds is not None
or performance_requirements.targetTtftMilliseconds is not None
or performance_requirements.targetItlMilliseconds is not None
):
request.performanceRequirements = performance_requirements
if storage_config.modelBucketUri is not None:
request.storageConfig = storage_config
if args.use_case:
request.useCase = args.use_case
response = client.optimizedManifest.Generate(request)
return response
except apitools_exceptions.HttpError as error:
raise api_lib_exceptions.HttpException(error, util.HTTP_ERROR_FORMAT)
def Display(self, args, resources):
if not resources:
log.out.Print("No manifests generated.")
return
output_content = ""
if args.output != "comments":
for manifest in resources.kubernetesManifests:
output_content += manifest.content + "\n---\n"
if resources.comments:
comment_string = "\n".join([f"# {line}" for line in resources.comments])
output_content += comment_string
if args.output_path:
try:
with files.FileWriter(args.output_path, output_content) as f:
f.write(output_content)
log.out.Print(f"Output saved to {args.output_path}")
except exceptions.Error as e:
log.error(f"An error occurred while saving output to file: {e}")
else:
log.out.Print(output_content)
@base.DefaultUniverseOnly
@base.ReleaseTracks(base.ReleaseTrack.ALPHA)
class CreateAlpha(base.CreateCommand):
"""Generate ready-to-deploy Kubernetes manifests with compute, load balancing, and autoscaling capabilities.
To get supported model, model servers, and model server versions, run `gcloud
alpha container ai profiles model-and-server-combinations list`. To get
supported accelerators with their performance metrics, run `gcloud alpha
container ai profiles accelerators list`.
"""
@staticmethod
def Args(parser):
parser.add_argument(
"--model",
required=True,
help="The model.",
)
parser.add_argument(
"--model-server",
required=True,
help="The model server.",
)
parser.add_argument(
"--model-server-version",
help=(
"The model server version. If not specified, this defaults to the"
" latest version."
),
)
parser.add_argument(
"--target-ntpot-milliseconds",
type=int,
help=(
"The maximum normalized time per output token (NTPOT) in"
" milliseconds. NTPOT is measured as the request_latency /"
" output_tokens. If this is set, the manifests will include"
" Horizontal Pod Autoscaler (HPA) resources which automatically"
" adjust the model server replica count in response to changes in"
" model server load to keep p50 NTPOT below the specified"
" threshold. If the provided target-ntpot-milliseconds is too low"
" to achieve, the HPA manifest will not be generated. "
),
)
parser.add_argument(
"--accelerator-type",
required=True,
help="The accelerator type.",
)
parser.add_argument(
"--namespace",
help=(
"The namespace to deploy the manifests in. Default namespace is"
" 'default'."
),
)
parser.add_argument(
"--output",
choices=["manifest", "comments", "all"],
default="all",
help="The output to display. Default is all.",
)
parser.add_argument(
"--output-path",
help=(
"The path to save the output to. If not specified, output to the"
" terminal."
),
)
parser.add_argument(
"--model-bucket-uri",
hidden=True,
help=(
"GCS bucket URI to pull model from. If not specified, default"
" to the model hoster."
),
)
def Run(self, args):
client = util.GetClientInstance(base.ReleaseTrack.ALPHA)
messages = util.GetMessagesModule(base.ReleaseTrack.ALPHA)
try:
request = messages.GkerecommenderOptimizedManifestRequest(
modelAndModelServerInfo_modelName=args.model,
modelAndModelServerInfo_modelServerName=args.model_server,
modelAndModelServerInfo_modelServerVersion=args.model_server_version,
targetNtpotMilliseconds=args.target_ntpot_milliseconds,
acceleratorType=args.accelerator_type,
kubernetesNamespace=args.namespace,
storageConfig_modelBucketUri=args.model_bucket_uri,
)
response = client.v1alpha1.OptimizedManifest(request)
return response
except exceptions.Error as e:
log.error(f"An error has occurred: {e}")
log.status.Print(f"An error has occurred: {e}")
return []
def Display(self, args, resources):
if not resources:
log.out.Print("No manifests generated.")
return
output_content = ""
if args.output == "manifest" or args.output == "all":
for manifest in resources.k8sManifests:
output_content += manifest.content + "\n---\n"
if args.output == "comments" or args.output == "all":
if resources.comments:
comment_string = "\n".join([f"# {line}" for line in resources.comments])
output_content += comment_string
if args.output_path:
try:
with files.FileWriter(args.output_path, output_content) as f:
f.write(output_content)
log.out.Print(f"Output saved to {args.output_path}")
except exceptions.Error as e:
log.error(f"An error occurred while saving output to file: {e}")
else:
log.out.Print(output_content)

View File

@@ -0,0 +1,29 @@
# -*- coding: utf-8 -*- #
# Copyright 2025 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The command group for the model and model server combinations CLI."""
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.projects import util
@base.UniverseCompatible
@base.ReleaseTracks(base.ReleaseTrack.ALPHA)
class ModelServers(base.Group):
"""Manage supported model and model servers for GKE Inference Quickstart."""

View File

@@ -0,0 +1,81 @@
# -*- coding: utf-8 -*- #
# Copyright 2025 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Lists supported model and server combinations for GKE Inference Quickstart."""
from googlecloudsdk.api_lib.ai.recommender import util
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.run import commands
from googlecloudsdk.core import exceptions
from googlecloudsdk.core import log
_EXAMPLES = """
To list all supported model and server combinations, run:
$ {command}
"""
@base.DefaultUniverseOnly
@base.ReleaseTracks(base.ReleaseTrack.ALPHA)
class List(commands.List):
"""List supported model and server combinations.
This command lists all supported model, model server, and model server version
combinations.
"""
@staticmethod
def Args(parser):
parser.add_argument(
"--model",
help="The model. If not specified, this defaults to any model.",
)
parser.add_argument(
"--model-server",
help=(
"The model server. If not specified, this defaults to any model"
" server."
),
)
parser.add_argument(
"--model-server-version",
help=(
"The model server version. If not specified, this defaults to the"
" any model server version."
),
)
parser.display_info.AddFormat(
"table(modelName, modelServerName, modelServerVersion)"
)
def Run(self, args):
client = util.GetClientInstance(base.ReleaseTrack.ALPHA)
messages = util.GetMessagesModule(base.ReleaseTrack.ALPHA)
try:
request = messages.GkerecommenderModelsAndServersListRequest(
modelName=args.model,
modelServerName=args.model_server,
modelServerVersion=args.model_server_version,
)
response = client.modelsAndServers.List(request)
if response.modelAndModelServerInfo:
return response.modelAndModelServerInfo
else:
return []
except exceptions.Error as e:
log.error(f"An error has occurred: {e}")
log.status.Print(f"An error has occurred: {e}")
return []

View File

@@ -0,0 +1,29 @@
# -*- coding: utf-8 -*- #
# Copyright 2025 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The command group for the model server versions CLI."""
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.projects import util
@base.UniverseCompatible
@base.ReleaseTracks(base.ReleaseTrack.ALPHA, base.ReleaseTrack.GA)
class ModelServers(base.Group):
"""Manage supported model server versions for GKE Inference Quickstart."""

View File

@@ -0,0 +1,137 @@
# -*- coding: utf-8 -*- #
# Copyright 2025 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Lists supported model server versions for GKE Inference Quickstart."""
from apitools.base.py import exceptions as apitools_exceptions
from googlecloudsdk.api_lib.ai.recommender import util
from googlecloudsdk.api_lib.util import exceptions as api_lib_exceptions
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.run import commands
from googlecloudsdk.core import exceptions
from googlecloudsdk.core import log
_EXAMPLES = """
To list all supported model server versions for a model and model server, run:
$ {command} --model=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --model-server=vllm
"""
@base.DefaultUniverseOnly
@base.ReleaseTracks(base.ReleaseTrack.GA)
class List(commands.List):
"""List supported model server versions.
To get supported model and model servers, run `gcloud container ai
profiles models list` and `gcloud container ai profiles
model-servers list`.
"""
@staticmethod
def Args(parser):
parser.add_argument(
"--model",
required=True,
help="The model.",
)
parser.add_argument(
"--model-server",
required=True,
help=(
"The model server. If not specified, this defaults to any model"
" server."
),
)
def Run(self, args):
client = util.GetClientInstance(base.ReleaseTrack.GA)
messages = util.GetMessagesModule(base.ReleaseTrack.GA)
try:
request = messages.GkerecommenderModelServerVersionsFetchRequest(
model=args.model, modelServer=args.model_server
)
response = client.modelServerVersions.Fetch(request)
if response.modelServerVersions:
return response.modelServerVersions
else:
return []
except apitools_exceptions.HttpError as error:
raise api_lib_exceptions.HttpException(error, util.HTTP_ERROR_FORMAT)
def Display(self, _, resources):
if resources:
log.out.Print("Supported model server versions:")
for model_server_version in resources:
log.out.Print("- ", model_server_version)
else:
log.out.Print("No supported model server versions found.")
@base.DefaultUniverseOnly
@base.ReleaseTracks(base.ReleaseTrack.ALPHA)
class ListAlpha(commands.List):
"""List supported model server versions.
To get supported model and model servers, run `gcloud alpha container ai
profiles models list` and `gcloud alpha container ai profiles
model-servers list`.
Alternatively, run `gcloud alpha container ai profiles
model-and-server-combinations list` to get all supported model and server
combinations.
"""
@staticmethod
def Args(parser):
parser.add_argument(
"--model",
required=True,
help="The model.",
)
parser.add_argument(
"--model-server",
required=True,
help=(
"The model server. If not specified, this defaults to any model"
" server."
),
)
def Run(self, args):
client = util.GetClientInstance(base.ReleaseTrack.ALPHA)
messages = util.GetMessagesModule(base.ReleaseTrack.ALPHA)
try:
request = messages.GkerecommenderModelServersVersionsListRequest(
modelName=args.model, modelServerName=args.model_server
)
response = client.modelServers_versions.List(request)
if response.modelServerVersions:
return response.modelServerVersions
else:
return []
except exceptions.Error as e:
log.error(f"An error has occurred: {e}")
log.status.Print(f"An error has occurred: {e}")
return []
def Display(self, _, resources):
if resources:
log.out.Print("Supported model server versions:")
for model_server_version in resources:
log.out.Print("- ", model_server_version)
else:
log.out.Print("No supported model server versions found.")

View File

@@ -0,0 +1,29 @@
# -*- coding: utf-8 -*- #
# Copyright 2025 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The command group for the model servers CLI."""
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.projects import util
@base.UniverseCompatible
@base.ReleaseTracks(base.ReleaseTrack.ALPHA, base.ReleaseTrack.GA)
class ModelServers(base.Group):
"""Manage supported model servers for GKE Inference Quickstart."""

View File

@@ -0,0 +1,118 @@
# -*- coding: utf-8 -*- #
# Copyright 2025 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Lists supported model servers for GKE Inference Quickstart."""
from apitools.base.py import exceptions as apitools_exceptions
from googlecloudsdk.api_lib.ai.recommender import util
from googlecloudsdk.api_lib.util import exceptions as api_lib_exceptions
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.run import commands
from googlecloudsdk.core import exceptions
from googlecloudsdk.core import log
_EXAMPLE = """
To list all supported model servers for a model, run:
$ {command} --model=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
"""
@base.DefaultUniverseOnly
@base.ReleaseTracks(base.ReleaseTrack.GA)
class List(commands.List):
"""List supported model servers for a given model.
To get supported models, run `gcloud container ai profiles models
list`.
"""
@staticmethod
def Args(parser):
parser.add_argument(
"--model",
required=True,
help="The model.",
)
def Run(self, args):
client = util.GetClientInstance(base.ReleaseTrack.GA)
messages = util.GetMessagesModule(base.ReleaseTrack.GA)
try:
request = messages.GkerecommenderModelServersFetchRequest(
model=args.model
)
response = client.modelServers.Fetch(request)
if response.modelServers:
return response.modelServers
else:
return []
except apitools_exceptions.HttpError as error:
raise api_lib_exceptions.HttpException(error, util.HTTP_ERROR_FORMAT)
def Display(self, _, resources):
if resources:
log.out.Print("Supported model servers:")
for model_server_name in resources:
log.out.Print("- ", model_server_name)
else:
log.out.Print("No supported model servers found.")
@base.DefaultUniverseOnly
@base.ReleaseTracks(base.ReleaseTrack.ALPHA)
class ListAlpha(commands.List):
"""List supported model servers for a given model.
To get supported models, run `gcloud alpha container ai profiles models
list` or to get all supported model and server combinations, run `gcloud alpha
container ai profiles model-and-server-combinations
list`.
"""
@staticmethod
def Args(parser):
parser.add_argument(
"--model",
required=True,
help="The model.",
)
def Run(self, args):
client = util.GetClientInstance(base.ReleaseTrack.ALPHA)
messages = util.GetMessagesModule(base.ReleaseTrack.ALPHA)
try:
request = messages.GkerecommenderModelServersListRequest(
modelName=args.model
)
response = client.modelServers.List(request)
if response.modelServerNames:
return response.modelServerNames
else:
return []
except exceptions.Error as e:
log.error(f"An error has occurred: {e}")
log.status.Print(f"An error has occurred: {e}")
return []
def Display(self, _, resources):
if resources:
log.out.Print("Supported model servers:")
for model_server_name in resources:
log.out.Print("- ", model_server_name)
else:
log.out.Print("No supported model servers found.")

View File

@@ -0,0 +1,29 @@
# -*- coding: utf-8 -*- #
# Copyright 2025 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The command group for the models CLI."""
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.projects import util
@base.UniverseCompatible
@base.ReleaseTracks(base.ReleaseTrack.ALPHA, base.ReleaseTrack.GA)
class Models(base.Group):
"""Manage supported models for GKE Inference Quickstart."""

View File

@@ -0,0 +1,88 @@
# -*- coding: utf-8 -*- #
# Copyright 2025 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Lists supported models for GKE Inference Quickstart."""
from apitools.base.py import exceptions as apitools_exceptions
from googlecloudsdk.api_lib.ai.recommender import util
from googlecloudsdk.api_lib.util import exceptions as api_lib_exceptions
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.run import commands
from googlecloudsdk.core import exceptions
from googlecloudsdk.core import log
_EXAMPLES = """
To list all supported models, run:
$ {command}
"""
@base.DefaultUniverseOnly
@base.ReleaseTracks(base.ReleaseTrack.GA)
class List(commands.List):
"""List supported models."""
def Run(self, _):
client = util.GetClientInstance(base.ReleaseTrack.GA)
messages = util.GetMessagesModule(base.ReleaseTrack.GA)
try:
response = client.models.Fetch(
messages.GkerecommenderModelsFetchRequest()
)
if response.models:
return response.models
else:
return []
except apitools_exceptions.HttpError as error:
raise api_lib_exceptions.HttpException(error, util.HTTP_ERROR_FORMAT)
def Display(self, _, resources):
if resources:
log.out.Print("Supported models:")
for model_name in resources:
log.out.Print("- ", model_name)
else:
log.out.Print("No supported models found.")
@base.DefaultUniverseOnly
@base.ReleaseTracks(base.ReleaseTrack.ALPHA)
class ListAlpha(commands.List):
"""List supported models."""
def Run(self, _):
client = util.GetClientInstance(base.ReleaseTrack.ALPHA)
messages = util.GetMessagesModule(base.ReleaseTrack.ALPHA)
try:
response = client.models.List(messages.GkerecommenderModelsListRequest())
if response.modelNames:
return response.modelNames
else:
return []
except exceptions.Error as e:
log.error(f"An error has occurred: {e}")
log.status.Print(f"An error has occured: {e}")
return []
def Display(self, _, resources):
if resources:
log.out.Print("Supported models:")
for model_name in resources:
log.out.Print("- ", model_name)
else:
log.out.Print("No supported models found.")

View File

@@ -0,0 +1,24 @@
# -*- coding: utf-8 -*- #
# Copyright 2025 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The command group for the serving stack versions CLI."""
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.projects import util
@base.UniverseCompatible
@base.ReleaseTracks(base.ReleaseTrack.GA)
class ServingStackVersions(base.Group):
"""List supported serving stack versions for GKE Inference Quickstart."""

View File

@@ -0,0 +1,69 @@
# -*- coding: utf-8 -*- #
# Copyright 2025 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Lists supported serving stack versions for GKE Inference Quickstart."""
from apitools.base.py import exceptions as apitools_exceptions
from googlecloudsdk.api_lib.ai.recommender import util
from googlecloudsdk.api_lib.util import exceptions as api_lib_exceptions
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.run import commands
_EXAMPLES = """
To list all supported serving stack versions, run:
$ {command} --serving-stack=llm-d
"""
@base.ReleaseTracks(base.ReleaseTrack.GA)
class List(commands.List):
"""List supported serving stack versions that were used to generate the inference profiles."""
@staticmethod
def Args(parser):
parser.display_info.AddFormat("table(version)")
parser.add_argument(
"--model",
help="The model to filter serving stack versions by.",
)
parser.add_argument(
"--model-server",
help="The model server to filter serving stack versions by.",
)
parser.add_argument(
"--serving-stack",
required=True,
help="The serving stack to filter serving stack versions by.",
)
def Run(self, args):
client = util.GetClientInstance(base.ReleaseTrack.GA)
messages = util.GetMessagesModule(base.ReleaseTrack.GA)
try:
response = client.servingStackVersions.Fetch(
messages.GkerecommenderServingStackVersionsFetchRequest(
model=args.model,
modelServer=args.model_server,
servingStack=args.serving_stack,
)
)
if response.servingStackVersions:
return [{"version": v} for v in response.servingStackVersions]
else:
return []
except apitools_exceptions.HttpError as error:
raise api_lib_exceptions.HttpException(error, util.HTTP_ERROR_FORMAT)

View File

@@ -0,0 +1,24 @@
# -*- coding: utf-8 -*- #
# Copyright 2025 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The command group for the serving stack CLI."""
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.projects import util
@base.UniverseCompatible
@base.ReleaseTracks(base.ReleaseTrack.GA)
class ServingStacks(base.Group):
"""List supported serving stacks for GKE Inference Quickstart."""

View File

@@ -0,0 +1,65 @@
# -*- coding: utf-8 -*- #
# Copyright 2025 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Lists supported serving stacks for GKE Inference Quickstart."""
from apitools.base.py import exceptions as apitools_exceptions
from googlecloudsdk.api_lib.ai.recommender import util
from googlecloudsdk.api_lib.util import exceptions as api_lib_exceptions
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.run import commands
_EXAMPLES = """
To list all supported serving stacks, run:
$ {command}
"""
@base.ReleaseTracks(base.ReleaseTrack.GA)
class List(commands.List):
"""List supported serving stacks that were used to generate the inference profiles."""
@staticmethod
def Args(parser):
parser.display_info.AddFormat(
"table(name,version)"
)
parser.add_argument(
"--model",
help="The model to filter serving stacks by.",
)
parser.add_argument(
"--model-server",
help="The model server to filter serving stacks by.",
)
def Run(self, args):
client = util.GetClientInstance(base.ReleaseTrack.GA)
messages = util.GetMessagesModule(base.ReleaseTrack.GA)
try:
response = client.servingStacks.Fetch(
messages.GkerecommenderServingStacksFetchRequest(
model=args.model,
modelServer=args.model_server,
)
)
if response.servingStacks:
return response.servingStacks
else:
return []
except apitools_exceptions.HttpError as error:
raise api_lib_exceptions.HttpException(error, util.HTTP_ERROR_FORMAT)

View File

@@ -0,0 +1,29 @@
# -*- coding: utf-8 -*- #
# Copyright 2025 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The command group for the models CLI."""
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.projects import util
@base.UniverseCompatible
@base.ReleaseTracks(base.ReleaseTrack.GA)
class UseCase(base.Group):
"""List supported use cases for GKE Inference Quickstart."""

View File

@@ -0,0 +1,54 @@
# -*- coding: utf-8 -*- #
# Copyright 2025 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Lists supported use cases for GKE Inference Quickstart."""
from apitools.base.py import exceptions as apitools_exceptions
from googlecloudsdk.api_lib.ai.recommender import util
from googlecloudsdk.api_lib.util import exceptions as api_lib_exceptions
from googlecloudsdk.calliope import base
from googlecloudsdk.command_lib.run import commands
_EXAMPLES = """
To list all supported use cases, run:
$ {command}
"""
@base.ReleaseTracks(base.ReleaseTrack.GA)
class List(commands.List):
"""List supported use cases that were used to generate the inference profiles."""
@staticmethod
def Args(parser):
parser.display_info.AddFormat(
"table(useCase,averageInputLength,averageOutputLength)"
)
def Run(self, _):
client = util.GetClientInstance(base.ReleaseTrack.GA)
messages = util.GetMessagesModule(base.ReleaseTrack.GA)
try:
response = client.useCases.Fetch(
messages.FetchUseCasesRequest()
)
if response.workloadSpecs:
return response.workloadSpecs
else:
return []
except apitools_exceptions.HttpError as error:
raise api_lib_exceptions.HttpException(error, util.HTTP_ERROR_FORMAT)