feat: Add new gcloud commands, API clients, and third-party libraries across various services.

2026-01-01 20:26:35 +01:00
parent 5e23cbece0
commit a19e592eb7
25221 changed files with 8324611 additions and 0 deletions
--- a/login/google-cloud-sdk/lib/surface/container/ai/profiles/init.py
+++ b/login/google-cloud-sdk/lib/surface/container/ai/profiles/init.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2025 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The command group for the profiles CLI."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import unicode_literals
+
+from googlecloudsdk.calliope import base
+from googlecloudsdk.command_lib.projects import util
+
+
+@base.UniverseCompatible
+@base.ReleaseTracks(base.ReleaseTrack.ALPHA, base.ReleaseTrack.GA)
+class Profiles(base.Group):
+  """Quickstart engine for GKE AI workloads.
+
+  The GKE Inference Quickstart helps simplify deploying AI inference on Google
+  Kubernetes Engine (GKE). It provides tailored profiles based on
+  Google's internal benchmarks. Provide inputs like your preferred open-source
+  model (e.g. Llama, Gemma, or Mistral) and your application's performance
+  target. Based on these inputs, the quickstart generates accelerator choices
+  with performance metrics, and detailed, ready-to-deploy profiles for
+  compute, load balancing, and autoscaling. These profiles are provided
+  as standard Kubernetes YAML manifests, which you can deploy or modify.
+
+  To visualize the benchmarking data that support these estimates, see the
+  accompanying Colab notebook:
+  https://colab.research.google.com/github/GoogleCloudPlatform/kubernetes-engine-samples/blob/main/ai-ml/notebooks/giq_visualizations.ipynb
+  """
+
+  category = base.SDK_TOOLS_CATEGORY
--- a/login/google-cloud-sdk/lib/surface/container/ai/profiles/accelerators/init.py
+++ b/login/google-cloud-sdk/lib/surface/container/ai/profiles/accelerators/init.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2025 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The command group for the accelerators CLI."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import unicode_literals
+
+from googlecloudsdk.calliope import base
+from googlecloudsdk.command_lib.projects import util
+
+
+@base.UniverseCompatible
+@base.ReleaseTracks(base.ReleaseTrack.ALPHA)
+class ModelServers(base.Group):
+  """Manage supported accelerators for GKE Inference Quickstart."""
--- a/login/google-cloud-sdk/lib/surface/container/ai/profiles/accelerators/list.py
+++ b/login/google-cloud-sdk/lib/surface/container/ai/profiles/accelerators/list.py
@@ -0,0 +1,190 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2025 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Lists compatible accelerator profiles for GKE Inference Quickstart."""
+
+from googlecloudsdk.api_lib.ai.recommender import util
+from googlecloudsdk.calliope import base
+from googlecloudsdk.command_lib.run import commands
+from googlecloudsdk.command_lib.run.printers import profiles_printer
+from googlecloudsdk.core import exceptions
+from googlecloudsdk.core import log
+from googlecloudsdk.core.resource import resource_printer
+
+_EXAMPLES = """
+To list compatible accelerator profiles for a model, run:
+
+$ {command} --model=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+"""
+
+
+def decimal_to_amount(decimal_value):
+  """Converts a decimal representation to an Amount proto."""
+
+  units = int(decimal_value)
+  nanos = int((decimal_value - units) * 1e9)
+
+  return (units, nanos)
+
+
+@base.DefaultUniverseOnly
+@base.ReleaseTracks(base.ReleaseTrack.ALPHA)
+class List(commands.List):
+  """List compatible accelerator profiles.
+
+  This command lists all supported accelerators with their performance details.
+  By default, the supported accelerators are displayed in a table format with
+  select information for each accelerator. To see all details, use
+  --format=yaml.
+
+  To get supported model, model servers, and model server versions, run `gcloud
+  alpha container ai profiles models list`, `gcloud alpha container ai
+  profiles model-servers list`, and `gcloud alpha container ai profiles
+  model-server-versions list`.
+  Alternatively, run `gcloud alpha container ai profiles
+  model-and-server-combinations list` to get all supported model and server
+  combinations.
+  """
+
+  @staticmethod
+  def Args(parser):
+    parser.add_argument(
+        "--model",
+        required=True,
+        help="The model.",
+    )
+    parser.add_argument(
+        "--model-server",
+        help=(
+            "The model server. If not specified, this defaults to any model"
+            " server."
+        ),
+    )
+    parser.add_argument(
+        "--model-server-version",
+        help=(
+            "The model server version. If not specified, this defaults to the"
+            " latest version."
+        ),
+    )
+    parser.add_argument(
+        "--max-ntpot-milliseconds",
+        type=int,
+        help=(
+            "The maximum normalized time per output token (NTPOT) in"
+            " milliseconds. NTPOT is measured as the request_latency /"
+            " output_tokens. If this field is set, the command will only return"
+            " accelerators that can meet the target ntpot milliseconds and"
+            " display their throughput performance at the target latency."
+            " Otherwise, the command will return all accelerators and display"
+            " their highest throughput performance."
+        ),
+    )
+
+    parser.add_argument(
+        "--target-cost-per-million-output-tokens",
+        hidden=True,
+        type=float,
+        required=False,
+        help=(
+            "The target cost per million output tokens to filter profiles by,"
+            " unit is 1 USD up to 5 decimal places."
+        ),
+    )
+    parser.add_argument(
+        "--target-cost-per-million-input-tokens",
+        hidden=True,
+        type=float,
+        required=False,
+        help=(
+            "The target cost per million input tokens to filter profiles by,"
+            " unit is 1 USD up to 5 decimal places."
+        ),
+    )
+    parser.add_argument(
+        "--pricing-model",
+        hidden=True,
+        required=False,
+        type=str,
+        help=(
+            "The pricing model to use to calculate token cost. Currently, this"
+            " supports on-demand, spot, 3-years-cud, 1-year-cud"
+        ),
+    )
+
+    parser.add_argument(
+        "--format",
+        type=str,
+        help="The format to use for the output. Default is table. yaml|table",
+    )
+
+    resource_printer.RegisterFormatter(
+        profiles_printer.PROFILES_PRINTER_FORMAT,
+        profiles_printer.ProfilePrinter,
+    )
+    parser.display_info.AddFormat(profiles_printer.PROFILES_PRINTER_FORMAT)
+    parser.display_info.AddFormat(
+        "table("
+        "acceleratorType,"
+        "modelAndModelServerInfo.modelName,"
+        "modelAndModelServerInfo.modelServerName,"
+        "modelAndModelServerInfo.modelServerVersion,"
+        "resourcesUsed.acceleratorCount,"
+        "performanceStats.outputTokensPerSecond,"
+        "performanceStats.ntpotMilliseconds"
+        ")"
+    )
+
+  def Run(self, args):
+    client = util.GetClientInstance(base.ReleaseTrack.ALPHA)
+    messages = util.GetMessagesModule(base.ReleaseTrack.ALPHA)
+
+    try:
+      request = messages.GkerecommenderAcceleratorsListRequest(
+          modelName=args.model,
+          modelServerName=args.model_server,
+          modelServerVersion=args.model_server_version,
+          performanceRequirements_maxNtpotMilliseconds=args.max_ntpot_milliseconds,
+          performanceRequirements_cost_pricingModel=args.pricing_model,
+      )
+      if args.target_cost_per_million_output_tokens:
+        units, nanos = decimal_to_amount(
+            args.target_cost_per_million_output_tokens
+        )
+        request.performanceRequirements_cost_costPerMillionNormalizedOutputTokens_units = (
+            units
+        )
+        request.performanceRequirements_cost_costPerMillionNormalizedOutputTokens_nanos = (
+            nanos
+        )
+      if args.target_cost_per_million_input_tokens:
+        units, nanos = decimal_to_amount(
+            args.target_cost_per_million_input_tokens
+        )
+        request.performanceRequirements_cost_costPerMillionInputTokens_units = (
+            units
+        )
+        request.performanceRequirements_cost_costPerMillionInputTokens_nanos = (
+            nanos
+        )
+      response = client.accelerators.List(request)
+      self.comments = response.comments
+      if response:
+        return response
+      else:
+        return []
+    except exceptions.Error as e:
+      log.error(f"An error has occurred: {e}")
+      log.status.Print(f"An error has occurred: {e}")
+      return []
--- a/login/google-cloud-sdk/lib/surface/container/ai/profiles/benchmarks/init.py
+++ b/login/google-cloud-sdk/lib/surface/container/ai/profiles/benchmarks/init.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2025 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The command group for the benchmarks CLI."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import unicode_literals
+
+from googlecloudsdk.calliope import base
+from googlecloudsdk.command_lib.projects import util
+
+
+@base.UniverseCompatible
+@base.ReleaseTracks(base.ReleaseTrack.GA)
+class Benchmarks(base.Group):
+  """Manage benchmarks for GKE Inference Quickstart."""
--- a/login/google-cloud-sdk/lib/surface/container/ai/profiles/benchmarks/list.py
+++ b/login/google-cloud-sdk/lib/surface/container/ai/profiles/benchmarks/list.py
@@ -0,0 +1,179 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2025 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Outputs benchmarking data for GKE Inference Quickstart."""
+
+from apitools.base.py import exceptions as apitools_exceptions
+from googlecloudsdk.api_lib.ai.recommender import util
+from googlecloudsdk.api_lib.util import exceptions
+from googlecloudsdk.calliope import base
+from googlecloudsdk.command_lib.run import commands
+from googlecloudsdk.command_lib.run.printers import profiles_csv_printer
+from googlecloudsdk.core.resource import resource_printer
+
+
+_EXAMPLE = """
+To get benchmarking data for a given model and model server, run:
+
+$ {command} --model=google/gemma-2-27b-it --model-server=vllm --pricing-model=spot
+"""
+
+
+def amount_to_decimal(cost):
+  """Converts cost to a decimal representation."""
+  units = cost.units
+  if not units:
+    units = 0
+  decimal_value = +(units + cost.nanos / 1e9)
+  return f"{decimal_value:.3f}"
+
+
+def get_decimal_cost(costs):
+  """Returns the cost per million normalized output tokens as a decimal.
+
+  Args:
+    costs: The costs to convert.
+  """
+  output_token_cost = "N/A"
+  if costs and costs[0].costPerMillionOutputTokens:
+    output_token_cost = amount_to_decimal(costs[0].costPerMillionOutputTokens)
+  input_token_cost = "N/A"
+  if costs and costs[0].costPerMillionInputTokens:
+    input_token_cost = amount_to_decimal(costs[0].costPerMillionInputTokens)
+  return (input_token_cost, output_token_cost)
+
+
+@base.DefaultUniverseOnly
+@base.ReleaseTracks(base.ReleaseTrack.GA)
+class List(commands.List):
+  """List benchmarks for a given model and model server.
+
+  This command lists all benchmarking data for a given model and model server.
+  By default, the benchmarks are displayed in a CSV format.
+
+  For examples of visualizing the benchmarking data, see the accompanying Colab
+  notebook:
+  https://colab.research.google.com/github/GoogleCloudPlatform/kubernetes-engine-samples/blob/main/ai-ml/notebooks/giq_visualizations.ipynb
+  """
+
+  @staticmethod
+  def Args(parser):
+    parser.add_argument(
+        "--model",
+        required=True,
+        help="The model.",
+    )
+    parser.add_argument(
+        "--model-server",
+        required=True,
+        help="The model server.",
+    )
+    parser.add_argument(
+        "--model-server-version",
+        help=(
+            "The model server version. Default is latest. Other options include"
+            " the model server version of a profile, all which returns all"
+            " versions."
+        ),
+    )
+    parser.add_argument(
+        "--instance-type",
+        help=(
+            "The instance type. If not specified, this defaults to any"
+            "instance type."
+        ),
+    )
+    parser.add_argument(
+        "--format",
+        help=(
+            "The format to print the output in. Default is csvprofile, which"
+            " displays the profile information in a CSV format, including"
+            "cost conversions."
+        ),
+    )
+    parser.add_argument(
+        "--pricing-model",
+        required=False,
+        help=(
+            "The pricing model to use to calculate token cost. Currently, this"
+            " supports on-demand, spot, 3-years-cud, 1-year-cud"
+        ),
+    )
+    parser.add_argument(
+        "--use-case",
+        required=False,
+        help=(
+            "If specified, results will only show profiles that match the"
+            " provided use case. Options are: Advanced Customer Support, Code"
+            " Completion, Text Summarization, Chatbot (ShareGPT), Code"
+            " Generation, Deep Research."
+        ),
+    )
+    parser.add_argument(
+        "--serving-stack",
+        required=False,
+        help=(
+            "The serving stack to filter benchmarking data by. If not"
+            " provided, benchmarking data for all serving stacks that support"
+            " the given model and model server will be returned."
+        ),
+    )
+    parser.add_argument(
+        "--serving-stack-version",
+        required=False,
+        help=(
+            "The serving stack version to filter benchmarking data by. If not"
+            " provided, benchmarking data for all versions that support"
+            " the given model and model server will be returned."
+        ),
+    )
+
+    resource_printer.RegisterFormatter(
+        profiles_csv_printer.PROFILES_PRINTER_FORMAT,
+        profiles_csv_printer.ProfileCSVPrinter,
+    )
+    parser.display_info.AddFormat(profiles_csv_printer.PROFILES_PRINTER_FORMAT)
+
+  def Run(self, args):
+    client = util.GetClientInstance(base.ReleaseTrack.GA)
+    messages = util.GetMessagesModule(base.ReleaseTrack.GA)
+
+    try:
+      model_server_info = messages.ModelServerInfo(
+          model=args.model,
+          modelServer=args.model_server,
+          modelServerVersion=args.model_server_version,
+      )
+      serving_stack = None
+      if args.serving_stack:
+        serving_stack = messages.ServingStack(
+            name=args.serving_stack,
+        )
+        if args.serving_stack_version:
+          serving_stack.version = args.serving_stack_version
+
+      request = messages.FetchBenchmarkingDataRequest(
+          modelServerInfo=model_server_info,
+          instanceType=args.instance_type,
+          pricingModel=args.pricing_model,
+          useCase=args.use_case,
+          servingStack=serving_stack,
+      )
+      response = client.benchmarkingData.Fetch(request)
+      if not response.profile:
+        return []
+      else:
+        return response.profile
+    except apitools_exceptions.HttpError as error:
+      raise exceptions.HttpException(error, util.HTTP_ERROR_FORMAT)
--- a/login/google-cloud-sdk/lib/surface/container/ai/profiles/list.py
+++ b/login/google-cloud-sdk/lib/surface/container/ai/profiles/list.py
@@ -0,0 +1,323 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2025 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Lists compatible accelerator profiles for GKE Inference Quickstart."""
+
+from apitools.base.py import exceptions as apitools_exceptions
+from googlecloudsdk.api_lib.ai.recommender import util
+from googlecloudsdk.api_lib.util import exceptions
+from googlecloudsdk.calliope import base
+from googlecloudsdk.command_lib.run import commands
+from googlecloudsdk.command_lib.run.printers import profiles_csv_printer
+from googlecloudsdk.command_lib.run.printers import profiles_printer_ga as profiles_printer
+from googlecloudsdk.core.resource import resource_printer
+
+
+_EXAMPLES = """
+To list compatible accelerator profiles for a model, run:
+
+$ {command} --model=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+"""
+
+
+def decimal_to_amount(decimal_value):
+  """Converts a decimal representation to an Amount proto."""
+
+  units = int(decimal_value)
+  nanos = int((decimal_value - units) * 1e9)
+
+  return (units, nanos)
+
+
+def amount_to_decimal(cost):
+  """Converts cost to a decimal representation."""
+  units = cost.units
+  if not units:
+    units = 0
+  decimal_value = +(units + cost.nanos / 1e9)
+  return f"{decimal_value:.3f}"
+
+
+def get_decimal_cost(costs):
+  """Returns the cost per million normalized output tokens as a decimal.
+
+  Args:
+    costs: The costs to convert.
+  """
+  output_token_cost = "N/A"
+  if costs and costs[0].costPerMillionOutputTokens:
+    output_token_cost = amount_to_decimal(
+        costs[0].costPerMillionOutputTokens
+    )
+  input_token_cost = "N/A"
+  if costs and costs[0].costPerMillionInputTokens:
+    input_token_cost = amount_to_decimal(costs[0].costPerMillionInputTokens)
+  return (input_token_cost, output_token_cost)
+
+
+@base.DefaultUniverseOnly
+@base.ReleaseTracks(base.ReleaseTrack.GA)
+class List(commands.List):
+  """List compatible accelerator profiles.
+
+  This command lists all supported accelerators with their performance details.
+  By default, the supported accelerators are displayed in a table format with
+  select information for each accelerator. To see all details, use
+  --format=yaml or --format=csvprofile.
+
+  To get supported model, model servers, and model server versions, run `gcloud
+  container ai profiles models list`, `gcloud container ai
+  profiles model-servers list`, and `gcloud container ai profiles
+  model-server-versions list`.
+  """
+
+  @staticmethod
+  def Args(parser):
+    parser.add_argument(
+        "--model",
+        help="The model.",
+    )
+    parser.add_argument(
+        "--model-server",
+        help=(
+            "The model server version. Default is latest. Other options include"
+            " the model server version of a profile, all which returns all"
+            " versions."
+        ),
+    )
+    parser.add_argument(
+        "--model-server-version",
+        help=(
+            "The model server version. If not specified, this defaults to the"
+            " latest version."
+        ),
+    )
+    parser.add_argument(
+        "--target-ntpot-milliseconds",
+        type=int,
+        help=(
+            "The target normalized time per output token (NTPOT) in"
+            " milliseconds. NTPOT is measured as the request_latency /"
+            " output_tokens. If this field is set, the command will only return"
+            " accelerators that can meet the target ntpot milliseconds and"
+            " display their throughput performance at the target latency."
+            " Otherwise, the command will return all accelerators and display"
+            " their highest throughput performance."
+        ),
+    )
+    parser.add_argument(
+        "--target-ttft-milliseconds",
+        type=int,
+        help=(
+            "The target time to first token (TTFT) in"
+            " milliseconds. TTFT is measured as the request_latency /"
+            " output_tokens. If this field is set, the command will only return"
+            " profiles that can meet the target ttft milliseconds and"
+            " display their throughput performance at the target latency."
+            " Otherwise, the command will return all profiles and display"
+            " their highest throughput performance."
+        ),
+    )
+    parser.add_argument(
+        "--target-itl-milliseconds",
+        type=int,
+        help=(
+            "If specified, results will only show profiles with instance types"
+            " that can meet the latency target and will show their throughput"
+            " performances at the target inter-token latency (ITL)."
+        ),
+    )
+    parser.add_argument(
+        "--target-cost-per-million-output-tokens",
+        type=float,
+        required=False,
+        help=(
+            "The target cost per million output tokens to filter profiles by,"
+            " unit is 1 USD up to 5 decimal places."
+        ),
+    )
+    parser.add_argument(
+        "--target-cost-per-million-input-tokens",
+        type=float,
+        required=False,
+        help=(
+            "The target cost per million input tokens to filter profiles by,"
+            " unit is 1 USD up to 5 decimal places."
+        ),
+    )
+    parser.add_argument(
+        "--pricing-model",
+        required=False,
+        type=str,
+        help=(
+            "The pricing model to use to calculate token cost. Currently, this"
+            " supports on-demand, spot, 3-years-cud, 1-year-cud"
+        ),
+    )
+    parser.add_argument(
+        "--format",
+        help=(
+            "The output format. Default is profile, which displays the profile"
+            " information in a table format, including cost conversions."
+            " csvprofile displays the profile information in a CSV"
+            " format.Options include csvprofile, profile, and yaml. "
+        ),
+    )
+    parser.add_argument(
+        "--use-case",
+        required=False,
+        type=str,
+        help=(
+            " If specified, results will only show profiles that match the"
+            " provided use case. Options are: Advanced Customer Support, Code"
+            " Completion, Text Summarization, Chatbot (ShareGPT), Text"
+            " Generation, Deep Research"
+        ),
+    )
+    parser.add_argument(
+        "--target-input-length",
+        required=False,
+        type=int,
+        help=(
+            " If specified, results will only show profiles that have an input"
+            " length within 20% of the specified one. Only works alongside"
+            " output length."
+        ),
+    )
+    parser.add_argument(
+        "--target-output-length",
+        required=False,
+        type=int,
+        help=(
+            "If specified, results will only show profiles that have an output"
+            " length within 20% of the specified one. Only works alongside"
+            " input length."
+        ),
+    )
+    parser.add_argument(
+        "--serving-stack",
+        required=False,
+        help=(
+            "The serving stack to filter profiles by. If not"
+            " provided, profiles for all serving stacks that support"
+            " the given model and model server will be returned."
+        ),
+    )
+    parser.add_argument(
+        "--serving-stack-version",
+        required=False,
+        help=(
+            "The serving stack version to filter profiles by. If not"
+            " provided, profiles for all versions that support"
+            " the given model and model server will be returned."
+        ),
+    )
+    resource_printer.RegisterFormatter(
+        profiles_printer.PROFILES_PRINTER_FORMAT,
+        profiles_printer.ProfilePrinter,
+    )
+    resource_printer.RegisterFormatter(
+        profiles_csv_printer.PROFILES_PRINTER_FORMAT,
+        profiles_csv_printer.ProfileCSVPrinter,
+    )
+    parser.display_info.AddFormat(profiles_printer.PROFILES_PRINTER_FORMAT)
+
+  def Run(self, args):
+    client = util.GetClientInstance(base.ReleaseTrack.GA)
+    messages = util.GetMessagesModule(base.ReleaseTrack.GA)
+
+    performance_requirements = messages.PerformanceRequirements()
+    workload_spec = messages.WorkloadSpec()
+    if args.target_ntpot_milliseconds:
+      performance_requirements.targetNtpotMilliseconds = (
+          args.target_ntpot_milliseconds
+      )
+    if args.target_ttft_milliseconds:
+      performance_requirements.targetTtftMilliseconds = (
+          args.target_ttft_milliseconds
+      )
+    if args.target_itl_milliseconds:
+      performance_requirements.targetItlMilliseconds = (
+          args.target_itl_milliseconds
+      )
+    if args.use_case:
+      workload_spec.useCase = (
+          args.use_case
+      )
+    if args.target_input_length:
+      workload_spec.averageInputLength = (
+          args.target_input_length
+      )
+    if args.target_output_length:
+      workload_spec.averageOutputLength = (
+          args.target_output_length
+      )
+    if (
+        args.target_cost_per_million_output_tokens
+        or args.target_cost_per_million_input_tokens
+        or args.pricing_model
+    ):
+      cost = messages.Cost()
+      if args.target_cost_per_million_output_tokens:
+        units, nanos = decimal_to_amount(
+            args.target_cost_per_million_output_tokens
+        )
+        cost.costPerMillionOutputTokens = messages.Amount(
+            units=units, nanos=nanos
+        )
+      if args.target_cost_per_million_input_tokens:
+        units, nanos = decimal_to_amount(
+            args.target_cost_per_million_input_tokens
+        )
+        cost.costPerMillionInputTokens = messages.Amount(
+            units=units, nanos=nanos
+        )
+      if args.pricing_model:
+        cost.pricingModel = args.pricing_model
+      performance_requirements.targetCost = cost
+
+    serving_stack = None
+    if args.serving_stack:
+      serving_stack = messages.ServingStack(
+          name=args.serving_stack,
+      )
+      if args.serving_stack_version:
+        serving_stack.version = args.serving_stack_version
+
+    try:
+      request = messages.FetchProfilesRequest(
+          model=args.model,
+          modelServer=args.model_server,
+          modelServerVersion=args.model_server_version,
+          servingStack=serving_stack,
+      )
+      if (
+          performance_requirements.targetNtpotMilliseconds is not None
+          or performance_requirements.targetTtftMilliseconds is not None
+          or performance_requirements.targetCost is not None
+      ):
+        request.performanceRequirements = performance_requirements
+      if (
+          workload_spec.useCase is not None
+          or workload_spec.averageInputLength is not None
+          or workload_spec.averageOutputLength is not None
+      ):
+        request.workloadSpec = workload_spec
+      response = client.profiles.Fetch(request)
+      if response.profile:
+        return response.profile
+      else:
+        return []
+    except apitools_exceptions.HttpError as error:
+      raise exceptions.HttpException(error, util.HTTP_ERROR_FORMAT)
--- a/login/google-cloud-sdk/lib/surface/container/ai/profiles/manifests/init.py
+++ b/login/google-cloud-sdk/lib/surface/container/ai/profiles/manifests/init.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2025 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The command group for the manifests CLI."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import unicode_literals
+
+from googlecloudsdk.calliope import base
+from googlecloudsdk.command_lib.projects import util
+
+
+@base.UniverseCompatible
+@base.ReleaseTracks(base.ReleaseTrack.ALPHA, base.ReleaseTrack.GA)
+class Manifests(base.Group):
+  """Generate optimized Kubernetes manifests."""
--- a/login/google-cloud-sdk/lib/surface/container/ai/profiles/manifests/create.py
+++ b/login/google-cloud-sdk/lib/surface/container/ai/profiles/manifests/create.py
@@ -0,0 +1,358 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2025 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Generates optimized Kubernetes manifests for GKE Inference Quickstart."""
+
+from apitools.base.py import exceptions as apitools_exceptions
+from googlecloudsdk.api_lib.ai.recommender import util
+from googlecloudsdk.api_lib.util import exceptions as api_lib_exceptions
+from googlecloudsdk.calliope import base
+from googlecloudsdk.core import exceptions
+from googlecloudsdk.core import log
+from googlecloudsdk.core.util import files
+
+
+@base.DefaultUniverseOnly
+@base.ReleaseTracks(base.ReleaseTrack.GA)
+class Create(base.CreateCommand):
+  """Generate ready-to-deploy Kubernetes manifests with compute, load balancing, and autoscaling capabilities.
+
+  To get supported model, model servers, and model server versions, run `gcloud
+  alpha container ai profiles model-and-server-combinations list`. To get
+  supported accelerators with their performance metrics, run `gcloud alpha
+  container ai profiles accelerators list`.
+  """
+
+  @staticmethod
+  def Args(parser):
+    parser.add_argument(
+        "--model",
+        required=True,
+        help="The model.",
+    )
+    parser.add_argument(
+        "--model-server",
+        required=True,
+        help="The model server.",
+    )
+    parser.add_argument(
+        "--model-server-version",
+        help=(
+            "The model server version. If not specified, this defaults to the"
+            " latest version."
+        ),
+    )
+    parser.add_argument(
+        "--target-ntpot-milliseconds",
+        type=int,
+        help=(
+            "The maximum normalized time per output token (NTPOT) in"
+            " milliseconds. NTPOT is measured as the request_latency /"
+            " output_tokens. If this is set, the manifests will include"
+            " Horizontal Pod Autoscaler (HPA) resources which automatically"
+            " adjust the model server replica count in response to changes in"
+            " model server load to keep p50 NTPOT below the specified"
+            " threshold. If the provided target-ntpot-milliseconds is too low"
+            " to achieve, the HPA manifest will not be generated. "
+        ),
+    )
+    parser.add_argument(
+        "--target-ttft-milliseconds",
+        type=int,
+        help=(
+            "If specified, results will only show accelerators that can meet"
+            " the latency target and will show their throughput performances at"
+            " the target ttft target to achieve, the HPA manifest will not be"
+            " generated. "
+        ),
+    )
+    parser.add_argument(
+        "--accelerator-type",
+        required=True,
+        help="The accelerator type.",
+    )
+    parser.add_argument(
+        "--namespace",
+        help=(
+            "The namespace to deploy the manifests in. Default namespace is"
+            " 'default'."
+        ),
+    )
+    parser.add_argument(
+        "--output",
+        choices=["manifest", "comments", "all"],
+        default="all",
+        help="The output to display. Default is all.",
+    )
+    parser.add_argument(
+        "--output-path",
+        help=(
+            "The path to save the output to. If not specified, output to the"
+            " terminal."
+        ),
+    )
+    parser.add_argument(
+        "--model-bucket-uri",
+        help=(
+            "The Google Cloud Storage bucket URI to load the model from. This"
+            " URI must point to the directory containing the model's config"
+            " file (config.json) and model weights. If unspecified, defaults to"
+            " loading the model from Hugging Face."
+        ),
+    )
+    parser.add_argument(
+        "--target-itl-milliseconds",
+        type=int,
+        help=(
+            "The target inter-token latency (ITL) in milliseconds. If this is"
+            " set, the manifest will include Horizontal Pod Autoscaler (HPA)"
+            " resources which automatically adjust the model server replica"
+            " count in response to changes in model server load to keep p50 ITL"
+            " below the specified threshold. If the provided"
+            " target-itl-milliseconds is too low to achieve, the HPA manifest"
+            " will not be generated."
+        ),
+    )
+    parser.add_argument(
+        "--use-case",
+        help=(
+            "The manifest will be optimized for this use case. Options are:"
+            " Advanced Customer Support, Code Completion, Text Summarization,"
+            " Chatbot (ShareGPT), Code Generation, Deep Research. Will default"
+            " to Chatbot if not specified."
+        ),
+    )
+    parser.add_argument(
+        "--serving-stack",
+        required=False,
+        help=(
+            "The serving stack to filter manifests by. If not"
+            " provided, manifests for all serving stacks that support"
+            " the given model and model server will be considered."
+        ),
+    )
+    parser.add_argument(
+        "--serving-stack-version",
+        required=False,
+        help=(
+            "The serving stack version to filter manifests by. If not"
+            " provided, manifests for all versions that support"
+            " the given model and model server will be considered."
+        ),
+    )
+
+  def Run(self, args):
+    client = util.GetClientInstance(base.ReleaseTrack.GA)
+    messages = util.GetMessagesModule(base.ReleaseTrack.GA)
+
+    try:
+      model_server_info = messages.ModelServerInfo(
+          model=args.model,
+          modelServer=args.model_server,
+          modelServerVersion=args.model_server_version,
+      )
+      performance_requirements = messages.PerformanceRequirements()
+      if args.target_ntpot_milliseconds:
+        performance_requirements.targetNtpotMilliseconds = (
+            args.target_ntpot_milliseconds
+        )
+      if args.target_ttft_milliseconds:
+        performance_requirements.targetTtftMilliseconds = (
+            args.target_ttft_milliseconds
+        )
+      if args.target_itl_milliseconds:
+        performance_requirements.targetItlMilliseconds = (
+            args.target_itl_milliseconds
+        )
+      storage_config = messages.StorageConfig()
+      if args.model_bucket_uri:
+        storage_config.modelBucketUri = args.model_bucket_uri
+
+      serving_stack = None
+      if args.serving_stack:
+        serving_stack = messages.ServingStack(
+            name=args.serving_stack,
+        )
+        if args.serving_stack_version:
+          serving_stack.version = args.serving_stack_version
+
+      request = messages.GenerateOptimizedManifestRequest(
+          modelServerInfo=model_server_info,
+          acceleratorType=args.accelerator_type,
+          kubernetesNamespace=args.namespace,
+          servingStack=serving_stack,
+      )
+      if (
+          performance_requirements.targetNtpotMilliseconds is not None
+          or performance_requirements.targetTtftMilliseconds is not None
+          or performance_requirements.targetItlMilliseconds is not None
+      ):
+        request.performanceRequirements = performance_requirements
+      if storage_config.modelBucketUri is not None:
+        request.storageConfig = storage_config
+      if args.use_case:
+        request.useCase = args.use_case
+      response = client.optimizedManifest.Generate(request)
+      return response
+    except apitools_exceptions.HttpError as error:
+      raise api_lib_exceptions.HttpException(error, util.HTTP_ERROR_FORMAT)
+
+  def Display(self, args, resources):
+    if not resources:
+      log.out.Print("No manifests generated.")
+      return
+
+    output_content = ""
+    if args.output != "comments":
+      for manifest in resources.kubernetesManifests:
+        output_content += manifest.content + "\n---\n"
+
+    if resources.comments:
+      comment_string = "\n".join([f"# {line}" for line in resources.comments])
+      output_content += comment_string
+
+    if args.output_path:
+      try:
+        with files.FileWriter(args.output_path, output_content) as f:
+          f.write(output_content)
+        log.out.Print(f"Output saved to {args.output_path}")
+      except exceptions.Error as e:
+        log.error(f"An error occurred while saving output to file: {e}")
+    else:
+      log.out.Print(output_content)
+
+
+@base.DefaultUniverseOnly
+@base.ReleaseTracks(base.ReleaseTrack.ALPHA)
+class CreateAlpha(base.CreateCommand):
+  """Generate ready-to-deploy Kubernetes manifests with compute, load balancing, and autoscaling capabilities.
+
+  To get supported model, model servers, and model server versions, run `gcloud
+  alpha container ai profiles model-and-server-combinations list`. To get
+  supported accelerators with their performance metrics, run `gcloud alpha
+  container ai profiles accelerators list`.
+  """
+
+  @staticmethod
+  def Args(parser):
+    parser.add_argument(
+        "--model",
+        required=True,
+        help="The model.",
+    )
+    parser.add_argument(
+        "--model-server",
+        required=True,
+        help="The model server.",
+    )
+    parser.add_argument(
+        "--model-server-version",
+        help=(
+            "The model server version. If not specified, this defaults to the"
+            " latest version."
+        ),
+    )
+    parser.add_argument(
+        "--target-ntpot-milliseconds",
+        type=int,
+        help=(
+            "The maximum normalized time per output token (NTPOT) in"
+            " milliseconds. NTPOT is measured as the request_latency /"
+            " output_tokens. If this is set, the manifests will include"
+            " Horizontal Pod Autoscaler (HPA) resources which automatically"
+            " adjust the model server replica count in response to changes in"
+            " model server load to keep p50 NTPOT below the specified"
+            " threshold. If the provided target-ntpot-milliseconds is too low"
+            " to achieve, the HPA manifest will not be generated. "
+        ),
+    )
+    parser.add_argument(
+        "--accelerator-type",
+        required=True,
+        help="The accelerator type.",
+    )
+    parser.add_argument(
+        "--namespace",
+        help=(
+            "The namespace to deploy the manifests in. Default namespace is"
+            " 'default'."
+        ),
+    )
+    parser.add_argument(
+        "--output",
+        choices=["manifest", "comments", "all"],
+        default="all",
+        help="The output to display. Default is all.",
+    )
+    parser.add_argument(
+        "--output-path",
+        help=(
+            "The path to save the output to. If not specified, output to the"
+            " terminal."
+        ),
+    )
+    parser.add_argument(
+        "--model-bucket-uri",
+        hidden=True,
+        help=(
+            "GCS bucket URI to pull model from. If not specified, default"
+            " to the model hoster."
+        ),
+    )
+
+  def Run(self, args):
+    client = util.GetClientInstance(base.ReleaseTrack.ALPHA)
+    messages = util.GetMessagesModule(base.ReleaseTrack.ALPHA)
+
+    try:
+      request = messages.GkerecommenderOptimizedManifestRequest(
+          modelAndModelServerInfo_modelName=args.model,
+          modelAndModelServerInfo_modelServerName=args.model_server,
+          modelAndModelServerInfo_modelServerVersion=args.model_server_version,
+          targetNtpotMilliseconds=args.target_ntpot_milliseconds,
+          acceleratorType=args.accelerator_type,
+          kubernetesNamespace=args.namespace,
+          storageConfig_modelBucketUri=args.model_bucket_uri,
+      )
+      response = client.v1alpha1.OptimizedManifest(request)
+      return response
+    except exceptions.Error as e:
+      log.error(f"An error has occurred: {e}")
+      log.status.Print(f"An error has occurred: {e}")
+      return []
+
+  def Display(self, args, resources):
+    if not resources:
+      log.out.Print("No manifests generated.")
+      return
+
+    output_content = ""
+    if args.output == "manifest" or args.output == "all":
+      for manifest in resources.k8sManifests:
+        output_content += manifest.content + "\n---\n"
+
+    if args.output == "comments" or args.output == "all":
+      if resources.comments:
+        comment_string = "\n".join([f"# {line}" for line in resources.comments])
+        output_content += comment_string
+
+    if args.output_path:
+      try:
+        with files.FileWriter(args.output_path, output_content) as f:
+          f.write(output_content)
+        log.out.Print(f"Output saved to {args.output_path}")
+      except exceptions.Error as e:
+        log.error(f"An error occurred while saving output to file: {e}")
+    else:
+      log.out.Print(output_content)
--- a/login/google-cloud-sdk/lib/surface/container/ai/profiles/model_and_server_combinations/init.py
+++ b/login/google-cloud-sdk/lib/surface/container/ai/profiles/model_and_server_combinations/init.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2025 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The command group for the model and model server combinations CLI."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import unicode_literals
+
+from googlecloudsdk.calliope import base
+from googlecloudsdk.command_lib.projects import util
+
+
+@base.UniverseCompatible
+@base.ReleaseTracks(base.ReleaseTrack.ALPHA)
+class ModelServers(base.Group):
+  """Manage supported model and model servers for GKE Inference Quickstart."""
--- a/login/google-cloud-sdk/lib/surface/container/ai/profiles/model_and_server_combinations/list.py
+++ b/login/google-cloud-sdk/lib/surface/container/ai/profiles/model_and_server_combinations/list.py
@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2025 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Lists supported model and server combinations for GKE Inference Quickstart."""
+
+from googlecloudsdk.api_lib.ai.recommender import util
+from googlecloudsdk.calliope import base
+from googlecloudsdk.command_lib.run import commands
+from googlecloudsdk.core import exceptions
+from googlecloudsdk.core import log
+
+_EXAMPLES = """
+To list all supported model and server combinations, run:
+
+$ {command}
+"""
+
+
+@base.DefaultUniverseOnly
+@base.ReleaseTracks(base.ReleaseTrack.ALPHA)
+class List(commands.List):
+  """List supported model and server combinations.
+
+  This command lists all supported model, model server, and model server version
+    combinations.
+  """
+
+  @staticmethod
+  def Args(parser):
+    parser.add_argument(
+        "--model",
+        help="The model. If not specified, this defaults to any model.",
+    )
+    parser.add_argument(
+        "--model-server",
+        help=(
+            "The model server. If not specified, this defaults to any model"
+            " server."
+        ),
+    )
+    parser.add_argument(
+        "--model-server-version",
+        help=(
+            "The model server version. If not specified, this defaults to the"
+            " any model server version."
+        ),
+    )
+    parser.display_info.AddFormat(
+        "table(modelName, modelServerName, modelServerVersion)"
+    )
+
+  def Run(self, args):
+    client = util.GetClientInstance(base.ReleaseTrack.ALPHA)
+    messages = util.GetMessagesModule(base.ReleaseTrack.ALPHA)
+
+    try:
+      request = messages.GkerecommenderModelsAndServersListRequest(
+          modelName=args.model,
+          modelServerName=args.model_server,
+          modelServerVersion=args.model_server_version,
+      )
+      response = client.modelsAndServers.List(request)
+      if response.modelAndModelServerInfo:
+        return response.modelAndModelServerInfo
+      else:
+        return []
+    except exceptions.Error as e:
+      log.error(f"An error has occurred: {e}")
+      log.status.Print(f"An error has occurred: {e}")
+      return []
--- a/login/google-cloud-sdk/lib/surface/container/ai/profiles/model_server_versions/init.py
+++ b/login/google-cloud-sdk/lib/surface/container/ai/profiles/model_server_versions/init.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2025 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The command group for the model server versions CLI."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import unicode_literals
+
+from googlecloudsdk.calliope import base
+from googlecloudsdk.command_lib.projects import util
+
+
+@base.UniverseCompatible
+@base.ReleaseTracks(base.ReleaseTrack.ALPHA, base.ReleaseTrack.GA)
+class ModelServers(base.Group):
+  """Manage supported model server versions for GKE Inference Quickstart."""
--- a/login/google-cloud-sdk/lib/surface/container/ai/profiles/model_server_versions/list.py
+++ b/login/google-cloud-sdk/lib/surface/container/ai/profiles/model_server_versions/list.py
@@ -0,0 +1,137 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2025 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Lists supported model server versions for GKE Inference Quickstart."""
+
+from apitools.base.py import exceptions as apitools_exceptions
+from googlecloudsdk.api_lib.ai.recommender import util
+from googlecloudsdk.api_lib.util import exceptions as api_lib_exceptions
+from googlecloudsdk.calliope import base
+from googlecloudsdk.command_lib.run import commands
+from googlecloudsdk.core import exceptions
+from googlecloudsdk.core import log
+
+
+_EXAMPLES = """
+To list all supported model server versions for a model and model server, run:
+
+$ {command} --model=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --model-server=vllm
+"""
+
+
+@base.DefaultUniverseOnly
+@base.ReleaseTracks(base.ReleaseTrack.GA)
+class List(commands.List):
+  """List supported model server versions.
+
+  To get supported model and model servers, run `gcloud container ai
+  profiles models list` and `gcloud container ai profiles
+  model-servers list`.
+  """
+
+  @staticmethod
+  def Args(parser):
+    parser.add_argument(
+        "--model",
+        required=True,
+        help="The model.",
+    )
+    parser.add_argument(
+        "--model-server",
+        required=True,
+        help=(
+            "The model server. If not specified, this defaults to any model"
+            " server."
+        ),
+    )
+
+  def Run(self, args):
+    client = util.GetClientInstance(base.ReleaseTrack.GA)
+    messages = util.GetMessagesModule(base.ReleaseTrack.GA)
+
+    try:
+      request = messages.GkerecommenderModelServerVersionsFetchRequest(
+          model=args.model, modelServer=args.model_server
+      )
+      response = client.modelServerVersions.Fetch(request)
+      if response.modelServerVersions:
+        return response.modelServerVersions
+      else:
+        return []
+    except apitools_exceptions.HttpError as error:
+      raise api_lib_exceptions.HttpException(error, util.HTTP_ERROR_FORMAT)
+
+  def Display(self, _, resources):
+    if resources:
+      log.out.Print("Supported model server versions:")
+      for model_server_version in resources:
+        log.out.Print("- ", model_server_version)
+    else:
+      log.out.Print("No supported model server versions found.")
+
+
+@base.DefaultUniverseOnly
+@base.ReleaseTracks(base.ReleaseTrack.ALPHA)
+class ListAlpha(commands.List):
+  """List supported model server versions.
+
+  To get supported model and model servers, run `gcloud alpha container ai
+  profiles models list` and `gcloud alpha container ai profiles
+  model-servers list`.
+  Alternatively, run `gcloud alpha container ai profiles
+  model-and-server-combinations list` to get all supported model and server
+  combinations.
+  """
+
+  @staticmethod
+  def Args(parser):
+    parser.add_argument(
+        "--model",
+        required=True,
+        help="The model.",
+    )
+    parser.add_argument(
+        "--model-server",
+        required=True,
+        help=(
+            "The model server. If not specified, this defaults to any model"
+            " server."
+        ),
+    )
+
+  def Run(self, args):
+    client = util.GetClientInstance(base.ReleaseTrack.ALPHA)
+    messages = util.GetMessagesModule(base.ReleaseTrack.ALPHA)
+
+    try:
+      request = messages.GkerecommenderModelServersVersionsListRequest(
+          modelName=args.model, modelServerName=args.model_server
+      )
+      response = client.modelServers_versions.List(request)
+      if response.modelServerVersions:
+        return response.modelServerVersions
+      else:
+        return []
+    except exceptions.Error as e:
+      log.error(f"An error has occurred: {e}")
+      log.status.Print(f"An error has occurred: {e}")
+      return []
+
+  def Display(self, _, resources):
+    if resources:
+      log.out.Print("Supported model server versions:")
+      for model_server_version in resources:
+        log.out.Print("- ", model_server_version)
+    else:
+      log.out.Print("No supported model server versions found.")
--- a/login/google-cloud-sdk/lib/surface/container/ai/profiles/model_servers/init.py
+++ b/login/google-cloud-sdk/lib/surface/container/ai/profiles/model_servers/init.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2025 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The command group for the model servers CLI."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import unicode_literals
+
+from googlecloudsdk.calliope import base
+from googlecloudsdk.command_lib.projects import util
+
+
+@base.UniverseCompatible
+@base.ReleaseTracks(base.ReleaseTrack.ALPHA, base.ReleaseTrack.GA)
+class ModelServers(base.Group):
+  """Manage supported model servers for GKE Inference Quickstart."""
--- a/login/google-cloud-sdk/lib/surface/container/ai/profiles/model_servers/list.py
+++ b/login/google-cloud-sdk/lib/surface/container/ai/profiles/model_servers/list.py
@@ -0,0 +1,118 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2025 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Lists supported model servers for GKE Inference Quickstart."""
+
+from apitools.base.py import exceptions as apitools_exceptions
+from googlecloudsdk.api_lib.ai.recommender import util
+from googlecloudsdk.api_lib.util import exceptions as api_lib_exceptions
+from googlecloudsdk.calliope import base
+from googlecloudsdk.command_lib.run import commands
+from googlecloudsdk.core import exceptions
+from googlecloudsdk.core import log
+
+
+_EXAMPLE = """
+To list all supported model servers for a model, run:
+
+$ {command} --model=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+"""
+
+
+@base.DefaultUniverseOnly
+@base.ReleaseTracks(base.ReleaseTrack.GA)
+class List(commands.List):
+  """List supported model servers for a given model.
+
+  To get supported models, run `gcloud container ai profiles models
+  list`.
+  """
+
+  @staticmethod
+  def Args(parser):
+    parser.add_argument(
+        "--model",
+        required=True,
+        help="The model.",
+    )
+
+  def Run(self, args):
+    client = util.GetClientInstance(base.ReleaseTrack.GA)
+    messages = util.GetMessagesModule(base.ReleaseTrack.GA)
+
+    try:
+      request = messages.GkerecommenderModelServersFetchRequest(
+          model=args.model
+      )
+      response = client.modelServers.Fetch(request)
+      if response.modelServers:
+        return response.modelServers
+      else:
+        return []
+    except apitools_exceptions.HttpError as error:
+      raise api_lib_exceptions.HttpException(error, util.HTTP_ERROR_FORMAT)
+
+  def Display(self, _, resources):
+    if resources:
+      log.out.Print("Supported model servers:")
+      for model_server_name in resources:
+        log.out.Print("- ", model_server_name)
+    else:
+      log.out.Print("No supported model servers found.")
+
+
+@base.DefaultUniverseOnly
+@base.ReleaseTracks(base.ReleaseTrack.ALPHA)
+class ListAlpha(commands.List):
+  """List supported model servers for a given model.
+
+  To get supported models, run `gcloud alpha container ai profiles models
+  list` or to get all supported model and server combinations, run `gcloud alpha
+  container ai profiles model-and-server-combinations
+  list`.
+  """
+
+  @staticmethod
+  def Args(parser):
+    parser.add_argument(
+        "--model",
+        required=True,
+        help="The model.",
+    )
+
+  def Run(self, args):
+    client = util.GetClientInstance(base.ReleaseTrack.ALPHA)
+    messages = util.GetMessagesModule(base.ReleaseTrack.ALPHA)
+
+    try:
+      request = messages.GkerecommenderModelServersListRequest(
+          modelName=args.model
+      )
+      response = client.modelServers.List(request)
+      if response.modelServerNames:
+        return response.modelServerNames
+      else:
+        return []
+    except exceptions.Error as e:
+      log.error(f"An error has occurred: {e}")
+      log.status.Print(f"An error has occurred: {e}")
+      return []
+
+  def Display(self, _, resources):
+    if resources:
+      log.out.Print("Supported model servers:")
+      for model_server_name in resources:
+        log.out.Print("- ", model_server_name)
+    else:
+      log.out.Print("No supported model servers found.")
--- a/login/google-cloud-sdk/lib/surface/container/ai/profiles/models/init.py
+++ b/login/google-cloud-sdk/lib/surface/container/ai/profiles/models/init.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2025 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The command group for the models CLI."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import unicode_literals
+
+from googlecloudsdk.calliope import base
+from googlecloudsdk.command_lib.projects import util
+
+
+@base.UniverseCompatible
+@base.ReleaseTracks(base.ReleaseTrack.ALPHA, base.ReleaseTrack.GA)
+class Models(base.Group):
+  """Manage supported models for GKE Inference Quickstart."""
--- a/login/google-cloud-sdk/lib/surface/container/ai/profiles/models/list.py
+++ b/login/google-cloud-sdk/lib/surface/container/ai/profiles/models/list.py
@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2025 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Lists supported models for GKE Inference Quickstart."""
+
+from apitools.base.py import exceptions as apitools_exceptions
+from googlecloudsdk.api_lib.ai.recommender import util
+from googlecloudsdk.api_lib.util import exceptions as api_lib_exceptions
+from googlecloudsdk.calliope import base
+from googlecloudsdk.command_lib.run import commands
+from googlecloudsdk.core import exceptions
+from googlecloudsdk.core import log
+
+
+_EXAMPLES = """
+To list all supported models, run:
+
+$ {command}
+"""
+
+
+@base.DefaultUniverseOnly
+@base.ReleaseTracks(base.ReleaseTrack.GA)
+class List(commands.List):
+  """List supported models."""
+
+  def Run(self, _):
+    client = util.GetClientInstance(base.ReleaseTrack.GA)
+    messages = util.GetMessagesModule(base.ReleaseTrack.GA)
+
+    try:
+      response = client.models.Fetch(
+          messages.GkerecommenderModelsFetchRequest()
+      )
+      if response.models:
+        return response.models
+      else:
+        return []
+    except apitools_exceptions.HttpError as error:
+      raise api_lib_exceptions.HttpException(error, util.HTTP_ERROR_FORMAT)
+
+  def Display(self, _, resources):
+    if resources:
+      log.out.Print("Supported models:")
+      for model_name in resources:
+        log.out.Print("- ", model_name)
+    else:
+      log.out.Print("No supported models found.")
+
+
+@base.DefaultUniverseOnly
+@base.ReleaseTracks(base.ReleaseTrack.ALPHA)
+class ListAlpha(commands.List):
+  """List supported models."""
+
+  def Run(self, _):
+    client = util.GetClientInstance(base.ReleaseTrack.ALPHA)
+    messages = util.GetMessagesModule(base.ReleaseTrack.ALPHA)
+
+    try:
+      response = client.models.List(messages.GkerecommenderModelsListRequest())
+      if response.modelNames:
+        return response.modelNames
+      else:
+        return []
+    except exceptions.Error as e:
+      log.error(f"An error has occurred: {e}")
+      log.status.Print(f"An error has occured: {e}")
+      return []
+
+  def Display(self, _, resources):
+    if resources:
+      log.out.Print("Supported models:")
+      for model_name in resources:
+        log.out.Print("- ", model_name)
+    else:
+      log.out.Print("No supported models found.")
--- a/login/google-cloud-sdk/lib/surface/container/ai/profiles/serving_stack_versions/init.py
+++ b/login/google-cloud-sdk/lib/surface/container/ai/profiles/serving_stack_versions/init.py
@@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2025 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The command group for the serving stack versions CLI."""
+from googlecloudsdk.calliope import base
+from googlecloudsdk.command_lib.projects import util
+
+
+@base.UniverseCompatible
+@base.ReleaseTracks(base.ReleaseTrack.GA)
+class ServingStackVersions(base.Group):
+  """List supported serving stack versions for GKE Inference Quickstart."""
--- a/login/google-cloud-sdk/lib/surface/container/ai/profiles/serving_stack_versions/list.py
+++ b/login/google-cloud-sdk/lib/surface/container/ai/profiles/serving_stack_versions/list.py
@@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2025 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Lists supported serving stack versions for GKE Inference Quickstart."""
+
+from apitools.base.py import exceptions as apitools_exceptions
+from googlecloudsdk.api_lib.ai.recommender import util
+from googlecloudsdk.api_lib.util import exceptions as api_lib_exceptions
+from googlecloudsdk.calliope import base
+from googlecloudsdk.command_lib.run import commands
+
+
+_EXAMPLES = """
+To list all supported serving stack versions, run:
+
+$ {command} --serving-stack=llm-d
+"""
+
+
+@base.ReleaseTracks(base.ReleaseTrack.GA)
+class List(commands.List):
+  """List supported serving stack versions that were used to generate the inference profiles."""
+
+  @staticmethod
+  def Args(parser):
+    parser.display_info.AddFormat("table(version)")
+    parser.add_argument(
+        "--model",
+        help="The model to filter serving stack versions by.",
+    )
+    parser.add_argument(
+        "--model-server",
+        help="The model server to filter serving stack versions by.",
+    )
+    parser.add_argument(
+        "--serving-stack",
+        required=True,
+        help="The serving stack to filter serving stack versions by.",
+    )
+
+  def Run(self, args):
+    client = util.GetClientInstance(base.ReleaseTrack.GA)
+    messages = util.GetMessagesModule(base.ReleaseTrack.GA)
+
+    try:
+      response = client.servingStackVersions.Fetch(
+          messages.GkerecommenderServingStackVersionsFetchRequest(
+              model=args.model,
+              modelServer=args.model_server,
+              servingStack=args.serving_stack,
+          )
+      )
+      if response.servingStackVersions:
+        return [{"version": v} for v in response.servingStackVersions]
+      else:
+        return []
+    except apitools_exceptions.HttpError as error:
+      raise api_lib_exceptions.HttpException(error, util.HTTP_ERROR_FORMAT)
--- a/login/google-cloud-sdk/lib/surface/container/ai/profiles/serving_stacks/init.py
+++ b/login/google-cloud-sdk/lib/surface/container/ai/profiles/serving_stacks/init.py
@@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2025 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The command group for the serving stack CLI."""
+from googlecloudsdk.calliope import base
+from googlecloudsdk.command_lib.projects import util
+
+
+@base.UniverseCompatible
+@base.ReleaseTracks(base.ReleaseTrack.GA)
+class ServingStacks(base.Group):
+  """List supported serving stacks for GKE Inference Quickstart."""
--- a/login/google-cloud-sdk/lib/surface/container/ai/profiles/serving_stacks/list.py
+++ b/login/google-cloud-sdk/lib/surface/container/ai/profiles/serving_stacks/list.py
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2025 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Lists supported serving stacks for GKE Inference Quickstart."""
+
+from apitools.base.py import exceptions as apitools_exceptions
+from googlecloudsdk.api_lib.ai.recommender import util
+from googlecloudsdk.api_lib.util import exceptions as api_lib_exceptions
+from googlecloudsdk.calliope import base
+from googlecloudsdk.command_lib.run import commands
+
+
+_EXAMPLES = """
+To list all supported serving stacks, run:
+
+$ {command}
+"""
+
+
+@base.ReleaseTracks(base.ReleaseTrack.GA)
+class List(commands.List):
+  """List supported serving stacks that were used to generate the inference profiles."""
+
+  @staticmethod
+  def Args(parser):
+    parser.display_info.AddFormat(
+        "table(name,version)"
+    )
+    parser.add_argument(
+        "--model",
+        help="The model to filter serving stacks by.",
+    )
+    parser.add_argument(
+        "--model-server",
+        help="The model server to filter serving stacks by.",
+    )
+
+  def Run(self, args):
+    client = util.GetClientInstance(base.ReleaseTrack.GA)
+    messages = util.GetMessagesModule(base.ReleaseTrack.GA)
+
+    try:
+      response = client.servingStacks.Fetch(
+          messages.GkerecommenderServingStacksFetchRequest(
+              model=args.model,
+              modelServer=args.model_server,
+          )
+      )
+      if response.servingStacks:
+        return response.servingStacks
+      else:
+        return []
+    except apitools_exceptions.HttpError as error:
+      raise api_lib_exceptions.HttpException(error, util.HTTP_ERROR_FORMAT)
--- a/login/google-cloud-sdk/lib/surface/container/ai/profiles/use_case/init.py
+++ b/login/google-cloud-sdk/lib/surface/container/ai/profiles/use_case/init.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2025 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The command group for the models CLI."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import unicode_literals
+
+from googlecloudsdk.calliope import base
+from googlecloudsdk.command_lib.projects import util
+
+
+@base.UniverseCompatible
+@base.ReleaseTracks(base.ReleaseTrack.GA)
+class UseCase(base.Group):
+  """List supported use cases for GKE Inference Quickstart."""
--- a/login/google-cloud-sdk/lib/surface/container/ai/profiles/use_case/list.py
+++ b/login/google-cloud-sdk/lib/surface/container/ai/profiles/use_case/list.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2025 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Lists supported use cases for GKE Inference Quickstart."""
+
+from apitools.base.py import exceptions as apitools_exceptions
+from googlecloudsdk.api_lib.ai.recommender import util
+from googlecloudsdk.api_lib.util import exceptions as api_lib_exceptions
+from googlecloudsdk.calliope import base
+from googlecloudsdk.command_lib.run import commands
+
+
+_EXAMPLES = """
+To list all supported use cases, run:
+
+$ {command}
+"""
+
+
+@base.ReleaseTracks(base.ReleaseTrack.GA)
+class List(commands.List):
+  """List supported use cases that were used to generate the inference profiles."""
+
+  @staticmethod
+  def Args(parser):
+    parser.display_info.AddFormat(
+        "table(useCase,averageInputLength,averageOutputLength)"
+    )
+
+  def Run(self, _):
+    client = util.GetClientInstance(base.ReleaseTrack.GA)
+    messages = util.GetMessagesModule(base.ReleaseTrack.GA)
+
+    try:
+      response = client.useCases.Fetch(
+          messages.FetchUseCasesRequest()
+      )
+      if response.workloadSpecs:
+        return response.workloadSpecs
+      else:
+        return []
+    except apitools_exceptions.HttpError as error:
+      raise api_lib_exceptions.HttpException(error, util.HTTP_ERROR_FORMAT)