feat: Add new gcloud commands, API clients, and third-party libraries across various services.

2026-01-01 20:26:35 +01:00
parent 5e23cbece0
commit a19e592eb7
25221 changed files with 8324611 additions and 0 deletions
--- a/login/google-cloud-sdk/lib/googlecloudsdk/command_lib/ml/speech/init.py
+++ b/login/google-cloud-sdk/lib/googlecloudsdk/command_lib/ml/speech/init.py
--- a/login/google-cloud-sdk/lib/googlecloudsdk/command_lib/ml/speech/flag_validations.py
+++ b/login/google-cloud-sdk/lib/googlecloudsdk/command_lib/ml/speech/flag_validations.py
@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2024 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Validation functions for speech commands flags."""
+
+import os
+from googlecloudsdk.api_lib.storage import storage_util
+from googlecloudsdk.calliope import exceptions
+
+
+EXPLICIT_ENCODING_OPTIONS = ('LINEAR16', 'MULAW', 'ALAW')
+ENCODING_OPTIONS = frozenset(EXPLICIT_ENCODING_OPTIONS) | {'AUTO'}
+
+
+def ValidateSpeakerDiarization(args):
+  """Validates speaker diarization flag input."""
+  if (
+      args.min_speaker_count is not None and args.max_speaker_count is not None
+  ) and (args.min_speaker_count > args.max_speaker_count):
+    raise exceptions.InvalidArgumentException(
+        '--max-speaker-count',
+        '[--max-speaker-count] must be equal to or larger than'
+        ' min-speaker-count.',
+    )
+
+
+def ValidateAudioSource(args, batch=False):
+  """Validates audio source flag input."""
+  if storage_util.ObjectReference.IsStorageUrl(args.audio):
+    return
+
+  if batch:
+    raise exceptions.InvalidArgumentException(
+        '--audio',
+        'Invalid audio source [{}]. The source must be a Google Cloud'
+        ' Storage URL (such as gs://bucket/object).'.format(args.audio),
+    )
+
+  if not os.path.isfile(args.audio):
+    raise exceptions.InvalidArgumentException(
+        '--audio',
+        'Invalid audio source [{}]. The source must either be a local '
+        'path or a Google Cloud Storage URL '
+        '(such as gs://bucket/object).'.format(args.audio),
+    )
+
+
+def ValidateDecodingConfig(args):
+  """Validates encoding flag input."""
+  if args.encoding is None:
+    return
+  if args.encoding not in ENCODING_OPTIONS:
+    raise exceptions.InvalidArgumentException(
+        '--encoding',
+        '[--encoding] must be set to one of '
+        + ', '.join(sorted(ENCODING_OPTIONS)),
+    )
+  if args.encoding == 'AUTO':
+    if args.sample_rate is not None or args.audio_channel_count is not None:
+      raise exceptions.InvalidArgumentException(
+          '--sample-rate'
+          if args.sample_rate is not None
+          else '--audio-channel-count',
+          'AUTO encoding does not support setting sample rate or audio'
+          ' channel count.',
+      )
+  else:
+    if args.sample_rate is None:
+      raise exceptions.InvalidArgumentException(
+          '--sample-rate',
+          '[--sample-rate] must be specified when configuring explicit'
+          ' encoding options '
+          + ', '.join(sorted(EXPLICIT_ENCODING_OPTIONS))
+          + '.',
+      )
+    if args.audio_channel_count is None:
+      raise exceptions.InvalidArgumentException(
+          '--audio-channel-count',
+          (
+              '[--audio-channel-count] must be specified when configuring'
+              ' explicit encoding options '
+              + ', '.join(sorted(EXPLICIT_ENCODING_OPTIONS))
+          ),
+      )
+
--- a/login/google-cloud-sdk/lib/googlecloudsdk/command_lib/ml/speech/flags.py
+++ b/login/google-cloud-sdk/lib/googlecloudsdk/command_lib/ml/speech/flags.py
@@ -0,0 +1,486 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2022 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Flags for speech commands."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import unicode_literals
+
+from googlecloudsdk.api_lib.util import apis
+from googlecloudsdk.calliope import actions
+from googlecloudsdk.calliope import arg_parsers
+from googlecloudsdk.calliope import exceptions
+from googlecloudsdk.command_lib.ml.speech import util
+from googlecloudsdk.command_lib.util.apis import arg_utils
+
+
+def GetEncodingTypeMapper(version):
+  messages = apis.GetMessagesModule(util.SPEECH_API, version)
+  return arg_utils.ChoiceEnumMapper(
+      '--encoding',
+      messages.RecognitionConfig.EncodingValueValuesEnum,
+      default='encoding-unspecified',
+      help_str='The type of encoding of the file. Required if the file format '
+      'is not WAV or FLAC.')
+
+
+class RecognizeArgsToRequestMapper:
+  """Utility class to map arguments to Recognize request."""
+
+  def __init__(self):
+    self._encoding_type_mapper = None
+    self._original_media_type_mapper = None
+    self._interaction_type_mapper = None
+    self._microphone_distance_type_mapper = None
+    self._device_type_mapper = None
+
+  def AddRecognizeArgsToParser(self, parser, api_version):
+    """Add common, GA level flags for recognize commands."""
+    parser.add_argument(
+        'audio',
+        help='The location of the audio file to transcribe. '
+        'Must be a local path or a Google Cloud Storage URL '
+        '(in the format gs://bucket/object).')
+    language_args = parser.add_group(mutex=True, required=True)
+    language_args.add_argument(
+        '--language-code',
+        help='The language of the supplied audio as a BCP-47 '
+        '(https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. Example: '
+        '"en-US". See https://cloud.google.com/speech/docs/languages for a list '
+        'of the currently supported language codes.')
+    language_args.add_argument(
+        '--language',
+        action=actions.DeprecationAction(
+            '--language',
+            warn=('The `--language` flag is deprecated. '
+                  'Use the `--language-code` flag instead.')),
+        hidden=True,
+        help='The language of the supplied audio as a BCP-47 '
+        '(https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. Example: '
+        '"en-US". See https://cloud.google.com/speech/docs/languages for a list '
+        'of the currently supported language codes.')
+    self._encoding_type_mapper = GetEncodingTypeMapper(api_version)
+    self._encoding_type_mapper.choice_arg.AddToParser(parser)
+    parser.add_argument(
+        '--sample-rate',
+        type=int,
+        required=False,
+        help='The sample rate in Hertz. For best results, set the sampling rate '
+        'of the audio source to 16000 Hz. If that\'s not possible, '
+        'use the native sample rate of the audio source '
+        '(instead of re-sampling).')
+
+    audio_channel_args = parser.add_group(
+        required=False, help='Audio channel settings.')
+    audio_channel_args.add_argument(
+        '--audio-channel-count',
+        type=int,
+        required=True,
+        help='The number of channels in the input audio data.  Set this for '
+        'separate-channel-recognition. Valid values are: '
+        '1)LINEAR16 and FLAC are 1-8 '
+        '2)OGG_OPUS are 1-254 '
+        '3) MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`.')
+    audio_channel_args.add_argument(
+        '--separate-channel-recognition',
+        action='store_true',
+        required=True,
+        help='Recognition result will contain a `channel_tag` field to state '
+        'which channel that result belongs to. If this is not true, only '
+        'the first channel will be recognized.')
+
+    parser.add_argument(
+        '--model',
+        choices={
+            'default': (
+                'audio that is not one of the specific audio models. '
+                'For example, long-form audio. '
+                'Ideally the audio is high-fidelity, recorded at a 16khz '
+                'or greater sampling rate.'
+            ),
+            'command_and_search': (
+                'short queries such as voice commands or voice search.'
+            ),
+            'latest_long': (
+                'Use this model for any kind of long form content such as media'
+                ' or spontaneous speech and conversations. Consider using this'
+                ' model in place of the video model, especially if the video'
+                ' model is not available in your target language. You can also'
+                ' use this in place of the default model.'
+            ),
+            'latest_short': (
+                'Use this model for short utterances that are a few seconds in'
+                ' length. It is useful for trying to capture commands or other'
+                ' single shot directed speech use cases. Consider using this'
+                ' model instead of the command and search model.'
+            ),
+            'medical_conversation': (
+                'Best for audio that originated from a conversation between a '
+                'medical provider and patient.'
+            ),
+            'medical_dictation': (
+                'Best for audio that originated from dictation notes by a'
+                ' medical provider.'
+            ),
+            'phone_call': (
+                'audio that originated from a phone call (typically recorded at'
+                ' an 8khz sampling rate).'
+            ),
+            'phone_call_enhanced': (
+                'audio that originated from a phone call (typically recorded at'
+                ' an 8khz sampling rate). This is a premium model and can'
+                ' produce better results but costs more than the standard rate.'
+            ),
+            'telephony': (
+                'Improved version of the "phone_call" model, best for audio '
+                'that originated from a phone call, typically recorded at an '
+                '8kHz sampling rate.'
+            ),
+            'telephony_short': (
+                'Dedicated version of the modern "telephony" model for short '
+                'or even single-word utterances for audio that originated from '
+                'a phone call, typically recorded at an 8kHz sampling rate.'
+            ),
+            'video_enhanced': (
+                'audio that originated from video or includes multiple'
+                ' speakers. Ideally the audio is recorded at a 16khz or greater'
+                ' sampling rate. This is a premium model that costs more than'
+                ' the standard rate.'
+            ),
+        },
+        help=(
+            'Select the model best suited to your domain to get best results.'
+            ' If you do not explicitly specify a model, Speech-to-Text will'
+            ' auto-select a model based on your other specified parameters.'
+            ' Some models are premium and cost more than standard models'
+            ' (although you can reduce the price by opting into'
+            ' https://cloud.google.com/speech-to-text/docs/data-logging)'
+        ),
+    )
+
+    parser.add_argument(
+        '--max-alternatives',
+        type=int,
+        default=1,
+        help='Maximum number of recognition hypotheses to be returned. '
+        'The server may return fewer than max_alternatives. '
+        'Valid values are 0-30. A value of 0 or 1 will return a maximum '
+        'of one.')
+    parser.add_argument(
+        '--hints',
+        type=arg_parsers.ArgList(),
+        metavar='HINT',
+        default=[],
+        help='A list of strings containing word and phrase "hints" so that the '
+        'speech recognition is more likely to recognize them. This can be '
+        'used to improve the accuracy for specific words and phrases, '
+        'for example, if specific commands are typically spoken by '
+        'the user. This can also be used to add additional words to the '
+        'vocabulary of the recognizer. '
+        'See https://cloud.google.com/speech/limits#content.')
+    parser.add_argument(
+        '--include-word-time-offsets',
+        action='store_true',
+        default=False,
+        help='If True, the top result includes a list of words with the start '
+        'and end time offsets (timestamps) for those words. If False, '
+        'no word-level time offset information is returned.')
+    parser.add_argument(
+        '--filter-profanity',
+        action='store_true',
+        default=False,
+        help='If True, the server will attempt to filter out profanities, '
+        'replacing all but the initial character in each filtered word with '
+        'asterisks, e.g. ```f***```.')
+    parser.add_argument(
+        '--enable-automatic-punctuation',
+        action='store_true',
+        help='Adds punctuation to recognition result hypotheses.')
+
+  def MakeRecognitionConfig(self, args, messages):
+    """Make RecognitionConfig message from given arguments."""
+    config = messages.RecognitionConfig(
+        languageCode=args.language_code
+        if args.language_code else args.language,
+        encoding=self._encoding_type_mapper.GetEnumForChoice(
+            args.encoding.replace('_', '-').lower()),
+        sampleRateHertz=args.sample_rate,
+        audioChannelCount=args.audio_channel_count,
+        maxAlternatives=args.max_alternatives,
+        enableWordTimeOffsets=args.include_word_time_offsets,
+        enableSeparateRecognitionPerChannel=args.separate_channel_recognition,
+        profanityFilter=args.filter_profanity,
+        speechContexts=[messages.SpeechContext(phrases=args.hints)])
+    if args.enable_automatic_punctuation:
+      config.enableAutomaticPunctuation = args.enable_automatic_punctuation
+    if args.model is not None:
+      if args.model in [
+          'default',
+          'command_and_search',
+          'phone_call',
+          'latest_long',
+          'latest_short',
+          'medical_conversation',
+          'medical_dictation',
+          'telephony',
+          'telephony_short',
+      ]:
+        config.model = args.model
+      elif args.model == 'phone_call_enhanced':
+        config.model = 'phone_call'
+        config.useEnhanced = True
+      elif args.model == 'video_enhanced':
+        config.model = 'video'
+        config.useEnhanced = True
+    return config
+
+  def AddBetaRecognizeArgsToParser(self, parser):
+    """Add beta arguments."""
+    parser.add_argument(
+        '--additional-language-codes',
+        type=arg_parsers.ArgList(),
+        default=[],
+        metavar='LANGUAGE_CODE',
+        help="""\
+The BCP-47 language tags of other languages that the speech may be in.
+Up to 3 can be provided.
+
+If alternative languages are listed, recognition result will contain recognition
+in the most likely language detected including the main language-code.""")
+
+    speaker_args = parser.add_group(required=False)
+    speaker_args.add_argument(
+        '--diarization-speaker-count',
+        type=int,
+        hidden=True,
+        action=actions.DeprecationAction(
+            '--diarization-speaker-count',
+            warn=('The `--diarization-speaker-count` flag is deprecated. '
+                  'Use the `--min-diarization-speaker-count` and/or '
+                  '`--max-diarization-speaker-count` flag instead.')),
+        help='Estimated number of speakers in the conversation '
+        'being recognized.')
+    speaker_args.add_argument(
+        '--min-diarization-speaker-count',
+        type=int,
+        help='Minimum estimated number of speakers in the conversation '
+        'being recognized.')
+    speaker_args.add_argument(
+        '--max-diarization-speaker-count',
+        type=int,
+        help='Maximum estimated number of speakers in the conversation '
+        'being recognized.')
+    speaker_args.add_argument(
+        '--enable-speaker-diarization',
+        action='store_true',
+        required=True,
+        help='Enable speaker detection for each recognized word in the top '
+        'alternative of the recognition result using an integer '
+        'speaker_tag provided in the WordInfo.')
+
+    parser.add_argument(
+        '--include-word-confidence',
+        action='store_true',
+        help='Include a list of words and the confidence for those words in '
+        'the top result.')
+
+  def UpdateBetaArgsInRecognitionConfig(self, args, config):
+    """Updates config from command line arguments."""
+    config.alternativeLanguageCodes = args.additional_language_codes
+    # If any of diarization flags are used enable diarization.
+    if (args.enable_speaker_diarization or args.min_diarization_speaker_count or
+        args.max_diarization_speaker_count or args.diarization_speaker_count):
+      speaker_config = config.diarizationConfig = config.field_by_name(
+          'diarizationConfig').message_type(enableSpeakerDiarization=True)
+      if args.min_diarization_speaker_count:
+        speaker_config.minSpeakerCount = args.min_diarization_speaker_count
+      if args.max_diarization_speaker_count:
+        speaker_config.maxSpeakerCount = args.max_diarization_speaker_count
+      # Only use legacy flag if min/max fields were not used.
+      if args.diarization_speaker_count:
+        if (args.min_diarization_speaker_count or
+            args.max_diarization_speaker_count):
+          raise exceptions.InvalidArgumentException(
+              '--diarization-speaker-count',
+              'deprecated flag cannot be used with '
+              '--max/min_diarization_speaker_count flags')
+        speaker_config.minSpeakerCount = args.diarization_speaker_count
+        speaker_config.maxSpeakerCount = args.diarization_speaker_count
+
+    config.enableWordConfidence = args.include_word_confidence
+
+  def AddAlphaRecognizeArgsToParser(self, parser, api_version):
+    """Add alpha arguments."""
+    meta_args = parser.add_group(
+        required=False,
+        help='Description of audio data to be recognized. '
+        'Note that the Google Cloud Speech-to-text-api does not use this '
+        'information, and only passes it through back into response.')
+    meta_args.add_argument(
+        '--naics-code',
+        action=MakeDeprecatedRecgonitionFlagAction('naics-code'),
+        type=int,
+        help='The industry vertical to which this speech recognition request '
+        'most closely applies.')
+    self._original_media_type_mapper = GetOriginalMediaTypeMapper(api_version)
+    self._original_media_type_mapper.choice_arg.AddToParser(meta_args)
+    self._interaction_type_mapper = GetInteractionTypeMapper(api_version)
+    self._interaction_type_mapper.choice_arg.AddToParser(meta_args)
+    self._microphone_distance_type_mapper = GetMicrophoneDistanceTypeMapper(
+        api_version)
+    self._microphone_distance_type_mapper.choice_arg.AddToParser(meta_args)
+    self._device_type_mapper = GetRecordingDeviceTypeMapper(api_version)
+    self._device_type_mapper.choice_arg.AddToParser(meta_args)
+    meta_args.add_argument(
+        '--recording-device-name',
+        action=MakeDeprecatedRecgonitionFlagAction('recording-device-name'),
+        help='The device used to make the recording.  Examples: `Nexus 5X`, '
+        '`Polycom SoundStation IP 6000`')
+    meta_args.add_argument(
+        '--original-mime-type',
+        action=MakeDeprecatedRecgonitionFlagAction('original-mime-type'),
+        help='Mime type of the original audio file. Examples: `audio/m4a`, '
+        ' `audio/mp3`.')
+    meta_args.add_argument(
+        '--audio-topic',
+        action=MakeDeprecatedRecgonitionFlagAction('audio-topic'),
+        help='Description of the content, e.g. "Recordings of federal supreme '
+        'court hearings from 2012".')
+
+  def UpdateAlphaArgsInRecognitionConfig(self, args, config):
+    """Update RecognitionConfig with args."""
+    if (args.interaction_type is not None or
+        args.original_media_type is not None or args.naics_code is not None or
+        args.microphone_distance is not None or
+        args.recording_device_type is not None or
+        args.recording_device_name is not None or
+        args.original_mime_type is not None or args.audio_topic is not None):
+      if config.metadata is None:
+        config.metadata = config.field_by_name('metadata').message_type()
+      config.metadata.interactionType = (
+          self._interaction_type_mapper.GetEnumForChoice(args.interaction_type))
+      config.metadata.originalMediaType = (
+          self._original_media_type_mapper.GetEnumForChoice(
+              args.original_media_type))
+      config.metadata.industryNaicsCodeOfAudio = args.naics_code
+      config.metadata.microphoneDistance = (
+          self._microphone_distance_type_mapper.GetEnumForChoice(
+              args.microphone_distance))
+      config.metadata.recordingDeviceType = (
+          self._device_type_mapper.GetEnumForChoice(args.recording_device_type))
+      config.metadata.recordingDeviceName = args.recording_device_name
+      config.metadata.originalMimeType = args.original_mime_type
+      config.metadata.audioTopic = args.audio_topic
+
+
+def MakeDeprecatedRecgonitionFlagAction(flag_name):
+  return actions.DeprecationAction(
+      '--' + flag_name,
+      warn='The `{}` flag is deprecated and will be removed. '
+      'The Google Cloud Speech-to-text api does not use it, and only '
+      'passes it through back into response.'.format(flag_name))
+
+
+def GetRecordingDeviceTypeMapper(version):
+  messages = apis.GetMessagesModule(util.SPEECH_API, version)
+  return arg_utils.ChoiceEnumMapper(
+      '--recording-device-type',
+      messages.RecognitionMetadata.RecordingDeviceTypeValueValuesEnum,
+      action=MakeDeprecatedRecgonitionFlagAction('recording-device-type'),
+      custom_mappings={
+          'SMARTPHONE': ('smartphone', 'Speech was recorded on a smartphone.'),
+          'PC': ('pc',
+                 'Speech was recorded using a personal computer or tablet.'),
+          'PHONE_LINE':
+              ('phone-line', 'Speech was recorded over a phone line.'),
+          'VEHICLE': ('vehicle', 'Speech was recorded in a vehicle.'),
+          'OTHER_OUTDOOR_DEVICE': ('outdoor', 'Speech was recorded outdoors.'),
+          'OTHER_INDOOR_DEVICE': ('indoor', 'Speech was recorded indoors.')
+      },
+      help_str='The device type through which the original audio was '
+      'recorded on.',
+      include_filter=lambda x: not x.endswith('UNSPECIFIED'))
+
+
+def GetMicrophoneDistanceTypeMapper(version):
+  messages = apis.GetMessagesModule(util.SPEECH_API, version)
+  return arg_utils.ChoiceEnumMapper(
+      '--microphone-distance',
+      messages.RecognitionMetadata.MicrophoneDistanceValueValuesEnum,
+      action=MakeDeprecatedRecgonitionFlagAction('microphone-distance'),
+      custom_mappings={
+          'NEARFIELD': ('nearfield', """\
+The audio was captured from a microphone close to the speaker, generally within
+ 1 meter. Examples include a phone, dictaphone, or handheld microphone."""),
+          'MIDFIELD':
+              ('midfield', 'The speaker is within 3 meters of the microphone.'),
+          'FARFIELD':
+              ('farfield',
+               'The speaker is more than 3 meters away from the microphone.'),
+      },
+      help_str='The distance at which the audio device is placed to record '
+      'the conversation.',
+      include_filter=lambda x: not x.endswith('UNSPECIFIED'))
+
+
+def GetInteractionTypeMapper(version):
+  messages = apis.GetMessagesModule(util.SPEECH_API, version)
+  return arg_utils.ChoiceEnumMapper(
+      '--interaction-type',
+      messages.RecognitionMetadata.InteractionTypeValueValuesEnum,
+      action=MakeDeprecatedRecgonitionFlagAction('interaction-type'),
+      custom_mappings={
+          'DICTATION': (
+              'dictation',
+              'Transcribe speech to text to create a written document, such as '
+              + 'a text-message, email or report.'),
+          'DISCUSSION': ('discussion',
+                         'Multiple people in a conversation or discussion.'),
+          'PRESENTATION': ('presentation',
+                           'One or more persons lecturing or presenting to ' +
+                           'others, mostly uninterrupted.'),
+          'PHONE_CALL': (
+              'phone-call',
+              'A phone-call or video-conference in which two or more people, ' +
+              'who are not in the same room, are actively participating.'),
+          'PROFESSIONALLY_PRODUCED':
+              ('professionally-produced',
+               'Professionally produced audio (eg. TV Show, Podcast).'),
+          'VOICE_COMMAND':
+              ('voice-command',
+               'Transcribe voice commands, such as for controlling a device.'),
+          'VOICE_SEARCH':
+              ('voice-search',
+               'Transcribe spoken questions and queries into text.'),
+          'VOICEMAIL':
+              ('voicemail',
+               'A recorded message intended for another person to listen to.'),
+      },
+      help_str='Determining the interaction type in the conversation.',
+      include_filter=lambda x: not x.endswith('UNSPECIFIED'))
+
+
+def GetOriginalMediaTypeMapper(version):
+  messages = apis.GetMessagesModule(util.SPEECH_API, version)
+  return arg_utils.ChoiceEnumMapper(
+      '--original-media-type',
+      messages.RecognitionMetadata.OriginalMediaTypeValueValuesEnum,
+      action=MakeDeprecatedRecgonitionFlagAction('original-media-type'),
+      custom_mappings={
+          'AUDIO': ('audio', 'The speech data is an audio recording.'),
+          'VIDEO': ('video', 'The speech data originally recorded on a video.'),
+      },
+      help_str='The media type of the original audio conversation.',
+      include_filter=lambda x: not x.endswith('UNSPECIFIED'))
--- a/login/google-cloud-sdk/lib/googlecloudsdk/command_lib/ml/speech/flags_v2.py
+++ b/login/google-cloud-sdk/lib/googlecloudsdk/command_lib/ml/speech/flags_v2.py
@@ -0,0 +1,330 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2022 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Flags for speech commands."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import unicode_literals
+
+from googlecloudsdk.calliope import arg_parsers
+from googlecloudsdk.calliope import base
+from googlecloudsdk.calliope.concepts import concepts
+from googlecloudsdk.command_lib.util.apis import yaml_data
+from googlecloudsdk.command_lib.util.concepts import concept_parsers
+from googlecloudsdk.command_lib.util.concepts import presentation_specs
+
+
+SPEAKER_COUNT_MAX_VALUE = 6
+SPEAKER_COUNT_MIN_VALUE = 1
+ALTERNATIVES_MAX_VALUE = 30
+ALTERNATIVES_MIN_VALUE = 1
+AUDIO_CHANNEL_COUNT_MAX_VALUE = 8
+AUDIO_CHANNEL_COUNT_MIN_VALUE = 1
+SAMPLE_RATE_MAX_VALUE = 48000
+SAMPLE_RATE_MIN_VALUE = 8000
+
+
+def AddRecognizerArgToParser(parser):
+  """Sets up an argument for the recognizer resource."""
+  resource_data = yaml_data.ResourceYAMLData.FromPath('ml.speech.recognizer')
+  resource_spec = concepts.ResourceSpec.FromYaml(
+      resource_data.GetData(), api_version='v2'
+  )
+  presentation_spec = presentation_specs.ResourcePresentationSpec(
+      name='recognizer',
+      concept_spec=resource_spec,
+      required=True,
+      group_help='recognizer.',
+  )
+  return concept_parsers.ConceptParser([presentation_spec]).AddToParser(parser)
+
+
+def AddLocationArgToParser(parser):
+  """Parses location flag."""
+  location_data = yaml_data.ResourceYAMLData.FromPath('ml.speech.location')
+  resource_spec = concepts.ResourceSpec.FromYaml(location_data.GetData())
+  presentation_spec = presentation_specs.ResourcePresentationSpec(
+      name='--location',
+      concept_spec=resource_spec,
+      required=True,
+      group_help='location.',
+  )
+  return concept_parsers.ConceptParser([presentation_spec]).AddToParser(parser)
+
+
+def AddLocationPositionalArgToParser(parser):
+  """Parses location when there is no flag."""
+  location_data = yaml_data.ResourceYAMLData.FromPath('ml.speech.location')
+  resource_spec = concepts.ResourceSpec.FromYaml(location_data.GetData())
+  presentation_spec = presentation_specs.ResourcePresentationSpec(
+      name='location',
+      concept_spec=resource_spec,
+      required=True,
+      group_help='location.',
+  )
+  return concept_parsers.ConceptParser([presentation_spec]).AddToParser(parser)
+
+
+def AddAllFlagsToParser(
+    parser, require_base_recognizer_attributes=False, use_store_true=False
+):
+  """Parses all flags for v2 STT API."""
+  AddRecognizerArgToParser(parser)
+  AddAsyncFlagToParser(parser)
+  parser.add_argument(
+      '--display-name',
+      help="""\
+      Name of this recognizer as it appears in UIs.
+      """,
+  )
+  AddBaseRecognizerAttributeFlagsToParser(
+      parser, required=require_base_recognizer_attributes
+  )
+  AddFeatureFlagsToParser(parser, use_store_true)
+  AddDecodingConfigFlagsToParser(parser)
+
+
+def AddRecognizeRequestFlagsToParser(parser, add_async_flag=False):
+  """Parses all flags for v2 STT API for command run-batch."""
+  AddRecognizerArgToParser(parser)
+  parser.add_argument(
+      '--audio',
+      required=True,
+      help=(
+          'Location of the audio file to transcribe. '
+          'Must be a audio data bytes, local file, or Google Cloud Storage URL '
+          '(in the format gs://bucket/object).'
+      ),
+  )
+  AddFeatureFlagsToParser(parser)
+  AddDecodingConfigFlagsToParser(parser)
+  AddBaseRecognizerAttributeFlagsToParser(parser)
+  parser.add_argument(
+      '--hint-phrases',
+      metavar='PHRASE',
+      type=arg_parsers.ArgList(),
+      help="""\
+        A list of strings containing word and phrase "hints" so that the '
+        'speech recognition is more likely to recognize them. This can be '
+        'used to improve the accuracy for specific words and phrases, '
+        'for example, if specific commands are typically spoken by '
+        'the user. This can also be used to add additional words to the '
+        'vocabulary of the recognizer. '
+        'See https://cloud.google.com/speech/limits#content.
+      """,
+  )
+  parser.add_argument(
+      '--hint-phrase-sets',
+      metavar='PHRASE_SET',
+      type=arg_parsers.ArgList(),
+      help="""\
+        A list of phrase set resource names to use for speech recognition.
+      """,
+  )
+  parser.add_argument(
+      '--hint-boost',
+      type=arg_parsers.BoundedFloat(1, 20),
+      help="""\
+        Boost value for the phrases passed to --phrases.
+        Can have a value between 1 and 20.
+      """,
+  )
+
+  if add_async_flag:
+    AddAsyncFlagToParser(parser)
+
+
+def AddAsyncFlagToParser(parser):
+  """Adds async flag to parser."""
+  base.ASYNC_FLAG.AddToParser(parser)
+  base.ASYNC_FLAG.SetDefault(parser, False)
+
+
+def AddBaseRecognizerAttributeFlagsToParser(parser, required=False):
+  """Adds base recognizer attribute flags to parser."""
+  parser.add_argument(
+      '--model',
+      required=required,
+      help="""\
+          Which model to use for recognition requests.
+          Select the model best suited to your domain to get best results.
+          Guidance for choosing which model to use can be found in the
+          [Transcription Models Documentation](https://cloud.google.com/speech-to-text/v2/docs/transcription-model)
+          and the models supported in each region can be found in the
+          [Table Of Supported Models](https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages).
+          """,
+  )
+  parser.add_argument(
+      '--language-codes',
+      metavar='LANGUAGE_CODE',
+      required=required,
+      type=arg_parsers.ArgList(),
+      help="""\
+          Language code is one of `en-US`, `en-GB`, `fr-FR`.
+          Check [documentation](https://cloud.google.com/speech-to-text/docs/multiple-languages)
+          for using more than one language code.
+          """,
+  )
+
+
+def AddDecodingConfigFlagsToParser(parser):
+  """Adds decoding config flags to parser."""
+  decoding_config_group = parser.add_group(help='Encoding format')
+  decoding_config_group.add_argument(
+      '--encoding',
+      help="""\
+          Encoding format of the provided audio.
+          For headerless formats, must be set to `LINEAR16`, `MULAW,` or `ALAW`.
+          For other formats, set to `AUTO`. Overrides the recognizer
+          configuration if present, else uses recognizer encoding.
+          """,
+  )
+  sample_rate_help = (
+      'Sample rate in Hertz of the audio data sent for recognition. '
+      'Required if --encoding flag is specified and is not AUTO. '
+      'Must be set to a value between {} and {}.'.format(
+          SAMPLE_RATE_MIN_VALUE, SAMPLE_RATE_MAX_VALUE
+      )
+  )
+  decoding_config_group.add_argument(
+      '--sample-rate',
+      type=arg_parsers.BoundedInt(SAMPLE_RATE_MIN_VALUE, SAMPLE_RATE_MAX_VALUE),
+      help=sample_rate_help,
+  )
+  audio_channel_count_help = (
+      'Number of channels present in the audio data sent for recognition. '
+      'Required if --encoding flag is specified and is not AUTO. '
+      'Must be set to a value between {} and {}.'.format(
+          AUDIO_CHANNEL_COUNT_MIN_VALUE, AUDIO_CHANNEL_COUNT_MAX_VALUE
+      )
+  )
+  decoding_config_group.add_argument(
+      '--audio-channel-count',
+      type=arg_parsers.BoundedInt(
+          AUDIO_CHANNEL_COUNT_MIN_VALUE, AUDIO_CHANNEL_COUNT_MAX_VALUE
+      ),
+      help=audio_channel_count_help,
+  )
+
+
+def AddFeatureFlagsToParser(parser, use_store_true=False):
+  """Adds feature flags to parser."""
+  features_group = parser.add_group(help='ASR Features')
+  speaker_diarization_group = features_group.add_group(
+      help='Speaker Diarization'
+  )
+  features_group.add_argument(
+      '--profanity-filter',
+      action='store_true'
+      if use_store_true
+      else arg_parsers.StoreTrueFalseAction,
+      help="""\
+      If set, the server will censor profanities.
+      """,
+  )
+  features_group.add_argument(
+      '--enable-word-time-offsets',
+      action='store_true'
+      if use_store_true
+      else arg_parsers.StoreTrueFalseAction,
+      help="""\
+      If set, the top result includes a list of words and their timestamps.
+      """,
+  )
+  features_group.add_argument(
+      '--enable-word-confidence',
+      action='store_true'
+      if use_store_true
+      else arg_parsers.StoreTrueFalseAction,
+      help="""\
+      If set, the top result includes a list of words and the confidence for
+      those words.
+      """,
+  )
+  features_group.add_argument(
+      '--enable-automatic-punctuation',
+      action='store_true'
+      if use_store_true
+      else arg_parsers.StoreTrueFalseAction,
+      help="""\
+      If set, adds punctuation to recognition result hypotheses.
+      """,
+  )
+  features_group.add_argument(
+      '--enable-spoken-punctuation',
+      action='store_true'
+      if use_store_true
+      else arg_parsers.StoreTrueFalseAction,
+      help="""\
+      If set, replaces spoken punctuation with the corresponding symbols in the request.
+      """,
+  )
+  features_group.add_argument(
+      '--enable-spoken-emojis',
+      action='store_true'
+      if use_store_true
+      else arg_parsers.StoreTrueFalseAction,
+      help="""\
+      If set, adds spoken emoji formatting.
+      """,
+  )
+  min_speaker_count_help = (
+      'Minimum number of speakers in the conversation. Must be less than or'
+      ' equal to --max-speaker-count. Must be set to a value between {} and {}.'
+      .format(SPEAKER_COUNT_MIN_VALUE, SPEAKER_COUNT_MAX_VALUE)
+  )
+  max_speaker_count_help = (
+      'Maximum number of speakers in the conversation. Must be greater than or'
+      ' equal to --min-speaker-count. Must be set to a value between {} and {}.'
+      .format(SPEAKER_COUNT_MIN_VALUE, SPEAKER_COUNT_MAX_VALUE)
+  )
+  speaker_diarization_group.add_argument(
+      '--min-speaker-count',
+      required=True,
+      type=arg_parsers.BoundedInt(
+          SPEAKER_COUNT_MIN_VALUE, SPEAKER_COUNT_MAX_VALUE
+      ),
+      help=min_speaker_count_help,
+  )
+  speaker_diarization_group.add_argument(
+      '--max-speaker-count',
+      required=True,
+      type=arg_parsers.BoundedInt(
+          SPEAKER_COUNT_MIN_VALUE, SPEAKER_COUNT_MAX_VALUE
+      ),
+      help=max_speaker_count_help,
+  )
+  features_group.add_argument(
+      '--separate-channel-recognition',
+      action='store_true'
+      if use_store_true
+      else arg_parsers.StoreTrueFalseAction,
+      help="""\
+        Mode for recognizing multi-channel audio using Separate Channel Recognition.
+        When set, the service will recognize each channel independently.
+        """,
+  )
+  max_alternatives_help = (
+      'Maximum number of recognition hypotheses to be returned. Must be set to'
+      ' a value between {} and {}.'.format(
+          ALTERNATIVES_MIN_VALUE, ALTERNATIVES_MAX_VALUE
+      )
+  )
+  features_group.add_argument(
+      '--max-alternatives',
+      type=arg_parsers.BoundedInt(
+          ALTERNATIVES_MIN_VALUE, ALTERNATIVES_MAX_VALUE
+      ),
+      help=max_alternatives_help,
+  )
--- a/login/google-cloud-sdk/lib/googlecloudsdk/command_lib/ml/speech/resources.yaml
+++ b/login/google-cloud-sdk/lib/googlecloudsdk/command_lib/ml/speech/resources.yaml
@@ -0,0 +1,40 @@
+project:
+  name: project
+  collection: speech.projects
+  attributes:
+  - &project
+    parameter_name: projectsId
+    attribute_name: project
+    help: |
+      Project of the {resource}.
+    property: core/project
+
+location:
+  name: location
+  collection: speech.projects.locations
+  attributes:
+  - *project
+  - &location
+    parameter_name: locationsId
+    attribute_name: location
+    help: |
+      Location of the {resource}.
+
+operation:
+  name: operation
+  collection: speech.operations
+  attributes:
+  - parameter_name: operationsId
+    attribute_name: operation
+    help: The ID of the operation
+
+recognizer:
+  name: recognizer
+  collection: speech.projects.locations.recognizers
+  attributes:
+  - *project
+  - *location
+  - &recognizer
+    parameter_name: recognizersId
+    attribute_name: recognizer
+    help: Speech-to-text recognizer.
--- a/login/google-cloud-sdk/lib/googlecloudsdk/command_lib/ml/speech/util.py
+++ b/login/google-cloud-sdk/lib/googlecloudsdk/command_lib/ml/speech/util.py
@@ -0,0 +1,142 @@
+# -*- coding: utf-8 -*- #
+# Copyright 2017 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Wrapper for interacting with speech API."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import unicode_literals
+
+import os
+
+from googlecloudsdk.api_lib.storage import storage_util
+from googlecloudsdk.api_lib.util import apis
+from googlecloudsdk.core import exceptions
+from googlecloudsdk.core import log
+from googlecloudsdk.core import properties
+from googlecloudsdk.core.console import console_io
+from googlecloudsdk.core.util import files
+
+from six.moves import urllib
+
+
+SPEECH_API = 'speech'
+SPEECH_API_VERSION = 'v1'
+
+
+OUTPUT_ERROR_MESSAGE = ('[{}] is not a valid format for result output. Must be '
+                        'a Google Cloud Storage URI '
+                        '(format: gs://bucket/file).')
+
+
+class Error(exceptions.Error):
+  """Exceptions for this module."""
+
+
+class AudioException(Error):
+  """Raised if audio is not found."""
+
+
+class UriFormatError(Error):
+  """Error if the specified URI is invalid."""
+
+
+def GetRecognitionAudioFromPath(path, version):
+  """Determine whether path to audio is local, set RecognitionAudio message."""
+  messages = apis.GetMessagesModule(SPEECH_API, version)
+  audio = messages.RecognitionAudio()
+
+  if os.path.isfile(path):
+    audio.content = files.ReadBinaryFileContents(path)
+  elif storage_util.ObjectReference.IsStorageUrl(path):
+    audio.uri = path
+  else:
+    raise AudioException(
+        'Invalid audio source [{}]. The source must either be a local path '
+        'or a Google Cloud Storage URL (such as gs://bucket/object).'.format(
+            path))
+  return audio
+
+
+def GetAudioHook(version=SPEECH_API_VERSION):
+  """Returns a hook to get the RecognitionAudio message for an API version."""
+  def GetAudioFromPath(path):
+    """Determine whether path to audio is local, build RecognitionAudio message.
+
+    Args:
+      path: str, the path to the audio.
+
+    Raises:
+      AudioException: If audio is not found locally and does not appear to be
+        Google Cloud Storage URL.
+
+    Returns:
+      speech_v1_messages.RecognitionAudio, the audio message.
+    """
+    return GetRecognitionAudioFromPath(path, version)
+  return GetAudioFromPath
+
+
+def ValidateOutputUri(output_uri):
+  """Validates given output URI against validator function.
+
+  Args:
+    output_uri: str, the output URI for the analysis.
+
+  Raises:
+    UriFormatError: if the URI is not valid.
+
+  Returns:
+    str, The same output_uri.
+  """
+  if output_uri and not storage_util.ObjectReference.IsStorageUrl(output_uri):
+    raise UriFormatError(OUTPUT_ERROR_MESSAGE.format(output_uri))
+  return output_uri
+
+
+def MaybePrintSttUiLink(request):
+  """Print Url to the Speech-to-text UI console for given recognize request."""
+  if (console_io.IsRunFromShellScript() or
+      properties.VALUES.core.disable_prompts.GetBool()):
+    return
+  audio_uri = request.audio.uri
+  if not audio_uri:
+    return
+  payload = {
+      'audio':
+          urllib.parse.quote_plus(
+              audio_uri[5:] if audio_uri.startswith('gs://') else audio_uri),
+      'encoding':
+          request.config.encoding,
+      'model':
+          request.config.model,
+      'locale':
+          request.config.languageCode,
+      'sampling':
+          request.config.sampleRateHertz,
+      'channels':
+          request.config.audioChannelCount,
+      'enhanced':
+          request.config.useEnhanced,
+  }
+
+  params = ';'.join('{}={}'.format(key, value)
+                    for (key, value) in sorted(payload.items())
+                    if value and ('unspecified' not in str(value).lower()))
+  url_tuple = ('https', 'console.cloud.google.com',
+               '/speech/transcriptions/create', params, '', '')
+  target_url = urllib.parse.urlunparse(url_tuple)
+  log.status.Print(
+      'Try this using the Speech-to-Text UI at {}'.format(target_url))