feat: Add new gcloud commands, API clients, and third-party libraries across various services.

This commit is contained in:
2026-01-01 20:26:35 +01:00
parent 5e23cbece0
commit a19e592eb7
25221 changed files with 8324611 additions and 0 deletions

View File

@@ -0,0 +1,96 @@
# -*- coding: utf-8 -*- #
# Copyright 2024 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Validation functions for speech commands flags."""
import os
from googlecloudsdk.api_lib.storage import storage_util
from googlecloudsdk.calliope import exceptions
EXPLICIT_ENCODING_OPTIONS = ('LINEAR16', 'MULAW', 'ALAW')
ENCODING_OPTIONS = frozenset(EXPLICIT_ENCODING_OPTIONS) | {'AUTO'}
def ValidateSpeakerDiarization(args):
"""Validates speaker diarization flag input."""
if (
args.min_speaker_count is not None and args.max_speaker_count is not None
) and (args.min_speaker_count > args.max_speaker_count):
raise exceptions.InvalidArgumentException(
'--max-speaker-count',
'[--max-speaker-count] must be equal to or larger than'
' min-speaker-count.',
)
def ValidateAudioSource(args, batch=False):
"""Validates audio source flag input."""
if storage_util.ObjectReference.IsStorageUrl(args.audio):
return
if batch:
raise exceptions.InvalidArgumentException(
'--audio',
'Invalid audio source [{}]. The source must be a Google Cloud'
' Storage URL (such as gs://bucket/object).'.format(args.audio),
)
if not os.path.isfile(args.audio):
raise exceptions.InvalidArgumentException(
'--audio',
'Invalid audio source [{}]. The source must either be a local '
'path or a Google Cloud Storage URL '
'(such as gs://bucket/object).'.format(args.audio),
)
def ValidateDecodingConfig(args):
"""Validates encoding flag input."""
if args.encoding is None:
return
if args.encoding not in ENCODING_OPTIONS:
raise exceptions.InvalidArgumentException(
'--encoding',
'[--encoding] must be set to one of '
+ ', '.join(sorted(ENCODING_OPTIONS)),
)
if args.encoding == 'AUTO':
if args.sample_rate is not None or args.audio_channel_count is not None:
raise exceptions.InvalidArgumentException(
'--sample-rate'
if args.sample_rate is not None
else '--audio-channel-count',
'AUTO encoding does not support setting sample rate or audio'
' channel count.',
)
else:
if args.sample_rate is None:
raise exceptions.InvalidArgumentException(
'--sample-rate',
'[--sample-rate] must be specified when configuring explicit'
' encoding options '
+ ', '.join(sorted(EXPLICIT_ENCODING_OPTIONS))
+ '.',
)
if args.audio_channel_count is None:
raise exceptions.InvalidArgumentException(
'--audio-channel-count',
(
'[--audio-channel-count] must be specified when configuring'
' explicit encoding options '
+ ', '.join(sorted(EXPLICIT_ENCODING_OPTIONS))
),
)

View File

@@ -0,0 +1,486 @@
# -*- coding: utf-8 -*- #
# Copyright 2022 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Flags for speech commands."""
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from googlecloudsdk.api_lib.util import apis
from googlecloudsdk.calliope import actions
from googlecloudsdk.calliope import arg_parsers
from googlecloudsdk.calliope import exceptions
from googlecloudsdk.command_lib.ml.speech import util
from googlecloudsdk.command_lib.util.apis import arg_utils
def GetEncodingTypeMapper(version):
messages = apis.GetMessagesModule(util.SPEECH_API, version)
return arg_utils.ChoiceEnumMapper(
'--encoding',
messages.RecognitionConfig.EncodingValueValuesEnum,
default='encoding-unspecified',
help_str='The type of encoding of the file. Required if the file format '
'is not WAV or FLAC.')
class RecognizeArgsToRequestMapper:
"""Utility class to map arguments to Recognize request."""
def __init__(self):
self._encoding_type_mapper = None
self._original_media_type_mapper = None
self._interaction_type_mapper = None
self._microphone_distance_type_mapper = None
self._device_type_mapper = None
def AddRecognizeArgsToParser(self, parser, api_version):
"""Add common, GA level flags for recognize commands."""
parser.add_argument(
'audio',
help='The location of the audio file to transcribe. '
'Must be a local path or a Google Cloud Storage URL '
'(in the format gs://bucket/object).')
language_args = parser.add_group(mutex=True, required=True)
language_args.add_argument(
'--language-code',
help='The language of the supplied audio as a BCP-47 '
'(https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. Example: '
'"en-US". See https://cloud.google.com/speech/docs/languages for a list '
'of the currently supported language codes.')
language_args.add_argument(
'--language',
action=actions.DeprecationAction(
'--language',
warn=('The `--language` flag is deprecated. '
'Use the `--language-code` flag instead.')),
hidden=True,
help='The language of the supplied audio as a BCP-47 '
'(https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. Example: '
'"en-US". See https://cloud.google.com/speech/docs/languages for a list '
'of the currently supported language codes.')
self._encoding_type_mapper = GetEncodingTypeMapper(api_version)
self._encoding_type_mapper.choice_arg.AddToParser(parser)
parser.add_argument(
'--sample-rate',
type=int,
required=False,
help='The sample rate in Hertz. For best results, set the sampling rate '
'of the audio source to 16000 Hz. If that\'s not possible, '
'use the native sample rate of the audio source '
'(instead of re-sampling).')
audio_channel_args = parser.add_group(
required=False, help='Audio channel settings.')
audio_channel_args.add_argument(
'--audio-channel-count',
type=int,
required=True,
help='The number of channels in the input audio data. Set this for '
'separate-channel-recognition. Valid values are: '
'1)LINEAR16 and FLAC are 1-8 '
'2)OGG_OPUS are 1-254 '
'3) MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`.')
audio_channel_args.add_argument(
'--separate-channel-recognition',
action='store_true',
required=True,
help='Recognition result will contain a `channel_tag` field to state '
'which channel that result belongs to. If this is not true, only '
'the first channel will be recognized.')
parser.add_argument(
'--model',
choices={
'default': (
'audio that is not one of the specific audio models. '
'For example, long-form audio. '
'Ideally the audio is high-fidelity, recorded at a 16khz '
'or greater sampling rate.'
),
'command_and_search': (
'short queries such as voice commands or voice search.'
),
'latest_long': (
'Use this model for any kind of long form content such as media'
' or spontaneous speech and conversations. Consider using this'
' model in place of the video model, especially if the video'
' model is not available in your target language. You can also'
' use this in place of the default model.'
),
'latest_short': (
'Use this model for short utterances that are a few seconds in'
' length. It is useful for trying to capture commands or other'
' single shot directed speech use cases. Consider using this'
' model instead of the command and search model.'
),
'medical_conversation': (
'Best for audio that originated from a conversation between a '
'medical provider and patient.'
),
'medical_dictation': (
'Best for audio that originated from dictation notes by a'
' medical provider.'
),
'phone_call': (
'audio that originated from a phone call (typically recorded at'
' an 8khz sampling rate).'
),
'phone_call_enhanced': (
'audio that originated from a phone call (typically recorded at'
' an 8khz sampling rate). This is a premium model and can'
' produce better results but costs more than the standard rate.'
),
'telephony': (
'Improved version of the "phone_call" model, best for audio '
'that originated from a phone call, typically recorded at an '
'8kHz sampling rate.'
),
'telephony_short': (
'Dedicated version of the modern "telephony" model for short '
'or even single-word utterances for audio that originated from '
'a phone call, typically recorded at an 8kHz sampling rate.'
),
'video_enhanced': (
'audio that originated from video or includes multiple'
' speakers. Ideally the audio is recorded at a 16khz or greater'
' sampling rate. This is a premium model that costs more than'
' the standard rate.'
),
},
help=(
'Select the model best suited to your domain to get best results.'
' If you do not explicitly specify a model, Speech-to-Text will'
' auto-select a model based on your other specified parameters.'
' Some models are premium and cost more than standard models'
' (although you can reduce the price by opting into'
' https://cloud.google.com/speech-to-text/docs/data-logging)'
),
)
parser.add_argument(
'--max-alternatives',
type=int,
default=1,
help='Maximum number of recognition hypotheses to be returned. '
'The server may return fewer than max_alternatives. '
'Valid values are 0-30. A value of 0 or 1 will return a maximum '
'of one.')
parser.add_argument(
'--hints',
type=arg_parsers.ArgList(),
metavar='HINT',
default=[],
help='A list of strings containing word and phrase "hints" so that the '
'speech recognition is more likely to recognize them. This can be '
'used to improve the accuracy for specific words and phrases, '
'for example, if specific commands are typically spoken by '
'the user. This can also be used to add additional words to the '
'vocabulary of the recognizer. '
'See https://cloud.google.com/speech/limits#content.')
parser.add_argument(
'--include-word-time-offsets',
action='store_true',
default=False,
help='If True, the top result includes a list of words with the start '
'and end time offsets (timestamps) for those words. If False, '
'no word-level time offset information is returned.')
parser.add_argument(
'--filter-profanity',
action='store_true',
default=False,
help='If True, the server will attempt to filter out profanities, '
'replacing all but the initial character in each filtered word with '
'asterisks, e.g. ```f***```.')
parser.add_argument(
'--enable-automatic-punctuation',
action='store_true',
help='Adds punctuation to recognition result hypotheses.')
def MakeRecognitionConfig(self, args, messages):
"""Make RecognitionConfig message from given arguments."""
config = messages.RecognitionConfig(
languageCode=args.language_code
if args.language_code else args.language,
encoding=self._encoding_type_mapper.GetEnumForChoice(
args.encoding.replace('_', '-').lower()),
sampleRateHertz=args.sample_rate,
audioChannelCount=args.audio_channel_count,
maxAlternatives=args.max_alternatives,
enableWordTimeOffsets=args.include_word_time_offsets,
enableSeparateRecognitionPerChannel=args.separate_channel_recognition,
profanityFilter=args.filter_profanity,
speechContexts=[messages.SpeechContext(phrases=args.hints)])
if args.enable_automatic_punctuation:
config.enableAutomaticPunctuation = args.enable_automatic_punctuation
if args.model is not None:
if args.model in [
'default',
'command_and_search',
'phone_call',
'latest_long',
'latest_short',
'medical_conversation',
'medical_dictation',
'telephony',
'telephony_short',
]:
config.model = args.model
elif args.model == 'phone_call_enhanced':
config.model = 'phone_call'
config.useEnhanced = True
elif args.model == 'video_enhanced':
config.model = 'video'
config.useEnhanced = True
return config
def AddBetaRecognizeArgsToParser(self, parser):
"""Add beta arguments."""
parser.add_argument(
'--additional-language-codes',
type=arg_parsers.ArgList(),
default=[],
metavar='LANGUAGE_CODE',
help="""\
The BCP-47 language tags of other languages that the speech may be in.
Up to 3 can be provided.
If alternative languages are listed, recognition result will contain recognition
in the most likely language detected including the main language-code.""")
speaker_args = parser.add_group(required=False)
speaker_args.add_argument(
'--diarization-speaker-count',
type=int,
hidden=True,
action=actions.DeprecationAction(
'--diarization-speaker-count',
warn=('The `--diarization-speaker-count` flag is deprecated. '
'Use the `--min-diarization-speaker-count` and/or '
'`--max-diarization-speaker-count` flag instead.')),
help='Estimated number of speakers in the conversation '
'being recognized.')
speaker_args.add_argument(
'--min-diarization-speaker-count',
type=int,
help='Minimum estimated number of speakers in the conversation '
'being recognized.')
speaker_args.add_argument(
'--max-diarization-speaker-count',
type=int,
help='Maximum estimated number of speakers in the conversation '
'being recognized.')
speaker_args.add_argument(
'--enable-speaker-diarization',
action='store_true',
required=True,
help='Enable speaker detection for each recognized word in the top '
'alternative of the recognition result using an integer '
'speaker_tag provided in the WordInfo.')
parser.add_argument(
'--include-word-confidence',
action='store_true',
help='Include a list of words and the confidence for those words in '
'the top result.')
def UpdateBetaArgsInRecognitionConfig(self, args, config):
"""Updates config from command line arguments."""
config.alternativeLanguageCodes = args.additional_language_codes
# If any of diarization flags are used enable diarization.
if (args.enable_speaker_diarization or args.min_diarization_speaker_count or
args.max_diarization_speaker_count or args.diarization_speaker_count):
speaker_config = config.diarizationConfig = config.field_by_name(
'diarizationConfig').message_type(enableSpeakerDiarization=True)
if args.min_diarization_speaker_count:
speaker_config.minSpeakerCount = args.min_diarization_speaker_count
if args.max_diarization_speaker_count:
speaker_config.maxSpeakerCount = args.max_diarization_speaker_count
# Only use legacy flag if min/max fields were not used.
if args.diarization_speaker_count:
if (args.min_diarization_speaker_count or
args.max_diarization_speaker_count):
raise exceptions.InvalidArgumentException(
'--diarization-speaker-count',
'deprecated flag cannot be used with '
'--max/min_diarization_speaker_count flags')
speaker_config.minSpeakerCount = args.diarization_speaker_count
speaker_config.maxSpeakerCount = args.diarization_speaker_count
config.enableWordConfidence = args.include_word_confidence
def AddAlphaRecognizeArgsToParser(self, parser, api_version):
"""Add alpha arguments."""
meta_args = parser.add_group(
required=False,
help='Description of audio data to be recognized. '
'Note that the Google Cloud Speech-to-text-api does not use this '
'information, and only passes it through back into response.')
meta_args.add_argument(
'--naics-code',
action=MakeDeprecatedRecgonitionFlagAction('naics-code'),
type=int,
help='The industry vertical to which this speech recognition request '
'most closely applies.')
self._original_media_type_mapper = GetOriginalMediaTypeMapper(api_version)
self._original_media_type_mapper.choice_arg.AddToParser(meta_args)
self._interaction_type_mapper = GetInteractionTypeMapper(api_version)
self._interaction_type_mapper.choice_arg.AddToParser(meta_args)
self._microphone_distance_type_mapper = GetMicrophoneDistanceTypeMapper(
api_version)
self._microphone_distance_type_mapper.choice_arg.AddToParser(meta_args)
self._device_type_mapper = GetRecordingDeviceTypeMapper(api_version)
self._device_type_mapper.choice_arg.AddToParser(meta_args)
meta_args.add_argument(
'--recording-device-name',
action=MakeDeprecatedRecgonitionFlagAction('recording-device-name'),
help='The device used to make the recording. Examples: `Nexus 5X`, '
'`Polycom SoundStation IP 6000`')
meta_args.add_argument(
'--original-mime-type',
action=MakeDeprecatedRecgonitionFlagAction('original-mime-type'),
help='Mime type of the original audio file. Examples: `audio/m4a`, '
' `audio/mp3`.')
meta_args.add_argument(
'--audio-topic',
action=MakeDeprecatedRecgonitionFlagAction('audio-topic'),
help='Description of the content, e.g. "Recordings of federal supreme '
'court hearings from 2012".')
def UpdateAlphaArgsInRecognitionConfig(self, args, config):
"""Update RecognitionConfig with args."""
if (args.interaction_type is not None or
args.original_media_type is not None or args.naics_code is not None or
args.microphone_distance is not None or
args.recording_device_type is not None or
args.recording_device_name is not None or
args.original_mime_type is not None or args.audio_topic is not None):
if config.metadata is None:
config.metadata = config.field_by_name('metadata').message_type()
config.metadata.interactionType = (
self._interaction_type_mapper.GetEnumForChoice(args.interaction_type))
config.metadata.originalMediaType = (
self._original_media_type_mapper.GetEnumForChoice(
args.original_media_type))
config.metadata.industryNaicsCodeOfAudio = args.naics_code
config.metadata.microphoneDistance = (
self._microphone_distance_type_mapper.GetEnumForChoice(
args.microphone_distance))
config.metadata.recordingDeviceType = (
self._device_type_mapper.GetEnumForChoice(args.recording_device_type))
config.metadata.recordingDeviceName = args.recording_device_name
config.metadata.originalMimeType = args.original_mime_type
config.metadata.audioTopic = args.audio_topic
def MakeDeprecatedRecgonitionFlagAction(flag_name):
return actions.DeprecationAction(
'--' + flag_name,
warn='The `{}` flag is deprecated and will be removed. '
'The Google Cloud Speech-to-text api does not use it, and only '
'passes it through back into response.'.format(flag_name))
def GetRecordingDeviceTypeMapper(version):
messages = apis.GetMessagesModule(util.SPEECH_API, version)
return arg_utils.ChoiceEnumMapper(
'--recording-device-type',
messages.RecognitionMetadata.RecordingDeviceTypeValueValuesEnum,
action=MakeDeprecatedRecgonitionFlagAction('recording-device-type'),
custom_mappings={
'SMARTPHONE': ('smartphone', 'Speech was recorded on a smartphone.'),
'PC': ('pc',
'Speech was recorded using a personal computer or tablet.'),
'PHONE_LINE':
('phone-line', 'Speech was recorded over a phone line.'),
'VEHICLE': ('vehicle', 'Speech was recorded in a vehicle.'),
'OTHER_OUTDOOR_DEVICE': ('outdoor', 'Speech was recorded outdoors.'),
'OTHER_INDOOR_DEVICE': ('indoor', 'Speech was recorded indoors.')
},
help_str='The device type through which the original audio was '
'recorded on.',
include_filter=lambda x: not x.endswith('UNSPECIFIED'))
def GetMicrophoneDistanceTypeMapper(version):
messages = apis.GetMessagesModule(util.SPEECH_API, version)
return arg_utils.ChoiceEnumMapper(
'--microphone-distance',
messages.RecognitionMetadata.MicrophoneDistanceValueValuesEnum,
action=MakeDeprecatedRecgonitionFlagAction('microphone-distance'),
custom_mappings={
'NEARFIELD': ('nearfield', """\
The audio was captured from a microphone close to the speaker, generally within
1 meter. Examples include a phone, dictaphone, or handheld microphone."""),
'MIDFIELD':
('midfield', 'The speaker is within 3 meters of the microphone.'),
'FARFIELD':
('farfield',
'The speaker is more than 3 meters away from the microphone.'),
},
help_str='The distance at which the audio device is placed to record '
'the conversation.',
include_filter=lambda x: not x.endswith('UNSPECIFIED'))
def GetInteractionTypeMapper(version):
messages = apis.GetMessagesModule(util.SPEECH_API, version)
return arg_utils.ChoiceEnumMapper(
'--interaction-type',
messages.RecognitionMetadata.InteractionTypeValueValuesEnum,
action=MakeDeprecatedRecgonitionFlagAction('interaction-type'),
custom_mappings={
'DICTATION': (
'dictation',
'Transcribe speech to text to create a written document, such as '
+ 'a text-message, email or report.'),
'DISCUSSION': ('discussion',
'Multiple people in a conversation or discussion.'),
'PRESENTATION': ('presentation',
'One or more persons lecturing or presenting to ' +
'others, mostly uninterrupted.'),
'PHONE_CALL': (
'phone-call',
'A phone-call or video-conference in which two or more people, ' +
'who are not in the same room, are actively participating.'),
'PROFESSIONALLY_PRODUCED':
('professionally-produced',
'Professionally produced audio (eg. TV Show, Podcast).'),
'VOICE_COMMAND':
('voice-command',
'Transcribe voice commands, such as for controlling a device.'),
'VOICE_SEARCH':
('voice-search',
'Transcribe spoken questions and queries into text.'),
'VOICEMAIL':
('voicemail',
'A recorded message intended for another person to listen to.'),
},
help_str='Determining the interaction type in the conversation.',
include_filter=lambda x: not x.endswith('UNSPECIFIED'))
def GetOriginalMediaTypeMapper(version):
messages = apis.GetMessagesModule(util.SPEECH_API, version)
return arg_utils.ChoiceEnumMapper(
'--original-media-type',
messages.RecognitionMetadata.OriginalMediaTypeValueValuesEnum,
action=MakeDeprecatedRecgonitionFlagAction('original-media-type'),
custom_mappings={
'AUDIO': ('audio', 'The speech data is an audio recording.'),
'VIDEO': ('video', 'The speech data originally recorded on a video.'),
},
help_str='The media type of the original audio conversation.',
include_filter=lambda x: not x.endswith('UNSPECIFIED'))

View File

@@ -0,0 +1,330 @@
# -*- coding: utf-8 -*- #
# Copyright 2022 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Flags for speech commands."""
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from googlecloudsdk.calliope import arg_parsers
from googlecloudsdk.calliope import base
from googlecloudsdk.calliope.concepts import concepts
from googlecloudsdk.command_lib.util.apis import yaml_data
from googlecloudsdk.command_lib.util.concepts import concept_parsers
from googlecloudsdk.command_lib.util.concepts import presentation_specs
SPEAKER_COUNT_MAX_VALUE = 6
SPEAKER_COUNT_MIN_VALUE = 1
ALTERNATIVES_MAX_VALUE = 30
ALTERNATIVES_MIN_VALUE = 1
AUDIO_CHANNEL_COUNT_MAX_VALUE = 8
AUDIO_CHANNEL_COUNT_MIN_VALUE = 1
SAMPLE_RATE_MAX_VALUE = 48000
SAMPLE_RATE_MIN_VALUE = 8000
def AddRecognizerArgToParser(parser):
"""Sets up an argument for the recognizer resource."""
resource_data = yaml_data.ResourceYAMLData.FromPath('ml.speech.recognizer')
resource_spec = concepts.ResourceSpec.FromYaml(
resource_data.GetData(), api_version='v2'
)
presentation_spec = presentation_specs.ResourcePresentationSpec(
name='recognizer',
concept_spec=resource_spec,
required=True,
group_help='recognizer.',
)
return concept_parsers.ConceptParser([presentation_spec]).AddToParser(parser)
def AddLocationArgToParser(parser):
"""Parses location flag."""
location_data = yaml_data.ResourceYAMLData.FromPath('ml.speech.location')
resource_spec = concepts.ResourceSpec.FromYaml(location_data.GetData())
presentation_spec = presentation_specs.ResourcePresentationSpec(
name='--location',
concept_spec=resource_spec,
required=True,
group_help='location.',
)
return concept_parsers.ConceptParser([presentation_spec]).AddToParser(parser)
def AddLocationPositionalArgToParser(parser):
"""Parses location when there is no flag."""
location_data = yaml_data.ResourceYAMLData.FromPath('ml.speech.location')
resource_spec = concepts.ResourceSpec.FromYaml(location_data.GetData())
presentation_spec = presentation_specs.ResourcePresentationSpec(
name='location',
concept_spec=resource_spec,
required=True,
group_help='location.',
)
return concept_parsers.ConceptParser([presentation_spec]).AddToParser(parser)
def AddAllFlagsToParser(
parser, require_base_recognizer_attributes=False, use_store_true=False
):
"""Parses all flags for v2 STT API."""
AddRecognizerArgToParser(parser)
AddAsyncFlagToParser(parser)
parser.add_argument(
'--display-name',
help="""\
Name of this recognizer as it appears in UIs.
""",
)
AddBaseRecognizerAttributeFlagsToParser(
parser, required=require_base_recognizer_attributes
)
AddFeatureFlagsToParser(parser, use_store_true)
AddDecodingConfigFlagsToParser(parser)
def AddRecognizeRequestFlagsToParser(parser, add_async_flag=False):
"""Parses all flags for v2 STT API for command run-batch."""
AddRecognizerArgToParser(parser)
parser.add_argument(
'--audio',
required=True,
help=(
'Location of the audio file to transcribe. '
'Must be a audio data bytes, local file, or Google Cloud Storage URL '
'(in the format gs://bucket/object).'
),
)
AddFeatureFlagsToParser(parser)
AddDecodingConfigFlagsToParser(parser)
AddBaseRecognizerAttributeFlagsToParser(parser)
parser.add_argument(
'--hint-phrases',
metavar='PHRASE',
type=arg_parsers.ArgList(),
help="""\
A list of strings containing word and phrase "hints" so that the '
'speech recognition is more likely to recognize them. This can be '
'used to improve the accuracy for specific words and phrases, '
'for example, if specific commands are typically spoken by '
'the user. This can also be used to add additional words to the '
'vocabulary of the recognizer. '
'See https://cloud.google.com/speech/limits#content.
""",
)
parser.add_argument(
'--hint-phrase-sets',
metavar='PHRASE_SET',
type=arg_parsers.ArgList(),
help="""\
A list of phrase set resource names to use for speech recognition.
""",
)
parser.add_argument(
'--hint-boost',
type=arg_parsers.BoundedFloat(1, 20),
help="""\
Boost value for the phrases passed to --phrases.
Can have a value between 1 and 20.
""",
)
if add_async_flag:
AddAsyncFlagToParser(parser)
def AddAsyncFlagToParser(parser):
"""Adds async flag to parser."""
base.ASYNC_FLAG.AddToParser(parser)
base.ASYNC_FLAG.SetDefault(parser, False)
def AddBaseRecognizerAttributeFlagsToParser(parser, required=False):
"""Adds base recognizer attribute flags to parser."""
parser.add_argument(
'--model',
required=required,
help="""\
Which model to use for recognition requests.
Select the model best suited to your domain to get best results.
Guidance for choosing which model to use can be found in the
[Transcription Models Documentation](https://cloud.google.com/speech-to-text/v2/docs/transcription-model)
and the models supported in each region can be found in the
[Table Of Supported Models](https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages).
""",
)
parser.add_argument(
'--language-codes',
metavar='LANGUAGE_CODE',
required=required,
type=arg_parsers.ArgList(),
help="""\
Language code is one of `en-US`, `en-GB`, `fr-FR`.
Check [documentation](https://cloud.google.com/speech-to-text/docs/multiple-languages)
for using more than one language code.
""",
)
def AddDecodingConfigFlagsToParser(parser):
"""Adds decoding config flags to parser."""
decoding_config_group = parser.add_group(help='Encoding format')
decoding_config_group.add_argument(
'--encoding',
help="""\
Encoding format of the provided audio.
For headerless formats, must be set to `LINEAR16`, `MULAW,` or `ALAW`.
For other formats, set to `AUTO`. Overrides the recognizer
configuration if present, else uses recognizer encoding.
""",
)
sample_rate_help = (
'Sample rate in Hertz of the audio data sent for recognition. '
'Required if --encoding flag is specified and is not AUTO. '
'Must be set to a value between {} and {}.'.format(
SAMPLE_RATE_MIN_VALUE, SAMPLE_RATE_MAX_VALUE
)
)
decoding_config_group.add_argument(
'--sample-rate',
type=arg_parsers.BoundedInt(SAMPLE_RATE_MIN_VALUE, SAMPLE_RATE_MAX_VALUE),
help=sample_rate_help,
)
audio_channel_count_help = (
'Number of channels present in the audio data sent for recognition. '
'Required if --encoding flag is specified and is not AUTO. '
'Must be set to a value between {} and {}.'.format(
AUDIO_CHANNEL_COUNT_MIN_VALUE, AUDIO_CHANNEL_COUNT_MAX_VALUE
)
)
decoding_config_group.add_argument(
'--audio-channel-count',
type=arg_parsers.BoundedInt(
AUDIO_CHANNEL_COUNT_MIN_VALUE, AUDIO_CHANNEL_COUNT_MAX_VALUE
),
help=audio_channel_count_help,
)
def AddFeatureFlagsToParser(parser, use_store_true=False):
"""Adds feature flags to parser."""
features_group = parser.add_group(help='ASR Features')
speaker_diarization_group = features_group.add_group(
help='Speaker Diarization'
)
features_group.add_argument(
'--profanity-filter',
action='store_true'
if use_store_true
else arg_parsers.StoreTrueFalseAction,
help="""\
If set, the server will censor profanities.
""",
)
features_group.add_argument(
'--enable-word-time-offsets',
action='store_true'
if use_store_true
else arg_parsers.StoreTrueFalseAction,
help="""\
If set, the top result includes a list of words and their timestamps.
""",
)
features_group.add_argument(
'--enable-word-confidence',
action='store_true'
if use_store_true
else arg_parsers.StoreTrueFalseAction,
help="""\
If set, the top result includes a list of words and the confidence for
those words.
""",
)
features_group.add_argument(
'--enable-automatic-punctuation',
action='store_true'
if use_store_true
else arg_parsers.StoreTrueFalseAction,
help="""\
If set, adds punctuation to recognition result hypotheses.
""",
)
features_group.add_argument(
'--enable-spoken-punctuation',
action='store_true'
if use_store_true
else arg_parsers.StoreTrueFalseAction,
help="""\
If set, replaces spoken punctuation with the corresponding symbols in the request.
""",
)
features_group.add_argument(
'--enable-spoken-emojis',
action='store_true'
if use_store_true
else arg_parsers.StoreTrueFalseAction,
help="""\
If set, adds spoken emoji formatting.
""",
)
min_speaker_count_help = (
'Minimum number of speakers in the conversation. Must be less than or'
' equal to --max-speaker-count. Must be set to a value between {} and {}.'
.format(SPEAKER_COUNT_MIN_VALUE, SPEAKER_COUNT_MAX_VALUE)
)
max_speaker_count_help = (
'Maximum number of speakers in the conversation. Must be greater than or'
' equal to --min-speaker-count. Must be set to a value between {} and {}.'
.format(SPEAKER_COUNT_MIN_VALUE, SPEAKER_COUNT_MAX_VALUE)
)
speaker_diarization_group.add_argument(
'--min-speaker-count',
required=True,
type=arg_parsers.BoundedInt(
SPEAKER_COUNT_MIN_VALUE, SPEAKER_COUNT_MAX_VALUE
),
help=min_speaker_count_help,
)
speaker_diarization_group.add_argument(
'--max-speaker-count',
required=True,
type=arg_parsers.BoundedInt(
SPEAKER_COUNT_MIN_VALUE, SPEAKER_COUNT_MAX_VALUE
),
help=max_speaker_count_help,
)
features_group.add_argument(
'--separate-channel-recognition',
action='store_true'
if use_store_true
else arg_parsers.StoreTrueFalseAction,
help="""\
Mode for recognizing multi-channel audio using Separate Channel Recognition.
When set, the service will recognize each channel independently.
""",
)
max_alternatives_help = (
'Maximum number of recognition hypotheses to be returned. Must be set to'
' a value between {} and {}.'.format(
ALTERNATIVES_MIN_VALUE, ALTERNATIVES_MAX_VALUE
)
)
features_group.add_argument(
'--max-alternatives',
type=arg_parsers.BoundedInt(
ALTERNATIVES_MIN_VALUE, ALTERNATIVES_MAX_VALUE
),
help=max_alternatives_help,
)

View File

@@ -0,0 +1,40 @@
project:
name: project
collection: speech.projects
attributes:
- &project
parameter_name: projectsId
attribute_name: project
help: |
Project of the {resource}.
property: core/project
location:
name: location
collection: speech.projects.locations
attributes:
- *project
- &location
parameter_name: locationsId
attribute_name: location
help: |
Location of the {resource}.
operation:
name: operation
collection: speech.operations
attributes:
- parameter_name: operationsId
attribute_name: operation
help: The ID of the operation
recognizer:
name: recognizer
collection: speech.projects.locations.recognizers
attributes:
- *project
- *location
- &recognizer
parameter_name: recognizersId
attribute_name: recognizer
help: Speech-to-text recognizer.

View File

@@ -0,0 +1,142 @@
# -*- coding: utf-8 -*- #
# Copyright 2017 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Wrapper for interacting with speech API."""
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
import os
from googlecloudsdk.api_lib.storage import storage_util
from googlecloudsdk.api_lib.util import apis
from googlecloudsdk.core import exceptions
from googlecloudsdk.core import log
from googlecloudsdk.core import properties
from googlecloudsdk.core.console import console_io
from googlecloudsdk.core.util import files
from six.moves import urllib
SPEECH_API = 'speech'
SPEECH_API_VERSION = 'v1'
OUTPUT_ERROR_MESSAGE = ('[{}] is not a valid format for result output. Must be '
'a Google Cloud Storage URI '
'(format: gs://bucket/file).')
class Error(exceptions.Error):
"""Exceptions for this module."""
class AudioException(Error):
"""Raised if audio is not found."""
class UriFormatError(Error):
"""Error if the specified URI is invalid."""
def GetRecognitionAudioFromPath(path, version):
"""Determine whether path to audio is local, set RecognitionAudio message."""
messages = apis.GetMessagesModule(SPEECH_API, version)
audio = messages.RecognitionAudio()
if os.path.isfile(path):
audio.content = files.ReadBinaryFileContents(path)
elif storage_util.ObjectReference.IsStorageUrl(path):
audio.uri = path
else:
raise AudioException(
'Invalid audio source [{}]. The source must either be a local path '
'or a Google Cloud Storage URL (such as gs://bucket/object).'.format(
path))
return audio
def GetAudioHook(version=SPEECH_API_VERSION):
"""Returns a hook to get the RecognitionAudio message for an API version."""
def GetAudioFromPath(path):
"""Determine whether path to audio is local, build RecognitionAudio message.
Args:
path: str, the path to the audio.
Raises:
AudioException: If audio is not found locally and does not appear to be
Google Cloud Storage URL.
Returns:
speech_v1_messages.RecognitionAudio, the audio message.
"""
return GetRecognitionAudioFromPath(path, version)
return GetAudioFromPath
def ValidateOutputUri(output_uri):
"""Validates given output URI against validator function.
Args:
output_uri: str, the output URI for the analysis.
Raises:
UriFormatError: if the URI is not valid.
Returns:
str, The same output_uri.
"""
if output_uri and not storage_util.ObjectReference.IsStorageUrl(output_uri):
raise UriFormatError(OUTPUT_ERROR_MESSAGE.format(output_uri))
return output_uri
def MaybePrintSttUiLink(request):
"""Print Url to the Speech-to-text UI console for given recognize request."""
if (console_io.IsRunFromShellScript() or
properties.VALUES.core.disable_prompts.GetBool()):
return
audio_uri = request.audio.uri
if not audio_uri:
return
payload = {
'audio':
urllib.parse.quote_plus(
audio_uri[5:] if audio_uri.startswith('gs://') else audio_uri),
'encoding':
request.config.encoding,
'model':
request.config.model,
'locale':
request.config.languageCode,
'sampling':
request.config.sampleRateHertz,
'channels':
request.config.audioChannelCount,
'enhanced':
request.config.useEnhanced,
}
params = ';'.join('{}={}'.format(key, value)
for (key, value) in sorted(payload.items())
if value and ('unspecified' not in str(value).lower()))
url_tuple = ('https', 'console.cloud.google.com',
'/speech/transcriptions/create', params, '', '')
target_url = urllib.parse.urlunparse(url_tuple)
log.status.Print(
'Try this using the Speech-to-Text UI at {}'.format(target_url))