254 lines
9.7 KiB
YAML
254 lines
9.7 KiB
YAML
examples:
|
|
recognize: |
|
|
To get a transcript of an audio file 'my-recording.wav':
|
|
|
|
$ {command} 'my-recording.wav' --language-code=en-US
|
|
|
|
To get a transcript of an audio file in bucket 'gs://bucket/myaudio' with a
|
|
custom sampling rate and encoding that uses hints and filters profanity:
|
|
|
|
$ {command} 'gs://bucket/myaudio' --language-code=es-ES --sample-rate=2200 --hints=Bueno --encoding=OGG_OPUS --filter-profanity
|
|
|
|
args_v1:
|
|
- api_field: audio
|
|
arg_name: audio
|
|
help_text: |
|
|
The location of the audio file to transcribe. Must be a local path or a
|
|
Google Cloud Storage URL (in the format gs://bucket/object).
|
|
is_positional: true
|
|
processor: googlecloudsdk.command_lib.ml.speech.util:GetAudioHook:version=v1
|
|
|
|
args_v1p1beta1:
|
|
- api_field: audio
|
|
arg_name: audio
|
|
help_text: |
|
|
The location of the audio file to transcribe. Must be a local path or a
|
|
Google Cloud Storage URL (in the format gs://bucket/object).
|
|
is_positional: true
|
|
processor: googlecloudsdk.command_lib.ml.speech.util:GetAudioHook:version=v1p1beta1
|
|
|
|
- api_field: config.enableWordConfidence
|
|
arg_name: include-word-confidence
|
|
help_text: |
|
|
Include a list of words and the confidence for those words in the top
|
|
result.
|
|
- group:
|
|
params:
|
|
- api_field: config.diarizationSpeakerCount
|
|
arg_name: diarization-speaker-count
|
|
type: int
|
|
help_text: |
|
|
Estimated number of speakers in the conversation being recognized.
|
|
- api_field: config.enableSpeakerDiarization
|
|
arg_name: enable-speaker-diarization
|
|
help_text: |
|
|
Enable speaker detection for each recognized word in the top
|
|
alternative of the recognition result using an integer speaker_tag
|
|
provided in the WordInfo.
|
|
type: bool
|
|
required: true
|
|
|
|
- api_field: config.alternativeLanguageCodes
|
|
arg_name: additional-language-codes
|
|
metavar: language_code
|
|
repeated: true
|
|
help_text: |
|
|
The BCP-47 language tags of other languages that the speech may be
|
|
in. Up to 3 can be provided.
|
|
|
|
If alternative languages are listed, recognition result will contain
|
|
recognition in the most likely language detected including the main
|
|
language-code.
|
|
|
|
args_v1p1beta1_alpha_track: # available only in 'gcloud alpha'
|
|
- arg_name: enable-automatic-punctuation
|
|
api_field: config.enableAutomaticPunctuation
|
|
help_text: |
|
|
Adds punctuation to recognition result hypotheses.
|
|
- group:
|
|
help_text: Description of audio data to be recognized.
|
|
params:
|
|
- arg_name: interaction-type
|
|
api_field: config.metadata.interactionType
|
|
help_text: |
|
|
Determining the interaction type in the conversation.
|
|
choices:
|
|
- arg_value: discussion
|
|
enum_value: DISCUSSION
|
|
help_text: Multiple people in a conversation or discussion.
|
|
- arg_value: phone-call
|
|
enum_value: PHONE_CALL
|
|
help_text: A phone-call or video-conference in which two or more people, who are not in the same room, are actively participating.
|
|
- arg_value: voicemail
|
|
enum_value: VOICEMAIL
|
|
help_text: A recorded message intended for another person to listen to.
|
|
- arg_value: professionally-produced
|
|
enum_value: PROFESSIONALLY_PRODUCED
|
|
help_text: Professionally produced audio (eg. TV Show, Podcast).
|
|
- arg_value: voice-search
|
|
enum_value: VOICE_SEARCH
|
|
help_text: Transcribe spoken questions and queries into text.
|
|
- arg_value: voice-command
|
|
enum_value: VOICE_COMMAND
|
|
help_text: Transcribe voice commands, such as for controlling a device.
|
|
- arg_value: dictation
|
|
enum_value: DICTATION
|
|
help_text: Transcribe speech to text to create a written document, such as a text-message, email or report.
|
|
|
|
- arg_name: naics-code
|
|
api_field: config.metadata.industryNaicsCodeOfAudio
|
|
type: int
|
|
help_text: |
|
|
The industry vertical to which this speech recognition request most closely applies.
|
|
- arg_name: microphone-distance
|
|
api_field: config.metadata.microphoneDistance
|
|
help_text: |
|
|
The distance at which the audio device is placed to record the conversation.
|
|
choices:
|
|
- arg_value: nearfield
|
|
enum_value: NEARFIELD
|
|
help_text: The speaker is within 1 meter of the microphone.
|
|
- arg_value: midfield
|
|
enum_value: MIDFIELD
|
|
help_text: The speaker is within 3 meters of the microphone.
|
|
- arg_value: farfield
|
|
enum_value: FARFIELD
|
|
help_text: The speaker is more than 3 meters away from the microphone.
|
|
- arg_name: original-media-type
|
|
api_field: config.metadata.originalMediaType
|
|
help_text: |
|
|
The media type of the original audio conversation.
|
|
choices:
|
|
- arg_value: audio
|
|
enum_value: AUDIO
|
|
help_text: The speech data is an audio recording.
|
|
- arg_value: video
|
|
enum_value: VIDEO
|
|
help_text: The speech data originally recorded on a video.
|
|
- arg_name: recording-device-type
|
|
api_field: config.metadata.recordingDeviceType
|
|
help_text: |
|
|
The device type through which the original audio was recorded on.
|
|
choices:
|
|
- arg_value: smartphone
|
|
enum_value: SMARTPHONE
|
|
help_text: Speech was recorded on a smartphone.
|
|
- arg_value: pc
|
|
enum_value: PC
|
|
help_text: Speech was recorded using a personal computer or tablet.
|
|
- arg_value: phone-line
|
|
enum_value: PHONE_LINE
|
|
help_text: Speech was recorded over a phone line.
|
|
- arg_value: vehicle
|
|
enum_value: VEHICLE
|
|
help_text: Speech was recorded in a vehicle.
|
|
- arg_value: outdoor
|
|
enum_value: OTHER_OUTDOOR_DEVICE
|
|
help_text: Speech was recorded outdoors.
|
|
- arg_value: indoor
|
|
enum_value: OTHER_INDOOR_DEVICE
|
|
help_text: Speech was recorded indoors.
|
|
- arg_name: recording-device-name
|
|
api_field: config.metadata.recordingDeviceName
|
|
help_text: |
|
|
The device used to make the recording. Examples: `Nexus 5X`, `Polycom SoundStation IP 6000`
|
|
- arg_name: original-mime-type
|
|
api_field: config.metadata.originalMimeType
|
|
help_text: |
|
|
Mime type of the original audio file. Examples: `audio/m4a`, `audio/mp3`.
|
|
- arg_name: audio-topic
|
|
api_field: config.metadata.audioTopic
|
|
help_text: |
|
|
Description of the content, e.g. "Recordings of federal supreme court hearings from 2012".
|
|
|
|
args:
|
|
- group:
|
|
help_text: Audio channel settings.
|
|
params:
|
|
- arg_name: separate-channel-recognition
|
|
api_field: config.enableSeparateRecognitionPerChannel
|
|
required: true
|
|
default: false
|
|
help_text: |
|
|
Recognition result will contain a `channel_tag` field to state which channel that
|
|
result belongs to. If this is not true, only the first channel will be recognized.
|
|
- arg_name: audio-channel-count
|
|
api_field: config.audioChannelCount
|
|
required: true
|
|
type: int
|
|
help_text: |
|
|
The number of channels in the input audio data. Set this for
|
|
separate-channel-recognition. Valid values are:
|
|
1)LINEAR16 and FLAC are `1`-`8`
|
|
2)OGG_OPUS are `1`-`254`
|
|
3) MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`.
|
|
- group:
|
|
mutex: true
|
|
required: true
|
|
params:
|
|
- api_field: config.languageCode
|
|
arg_name: language-code
|
|
help_text: |
|
|
The language of the supplied audio as a BCP-47
|
|
(https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. Example:
|
|
"en-US". See https://cloud.google.com/speech/docs/languages for a list
|
|
of the currently supported language codes.
|
|
- api_field: config.languageCode
|
|
arg_name: language
|
|
hidden: true
|
|
action:
|
|
deprecated:
|
|
warn: Flag {flag_name} is deprecated. Use --language-code instead.
|
|
help_text: |
|
|
The language of the supplied audio as a BCP-47
|
|
(https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. Example:
|
|
"en-US". See https://cloud.google.com/speech/docs/languages for a list
|
|
of the currently supported language codes.
|
|
|
|
- api_field: config.speechContexts.phrases
|
|
arg_name: hints
|
|
default: []
|
|
help_text: |
|
|
A list of strings containing word and phrase "hints" so that the speech
|
|
recognition is more likely to recognize them. This can be used to
|
|
improve the accuracy for specific words and phrases, for example, if
|
|
specific commands are typically spoken by the user. This can also be
|
|
used to add additional words to the vocabulary of the recognizer. See
|
|
https://cloud.google.com/speech/limits#content.
|
|
|
|
- api_field: config.maxAlternatives
|
|
arg_name: max-alternatives
|
|
default: 1
|
|
help_text: |
|
|
Maximum number of recognition hypotheses to be returned. The server
|
|
may return fewer than max_alternatives. Valid values are 0-30. A value
|
|
of 0 or 1 will return a maximum of one.
|
|
|
|
- api_field: config.profanityFilter
|
|
arg_name: filter-profanity
|
|
help_text: |
|
|
If True, the server will attempt to filter out profanities, replacing
|
|
all but the initial character in each filtered word with asterisks,
|
|
e.g. ```f***```.
|
|
|
|
- api_field: config.encoding
|
|
arg_name: encoding
|
|
default: encoding-unspecified
|
|
help_text: |
|
|
The type of encoding of the file. Required if the file format is not
|
|
WAV or FLAC.
|
|
|
|
- api_field: config.sampleRateHertz
|
|
arg_name: sample-rate
|
|
help_text: |
|
|
The sample rate in Hertz. For best results, set the sampling rate of
|
|
the audio source to 16000 Hz. If that's not possible, use the native
|
|
sample rate of the audio source (instead of re-sampling).
|
|
|
|
- api_field: config.enableWordTimeOffsets
|
|
arg_name: include-word-time-offsets
|
|
help_text: |
|
|
If True, the top result includes a list of words with the start and
|
|
end time offsets (timestamps) for those words. If False, no word-level
|
|
time offset information is returned.
|