# -*- coding: utf-8 -*- # # Copyright 2016 Google LLC. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Methods for suggesting corrections to command typos.""" from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals import collections import os import re from googlecloudsdk.command_lib.static_completion import lookup from googlecloudsdk.core import log from googlecloudsdk.core.util import files import six # Command noun and verb variants mapped to most likely gcloud counterpart. SYNONYMS = { 'change': 'update', 'copy-files': 'scp', 'create': 'add', 'delete': 'remove', 'describe': 'get', 'docker': 'auth-configure-docker', 'get': 'describe', 'image': 'images', 'instance': 'instances', 'instances': 'instance', 'make': 'create', 'modify': 'update', 'patch': 'update', 'remove': 'delete', 'show': 'describe', } MIN_RATIO = 0.7 # Minimum score/top_score ratio of accepted suggestions. MIN_SUGGESTED_GROUPS = 4 # Check for group prefix if less groups than this. MAX_SUGGESTIONS = 10 # Maximum number of suggestions. # Factor to multiply logged command frequencies by before incrementing # canonical command scores. FREQUENCY_FACTOR = 100 def _GetSurfaceHistoryFrequencies(logs_dir): """Load the last 100 surfaces user used today from local command history. Args: logs_dir: str, the path to today's logs directory Returns: dict mapping surfaces to normalized frequencies. """ surfaces_count = collections.defaultdict(int) if not logs_dir: return surfaces_count total = 0 last_100_invocations = sorted(os.listdir(logs_dir), reverse=True)[:100] for filename in last_100_invocations: file_path = os.path.join(logs_dir, filename) with files.FileReader(file_path) as log_file: for line in log_file: match = re.search(log.USED_SURFACE_PATTERN, line) if match: surface = match.group(1) total += 1 surfaces_count[surface] += 1 # normalize surface frequencies return { surface: count / total for surface, count in six.iteritems(surfaces_count) } def _GetCanonicalCommandsHelper(tree, results, prefix): """Helper method to _GetCanonicalCommands. Args: tree: The root of the tree that will be traversed to find commands. results: The results list to append to. prefix: [str], the canonical command line words so far. Once we reach a leaf node, prefix contains a canonical command and a copy is appended to results. Returns: None """ if not tree.get(lookup.LOOKUP_COMMANDS): results.append(prefix[:]) return for command, command_tree in six.iteritems(tree[lookup.LOOKUP_COMMANDS]): prefix.append(command) _GetCanonicalCommandsHelper(command_tree, results, prefix) prefix.pop() def _GetCanonicalCommands(tree): """Return list of all canonical commands in CLI tree in arbitrary order. Args: tree: The root of the tree that will be traversed to find commands. Returns: [[canonical_command_words]]: List of lists, all possible sequences of canonical command words in the tree. """ results = [] _GetCanonicalCommandsHelper(tree, results, prefix=[]) return results def _WordScore( index, normalized_command_word, canonical_command_word, canonical_command_length, ): """Returns the integer word match score for a command word. Args: index: The position of the word in the command. normalized_command_word: The normalized command word. canonical_command_word: The actual command word to compare with. canonical_command_length: The length of the actual command. Returns: The integer word match score, always >= 0. """ score = 0 # The match can go either way. if normalized_command_word in canonical_command_word: shorter_word = normalized_command_word longer_word = canonical_command_word elif canonical_command_word in normalized_command_word: shorter_word = canonical_command_word longer_word = normalized_command_word else: return score # Inner match must be a word boundary. hit = longer_word.find(shorter_word) if hit > 0 and longer_word[hit - 1] != '-': return score # Partial hit. score += 10 # Prefer a match in less words. if canonical_command_length == 1: score += 30 elif canonical_command_length == 2: score += 20 elif canonical_command_length == 3: score += 10 # Prefer a match in order. if index == 0: score += 25 elif index == 1: score += 15 else: score += 5 # Prefer matching more chars and beginning of word. # This also handles minor suffix diffs, like singular vs. plural. extra = len(longer_word) - len(shorter_word) if extra <= 2: extra = 3 - extra if longer_word.startswith(shorter_word): extra *= 2 score += extra # Prefer matching on surface words. if index == 0 and canonical_command_length > 1: score += 30 # Also prefer matching on group words. elif index > 0 and canonical_command_length > index + 1: score += 15 return score def _GetScoredCommandsContaining(command_words): """Return scored canonical commands containing input command words. Args: command_words: List of input command words. Returns: [(canonical_command_words, score)]: List of tuples, where canonical_command_words is a list of strings and score is an integer > 0. The tuples are sorted from highest score to lowest, and commands with the same score appear in lexicographic order. """ root = lookup.LoadCompletionCliTree() surface_history = _GetSurfaceHistoryFrequencies(log.GetLogDir()) normalized_command_words = [ command_word.lower().replace('_', '-') for command_word in command_words ] scored_commands = [] all_canonical_commands = _GetCanonicalCommands(root) canonical_command_set = set(map(tuple, all_canonical_commands)) for canonical_command_words in all_canonical_commands: canonical_command_length = len(canonical_command_words) matched = set() score = 0 for index, canonical_command_word in enumerate(canonical_command_words): for normalized_command_word in normalized_command_words: # Prefer the higher score of the normalized word or its synonym if any. increment = _WordScore( index, normalized_command_word, canonical_command_word, canonical_command_length, ) alternate_command_word = SYNONYMS.get(normalized_command_word) if alternate_command_word: alternate_increment = _WordScore( index, alternate_command_word, canonical_command_word, canonical_command_length, ) if increment < alternate_increment: increment = alternate_increment if increment: matched.add(normalized_command_word) score += increment # Prefer all command words to match. if len(matched) == len(normalized_command_words): score += 10 # 0 score is always ignored, no need to save. if score > 0: surface = '.'.join(canonical_command_words[:-1]) if surface in surface_history: score += int(surface_history[surface] * FREQUENCY_FACTOR) # We want to display `alpha` and `beta` commands in the Maybe You Mean # list as well, however we should display them with a lower confidence # score, and not display them if their higher track counterpart exists. better_track_exists = False if 'alpha' == canonical_command_words[0]: score -= 5 if tuple(canonical_command_words[1:]) in canonical_command_set: better_track_exists = True if ( tuple(['beta'] + canonical_command_words[1:]) in canonical_command_set ): better_track_exists = True if 'beta' == canonical_command_words[0]: score -= 5 if tuple(canonical_command_words[1:]) in canonical_command_set: better_track_exists = True if not better_track_exists: scored_commands.append((canonical_command_words, score)) # Sort scores descending, commands ascending. scored_commands.sort(key=lambda tuple: (-tuple[1], tuple[0])) return scored_commands def GetCommandSuggestions(command_words): """Return suggested commands containing input command words. Args: command_words: List of input command words. Returns: [command]: A list of canonical command strings with 'gcloud' prepended. Only commands whose scores have a ratio of at least MIN_RATIO against the top score are returned. At most MAX_SUGGESTIONS command strings are returned. If many commands from the same group are being suggested, then the common groups are suggested instead. """ suggested_commands = [] try: scored_commands = _GetScoredCommandsContaining(command_words) except lookup.CannotHandleCompletionError: # Don't crash error reports on static completion misconfiguration. scored_commands = None if not scored_commands: return suggested_commands # Scores are greater than zero and sorted highest to lowest. top_score = float(scored_commands[0][1]) too_many = False suggested_groups = set() for command, score in scored_commands: if score / top_score >= MIN_RATIO: suggested_commands.append(' '.join(['gcloud'] + command)) suggested_groups.add(' '.join(command[:-1])) if len(suggested_commands) >= MAX_SUGGESTIONS: too_many = True break # Too many most likely indicates the suggested commands have common groups. if too_many and len(suggested_groups) < MIN_SUGGESTED_GROUPS: min_length = len(scored_commands[0][0]) for command, score in scored_commands: if score / top_score < MIN_RATIO: break if min_length > len(command): min_length = len(command) common_length = min_length - 1 if common_length: suggested_groups = set() for command, score in scored_commands: if score / top_score < MIN_RATIO: break suggested_groups.add(' '.join(['gcloud'] + command[:common_length])) if len(suggested_groups) >= MAX_SUGGESTIONS: break suggested_commands = sorted(suggested_groups) return suggested_commands