# -*- coding: utf-8 -*- # # Copyright 2015 Google LLC. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Diagnose cluster command.""" from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals from apitools.base.py import encoding from googlecloudsdk.api_lib.dataproc import dataproc as dp from googlecloudsdk.api_lib.dataproc import exceptions from googlecloudsdk.api_lib.dataproc import storage_helpers from googlecloudsdk.api_lib.dataproc import util from googlecloudsdk.calliope import actions from googlecloudsdk.calliope import base from googlecloudsdk.command_lib.dataproc import flags from googlecloudsdk.command_lib.util.apis import arg_utils from googlecloudsdk.core import log from googlecloudsdk.core.util import retry @base.UniverseCompatible class Diagnose(base.Command): """Run a detailed diagnostic on a cluster.""" detailed_help = { 'EXAMPLES': """ To diagnose a cluster, run: $ {command} my-cluster --region=us-central1 """ } @classmethod def Args(cls, parser): # 26m is backend timeout + 4m for safety buffer. flags.AddTimeoutFlag(parser, default='30m') dataproc = dp.Dataproc(cls.ReleaseTrack()) flags.AddClusterResourceArg(parser, 'diagnose', dataproc.api_version) Diagnose.addDiagnoseFlags(parser, dataproc) @staticmethod def _GetValidTarballAccessChoices(dataproc): tarball_access_enums = ( dataproc.messages.DiagnoseClusterRequest.TarballAccessValueValuesEnum ) return [ arg_utils.ChoiceToEnumName(n) for n in tarball_access_enums.names() if n != 'TARBALL_ACCESS_UNSPECIFIED' ] @staticmethod def addDiagnoseFlags(parser, dataproc): parser.add_argument( '--tarball-access', type=arg_utils.ChoiceToEnumName, choices=Diagnose._GetValidTarballAccessChoices(dataproc), help='Target access privileges for diagnostic tarball.') parser.add_argument( '--start-time', help='Time instant to start the diagnosis from (in ' + '%Y-%m-%dT%H:%M:%S.%fZ format).') parser.add_argument( '--end-time', help='Time instant to stop the diagnosis at (in ' + '%Y-%m-%dT%H:%M:%S.%fZ format).') parser.add_argument( '--job-id', hidden=True, help='The job on which to perform the diagnosis.', action=actions.DeprecationAction( '--job-id', warn=( 'The {flag_name} option is deprecated and will be removed in' ' upcoming release; use --job-ids instead.' ), removed=False, ), ) parser.add_argument( '--yarn-application-id', hidden=True, help='The yarn application on which to perform the diagnosis.', action=actions.DeprecationAction( '--yarn-application-id', warn=( 'The {flag_name} option is deprecated and will be removed in' ' upcoming release; use --yarn-application-ids instead.' ), removed=False, ), ) parser.add_argument( '--workers', hidden=True, help='A list of workers in the cluster to run the diagnostic script ' + 'on.') parser.add_argument( '--job-ids', help='A list of jobs on which to perform the diagnosis.', ) parser.add_argument( '--yarn-application-ids', help='A list of yarn applications on which to perform the diagnosis.', ) parser.add_argument( '--tarball-gcs-dir', help='The output Cloud Storage directory for the diagnostic tarball. ' + 'If not specified, a task-specific directory in the cluster\'s ' + 'staging bucket will be used.' ) def Run(self, args): dataproc = dp.Dataproc(self.ReleaseTrack()) cluster_ref = args.CONCEPTS.cluster.Parse() request = None diagnose_request = dataproc.messages.DiagnoseClusterRequest( job=args.job_id, yarnApplicationId=args.yarn_application_id ) diagnose_request.diagnosisInterval = dataproc.messages.Interval( startTime=args.start_time, endTime=args.end_time ) if args.job_ids is not None: diagnose_request.jobs.extend(args.job_ids.split(',')) if args.yarn_application_ids is not None: diagnose_request.yarnApplicationIds.extend( args.yarn_application_ids.split(',')) if args.workers is not None: diagnose_request.workers.extend(args.workers.split(',')) if args.tarball_access is not None: tarball_access = arg_utils.ChoiceToEnum( args.tarball_access, dataproc.messages.DiagnoseClusterRequest.TarballAccessValueValuesEnum) diagnose_request.tarballAccess = tarball_access if args.tarball_gcs_dir is not None: diagnose_request.tarballGcsDir = args.tarball_gcs_dir request = dataproc.messages.DataprocProjectsRegionsClustersDiagnoseRequest( clusterName=cluster_ref.clusterName, region=cluster_ref.region, projectId=cluster_ref.projectId, diagnoseClusterRequest=diagnose_request) operation = dataproc.client.projects_regions_clusters.Diagnose(request) # TODO(b/36052522): Stream output during polling. operation = util.WaitForOperation( dataproc, operation, message='Waiting for cluster diagnose operation', timeout_s=args.timeout) if not operation.response: raise exceptions.OperationError('Operation is missing response') properties = encoding.MessageToDict(operation.response) output_uri = properties['outputUri'] if not output_uri: raise exceptions.OperationError('Response is missing outputUri') log.err.Print('Output from diagnostic:') log.err.Print('-----------------------------------------------') driver_log_stream = storage_helpers.StorageObjectSeriesStream( output_uri) # A single read might not read whole stream. Try a few times. read_retrier = retry.Retryer(max_retrials=4, jitter_ms=None) try: read_retrier.RetryOnResult( lambda: driver_log_stream.ReadIntoWritable(log.err), sleep_ms=100, should_retry_if=lambda *_: driver_log_stream.open) except retry.MaxRetrialsException: log.warning( 'Diagnostic finished successfully, ' 'but output did not finish streaming.') log.err.Print('-----------------------------------------------') return output_uri