# -*- coding: utf-8 -*- # # Copyright 2015 Google LLC. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Base class for PySpark Job.""" from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals import argparse from apitools.base.py import encoding from googlecloudsdk.calliope import arg_parsers from googlecloudsdk.command_lib.dataproc.jobs import base as job_base from googlecloudsdk.command_lib.dataproc.jobs import util as job_util class PySparkBase(job_base.JobBase): """Submit a PySpark job to a cluster.""" @staticmethod def Args(parser): """Performs command-line argument parsing specific to PySpark.""" parser.add_argument( 'py_file', help='Main .py file to run as the driver.') parser.add_argument( '--py-files', type=arg_parsers.ArgList(), metavar='PY_FILE', default=[], help=('Comma separated list of Python files to be provided to the job. ' 'Must be one of the following file formats ' '".py, .zip, or .egg".')) parser.add_argument( '--jars', type=arg_parsers.ArgList(), metavar='JAR', default=[], help=('Comma separated list of jar files to be provided to the ' 'executor and driver classpaths.')) parser.add_argument( '--files', type=arg_parsers.ArgList(), metavar='FILE', default=[], help=('Comma separated list of files to be placed in the working ' 'directory of both the app driver and executors.')) parser.add_argument( '--archives', type=arg_parsers.ArgList(), metavar='ARCHIVE', default=[], help=( 'Comma separated list of archives to be extracted into the working ' 'directory of each executor. ' 'Must be one of the following file formats: .zip, .tar, .tar.gz, ' 'or .tgz.')) parser.add_argument( 'job_args', nargs=argparse.REMAINDER, help='Arguments to pass to the driver.') parser.add_argument( '--properties', type=arg_parsers.ArgDict(), metavar='PROPERTY=VALUE', help='List of key value pairs to configure PySpark. For a list of ' 'available properties, see: ' 'https://spark.apache.org/docs/latest/' 'configuration.html#available-properties.') parser.add_argument( '--properties-file', help=job_util.PROPERTIES_FILE_HELP_TEXT) parser.add_argument( '--driver-log-levels', type=arg_parsers.ArgDict(), metavar='PACKAGE=LEVEL', help=('List of key value pairs to configure driver logging, where key ' 'is a package and value is the log4j log level. For ' 'example: root=FATAL,com.example=INFO')) @staticmethod def GetFilesByType(args): return { 'py_file': args.py_file, 'py_files': args.py_files, 'archives': args.archives, 'files': args.files, 'jars': args.jars} @staticmethod def ConfigureJob(messages, job, files_by_type, logging_config, args): """Populates the pysparkJob member of the given job.""" pyspark_job = messages.PySparkJob( args=args.job_args or [], archiveUris=files_by_type['archives'], fileUris=files_by_type['files'], jarFileUris=files_by_type['jars'], pythonFileUris=files_by_type['py_files'], mainPythonFileUri=files_by_type['py_file'], loggingConfig=logging_config, ) job_properties = job_util.BuildJobProperties( args.properties, args.properties_file ) if job_properties: # Sort properties to ensure tests comparing messages not fail on ordering. pyspark_job.properties = encoding.DictToAdditionalPropertyMessage( job_properties, messages.PySparkJob.PropertiesValue, sort_items=True ) job.pysparkJob = pyspark_job