129 lines
4.4 KiB
Python
129 lines
4.4 KiB
Python
# -*- coding: utf-8 -*- #
|
|
# Copyright 2015 Google LLC. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""Base class for PySpark Job."""
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import unicode_literals
|
|
|
|
import argparse
|
|
|
|
from apitools.base.py import encoding
|
|
|
|
from googlecloudsdk.calliope import arg_parsers
|
|
from googlecloudsdk.command_lib.dataproc.jobs import base as job_base
|
|
from googlecloudsdk.command_lib.dataproc.jobs import util as job_util
|
|
|
|
|
|
class PySparkBase(job_base.JobBase):
|
|
"""Submit a PySpark job to a cluster."""
|
|
|
|
@staticmethod
|
|
def Args(parser):
|
|
"""Performs command-line argument parsing specific to PySpark."""
|
|
|
|
parser.add_argument(
|
|
'py_file',
|
|
help='Main .py file to run as the driver.')
|
|
parser.add_argument(
|
|
'--py-files',
|
|
type=arg_parsers.ArgList(),
|
|
metavar='PY_FILE',
|
|
default=[],
|
|
help=('Comma separated list of Python files to be provided to the job. '
|
|
'Must be one of the following file formats '
|
|
'".py, .zip, or .egg".'))
|
|
parser.add_argument(
|
|
'--jars',
|
|
type=arg_parsers.ArgList(),
|
|
metavar='JAR',
|
|
default=[],
|
|
help=('Comma separated list of jar files to be provided to the '
|
|
'executor and driver classpaths.'))
|
|
parser.add_argument(
|
|
'--files',
|
|
type=arg_parsers.ArgList(),
|
|
metavar='FILE',
|
|
default=[],
|
|
help=('Comma separated list of files to be placed in the working '
|
|
'directory of both the app driver and executors.'))
|
|
parser.add_argument(
|
|
'--archives',
|
|
type=arg_parsers.ArgList(),
|
|
metavar='ARCHIVE',
|
|
default=[],
|
|
help=(
|
|
'Comma separated list of archives to be extracted into the working '
|
|
'directory of each executor. '
|
|
'Must be one of the following file formats: .zip, .tar, .tar.gz, '
|
|
'or .tgz.'))
|
|
parser.add_argument(
|
|
'job_args',
|
|
nargs=argparse.REMAINDER,
|
|
help='Arguments to pass to the driver.')
|
|
parser.add_argument(
|
|
'--properties',
|
|
type=arg_parsers.ArgDict(),
|
|
metavar='PROPERTY=VALUE',
|
|
help='List of key value pairs to configure PySpark. For a list of '
|
|
'available properties, see: '
|
|
'https://spark.apache.org/docs/latest/'
|
|
'configuration.html#available-properties.')
|
|
parser.add_argument(
|
|
'--properties-file',
|
|
help=job_util.PROPERTIES_FILE_HELP_TEXT)
|
|
parser.add_argument(
|
|
'--driver-log-levels',
|
|
type=arg_parsers.ArgDict(),
|
|
metavar='PACKAGE=LEVEL',
|
|
help=('List of key value pairs to configure driver logging, where key '
|
|
'is a package and value is the log4j log level. For '
|
|
'example: root=FATAL,com.example=INFO'))
|
|
|
|
@staticmethod
|
|
def GetFilesByType(args):
|
|
return {
|
|
'py_file': args.py_file,
|
|
'py_files': args.py_files,
|
|
'archives': args.archives,
|
|
'files': args.files,
|
|
'jars': args.jars}
|
|
|
|
@staticmethod
|
|
def ConfigureJob(messages, job, files_by_type, logging_config, args):
|
|
"""Populates the pysparkJob member of the given job."""
|
|
|
|
pyspark_job = messages.PySparkJob(
|
|
args=args.job_args or [],
|
|
archiveUris=files_by_type['archives'],
|
|
fileUris=files_by_type['files'],
|
|
jarFileUris=files_by_type['jars'],
|
|
pythonFileUris=files_by_type['py_files'],
|
|
mainPythonFileUri=files_by_type['py_file'],
|
|
loggingConfig=logging_config,
|
|
)
|
|
|
|
job_properties = job_util.BuildJobProperties(
|
|
args.properties, args.properties_file
|
|
)
|
|
if job_properties:
|
|
# Sort properties to ensure tests comparing messages not fail on ordering.
|
|
pyspark_job.properties = encoding.DictToAdditionalPropertyMessage(
|
|
job_properties, messages.PySparkJob.PropertiesValue, sort_items=True
|
|
)
|
|
|
|
job.pysparkJob = pyspark_job
|