339 lines
12 KiB
Python
339 lines
12 KiB
Python
# -*- coding: utf-8 -*- #
|
|
# Copyright 2024 Google LLC. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""Create a Flink job from a Java jar."""
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import unicode_literals
|
|
|
|
import os
|
|
|
|
from googlecloudsdk.api_lib.managed_flink import util as flink_util
|
|
from googlecloudsdk.api_lib.util import waiter
|
|
from googlecloudsdk.calliope import base
|
|
from googlecloudsdk.calliope import exceptions
|
|
from googlecloudsdk.command_lib.managed_flink import flags
|
|
from googlecloudsdk.command_lib.managed_flink import flink_backend
|
|
from googlecloudsdk.command_lib.util.args import common_args
|
|
from googlecloudsdk.core import exceptions as core_exceptions
|
|
from googlecloudsdk.core import log
|
|
from googlecloudsdk.core import properties
|
|
from googlecloudsdk.core import resources
|
|
from googlecloudsdk.core import yaml
|
|
from googlecloudsdk.core.util import encoding
|
|
from googlecloudsdk.core.util import files
|
|
from googlecloudsdk.core.util import platforms
|
|
|
|
|
|
def GetJobType(job_type, job_file):
|
|
"""Returns the job type based on the job_type and job_file."""
|
|
if job_type == 'auto':
|
|
job_type = None
|
|
if job_file.endswith('.py'):
|
|
job_type = 'python'
|
|
elif job_file.endswith('.sql'):
|
|
job_type = 'sql'
|
|
elif job_file.endswith('.jar'):
|
|
job_type = 'jar'
|
|
if not job_type:
|
|
raise UnknownJobType(
|
|
'Unable to determine type of job [{}]. Job input files must end in'
|
|
)
|
|
return job_type
|
|
|
|
|
|
def GetInputType(job_file):
|
|
"""Returns the input type based on the job_file."""
|
|
input_type = 'file://'
|
|
# format is:
|
|
# ar://<project>/<location>/<repository>/<file/path/version/file.jar>
|
|
if job_file.startswith('ar://') or job_file.startswith('artifactregistry://'):
|
|
input_type = 'ar://'
|
|
return input_type
|
|
|
|
|
|
class UnknownJobType(core_exceptions.Error):
|
|
"""Raised when the job type cannot be determined."""
|
|
|
|
|
|
@base.DefaultUniverseOnly
|
|
@base.ReleaseTracks(base.ReleaseTrack.ALPHA)
|
|
class Create(base.BinaryBackedCommand):
|
|
"""Create a Flink job from a Java jar."""
|
|
|
|
detailed_help = {
|
|
'EXAMPLES': """
|
|
To create a Flink job from a Java jar, run:
|
|
|
|
$ {command} my-job.jar --project=my-project --location=us-central1
|
|
""",
|
|
}
|
|
|
|
def _JobSubmitResponseHandler(self, response, job_type, temp_dir, args):
|
|
"""Process results of BinaryOperation Execution."""
|
|
if response.stdout and (args.show_output or job_type == 'sql'):
|
|
log.Print(response.stdout)
|
|
|
|
if response.stderr:
|
|
log.status.Print(response.stderr)
|
|
|
|
if response.failed:
|
|
return None
|
|
|
|
jobgraph = os.path.join(temp_dir, 'jobgraph.bin')
|
|
if not os.path.exists(jobgraph):
|
|
return None
|
|
jobspec = os.path.join(temp_dir, 'jobspec.yaml')
|
|
if not os.path.exists(jobspec):
|
|
return None
|
|
with files.FileReader(jobspec) as f:
|
|
jobspec_json = yaml.load(f)
|
|
|
|
files_to_upload = list()
|
|
files_to_upload.append(os.path.join(temp_dir, 'jobgraph.bin'))
|
|
for jar in jobspec_json['job']['jars']:
|
|
if jar.startswith('file:'):
|
|
files_to_upload.append(jar[5:])
|
|
|
|
dest = flink_backend.Upload(
|
|
files_to_upload,
|
|
os.path.join(args.staging_location, jobspec_json['job']['id']),
|
|
)
|
|
msg = flink_util.GetMessagesModule(self.ReleaseTrack())
|
|
|
|
jobspec = msg.JobSpec(
|
|
jobName='{0}'.format(jobspec_json['job']['name']),
|
|
jobGraphUri=dest[os.path.join(temp_dir, 'jobgraph.bin')],
|
|
jarUris=[dest[jar[5:]] for jar in jobspec_json['job']['jars']],
|
|
)
|
|
|
|
# Configure optional arguments
|
|
if args.name:
|
|
jobspec.displayName = args.name
|
|
|
|
if args.network:
|
|
config = msg.NetworkConfig(vpc=args.network)
|
|
if args.subnetwork:
|
|
config.subnetwork = args.subnetwork
|
|
jobspec.networkConfig = config
|
|
|
|
# Configure autotuning mode
|
|
autotuning_config = msg.AutotuningConfig()
|
|
if args.autotuning_mode == 'fixed':
|
|
autotuning_config.fixed = msg.Fixed(parallelism=args.parallelism)
|
|
else:
|
|
autotuning_config.throughputBased = msg.Elastic(
|
|
parallelism=args.min_parallelism,
|
|
minParallelism=args.min_parallelism,
|
|
maxParallelism=args.max_parallelism,
|
|
)
|
|
jobspec.autotuningConfig = autotuning_config
|
|
|
|
job = msg.Job(name=jobspec_json['job']['id'], jobSpec=jobspec)
|
|
if args.deployment:
|
|
job.deploymentId = args.deployment
|
|
|
|
create = msg.ManagedflinkProjectsLocationsJobsCreateRequest(
|
|
parent='projects/{0}/locations/{1}'.format(
|
|
properties.VALUES.core.project.Get(required=True), args.location
|
|
),
|
|
jobId=jobspec_json['job']['id'],
|
|
job=job,
|
|
)
|
|
if args.show_output:
|
|
log.Print(create)
|
|
if args.dry_run:
|
|
return response
|
|
flink_client = flink_util.FlinkClient(self.ReleaseTrack())
|
|
create_op = flink_client.client.projects_locations_jobs.Create(create)
|
|
if args.show_output:
|
|
log.Print(create_op)
|
|
log.Print('Create request issued for [{0}]'.format(create.jobId))
|
|
if args.async_submit:
|
|
return response
|
|
|
|
create_op_ref = resources.REGISTRY.Parse(
|
|
create_op.name, collection='managedflink.projects.locations.operations'
|
|
)
|
|
waiter.WaitFor(
|
|
waiter.CloudOperationPoller(
|
|
flink_client.client.projects_locations_jobs,
|
|
flink_client.client.projects_locations_operations,
|
|
),
|
|
create_op_ref,
|
|
'Waiting for operations [{0}] to complete...'.format(create_op.name),
|
|
)
|
|
return response
|
|
|
|
@staticmethod
|
|
def Args(parser):
|
|
# Common arguments
|
|
common_args.ProjectArgument(
|
|
help_text_to_overwrite='Project to run the job in.'
|
|
).AddToParser(parser)
|
|
# Specific arguments
|
|
flags.AddDeploymentArgument(
|
|
parser, help_text_to_overwrite='Deployment to run the job in.'
|
|
)
|
|
flags.AddShowOutputArgument(parser)
|
|
flags.AddDryRunArgument(parser)
|
|
flags.AddAsyncArgument(parser)
|
|
flags.AddMainClassArgument(parser)
|
|
flags.AddExtraJarsArgument(parser)
|
|
flags.AddLocationArgument(parser)
|
|
flags.AddStagingLocationArgument(parser)
|
|
flags.AddAutotuningModeArgument(parser)
|
|
flags.AddJobJarArgument(parser)
|
|
flags.AddJobTypeArgument(parser)
|
|
flags.AddNameArgument(parser)
|
|
flags.AddFixedParallelismArgs(parser)
|
|
flags.AddElasticParallelismArgs(parser)
|
|
flags.AddNetworkConfigArgs(parser)
|
|
flags.AddWorkloadIdentityArgument(parser)
|
|
flags.AddJobArgsCollector(parser)
|
|
flags.AddPythonVirtualEnvArgument(parser)
|
|
flags.AddExtraArchivesArgument(parser)
|
|
|
|
def Run(self, args):
|
|
current_os = platforms.OperatingSystem.Current()
|
|
if current_os is platforms.OperatingSystem.WINDOWS:
|
|
raise exceptions.ToolException('Job creation not supported on Windows.')
|
|
# Determing the job_file input method
|
|
input_type = GetInputType(args.job)
|
|
# Make sure the job file exists
|
|
if input_type == 'file://' and not os.path.exists(args.job):
|
|
raise exceptions.InvalidArgumentException(
|
|
'JAR|PY|SQL',
|
|
'Job definition [{0}] does not exist.'.format(args.job),
|
|
)
|
|
|
|
# Determine the job type
|
|
job_type = GetJobType(args.job_type, args.job)
|
|
|
|
# Make sure both network arguments are set if at least one is present.
|
|
if args.network:
|
|
if not args.subnetwork:
|
|
raise exceptions.InvalidArgumentException(
|
|
'network-config-subnetwork',
|
|
'--network-config-subnetwork must be set if --network-config-vpc is'
|
|
' set.',
|
|
)
|
|
elif args.subnetwork:
|
|
if not args.network:
|
|
raise exceptions.InvalidArgumentException(
|
|
'network-config-vpc',
|
|
'--network-config-vpc must be set if --network-config-subnetwork is'
|
|
' set.',
|
|
)
|
|
|
|
if args.workload_identity and args.deployment:
|
|
raise exceptions.InvalidArgumentException(
|
|
'workload-identity',
|
|
'--workload-identity cannot be set if --deployment is set.',
|
|
)
|
|
|
|
# Validate that autotuning arguments are consistent
|
|
flink_backend.ValidateAutotuning(
|
|
args.autotuning_mode,
|
|
args.min_parallelism,
|
|
args.max_parallelism,
|
|
args.parallelism,
|
|
)
|
|
|
|
# Validate the staging location
|
|
if not args.staging_location.startswith('gs://'):
|
|
raise exceptions.InvalidArgumentException(
|
|
'staging-location',
|
|
'Staging location must be of the form gs://<bucket>/<path>.',
|
|
)
|
|
flink_backend.CheckStagingLocation(args.staging_location)
|
|
|
|
# Validate the python virtualenv
|
|
if job_type == 'python':
|
|
if not args.python_venv:
|
|
raise exceptions.InvalidArgumentException(
|
|
'python-venv',
|
|
'Python virtualenv must be set if job type is python.',
|
|
)
|
|
|
|
if not args.python_venv.startswith('gs://'):
|
|
raise exceptions.InvalidArgumentException(
|
|
'python-venv',
|
|
'Python Virtualenv location must be of the form'
|
|
' gs://<bucket>/<path>.',
|
|
)
|
|
|
|
env = dict()
|
|
env['CLOUDSDK_MANAGEDFLINK_JOB_TYPE'] = job_type
|
|
if job_type == 'python':
|
|
if args.extra_jars:
|
|
env['HADOOP_CLASSPATH'] = ':'.join(args.extra_jars)
|
|
elif job_type == 'jar':
|
|
if args.extra_jars:
|
|
env['HADOOP_CLASSPATH'] = ':'.join(args.extra_jars)
|
|
|
|
# If there is a HADOOP_CLASSPATH environment variable, add it to the env
|
|
# variable so it doesn't get overwritten.
|
|
if env.get('HADOOP_CLASSPATH') and encoding.GetEncodedValue(
|
|
os.environ, 'HADOOP_CLASSPATH'
|
|
):
|
|
env['HADOOP_CLASSPATH'] = ':'.join([
|
|
env.get('HADOOP_CLASSPATH'),
|
|
encoding.GetEncodedValue(os.environ, 'HADOOP_CLASSPATH'),
|
|
])
|
|
|
|
# Dry run
|
|
if args.dry_run:
|
|
env['CLOUDSDK_MANAGEDFLINK_DRY_RUN'] = 'true'
|
|
env['CLOUDSDK_MANAGEDFLINK_ECHO_CMD'] = 'true'
|
|
|
|
with files.TemporaryDirectory() as temp_dir:
|
|
jar_path = args.job
|
|
if input_type == 'ar://':
|
|
jar_name, registry = flink_backend.CreateRegistryFromArtifactUri(
|
|
args.job
|
|
)
|
|
log.Print(
|
|
'Downloading {0} file from Artifact Registry...'.format(jar_name)
|
|
)
|
|
jar_path = os.path.join(temp_dir, jar_name.split('/')[-1])
|
|
# Will throw an HTTPError if the file does not exist.
|
|
flink_backend.DownloadJarFromArtifactRegistry(
|
|
dest_path=jar_path, artifact_jar_path=registry.RelativeName()
|
|
)
|
|
log.debug('Successfully downloaded the file to ' + jar_path)
|
|
command_executor = flink_backend.FlinkClientWrapper()
|
|
response = command_executor(
|
|
command='run',
|
|
job_type=job_type,
|
|
jar=jar_path,
|
|
target='gcloud',
|
|
deployment=args.deployment,
|
|
staging_location=args.staging_location,
|
|
autotuning_mode=args.autotuning_mode,
|
|
temp_dir=temp_dir,
|
|
network=args.network,
|
|
subnetwork=args.subnetwork,
|
|
name=args.name,
|
|
location=args.location,
|
|
main_class=args.main_class,
|
|
extra_jars=args.extra_jars,
|
|
extra_args=args.job_args,
|
|
extra_archives=args.archives,
|
|
python_venv=args.python_venv,
|
|
env=flink_backend.GetEnvArgsForCommand(env),
|
|
)
|
|
return self._JobSubmitResponseHandler(response, job_type, temp_dir, args)
|