2623 lines
111 KiB
Python
2623 lines
111 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Copyright 2010 Google Inc. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""Base class for gsutil commands.
|
|
|
|
In addition to base class code, this file contains helpers that depend on base
|
|
class state (such as GetAndPrintAcl) In general, functions that depend on
|
|
class state and that are used by multiple commands belong in this file.
|
|
Functions that don't depend on class state belong in util.py, and non-shared
|
|
helpers belong in individual subclasses.
|
|
"""
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import print_function
|
|
from __future__ import division
|
|
from __future__ import unicode_literals
|
|
|
|
import codecs
|
|
from collections import namedtuple
|
|
import copy
|
|
import getopt
|
|
import json
|
|
import logging
|
|
import os
|
|
import signal
|
|
import sys
|
|
import textwrap
|
|
import threading
|
|
import time
|
|
import traceback
|
|
|
|
import boto
|
|
from boto.storage_uri import StorageUri
|
|
import gslib
|
|
from gslib.cloud_api import AccessDeniedException
|
|
from gslib.cloud_api import ArgumentException
|
|
from gslib.cloud_api import ServiceException
|
|
from gslib.cloud_api_delegator import CloudApiDelegator
|
|
from gslib.cs_api_map import ApiSelector
|
|
from gslib.cs_api_map import GsutilApiMapFactory
|
|
from gslib.exception import CommandException
|
|
from gslib.help_provider import HelpProvider
|
|
from gslib.metrics import CaptureThreadStatException
|
|
from gslib.metrics import LogPerformanceSummaryParams
|
|
from gslib.name_expansion import CopyObjectInfo
|
|
from gslib.name_expansion import CopyObjectsIterator
|
|
from gslib.name_expansion import NameExpansionIterator
|
|
from gslib.name_expansion import NameExpansionResult
|
|
from gslib.name_expansion import SeekAheadNameExpansionIterator
|
|
from gslib.plurality_checkable_iterator import PluralityCheckableIterator
|
|
from gslib.seek_ahead_thread import SeekAheadThread
|
|
from gslib.sig_handling import ChildProcessSignalHandler
|
|
from gslib.sig_handling import GetCaughtSignals
|
|
from gslib.sig_handling import KillProcess
|
|
from gslib.sig_handling import MultithreadedMainSignalHandler
|
|
from gslib.sig_handling import RegisterSignalHandler
|
|
from gslib.storage_url import HaveFileUrls
|
|
from gslib.storage_url import HaveProviderUrls
|
|
from gslib.storage_url import StorageUrlFromString
|
|
from gslib.storage_url import UrlsAreForSingleProvider
|
|
from gslib.storage_url import UrlsAreMixOfBucketsAndObjects
|
|
from gslib.third_party.storage_apitools import storage_v1_messages as apitools_messages
|
|
from gslib.thread_message import FinalMessage
|
|
from gslib.thread_message import MetadataMessage
|
|
from gslib.thread_message import PerformanceSummaryMessage
|
|
from gslib.thread_message import ProducerThreadMessage
|
|
from gslib.ui_controller import MainThreadUIQueue
|
|
from gslib.ui_controller import UIController
|
|
from gslib.ui_controller import UIThread
|
|
from gslib.utils.boto_util import GetFriendlyConfigFilePaths
|
|
from gslib.utils.boto_util import GetMaxConcurrentCompressedUploads
|
|
from gslib.utils.constants import NO_MAX
|
|
from gslib.utils.constants import UTF8
|
|
import gslib.utils.parallelism_framework_util
|
|
from gslib.utils.parallelism_framework_util import AtomicDict
|
|
from gslib.utils.parallelism_framework_util import CheckMultiprocessingAvailableAndInit
|
|
from gslib.utils.parallelism_framework_util import multiprocessing_context
|
|
from gslib.utils.parallelism_framework_util import ProcessAndThreadSafeInt
|
|
from gslib.utils.parallelism_framework_util import PutToQueueWithTimeout
|
|
from gslib.utils.parallelism_framework_util import SEEK_AHEAD_JOIN_TIMEOUT
|
|
from gslib.utils.parallelism_framework_util import ShouldProhibitMultiprocessing
|
|
from gslib.utils.parallelism_framework_util import UI_THREAD_JOIN_TIMEOUT
|
|
from gslib.utils.parallelism_framework_util import ZERO_TASKS_TO_DO_ARGUMENT
|
|
from gslib.utils.rsync_util import RsyncDiffToApply
|
|
from gslib.utils.shim_util import GcloudStorageCommandMixin
|
|
from gslib.utils.system_util import GetTermLines
|
|
from gslib.utils.system_util import IS_WINDOWS
|
|
from gslib.utils.translation_helper import AclTranslation
|
|
from gslib.utils.translation_helper import GetNonMetadataHeaders
|
|
from gslib.utils.translation_helper import PRIVATE_DEFAULT_OBJ_ACL
|
|
from gslib.wildcard_iterator import CreateWildcardIterator
|
|
from six.moves import queue as Queue
|
|
|
|
# pylint: disable=g-import-not-at-top
|
|
try:
|
|
from Crypto import Random as CryptoRandom
|
|
except ImportError:
|
|
CryptoRandom = None
|
|
# pylint: enable=g-import-not-at-top
|
|
|
|
OFFER_GSUTIL_M_SUGGESTION_THRESHOLD = 5
|
|
OFFER_GSUTIL_M_SUGGESTION_FREQUENCY = 1000
|
|
|
|
|
|
def CreateOrGetGsutilLogger(command_name):
|
|
"""Fetches a logger with the given name that resembles 'print' output.
|
|
|
|
Initial Logger Configuration:
|
|
|
|
The logger abides by gsutil -d/-D/-DD/-q options. If none of those options
|
|
were specified at invocation, the returned logger will display all messages
|
|
logged with level INFO or above. Log propagation is disabled.
|
|
|
|
If a logger with the specified name has already been created and configured,
|
|
it is not reconfigured, e.g.:
|
|
|
|
foo = CreateOrGetGsutilLogger('foo') # Creates and configures Logger "foo".
|
|
foo.setLevel(logging.DEBUG) # Change level from INFO to DEBUG
|
|
foo = CreateOrGetGsutilLogger('foo') # Does not reset level to INFO.
|
|
|
|
Args:
|
|
command_name: (str) Command name to create logger for.
|
|
|
|
Returns:
|
|
A logging.Logger object.
|
|
"""
|
|
log = logging.getLogger(command_name)
|
|
# There are some scenarios (e.g. unit tests, commands like `mv` that call
|
|
# other commands) in which we call this function multiple times. To avoid
|
|
# adding duplicate handlers or overwriting logger attributes set elsewhere,
|
|
# we only configure the logger if it's one we haven't configured before (i.e.
|
|
# one that doesn't have a handler set yet).
|
|
if not log.handlers:
|
|
log.propagate = False
|
|
log.setLevel(logging.root.level)
|
|
log_handler = logging.StreamHandler()
|
|
log_handler.setFormatter(logging.Formatter('%(message)s'))
|
|
log.addHandler(log_handler)
|
|
return log
|
|
|
|
|
|
def _DefaultExceptionHandler(cls, e):
|
|
cls.logger.exception(e)
|
|
|
|
|
|
def _UrlArgChecker(command_instance, url):
|
|
if not command_instance.exclude_symlinks:
|
|
return True
|
|
exp_src_url = url.expanded_storage_url
|
|
if exp_src_url.IsFileUrl() and os.path.islink(exp_src_url.object_name):
|
|
command_instance.logger.info('Skipping symbolic link %s...', exp_src_url)
|
|
return False
|
|
return True
|
|
|
|
|
|
def DummyArgChecker(*unused_args):
|
|
return True
|
|
|
|
|
|
def SetAclFuncWrapper(cls, name_expansion_result, thread_state=None):
|
|
return cls.SetAclFunc(name_expansion_result, thread_state=thread_state)
|
|
|
|
|
|
def SetAclExceptionHandler(cls, e):
|
|
"""Exception handler that maintains state about post-completion status."""
|
|
cls.logger.error(str(e))
|
|
cls.everything_set_okay = False
|
|
|
|
|
|
# We will keep this list of all thread- or process-safe queues (except the
|
|
# global status queue) ever created by the main thread so that we can
|
|
# forcefully kill them upon shutdown. Otherwise, we encounter a Python bug in
|
|
# which empty queues block forever on join (which is called as part of the
|
|
# Python exit function cleanup) under the impression that they are non-empty.
|
|
# However, this also lets us shut down somewhat more cleanly when interrupted.
|
|
queues = []
|
|
|
|
|
|
def _CryptoRandomAtFork():
|
|
if CryptoRandom and getattr(CryptoRandom, 'atfork', None):
|
|
# Fixes https://github.com/GoogleCloudPlatform/gsutil/issues/390. The
|
|
# oauth2client module uses Python's Crypto library when pyOpenSSL isn't
|
|
# present; that module requires calling atfork() in both the parent and
|
|
# child process after a new process is forked.
|
|
CryptoRandom.atfork()
|
|
|
|
|
|
def _NewMultiprocessingQueue():
|
|
new_queue = multiprocessing_context.Queue(MAX_QUEUE_SIZE)
|
|
queues.append(new_queue)
|
|
return new_queue
|
|
|
|
|
|
def _NewThreadsafeQueue():
|
|
new_queue = Queue.Queue(MAX_QUEUE_SIZE)
|
|
queues.append(new_queue)
|
|
return new_queue
|
|
|
|
|
|
# The maximum size of a process- or thread-safe queue. Imposing this limit
|
|
# prevents us from needing to hold an arbitrary amount of data in memory.
|
|
# However, setting this number too high (e.g., >= 32768 on OS X) can cause
|
|
# problems on some operating systems.
|
|
MAX_QUEUE_SIZE = 32500
|
|
|
|
# Related to the max queue size above, once we cross this threshold of
|
|
# iterated tasks added to the queue, kick off the SeekAheadThread that will
|
|
# estimate the total work necessary for the command.
|
|
DEFAULT_TASK_ESTIMATION_THRESHOLD = 30000
|
|
|
|
|
|
def _GetTaskEstimationThreshold():
|
|
return boto.config.getint('GSUtil', 'task_estimation_threshold',
|
|
DEFAULT_TASK_ESTIMATION_THRESHOLD)
|
|
|
|
|
|
# That maximum depth of the tree of recursive calls to command.Apply. This is
|
|
# an arbitrary limit put in place to prevent developers from accidentally
|
|
# causing problems with infinite recursion, and it can be increased if needed.
|
|
MAX_RECURSIVE_DEPTH = 5
|
|
|
|
# Map from deprecated aliases to the current command and subcommands that
|
|
# provide the same behavior.
|
|
# TODO: Remove this map and deprecate old commands on 9/9/14.
|
|
OLD_ALIAS_MAP = {
|
|
'chacl': ['acl', 'ch'],
|
|
'getacl': ['acl', 'get'],
|
|
'setacl': ['acl', 'set'],
|
|
'getcors': ['cors', 'get'],
|
|
'setcors': ['cors', 'set'],
|
|
'chdefacl': ['defacl', 'ch'],
|
|
'getdefacl': ['defacl', 'get'],
|
|
'setdefacl': ['defacl', 'set'],
|
|
'disablelogging': ['logging', 'set', 'off'],
|
|
'enablelogging': ['logging', 'set', 'on'],
|
|
'getlogging': ['logging', 'get'],
|
|
'getversioning': ['versioning', 'get'],
|
|
'setversioning': ['versioning', 'set'],
|
|
'getwebcfg': ['web', 'get'],
|
|
'setwebcfg': ['web', 'set']
|
|
}
|
|
|
|
# Declare all of the module level variables - see
|
|
# InitializeMultiprocessingVariables for an explanation of why this is
|
|
# necessary.
|
|
# pylint: disable=global-at-module-level
|
|
global manager, consumer_pools, task_queues, caller_id_lock, caller_id_counter
|
|
global total_tasks, call_completed_map, global_return_values_map
|
|
global need_pool_or_done_cond, caller_id_finished_count, new_pool_needed
|
|
global current_max_recursive_level, shared_vars_map, shared_vars_list_map
|
|
global class_map, worker_checking_level_lock, failure_count, thread_stats
|
|
global glob_status_queue, ui_controller, concurrent_compressed_upload_lock
|
|
|
|
|
|
def InitializeMultiprocessingVariables():
|
|
"""Initializes module-level variables that will be inherited by subprocesses.
|
|
|
|
On Windows, a multiprocessing.Manager object should only
|
|
be created within an "if __name__ == '__main__':" block. This function
|
|
must be called, otherwise every command that calls Command.Apply will fail.
|
|
|
|
While multiprocessing variables are initialized at the beginning of
|
|
gsutil execution, new processes and threads are created only by calls
|
|
to Command.Apply. When multiple processes and threads are used,
|
|
the flow of startup/teardown looks like this:
|
|
|
|
1. __main__: initializes multiprocessing variables, including any necessary
|
|
Manager processes (here and in gslib.utils.parallelism_framework_util).
|
|
2. __main__: Registers signal handlers for terminating signals responsible
|
|
for cleaning up multiprocessing variables and manager processes upon exit.
|
|
3. Command.Apply registers signal handlers for the main process to kill
|
|
itself after the cleanup handlers registered by __main__ have executed.
|
|
4. If worker processes have not been created for the current level of
|
|
recursive calls, Command.Apply creates those processes.
|
|
|
|
---- Parallel operations start here, so steps are no longer numbered. ----
|
|
- Command.Apply in the main thread starts the ProducerThread.
|
|
- The Producer thread adds task arguments to the global task queue.
|
|
- It optionally starts the SeekAheadThread which estimates total
|
|
work for the Apply call.
|
|
|
|
- Command.Apply in the main thread starts the UIThread, which will consume
|
|
messages from the global status queue, process them, and display them to
|
|
the user.
|
|
|
|
- Each worker process creates a thread pool to perform work.
|
|
- The worker process registers signal handlers to kill itself in
|
|
response to a terminating signal.
|
|
- The main thread of the worker process moves items from the global
|
|
task queue to the process-local task queue.
|
|
- Worker threads retrieve items from the process-local task queue,
|
|
perform the work, and post messages to the global status queue.
|
|
- Worker threads may themselves call Command.Apply.
|
|
- This creates a new pool of worker subprocesses with the same size
|
|
as the main pool. This pool is shared amongst all Command.Apply calls
|
|
at the given recursion depth.
|
|
- This reuses the global UIThread, global status queue, and global task
|
|
queue.
|
|
- This starts a new ProducerThread.
|
|
- A SeekAheadThread is not started at this level; only one such thread
|
|
exists at the top level, and it provides estimates for top-level work
|
|
only.
|
|
|
|
- The ProducerThread runs out of tasks, or the user signals cancellation.
|
|
- The ProducerThread cancels the SeekAheadThread (if it is running) via
|
|
an event.
|
|
- The ProducerThread enqueues special terminating messages on the
|
|
global task queue and global status queue, signaling the UI Thread to
|
|
shut down and the main thread to continue operation.
|
|
- In the termination case, existing processes exit in response to
|
|
terminating signals from the main process.
|
|
|
|
---- Parallel operations end here. ----
|
|
5. Further top-level calls to Command.Apply can be made, which will repeat
|
|
all of the steps made in #4, except that worker processes will be
|
|
reused.
|
|
"""
|
|
# This list of global variables must exactly match the above list of
|
|
# declarations.
|
|
# pylint: disable=global-variable-undefined
|
|
global manager, consumer_pools, task_queues, caller_id_lock, caller_id_counter
|
|
global total_tasks, call_completed_map, global_return_values_map, thread_stats
|
|
global need_pool_or_done_cond, caller_id_finished_count, new_pool_needed
|
|
global current_max_recursive_level, shared_vars_map, shared_vars_list_map
|
|
global class_map, worker_checking_level_lock, failure_count, glob_status_queue
|
|
global concurrent_compressed_upload_lock
|
|
|
|
manager = multiprocessing_context.Manager()
|
|
|
|
# List of ConsumerPools - used during shutdown to clean up child processes.
|
|
consumer_pools = []
|
|
|
|
# List of all existing task queues - used by all pools to find the queue
|
|
# that's appropriate for the given recursive_apply_level.
|
|
task_queues = []
|
|
|
|
# Used to assign a globally unique caller ID to each Apply call.
|
|
caller_id_lock = manager.Lock()
|
|
caller_id_counter = ProcessAndThreadSafeInt(True)
|
|
|
|
# Map from caller_id to total number of tasks to be completed for that ID.
|
|
total_tasks = AtomicDict(manager=manager)
|
|
|
|
# Map from caller_id to a boolean which is True iff all its tasks are
|
|
# finished.
|
|
call_completed_map = AtomicDict(manager=manager)
|
|
|
|
# Used to keep track of the set of return values for each caller ID.
|
|
global_return_values_map = AtomicDict(manager=manager)
|
|
|
|
# Condition used to notify any waiting threads that a task has finished or
|
|
# that a call to Apply needs a new set of consumer processes.
|
|
need_pool_or_done_cond = manager.Condition()
|
|
|
|
# Lock used to prevent multiple worker processes from asking the main thread
|
|
# to create a new consumer pool for the same level.
|
|
worker_checking_level_lock = manager.Lock()
|
|
|
|
# Map from caller_id to the current number of completed tasks for that ID.
|
|
caller_id_finished_count = AtomicDict(manager=manager)
|
|
|
|
# Used as a way for the main thread to distinguish between being woken up
|
|
# by another call finishing and being woken up by a call that needs a new set
|
|
# of consumer processes.
|
|
new_pool_needed = ProcessAndThreadSafeInt(True)
|
|
|
|
current_max_recursive_level = ProcessAndThreadSafeInt(True)
|
|
|
|
# Map from (caller_id, name) to the value of that shared variable.
|
|
shared_vars_map = AtomicDict(manager=manager)
|
|
shared_vars_list_map = AtomicDict(manager=manager)
|
|
|
|
# Map from (process id, thread id) to a _ThreadStat object (see WorkerThread).
|
|
# Used to keep track of thread idle time and execution time.
|
|
thread_stats = AtomicDict(manager=manager)
|
|
|
|
# Map from caller_id to calling class.
|
|
class_map = manager.dict()
|
|
|
|
# Number of tasks that resulted in an exception in calls to Apply().
|
|
failure_count = ProcessAndThreadSafeInt(True)
|
|
|
|
# Central queue for status reporting across multiple processes and threads.
|
|
# It's possible that if many processes and threads are executing small file
|
|
# writes or metadata changes quickly, performance may be bounded by lock
|
|
# contention on the queue. Initial testing conducted with
|
|
# 12 processes * 5 threads per process showed little difference. If this
|
|
# becomes a performance bottleneck in the future, consider creating a queue
|
|
# per-process and having the UI thread poll all of the queues; that approach
|
|
# would need to address:
|
|
# - Queue fairness if one queue grows to be disproportionately large
|
|
# - Reasonable time correlation with events as they occur
|
|
#
|
|
# This queue must be torn down after worker processes/threads and the
|
|
# UI thread have been torn down. Otherwise, these threads may have
|
|
# undefined behavior when trying to interact with a non-existent queue.
|
|
glob_status_queue = manager.Queue(MAX_QUEUE_SIZE)
|
|
|
|
# Semaphore lock used to prevent resource exhaustion when running many
|
|
# compressed uploads in parallel.
|
|
concurrent_compressed_upload_lock = manager.BoundedSemaphore(
|
|
GetMaxConcurrentCompressedUploads())
|
|
|
|
|
|
def TeardownMultiprocessingProcesses():
|
|
"""Should be called by signal handlers prior to shut down."""
|
|
# Shut down all processes in consumer pools in preparation for exiting.
|
|
ShutDownGsutil()
|
|
# Shut down command and util's multiprocessing.Manager().
|
|
# pylint: disable=global-variable-not-assigned,global-variable-undefined
|
|
global manager
|
|
# pylint: enable=global-variable-not-assigned,global-variable-undefined
|
|
manager.shutdown()
|
|
gslib.utils.parallelism_framework_util.top_level_manager.shutdown()
|
|
|
|
|
|
def InitializeThreadingVariables():
|
|
"""Initializes module-level variables used when running multi-threaded.
|
|
|
|
When multiprocessing is not available (or on Windows where only 1 process
|
|
is used), thread-safe analogs to the multiprocessing global variables
|
|
must be initialized. This function is the thread-safe analog to
|
|
InitializeMultiprocessingVariables.
|
|
"""
|
|
# pylint: disable=global-variable-undefined
|
|
global global_return_values_map, shared_vars_map, failure_count
|
|
global caller_id_finished_count, shared_vars_list_map, total_tasks
|
|
global need_pool_or_done_cond, call_completed_map, class_map, thread_stats
|
|
global task_queues, caller_id_lock, caller_id_counter, glob_status_queue
|
|
global worker_checking_level_lock, current_max_recursive_level
|
|
global concurrent_compressed_upload_lock
|
|
caller_id_counter = ProcessAndThreadSafeInt(False)
|
|
caller_id_finished_count = AtomicDict()
|
|
caller_id_lock = threading.Lock()
|
|
call_completed_map = AtomicDict()
|
|
class_map = AtomicDict()
|
|
current_max_recursive_level = ProcessAndThreadSafeInt(False)
|
|
failure_count = ProcessAndThreadSafeInt(False)
|
|
glob_status_queue = Queue.Queue(MAX_QUEUE_SIZE)
|
|
global_return_values_map = AtomicDict()
|
|
need_pool_or_done_cond = threading.Condition()
|
|
shared_vars_list_map = AtomicDict()
|
|
shared_vars_map = AtomicDict()
|
|
thread_stats = AtomicDict()
|
|
task_queues = []
|
|
total_tasks = AtomicDict()
|
|
worker_checking_level_lock = threading.Lock()
|
|
concurrent_compressed_upload_lock = threading.BoundedSemaphore(
|
|
GetMaxConcurrentCompressedUploads())
|
|
|
|
|
|
# Each subclass of Command must define a property named 'command_spec' that is
|
|
# an instance of the following class.
|
|
CommandSpec = namedtuple(
|
|
'CommandSpec',
|
|
[
|
|
# Name of command.
|
|
'command_name',
|
|
# Usage synopsis.
|
|
'usage_synopsis',
|
|
# List of command name aliases.
|
|
'command_name_aliases',
|
|
# Min number of args required by this command.
|
|
'min_args',
|
|
# Max number of args required by this command, or NO_MAX.
|
|
'max_args',
|
|
# Getopt-style string specifying acceptable sub args.
|
|
'supported_sub_args',
|
|
# True if file URLs are acceptable for this command.
|
|
'file_url_ok',
|
|
# True if provider-only URLs are acceptable for this command.
|
|
'provider_url_ok',
|
|
# Index in args of first URL arg.
|
|
'urls_start_arg',
|
|
# List of supported APIs
|
|
'gs_api_support',
|
|
# Default API to use for this command
|
|
'gs_default_api',
|
|
# Private arguments (for internal testing)
|
|
'supported_private_args',
|
|
'argparse_arguments',
|
|
])
|
|
|
|
|
|
class Command(HelpProvider, GcloudStorageCommandMixin):
|
|
"""Base class for all gsutil commands."""
|
|
|
|
# Each subclass must override this with an instance of CommandSpec.
|
|
command_spec = None
|
|
|
|
_commands_with_subcommands_and_subopts = ('acl', 'defacl', 'iam', 'kms',
|
|
'label', 'logging', 'notification',
|
|
'retention', 'web')
|
|
|
|
# This keeps track of the recursive depth of the current call to Apply.
|
|
recursive_apply_level = 0
|
|
|
|
# If the multiprocessing module isn't available, we'll use this to keep track
|
|
# of the caller_id.
|
|
sequential_caller_id = -1
|
|
|
|
@staticmethod
|
|
def CreateCommandSpec(command_name,
|
|
usage_synopsis=None,
|
|
command_name_aliases=None,
|
|
min_args=0,
|
|
max_args=NO_MAX,
|
|
supported_sub_args='',
|
|
file_url_ok=False,
|
|
provider_url_ok=False,
|
|
urls_start_arg=0,
|
|
gs_api_support=None,
|
|
gs_default_api=None,
|
|
supported_private_args=None,
|
|
argparse_arguments=None):
|
|
"""Creates an instance of CommandSpec, with defaults."""
|
|
return CommandSpec(command_name=command_name,
|
|
usage_synopsis=usage_synopsis,
|
|
command_name_aliases=command_name_aliases or [],
|
|
min_args=min_args,
|
|
max_args=max_args,
|
|
supported_sub_args=supported_sub_args,
|
|
file_url_ok=file_url_ok,
|
|
provider_url_ok=provider_url_ok,
|
|
urls_start_arg=urls_start_arg,
|
|
gs_api_support=gs_api_support or [ApiSelector.XML],
|
|
gs_default_api=gs_default_api or ApiSelector.XML,
|
|
supported_private_args=supported_private_args,
|
|
argparse_arguments=argparse_arguments or [])
|
|
|
|
# Define a convenience property for command name, since it's used many places.
|
|
def _GetDefaultCommandName(self):
|
|
return self.command_spec.command_name
|
|
|
|
command_name = property(_GetDefaultCommandName)
|
|
|
|
def _CalculateUrlsStartArg(self):
|
|
"""Calculate the index in args of the first URL arg.
|
|
|
|
Returns:
|
|
Index of the first URL arg (according to the command spec).
|
|
"""
|
|
return self.command_spec.urls_start_arg
|
|
|
|
def _TranslateDeprecatedAliases(self, args):
|
|
"""Map deprecated aliases to the corresponding new command, and warn."""
|
|
new_command_args = OLD_ALIAS_MAP.get(self.command_alias_used, None)
|
|
if new_command_args:
|
|
# Prepend any subcommands for the new command. The command name itself
|
|
# is not part of the args, so leave it out.
|
|
args = new_command_args[1:] + args
|
|
self.logger.warn('\n'.join(
|
|
textwrap.wrap(
|
|
('You are using a deprecated alias, "%(used_alias)s", for the '
|
|
'"%(command_name)s" command. This will stop working on 9/9/2014. '
|
|
'Please use "%(command_name)s" with the appropriate sub-command in '
|
|
'the future. See "gsutil help %(command_name)s" for details.') %
|
|
{
|
|
'used_alias': self.command_alias_used,
|
|
'command_name': self.command_name
|
|
})))
|
|
return args
|
|
|
|
def __init__(self,
|
|
command_runner,
|
|
args,
|
|
headers,
|
|
debug,
|
|
trace_token,
|
|
parallel_operations,
|
|
bucket_storage_uri_class,
|
|
gsutil_api_class_map_factory,
|
|
logging_filters=None,
|
|
command_alias_used=None,
|
|
perf_trace_token=None,
|
|
user_project=None):
|
|
"""Instantiates a Command.
|
|
|
|
Args:
|
|
command_runner: CommandRunner (for commands built atop other commands).
|
|
args: Command-line args (arg0 = actual arg, not command name ala bash).
|
|
headers: Dictionary containing optional HTTP headers to pass to boto.
|
|
debug: Debug level to pass in to boto connection (range 0..3).
|
|
trace_token: Trace token to pass to the API implementation.
|
|
parallel_operations: Should command operations be executed in parallel?
|
|
bucket_storage_uri_class: Class to instantiate for cloud StorageUris.
|
|
Settable for testing/mocking.
|
|
gsutil_api_class_map_factory: Creates map of cloud storage interfaces.
|
|
Settable for testing/mocking.
|
|
logging_filters: Optional list of logging. Filters to apply to this
|
|
command's logger.
|
|
command_alias_used: The alias that was actually used when running this
|
|
command (as opposed to the "official" command name,
|
|
which will always correspond to the file name).
|
|
perf_trace_token: Performance measurement trace token to use when making
|
|
API calls.
|
|
user_project: Project to be billed for this request.
|
|
|
|
Implementation note: subclasses shouldn't need to define an __init__
|
|
method, and instead depend on the shared initialization that happens
|
|
here. If you do define an __init__ method in a subclass you'll need to
|
|
explicitly call super().__init__(). But you're encouraged not to do this,
|
|
because it will make changing the __init__ interface more painful.
|
|
"""
|
|
# Save class values from constructor params.
|
|
super().__init__()
|
|
self.command_runner = command_runner
|
|
self.unparsed_args = args
|
|
self.headers = headers
|
|
self.debug = debug
|
|
self.trace_token = trace_token
|
|
self.perf_trace_token = perf_trace_token
|
|
self.parallel_operations = parallel_operations
|
|
self.user_project = user_project
|
|
self.bucket_storage_uri_class = bucket_storage_uri_class
|
|
self.gsutil_api_class_map_factory = gsutil_api_class_map_factory
|
|
self.exclude_symlinks = False
|
|
self.recursion_requested = False
|
|
self.all_versions = False
|
|
self.command_alias_used = command_alias_used
|
|
self.seek_ahead_gsutil_api = None
|
|
# pylint: disable=global-variable-not-assigned
|
|
# pylint: disable=global-variable-undefined
|
|
global ui_controller
|
|
# pylint: enable=global-variable-undefined
|
|
# pylint: enable=global-variable-not-assigned
|
|
# Global instance of a threaded logger object.
|
|
self.logger = CreateOrGetGsutilLogger(self.command_name)
|
|
if logging_filters:
|
|
for log_filter in logging_filters:
|
|
self.logger.addFilter(log_filter)
|
|
|
|
if self.headers is not None:
|
|
self.non_metadata_headers = GetNonMetadataHeaders(self.headers)
|
|
else:
|
|
self.non_metadata_headers = None
|
|
|
|
if self.command_spec is None:
|
|
raise CommandException('"%s" command implementation is missing a '
|
|
'command_spec definition.' % self.command_name)
|
|
|
|
self.quiet_mode = not self.logger.isEnabledFor(logging.INFO)
|
|
ui_controller = UIController(quiet_mode=self.quiet_mode,
|
|
dump_status_messages_file=boto.config.get(
|
|
'GSUtil', 'dump_status_messages_file',
|
|
None))
|
|
|
|
# Parse and validate args.
|
|
self.args = self._TranslateDeprecatedAliases(args)
|
|
self.ParseSubOpts()
|
|
|
|
# Named tuple public functions start with _
|
|
# pylint: disable=protected-access
|
|
self.command_spec = self.command_spec._replace(
|
|
urls_start_arg=self._CalculateUrlsStartArg())
|
|
|
|
if (len(self.args) < self.command_spec.min_args or
|
|
len(self.args) > self.command_spec.max_args):
|
|
self.RaiseWrongNumberOfArgumentsException()
|
|
|
|
if self.command_name not in self._commands_with_subcommands_and_subopts:
|
|
self.CheckArguments()
|
|
|
|
# Build the support and default maps from the command spec.
|
|
support_map = {
|
|
'gs': self.command_spec.gs_api_support,
|
|
's3': [ApiSelector.XML]
|
|
}
|
|
default_map = {
|
|
'gs': self.command_spec.gs_default_api,
|
|
's3': ApiSelector.XML
|
|
}
|
|
self.gsutil_api_map = GsutilApiMapFactory.GetApiMap(
|
|
self.gsutil_api_class_map_factory, support_map, default_map)
|
|
|
|
self.project_id = None
|
|
self.gsutil_api = CloudApiDelegator(self.bucket_storage_uri_class,
|
|
self.gsutil_api_map,
|
|
self.logger,
|
|
MainThreadUIQueue(
|
|
sys.stderr, ui_controller),
|
|
debug=self.debug,
|
|
http_headers=self.non_metadata_headers,
|
|
trace_token=self.trace_token,
|
|
perf_trace_token=self.perf_trace_token,
|
|
user_project=self.user_project)
|
|
# Cross-platform path to run gsutil binary.
|
|
self.gsutil_cmd = ''
|
|
# If running on Windows, invoke python interpreter explicitly.
|
|
if IS_WINDOWS:
|
|
self.gsutil_cmd += 'python '
|
|
# Add full path to gsutil to make sure we test the correct version.
|
|
self.gsutil_path = gslib.GSUTIL_PATH
|
|
self.gsutil_cmd += self.gsutil_path
|
|
|
|
# We're treating recursion_requested like it's used by all commands, but
|
|
# only some of the commands accept the -R option.
|
|
if self.sub_opts:
|
|
for o, unused_a in self.sub_opts:
|
|
if o == '-r' or o == '-R':
|
|
self.recursion_requested = True
|
|
break
|
|
|
|
self.multiprocessing_is_available = (
|
|
CheckMultiprocessingAvailableAndInit().is_available)
|
|
|
|
def RaiseWrongNumberOfArgumentsException(self):
|
|
"""Raises exception for wrong number of arguments supplied to command."""
|
|
if len(self.args) < self.command_spec.min_args:
|
|
tail_str = 's' if self.command_spec.min_args > 1 else ''
|
|
message = ('The %s command requires at least %d argument%s.' %
|
|
(self.command_name, self.command_spec.min_args, tail_str))
|
|
else:
|
|
message = ('The %s command accepts at most %d arguments.' %
|
|
(self.command_name, self.command_spec.max_args))
|
|
message += ' Usage:\n%s\nFor additional help run:\n gsutil help %s' % (
|
|
self.command_spec.usage_synopsis, self.command_name)
|
|
raise CommandException(message)
|
|
|
|
def RaiseInvalidArgumentException(self):
|
|
"""Raises exception for specifying an invalid argument to command."""
|
|
message = ('Incorrect option(s) specified. Usage:\n%s\n'
|
|
'For additional help run:\n gsutil help %s' %
|
|
(self.command_spec.usage_synopsis, self.command_name))
|
|
raise CommandException(message)
|
|
|
|
def ParseSubOpts(self,
|
|
check_args=False,
|
|
args=None,
|
|
should_update_sub_opts_and_args=True):
|
|
"""Parses sub-opt args.
|
|
|
|
Args:
|
|
check_args: True to have CheckArguments() called after parsing.
|
|
args: List of args. If None, self.args will be used.
|
|
should_update_sub_opts_and_args: True if self.sub_opts and self.args
|
|
should be updated with the values returned after parsing. Else return a
|
|
tuple of sub_opts, args returned by getopt.getopt. This is done
|
|
to allow this method to be called from get_gcloud_storage_args in which
|
|
case we do not want to update self.sub_opts and self.args.
|
|
|
|
Raises:
|
|
RaiseInvalidArgumentException: Invalid args specified.
|
|
"""
|
|
if args is None:
|
|
unparsed_args = self.args
|
|
else:
|
|
unparsed_args = args
|
|
try:
|
|
parsed_sub_opts, parsed_args = getopt.getopt(
|
|
unparsed_args, self.command_spec.supported_sub_args,
|
|
self.command_spec.supported_private_args or [])
|
|
except getopt.GetoptError:
|
|
self.RaiseInvalidArgumentException()
|
|
if should_update_sub_opts_and_args:
|
|
self.sub_opts, self.args = parsed_sub_opts, parsed_args
|
|
if check_args:
|
|
self.CheckArguments()
|
|
else:
|
|
if check_args:
|
|
# This is just for sanity check. Only get_gcloud_storage_args will
|
|
# call this method with should_update_sub_opts_and_args=False, and it
|
|
# does not set check_args to True.
|
|
raise TypeError('Requested to check arguments'
|
|
' but sub_opts and args have not been updated.')
|
|
return parsed_sub_opts, parsed_args
|
|
|
|
def CheckArguments(self):
|
|
"""Checks that command line arguments match the command_spec.
|
|
|
|
Any commands in self._commands_with_subcommands_and_subopts are responsible
|
|
for calling this method after handling initial parsing of their arguments.
|
|
This prevents commands with sub-commands as well as options from breaking
|
|
the parsing of getopt.
|
|
|
|
TODO: Provide a function to parse commands and sub-commands more
|
|
intelligently once we stop allowing the deprecated command versions.
|
|
|
|
Raises:
|
|
CommandException if the arguments don't match.
|
|
"""
|
|
|
|
if (not self.command_spec.file_url_ok and
|
|
HaveFileUrls(self.args[self.command_spec.urls_start_arg:])):
|
|
raise CommandException('"%s" command does not support "file://" URLs. '
|
|
'Did you mean to use a gs:// URL?' %
|
|
self.command_name)
|
|
if (not self.command_spec.provider_url_ok and
|
|
HaveProviderUrls(self.args[self.command_spec.urls_start_arg:])):
|
|
raise CommandException('"%s" command does not support provider-only '
|
|
'URLs.' % self.command_name)
|
|
|
|
def WildcardIterator(self, url_string, all_versions=False):
|
|
"""Helper to instantiate gslib.WildcardIterator.
|
|
|
|
Args are same as gslib.WildcardIterator interface, but this method fills in
|
|
most of the values from instance state.
|
|
|
|
Args:
|
|
url_string: URL string naming wildcard objects to iterate.
|
|
all_versions: If true, the iterator yields all versions of objects
|
|
matching the wildcard. If false, yields just the live
|
|
object version.
|
|
|
|
Returns:
|
|
WildcardIterator for use by caller.
|
|
"""
|
|
return CreateWildcardIterator(url_string,
|
|
self.gsutil_api,
|
|
all_versions=all_versions,
|
|
project_id=self.project_id,
|
|
logger=self.logger)
|
|
|
|
def GetSeekAheadGsutilApi(self):
|
|
"""Helper to instantiate a Cloud API instance for a seek-ahead iterator.
|
|
|
|
This must be separate from the core command.gsutil_api instance for
|
|
thread-safety, since other iterators typically use that instance and the
|
|
SeekAheadIterator operates in parallel.
|
|
|
|
Returns:
|
|
Cloud API instance for use by the seek-ahead iterator.
|
|
"""
|
|
# This is initialized in Initialize(Multiprocessing|Threading)Variables
|
|
# pylint: disable=global-variable-not-assigned
|
|
# pylint: disable=global-variable-undefined
|
|
global glob_status_queue
|
|
# pylint: enable=global-variable-not-assigned
|
|
# pylint: enable=global-variable-undefined
|
|
if not self.seek_ahead_gsutil_api:
|
|
self.seek_ahead_gsutil_api = CloudApiDelegator(
|
|
self.bucket_storage_uri_class,
|
|
self.gsutil_api_map,
|
|
logging.getLogger('dummy'),
|
|
glob_status_queue,
|
|
debug=self.debug,
|
|
http_headers=self.non_metadata_headers,
|
|
trace_token=self.trace_token,
|
|
perf_trace_token=self.perf_trace_token,
|
|
user_project=self.user_project)
|
|
return self.seek_ahead_gsutil_api
|
|
|
|
def RunCommand(self):
|
|
"""Abstract function in base class. Subclasses must implement this.
|
|
|
|
The return value of this function will be used as the exit status of the
|
|
process, so subclass commands should return an integer exit code (0 for
|
|
success, a value in [1,255] for failure).
|
|
"""
|
|
raise CommandException('Command %s is missing its RunCommand() '
|
|
'implementation' % self.command_name)
|
|
|
|
############################################################
|
|
# Shared helper functions that depend on base class state. #
|
|
############################################################
|
|
|
|
# TODO: Refactor ACL functions to a different module and pass the
|
|
# command object as state, as opposed to defining them as member functions
|
|
# of the command class.
|
|
def ApplyAclFunc(self,
|
|
acl_func,
|
|
acl_excep_handler,
|
|
url_strs,
|
|
object_fields=None):
|
|
"""Sets the standard or default object ACL depending on self.command_name.
|
|
|
|
Args:
|
|
acl_func: ACL function to be passed to Apply.
|
|
acl_excep_handler: ACL exception handler to be passed to Apply.
|
|
url_strs: URL strings on which to set ACL.
|
|
object_fields: If present, list of object metadata fields to retrieve;
|
|
if None, default name expansion iterator fields will be used.
|
|
|
|
Raises:
|
|
CommandException if an ACL could not be set.
|
|
"""
|
|
multi_threaded_url_args = []
|
|
|
|
urls = list(map(StorageUrlFromString, url_strs))
|
|
|
|
if (UrlsAreMixOfBucketsAndObjects(urls) and not self.recursion_requested):
|
|
raise CommandException('Cannot operate on a mix of buckets and objects.')
|
|
|
|
# Handle bucket ACL setting operations single-threaded, because
|
|
# our threading machinery currently assumes it's working with objects
|
|
# (name_expansion_iterator), and normally we wouldn't expect users to need
|
|
# to set ACLs on huge numbers of buckets at once anyway.
|
|
for url in urls:
|
|
if url.IsCloudUrl() and url.IsBucket():
|
|
if self.recursion_requested:
|
|
# If user specified -R option, convert any bucket args to bucket
|
|
# wildcards (e.g., gs://bucket/*), to prevent the operation from
|
|
# being applied to the buckets themselves.
|
|
url.object_name = '*'
|
|
multi_threaded_url_args.append(url.url_string)
|
|
else:
|
|
# Convert to a NameExpansionResult so we can re-use the threaded
|
|
# function for the single-threaded implementation. RefType is unused.
|
|
for blr in self.WildcardIterator(
|
|
url.url_string).IterBuckets(bucket_fields=['id']):
|
|
name_expansion_for_url = NameExpansionResult(
|
|
source_storage_url=url,
|
|
is_multi_source_request=False,
|
|
is_multi_top_level_source_request=False,
|
|
names_container=False,
|
|
expanded_storage_url=blr.storage_url,
|
|
expanded_result=None)
|
|
acl_func(self, name_expansion_for_url)
|
|
else:
|
|
multi_threaded_url_args.append(url.url_string)
|
|
|
|
if len(multi_threaded_url_args) >= 1:
|
|
name_expansion_iterator = NameExpansionIterator(
|
|
self.command_name,
|
|
self.debug,
|
|
self.logger,
|
|
self.gsutil_api,
|
|
multi_threaded_url_args,
|
|
self.recursion_requested,
|
|
all_versions=self.all_versions,
|
|
continue_on_error=self.continue_on_error or self.parallel_operations,
|
|
bucket_listing_fields=object_fields)
|
|
|
|
seek_ahead_iterator = SeekAheadNameExpansionIterator(
|
|
self.command_name,
|
|
self.debug,
|
|
self.GetSeekAheadGsutilApi(),
|
|
multi_threaded_url_args,
|
|
self.recursion_requested,
|
|
all_versions=self.all_versions)
|
|
|
|
# Perform requests in parallel (-m) mode, if requested, using
|
|
# configured number of parallel processes and threads. Otherwise,
|
|
# perform requests with sequential function calls in current process.
|
|
self.Apply(acl_func,
|
|
name_expansion_iterator,
|
|
acl_excep_handler,
|
|
fail_on_error=not self.continue_on_error,
|
|
seek_ahead_iterator=seek_ahead_iterator)
|
|
|
|
if not self.everything_set_okay and not self.continue_on_error:
|
|
raise CommandException('ACLs for some objects could not be set.')
|
|
|
|
def SetAclFunc(self, name_expansion_result, thread_state=None):
|
|
"""Sets the object ACL for the name_expansion_result provided.
|
|
|
|
Args:
|
|
name_expansion_result: NameExpansionResult describing the target object.
|
|
thread_state: If present, use this gsutil Cloud API instance for the set.
|
|
"""
|
|
if thread_state:
|
|
assert not self.def_acl
|
|
gsutil_api = thread_state
|
|
else:
|
|
gsutil_api = self.gsutil_api
|
|
op_string = 'default object ACL' if self.def_acl else 'ACL'
|
|
url = name_expansion_result.expanded_storage_url
|
|
self.logger.info('Setting %s on %s...', op_string, url)
|
|
if (gsutil_api.GetApiSelector(url.scheme) == ApiSelector.XML and
|
|
url.scheme != 'gs'):
|
|
# If we are called with a non-google ACL model, we need to use the XML
|
|
# passthrough. acl_arg should either be a canned ACL or an XML ACL.
|
|
self._SetAclXmlPassthrough(url, gsutil_api)
|
|
else:
|
|
# Normal Cloud API path. acl_arg is a JSON ACL or a canned ACL.
|
|
self._SetAclGsutilApi(url, gsutil_api)
|
|
PutToQueueWithTimeout(gsutil_api.status_queue,
|
|
MetadataMessage(message_time=time.time()))
|
|
|
|
def _SetAclXmlPassthrough(self, url, gsutil_api):
|
|
"""Sets the ACL for the URL provided using the XML passthrough functions.
|
|
|
|
This function assumes that self.def_acl, self.canned,
|
|
and self.continue_on_error are initialized, and that self.acl_arg is
|
|
either an XML string or a canned ACL string.
|
|
|
|
Args:
|
|
url: CloudURL to set the ACL on.
|
|
gsutil_api: gsutil Cloud API to use for the ACL set. Must support XML
|
|
passthrough functions.
|
|
"""
|
|
orig_prefer_api = gsutil_api.prefer_api
|
|
try:
|
|
gsutil_api.prefer_api = ApiSelector.XML
|
|
gsutil_api.XmlPassThroughSetAcl(self.acl_arg,
|
|
url,
|
|
canned=self.canned,
|
|
def_obj_acl=self.def_acl,
|
|
provider=url.scheme)
|
|
except ServiceException as e:
|
|
if self.continue_on_error:
|
|
self.everything_set_okay = False
|
|
self.logger.error(e)
|
|
else:
|
|
raise
|
|
finally:
|
|
gsutil_api.prefer_api = orig_prefer_api
|
|
|
|
def _SetAclGsutilApi(self, url, gsutil_api):
|
|
"""Sets the ACL for the URL provided using the gsutil Cloud API.
|
|
|
|
This function assumes that self.def_acl, self.canned,
|
|
and self.continue_on_error are initialized, and that self.acl_arg is
|
|
either a JSON string or a canned ACL string.
|
|
|
|
Args:
|
|
url: CloudURL to set the ACL on.
|
|
gsutil_api: gsutil Cloud API to use for the ACL set.
|
|
"""
|
|
try:
|
|
if url.IsBucket():
|
|
if self.def_acl:
|
|
if self.canned:
|
|
gsutil_api.PatchBucket(url.bucket_name,
|
|
apitools_messages.Bucket(),
|
|
canned_def_acl=self.acl_arg,
|
|
provider=url.scheme,
|
|
fields=['id'])
|
|
else:
|
|
def_obj_acl = AclTranslation.JsonToMessage(
|
|
self.acl_arg, apitools_messages.ObjectAccessControl)
|
|
if not def_obj_acl:
|
|
# Use a sentinel value to indicate a private (no entries) default
|
|
# object ACL.
|
|
def_obj_acl.append(PRIVATE_DEFAULT_OBJ_ACL)
|
|
bucket_metadata = apitools_messages.Bucket(
|
|
defaultObjectAcl=def_obj_acl)
|
|
gsutil_api.PatchBucket(url.bucket_name,
|
|
bucket_metadata,
|
|
provider=url.scheme,
|
|
fields=['id'])
|
|
else:
|
|
if self.canned:
|
|
gsutil_api.PatchBucket(url.bucket_name,
|
|
apitools_messages.Bucket(),
|
|
canned_acl=self.acl_arg,
|
|
provider=url.scheme,
|
|
fields=['id'])
|
|
else:
|
|
bucket_acl = AclTranslation.JsonToMessage(
|
|
self.acl_arg, apitools_messages.BucketAccessControl)
|
|
bucket_metadata = apitools_messages.Bucket(acl=bucket_acl)
|
|
gsutil_api.PatchBucket(url.bucket_name,
|
|
bucket_metadata,
|
|
provider=url.scheme,
|
|
fields=['id'])
|
|
else: # url.IsObject()
|
|
if self.canned:
|
|
gsutil_api.PatchObjectMetadata(url.bucket_name,
|
|
url.object_name,
|
|
apitools_messages.Object(),
|
|
provider=url.scheme,
|
|
generation=url.generation,
|
|
canned_acl=self.acl_arg)
|
|
else:
|
|
object_acl = AclTranslation.JsonToMessage(
|
|
self.acl_arg, apitools_messages.ObjectAccessControl)
|
|
object_metadata = apitools_messages.Object(acl=object_acl)
|
|
gsutil_api.PatchObjectMetadata(url.bucket_name,
|
|
url.object_name,
|
|
object_metadata,
|
|
provider=url.scheme,
|
|
generation=url.generation)
|
|
except ArgumentException as e:
|
|
raise
|
|
except ServiceException as e:
|
|
if self.continue_on_error:
|
|
self.everything_set_okay = False
|
|
self.logger.error(e)
|
|
else:
|
|
raise
|
|
|
|
def SetAclCommandHelper(self, acl_func, acl_excep_handler):
|
|
"""Sets ACLs on the self.args using the passed-in acl function.
|
|
|
|
Args:
|
|
acl_func: ACL function to be passed to Apply.
|
|
acl_excep_handler: ACL exception handler to be passed to Apply.
|
|
"""
|
|
acl_arg = self.args[0]
|
|
url_args = self.args[1:]
|
|
# Disallow multi-provider setacl requests, because there are differences in
|
|
# the ACL models.
|
|
if not UrlsAreForSingleProvider(url_args):
|
|
raise CommandException('"%s" command spanning providers not allowed.' %
|
|
self.command_name)
|
|
|
|
# Determine whether acl_arg names a file containing XML ACL text vs. the
|
|
# string name of a canned ACL.
|
|
if os.path.isfile(acl_arg):
|
|
with codecs.open(acl_arg, 'r', UTF8) as f:
|
|
acl_arg = f.read()
|
|
self.canned = False
|
|
else:
|
|
# No file exists, so expect a canned ACL string.
|
|
# validate=False because we allow wildcard urls.
|
|
storage_uri = boto.storage_uri(
|
|
url_args[0],
|
|
debug=self.debug,
|
|
validate=False,
|
|
bucket_storage_uri_class=self.bucket_storage_uri_class)
|
|
|
|
canned_acls = storage_uri.canned_acls()
|
|
if acl_arg not in canned_acls:
|
|
raise CommandException('Invalid canned ACL "%s".' % acl_arg)
|
|
self.canned = True
|
|
|
|
# Used to track if any ACLs failed to be set.
|
|
self.everything_set_okay = True
|
|
self.acl_arg = acl_arg
|
|
|
|
self.ApplyAclFunc(acl_func, acl_excep_handler, url_args)
|
|
if not self.everything_set_okay and not self.continue_on_error:
|
|
raise CommandException('ACLs for some objects could not be set.')
|
|
|
|
def _WarnServiceAccounts(self):
|
|
"""Warns service account users who have received an AccessDenied error.
|
|
|
|
When one of the metadata-related commands fails due to AccessDenied, user
|
|
must ensure that they are listed as an Owner in the API console.
|
|
"""
|
|
# Import this here so that the value will be set first in
|
|
# gcs_oauth2_boto_plugin.
|
|
# pylint: disable=g-import-not-at-top
|
|
from gcs_oauth2_boto_plugin.oauth2_plugin import IS_SERVICE_ACCOUNT
|
|
|
|
if IS_SERVICE_ACCOUNT:
|
|
# This method is only called when canned ACLs are used, so the warning
|
|
# definitely applies.
|
|
self.logger.warning('\n'.join(
|
|
textwrap.wrap(
|
|
'It appears that your service account has been denied access while '
|
|
'attempting to perform a metadata operation. If you believe that you '
|
|
'should have access to this metadata (i.e., if it is associated with '
|
|
'your account), please make sure that your service account'
|
|
's email '
|
|
'address is listed as an Owner in the Permissions tab of the API '
|
|
'console. See "gsutil help creds" for further information.\n')))
|
|
|
|
def GetAndPrintAcl(self, url_str):
|
|
"""Prints the standard or default object ACL depending on self.command_name.
|
|
|
|
Args:
|
|
url_str: URL string to get ACL for.
|
|
"""
|
|
blr = self.GetAclCommandBucketListingReference(url_str)
|
|
url = StorageUrlFromString(url_str)
|
|
if (self.gsutil_api.GetApiSelector(url.scheme) == ApiSelector.XML and
|
|
url.scheme != 'gs'):
|
|
# Need to use XML passthrough.
|
|
try:
|
|
acl = self.gsutil_api.XmlPassThroughGetAcl(url,
|
|
def_obj_acl=self.def_acl,
|
|
provider=url.scheme)
|
|
print(acl.to_xml())
|
|
except AccessDeniedException as _:
|
|
self._WarnServiceAccounts()
|
|
raise
|
|
else:
|
|
if self.command_name == 'defacl':
|
|
acl = blr.root_object.defaultObjectAcl
|
|
if not acl:
|
|
self.logger.warn(
|
|
'No default object ACL present for %s. This could occur if '
|
|
'the default object ACL is private, in which case objects '
|
|
'created in this bucket will be readable only by their '
|
|
'creators. It could also mean you do not have OWNER permission '
|
|
'on %s and therefore do not have permission to read the '
|
|
'default object ACL. It could also mean that %s has Bucket '
|
|
'Policy Only enabled and therefore object ACLs and default '
|
|
'object ACLs are disabled (see '
|
|
'https://cloud.google.com/storage/docs/bucket-policy-only).',
|
|
url_str, url_str, url_str)
|
|
else:
|
|
acl = blr.root_object.acl
|
|
# Use the access controls api to check if the acl is actually empty or
|
|
# if the user has 403 access denied or 400 invalid argument.
|
|
if not acl:
|
|
self._ListAccessControlsAcl(url)
|
|
|
|
print(AclTranslation.JsonFromMessage(acl))
|
|
|
|
def _ListAccessControlsAcl(self, storage_url):
|
|
"""Returns either bucket or object access controls for a storage url.
|
|
|
|
Args:
|
|
storage_url: StorageUrl object representing the bucket or object.
|
|
|
|
Returns:
|
|
BucketAccessControls, ObjectAccessControls, or None if storage_url does
|
|
not represent a cloud bucket or cloud object.
|
|
|
|
Raises:
|
|
ServiceException if there was an error in the request.
|
|
"""
|
|
if storage_url.IsBucket():
|
|
return self.gsutil_api.ListBucketAccessControls(
|
|
storage_url.bucket_name, provider=storage_url.scheme)
|
|
elif storage_url.IsObject():
|
|
return self.gsutil_api.ListObjectAccessControls(
|
|
storage_url.bucket_name,
|
|
storage_url.object_name,
|
|
provider=storage_url.scheme)
|
|
else:
|
|
return None
|
|
|
|
def GetAclCommandBucketListingReference(self, url_str):
|
|
"""Gets a single bucket listing reference for an acl get command.
|
|
|
|
Args:
|
|
url_str: URL string to get the bucket listing reference for.
|
|
|
|
Returns:
|
|
BucketListingReference for the URL string.
|
|
|
|
Raises:
|
|
CommandException if string did not result in exactly one reference.
|
|
"""
|
|
# We're guaranteed by caller that we have the appropriate type of url
|
|
# string for the call (ex. we will never be called with an object string
|
|
# by getdefacl)
|
|
wildcard_url = StorageUrlFromString(url_str)
|
|
if wildcard_url.IsObject():
|
|
plurality_iter = PluralityCheckableIterator(
|
|
self.WildcardIterator(url_str).IterObjects(
|
|
bucket_listing_fields=['acl']))
|
|
else:
|
|
# Bucket or provider. We call IterBuckets explicitly here to ensure that
|
|
# the root object is populated with the acl.
|
|
if self.command_name == 'defacl':
|
|
bucket_fields = ['defaultObjectAcl']
|
|
else:
|
|
bucket_fields = ['acl']
|
|
plurality_iter = PluralityCheckableIterator(
|
|
self.WildcardIterator(url_str).IterBuckets(
|
|
bucket_fields=bucket_fields))
|
|
if plurality_iter.IsEmpty():
|
|
raise CommandException('No URLs matched')
|
|
if plurality_iter.HasPlurality():
|
|
raise CommandException(
|
|
'%s matched more than one URL, which is not allowed by the %s '
|
|
'command' % (url_str, self.command_name))
|
|
return list(plurality_iter)[0]
|
|
|
|
def GetSingleBucketUrlFromArg(self, arg, bucket_fields=None):
|
|
"""Gets a single bucket URL based on the command arguments.
|
|
|
|
Args:
|
|
arg: String argument to get bucket URL for.
|
|
bucket_fields: Fields to populate for the bucket.
|
|
|
|
Returns:
|
|
(StorageUrl referring to a single bucket, Bucket metadata).
|
|
|
|
Raises:
|
|
CommandException if args did not match exactly one bucket.
|
|
"""
|
|
plurality_checkable_iterator = self.GetBucketUrlIterFromArg(
|
|
arg, bucket_fields=bucket_fields)
|
|
if plurality_checkable_iterator.HasPlurality():
|
|
raise CommandException('%s matched more than one URL, which is not\n'
|
|
'allowed by the %s command' %
|
|
(arg, self.command_name))
|
|
blr = list(plurality_checkable_iterator)[0]
|
|
return StorageUrlFromString(blr.url_string), blr.root_object
|
|
|
|
def GetBucketUrlIterFromArg(self, arg, bucket_fields=None):
|
|
"""Gets a single bucket URL based on the command arguments.
|
|
|
|
Args:
|
|
arg: String argument to iterate over.
|
|
bucket_fields: Fields to populate for the bucket.
|
|
|
|
Returns:
|
|
PluralityCheckableIterator over buckets.
|
|
|
|
Raises:
|
|
CommandException if iterator matched no buckets.
|
|
"""
|
|
arg_url = StorageUrlFromString(arg)
|
|
if not arg_url.IsCloudUrl() or arg_url.IsObject():
|
|
raise CommandException('"%s" command must specify a bucket' %
|
|
self.command_name)
|
|
|
|
plurality_checkable_iterator = PluralityCheckableIterator(
|
|
self.WildcardIterator(arg).IterBuckets(bucket_fields=bucket_fields))
|
|
if plurality_checkable_iterator.IsEmpty():
|
|
raise CommandException('No URLs matched')
|
|
return plurality_checkable_iterator
|
|
|
|
######################
|
|
# Private functions. #
|
|
######################
|
|
|
|
def _ResetConnectionPool(self):
|
|
# Each OS process needs to establish its own set of connections to
|
|
# the server to avoid writes from different OS processes interleaving
|
|
# onto the same socket (and garbling the underlying SSL session).
|
|
# We ensure each process gets its own set of connections here by
|
|
# reinitializing state that tracks connections.
|
|
connection_pool = StorageUri.provider_pool
|
|
if connection_pool:
|
|
for i in connection_pool:
|
|
connection_pool[i].connection.close()
|
|
|
|
StorageUri.provider_pool = {}
|
|
StorageUri.connection = None
|
|
|
|
def _GetProcessAndThreadCount(self,
|
|
process_count,
|
|
thread_count,
|
|
parallel_operations_override,
|
|
print_macos_warning=True):
|
|
"""Determines the values of process_count and thread_count.
|
|
|
|
These values are used for parallel operations.
|
|
If we're not performing operations in parallel, then ignore
|
|
existing values and use process_count = thread_count = 1.
|
|
|
|
Args:
|
|
process_count: A positive integer or None. In the latter case, we read
|
|
the value from the .boto config file.
|
|
thread_count: A positive integer or None. In the latter case, we read
|
|
the value from the .boto config file.
|
|
parallel_operations_override: Used to override self.parallel_operations.
|
|
This allows the caller to safely override
|
|
the top-level flag for a single call.
|
|
print_macos_warning: Print a warning about parallel processing on MacOS
|
|
if true.
|
|
|
|
Returns:
|
|
(process_count, thread_count): The number of processes and threads to use,
|
|
respectively.
|
|
"""
|
|
# Set OS process and python thread count as a function of options
|
|
# and config.
|
|
if self.parallel_operations or parallel_operations_override:
|
|
if not process_count:
|
|
process_count = boto.config.getint(
|
|
'GSUtil', 'parallel_process_count',
|
|
gslib.commands.config.DEFAULT_PARALLEL_PROCESS_COUNT)
|
|
if process_count < 1:
|
|
raise CommandException('Invalid parallel_process_count "%d".' %
|
|
process_count)
|
|
if not thread_count:
|
|
thread_count = boto.config.getint(
|
|
'GSUtil', 'parallel_thread_count',
|
|
gslib.commands.config.DEFAULT_PARALLEL_THREAD_COUNT)
|
|
if thread_count < 1:
|
|
raise CommandException('Invalid parallel_thread_count "%d".' %
|
|
thread_count)
|
|
else:
|
|
# If -m not specified, then assume 1 OS process and 1 Python thread.
|
|
process_count = 1
|
|
thread_count = 1
|
|
|
|
should_prohibit_multiprocessing, os_name = ShouldProhibitMultiprocessing()
|
|
if should_prohibit_multiprocessing and process_count > 1:
|
|
raise CommandException('\n'.join(
|
|
textwrap.wrap(
|
|
('It is not possible to set process_count > 1 on %s. Please '
|
|
'update your config file(s) (located at %s) and set '
|
|
'"parallel_process_count = 1".') %
|
|
(os_name, ', '.join(GetFriendlyConfigFilePaths())))))
|
|
is_main_thread = self.recursive_apply_level == 0
|
|
if print_macos_warning and os_name == 'macOS' and process_count > 1 and is_main_thread:
|
|
self.logger.info(
|
|
'If you experience problems with multiprocessing on MacOS, they '
|
|
'might be related to https://bugs.python.org/issue33725. You can '
|
|
'disable multiprocessing by editing your .boto config or by adding '
|
|
'the following flag to your command: '
|
|
'`-o "GSUtil:parallel_process_count=1"`. Note that multithreading is '
|
|
'still available even if you disable multiprocessing.\n')
|
|
|
|
self.logger.debug('process count: %d', process_count)
|
|
self.logger.debug('thread count: %d', thread_count)
|
|
return (process_count, thread_count)
|
|
|
|
def _SetUpPerCallerState(self):
|
|
"""Set up the state for a caller id, corresponding to one Apply call."""
|
|
# pylint: disable=global-variable-undefined,global-variable-not-assigned
|
|
# These variables are initialized in InitializeMultiprocessingVariables or
|
|
# InitializeThreadingVariables
|
|
global global_return_values_map, shared_vars_map, failure_count
|
|
global caller_id_finished_count, shared_vars_list_map, total_tasks
|
|
global need_pool_or_done_cond, call_completed_map, class_map
|
|
global task_queues, caller_id_lock, caller_id_counter
|
|
# Get a new caller ID.
|
|
with caller_id_lock:
|
|
caller_id_counter.Increment()
|
|
caller_id = caller_id_counter.GetValue()
|
|
|
|
# Create a copy of self with an incremented recursive level. This allows
|
|
# the class to report its level correctly if the function called from it
|
|
# also needs to call Apply.
|
|
cls = copy.copy(self)
|
|
cls.recursive_apply_level += 1
|
|
|
|
# Thread-safe loggers can't be pickled, so we will remove it here and
|
|
# recreate it later in the WorkerThread. This is not a problem since any
|
|
# logger with the same name will be treated as a singleton.
|
|
cls.logger = None
|
|
|
|
# Likewise, the default API connection(s) can't be pickled, but are unused
|
|
# anyway as each thread gets its own API delegator.
|
|
cls.gsutil_api = None
|
|
cls.seek_ahead_gsutil_api = None
|
|
|
|
class_map[caller_id] = cls
|
|
total_tasks[caller_id] = -1 # -1 => the producer hasn't finished yet.
|
|
call_completed_map[caller_id] = False
|
|
caller_id_finished_count[caller_id] = 0
|
|
global_return_values_map[caller_id] = []
|
|
return caller_id
|
|
|
|
def _CreateNewConsumerPool(self, num_processes, num_threads, status_queue):
|
|
"""Create a new pool of processes that call _ApplyThreads."""
|
|
processes = []
|
|
task_queue = _NewMultiprocessingQueue()
|
|
task_queues.append(task_queue)
|
|
|
|
current_max_recursive_level.Increment()
|
|
if current_max_recursive_level.GetValue() > MAX_RECURSIVE_DEPTH:
|
|
raise CommandException('Recursion depth of Apply calls is too great.')
|
|
for _ in range(num_processes):
|
|
recursive_apply_level = len(consumer_pools)
|
|
p = multiprocessing_context.Process(target=self._ApplyThreads,
|
|
args=(num_threads, num_processes,
|
|
recursive_apply_level,
|
|
status_queue))
|
|
p.daemon = True
|
|
processes.append(p)
|
|
_CryptoRandomAtFork()
|
|
p.start()
|
|
consumer_pool = _ConsumerPool(processes, task_queue)
|
|
consumer_pools.append(consumer_pool)
|
|
|
|
class ParallelOverrideReason(object):
|
|
"""Enum class to describe purpose of overriding parallel operations."""
|
|
# For the case when we use slice parallelism.
|
|
SLICE = 'slice'
|
|
# For the case when we run a helper Apply call (such as in the _DiffIterator
|
|
# of rsync) and override to make the command go faster.
|
|
SPEED = 'speed'
|
|
# For when we run Apply calls in perfdiag.
|
|
PERFDIAG = 'perfdiag'
|
|
|
|
def Apply(self,
|
|
func,
|
|
args_iterator,
|
|
exception_handler,
|
|
shared_attrs=None,
|
|
arg_checker=_UrlArgChecker,
|
|
parallel_operations_override=None,
|
|
process_count=None,
|
|
thread_count=None,
|
|
should_return_results=False,
|
|
fail_on_error=False,
|
|
seek_ahead_iterator=None):
|
|
"""Calls _Parallel/SequentialApply based on multiprocessing availability.
|
|
|
|
Args:
|
|
func: Function to call to process each argument.
|
|
args_iterator: Iterable collection of arguments to be put into the
|
|
work queue.
|
|
exception_handler: Exception handler for WorkerThread class.
|
|
shared_attrs: List of attributes to manage across sub-processes.
|
|
arg_checker: Used to determine whether we should process the current
|
|
argument or simply skip it. Also handles any logging that
|
|
is specific to a particular type of argument.
|
|
parallel_operations_override: A string (see ParallelOverrideReason)
|
|
describing the reason to override
|
|
self.parallel_operations. This allows the
|
|
caller to safely override the top-level flag
|
|
for a single call.
|
|
process_count: The number of processes to use. If not specified, then
|
|
the configured default will be used.
|
|
thread_count: The number of threads per process. If not specified, then
|
|
the configured default will be used..
|
|
should_return_results: If true, then return the results of all successful
|
|
calls to func in a list.
|
|
fail_on_error: If true, then raise any exceptions encountered when
|
|
executing func. This is only applicable in the case of
|
|
process_count == thread_count == 1.
|
|
seek_ahead_iterator: If present, a seek-ahead iterator that will
|
|
provide an approximation of the total number of tasks and bytes that
|
|
will be iterated by the ProducerThread. Used only if multiple
|
|
processes and/or threads are used.
|
|
|
|
Returns:
|
|
Results from spawned threads.
|
|
"""
|
|
# This is initialized in Initialize(Multiprocessing|Threading)Variables
|
|
# pylint: disable=global-variable-not-assigned
|
|
# pylint: disable=global-variable-undefined
|
|
global thread_stats
|
|
# pylint: enable=global-variable-not-assigned
|
|
# pylint: enable=global-variable-undefined
|
|
if shared_attrs:
|
|
original_shared_vars_values = {} # We'll add these back in at the end.
|
|
for name in shared_attrs:
|
|
original_shared_vars_values[name] = getattr(self, name)
|
|
# By setting this to 0, we simplify the logic for computing deltas.
|
|
# We'll add it back after all of the tasks have been performed.
|
|
setattr(self, name, 0)
|
|
|
|
(process_count, thread_count) = self._GetProcessAndThreadCount(
|
|
process_count, thread_count, parallel_operations_override)
|
|
|
|
is_main_thread = (self.recursive_apply_level == 0 and
|
|
self.sequential_caller_id == -1)
|
|
|
|
if is_main_thread:
|
|
# This initializes the initial performance summary parameters.
|
|
LogPerformanceSummaryParams(num_processes=process_count,
|
|
num_threads=thread_count)
|
|
|
|
# We don't honor the fail_on_error flag in the case of multiple threads
|
|
# or processes.
|
|
fail_on_error = fail_on_error and (process_count * thread_count == 1)
|
|
|
|
# Only check this from the first call in the main thread. Apart from the
|
|
# fact that it's wasteful to try this multiple times in general, it also
|
|
# will never work when called from a subprocess since we use daemon
|
|
# processes, and daemons can't create other processes.
|
|
if (is_main_thread and not self.multiprocessing_is_available and
|
|
process_count > 1):
|
|
# Run the check again and log the appropriate warnings. This was run
|
|
# before, when the Command object was created, in order to calculate
|
|
# self.multiprocessing_is_available, but we don't want to print the
|
|
# warning until we're sure the user actually tried to use multiple
|
|
# threads or processes.
|
|
CheckMultiprocessingAvailableAndInit(logger=self.logger)
|
|
|
|
caller_id = self._SetUpPerCallerState()
|
|
|
|
# If any shared attributes passed by caller, create a dictionary of
|
|
# shared memory variables for every element in the list of shared
|
|
# attributes.
|
|
if shared_attrs:
|
|
shared_vars_list_map[caller_id] = shared_attrs
|
|
for name in shared_attrs:
|
|
shared_vars_map[(caller_id, name)] = 0
|
|
|
|
# Make all of the requested function calls.
|
|
usable_processes_count = (process_count
|
|
if self.multiprocessing_is_available else 1)
|
|
if thread_count * usable_processes_count > 1:
|
|
self._ParallelApply(
|
|
func,
|
|
args_iterator,
|
|
exception_handler,
|
|
caller_id,
|
|
arg_checker,
|
|
usable_processes_count,
|
|
thread_count,
|
|
should_return_results,
|
|
fail_on_error,
|
|
seek_ahead_iterator=seek_ahead_iterator,
|
|
parallel_operations_override=parallel_operations_override)
|
|
if is_main_thread:
|
|
_AggregateThreadStats()
|
|
else:
|
|
self._SequentialApply(func, args_iterator, exception_handler, caller_id,
|
|
arg_checker, should_return_results, fail_on_error)
|
|
|
|
if shared_attrs:
|
|
for name in shared_attrs:
|
|
# This allows us to retain the original value of the shared variable,
|
|
# and simply apply the delta after what was done during the call to
|
|
# apply.
|
|
final_value = (original_shared_vars_values[name] + shared_vars_map.get(
|
|
(caller_id, name)))
|
|
setattr(self, name, final_value)
|
|
|
|
if should_return_results:
|
|
return global_return_values_map.get(caller_id)
|
|
|
|
def _MaybeSuggestGsutilDashM(self):
|
|
"""Outputs a suggestion to the user to use gsutil -m."""
|
|
if not (boto.config.getint('GSUtil', 'parallel_process_count', 0) == 1 and
|
|
boto.config.getint('GSUtil', 'parallel_thread_count', 0) == 1):
|
|
self.logger.info('\n' + textwrap.fill(
|
|
'==> NOTE: You are performing a sequence of gsutil operations that '
|
|
'may run significantly faster if you instead use gsutil -m %s ...\n'
|
|
'Please see the -m section under "gsutil help options" for further '
|
|
'information about when gsutil -m can be advantageous.' %
|
|
self.command_spec.command_name) + '\n')
|
|
|
|
# pylint: disable=g-doc-args
|
|
def _SequentialApply(self, func, args_iterator, exception_handler, caller_id,
|
|
arg_checker, should_return_results, fail_on_error):
|
|
"""Performs all function calls sequentially in the current thread.
|
|
|
|
No other threads or processes will be spawned. This degraded functionality
|
|
is used when the multiprocessing module is not available or the user
|
|
requests only one thread and one process.
|
|
"""
|
|
# Create a WorkerThread to handle all of the logic needed to actually call
|
|
# the function. Note that this thread will never be started, and all work
|
|
# is done in the current thread.
|
|
worker_thread = WorkerThread(None,
|
|
False,
|
|
headers=self.non_metadata_headers,
|
|
perf_trace_token=self.perf_trace_token,
|
|
trace_token=self.trace_token,
|
|
user_project=self.user_project)
|
|
args_iterator = iter(args_iterator)
|
|
# Count of sequential calls that have been made. Used for producing
|
|
# suggestion to use gsutil -m.
|
|
sequential_call_count = 0
|
|
while True:
|
|
|
|
# Try to get the next argument, handling any exceptions that arise.
|
|
try:
|
|
args = next(args_iterator)
|
|
except StopIteration as e:
|
|
break
|
|
except Exception as e: # pylint: disable=broad-except
|
|
_IncrementFailureCount()
|
|
if fail_on_error:
|
|
raise
|
|
else:
|
|
try:
|
|
exception_handler(self, e)
|
|
except Exception as _: # pylint: disable=broad-except
|
|
self.logger.debug(
|
|
'Caught exception while handling exception for %s:\n%s', func,
|
|
traceback.format_exc())
|
|
continue
|
|
|
|
sequential_call_count += 1
|
|
if (sequential_call_count == OFFER_GSUTIL_M_SUGGESTION_THRESHOLD or
|
|
sequential_call_count % OFFER_GSUTIL_M_SUGGESTION_FREQUENCY == 0):
|
|
# Output suggestion near beginning of run, so user sees it early, and
|
|
# every so often while the command is executing, so they can ^C and try
|
|
# gsutil -m.
|
|
self._MaybeSuggestGsutilDashM()
|
|
if arg_checker(self, args):
|
|
# Now that we actually have the next argument, perform the task.
|
|
task = Task(func, args, caller_id, exception_handler,
|
|
should_return_results, arg_checker, fail_on_error)
|
|
worker_thread.PerformTask(task, self)
|
|
|
|
lines_since_suggestion_last_printed = (sequential_call_count %
|
|
OFFER_GSUTIL_M_SUGGESTION_FREQUENCY)
|
|
if lines_since_suggestion_last_printed >= GetTermLines():
|
|
# Output suggestion at end of long run, in case user missed it at the
|
|
# start and it scrolled off-screen.
|
|
self._MaybeSuggestGsutilDashM()
|
|
|
|
PutToQueueWithTimeout(self.gsutil_api.status_queue,
|
|
FinalMessage(time.time()))
|
|
|
|
# If the final iterated argument results in an exception, and that
|
|
# exception modifies shared_attrs, we need to publish the results.
|
|
worker_thread.shared_vars_updater.Update(caller_id, self)
|
|
|
|
# Now that all the work is done, log the types of source URLs encountered.
|
|
self._ProcessSourceUrlTypes(args_iterator)
|
|
|
|
# pylint: disable=g-doc-args
|
|
def _ParallelApply(self,
|
|
func,
|
|
args_iterator,
|
|
exception_handler,
|
|
caller_id,
|
|
arg_checker,
|
|
process_count,
|
|
thread_count,
|
|
should_return_results,
|
|
fail_on_error,
|
|
seek_ahead_iterator=None,
|
|
parallel_operations_override=None):
|
|
r"""Dispatches input arguments across a thread/process pool.
|
|
|
|
Pools are composed of parallel OS processes and/or Python threads,
|
|
based on options (-m or not) and settings in the user's config file.
|
|
|
|
If only one OS process is requested/available, dispatch requests across
|
|
threads in the current OS process.
|
|
|
|
In the multi-process case, we will create one pool of worker processes for
|
|
each level of the tree of recursive calls to Apply. E.g., if A calls
|
|
Apply(B), and B ultimately calls Apply(C) followed by Apply(D), then we
|
|
will only create two sets of worker processes - B will execute in the first,
|
|
and C and D will execute in the second. If C is then changed to call
|
|
Apply(E) and D is changed to call Apply(F), then we will automatically
|
|
create a third set of processes (lazily, when needed) that will be used to
|
|
execute calls to E and F. This might look something like:
|
|
|
|
Pool1 Executes: B
|
|
/ \
|
|
Pool2 Executes: C D
|
|
/ \
|
|
Pool3 Executes: E F
|
|
|
|
Apply's parallelism is generally broken up into 4 cases:
|
|
- If process_count == thread_count == 1, then all tasks will be executed
|
|
by _SequentialApply.
|
|
- If process_count > 1 and thread_count == 1, then the main thread will
|
|
create a new pool of processes (if they don't already exist) and each of
|
|
those processes will execute the tasks in a single thread.
|
|
- If process_count == 1 and thread_count > 1, then this process will create
|
|
a new pool of threads to execute the tasks.
|
|
- If process_count > 1 and thread_count > 1, then the main thread will
|
|
create a new pool of processes (if they don't already exist) and each of
|
|
those processes will, upon creation, create a pool of threads to
|
|
execute the tasks.
|
|
|
|
Args:
|
|
caller_id: The caller ID unique to this call to command.Apply.
|
|
See command.Apply for description of other arguments.
|
|
"""
|
|
# This is initialized in Initialize(Multiprocessing|Threading)Variables
|
|
# pylint: disable=global-variable-not-assigned
|
|
# pylint: disable=global-variable-undefined
|
|
global glob_status_queue, ui_controller
|
|
# pylint: enable=global-variable-not-assigned
|
|
# pylint: enable=global-variable-undefined
|
|
is_main_thread = self.recursive_apply_level == 0
|
|
|
|
if (parallel_operations_override == self.ParallelOverrideReason.SLICE and
|
|
self.recursive_apply_level <= 1):
|
|
# The operation uses slice parallelism if the recursive apply level > 0 or
|
|
# if we're executing _ParallelApply without the -m option.
|
|
glob_status_queue.put(PerformanceSummaryMessage(time.time(), True))
|
|
|
|
if not IS_WINDOWS and is_main_thread:
|
|
# For multi-thread or multi-process scenarios, the main process must
|
|
# kill itself on a terminating signal, because sys.exit(1) only exits
|
|
# the currently executing thread, leaving orphaned processes. The main
|
|
# thread is responsible for cleaning up multiprocessing variables such
|
|
# as manager processes. Therefore, the main thread's signal handling
|
|
# chain is:
|
|
# 1: __main__._CleanupSignalHandler (clean up processes)
|
|
# 2: MultithreadedSignalHandler (kill self)
|
|
for signal_num in (signal.SIGINT, signal.SIGTERM):
|
|
RegisterSignalHandler(signal_num,
|
|
MultithreadedMainSignalHandler,
|
|
is_final_handler=True)
|
|
|
|
if not task_queues:
|
|
# The process we create will need to access the next recursive level
|
|
# of task queues if it makes a call to Apply, so we always keep around
|
|
# one more queue than we know we need. OTOH, if we don't create a new
|
|
# process, the existing process still needs a task queue to use.
|
|
if process_count > 1:
|
|
task_queues.append(_NewMultiprocessingQueue())
|
|
else:
|
|
task_queue = _NewThreadsafeQueue()
|
|
task_queues.append(task_queue)
|
|
# Create a top-level worker pool since this is the first execution
|
|
# of ParallelApply on the main thread.
|
|
WorkerPool(thread_count,
|
|
self.logger,
|
|
task_queue=task_queue,
|
|
bucket_storage_uri_class=self.bucket_storage_uri_class,
|
|
gsutil_api_map=self.gsutil_api_map,
|
|
debug=self.debug,
|
|
status_queue=glob_status_queue,
|
|
headers=self.non_metadata_headers,
|
|
perf_trace_token=self.perf_trace_token,
|
|
trace_token=self.trace_token,
|
|
user_project=self.user_project)
|
|
|
|
if process_count > 1: # Handle process pool creation.
|
|
# Check whether this call will need a new set of workers.
|
|
|
|
# Each worker must acquire a shared lock before notifying the main thread
|
|
# that it needs a new worker pool, so that at most one worker asks for
|
|
# a new worker pool at once.
|
|
try:
|
|
if not is_main_thread:
|
|
worker_checking_level_lock.acquire()
|
|
if self.recursive_apply_level >= current_max_recursive_level.GetValue():
|
|
with need_pool_or_done_cond:
|
|
# Only the main thread is allowed to create new processes -
|
|
# otherwise, we will run into some Python bugs.
|
|
if is_main_thread:
|
|
self._CreateNewConsumerPool(process_count, thread_count,
|
|
glob_status_queue)
|
|
else:
|
|
# Notify the main thread that we need a new consumer pool.
|
|
new_pool_needed.Reset(reset_value=1)
|
|
need_pool_or_done_cond.notify_all()
|
|
# The main thread will notify us when it finishes.
|
|
need_pool_or_done_cond.wait()
|
|
finally:
|
|
if not is_main_thread:
|
|
worker_checking_level_lock.release()
|
|
else: # Handle new worker thread pool creation.
|
|
if not is_main_thread:
|
|
try:
|
|
worker_checking_level_lock.acquire()
|
|
if self.recursive_apply_level > _GetCurrentMaxRecursiveLevel():
|
|
# We don't have a thread pool for this level of recursive apply
|
|
# calls, so create a pool and corresponding task queue.
|
|
_IncrementCurrentMaxRecursiveLevel()
|
|
task_queue = _NewThreadsafeQueue()
|
|
task_queues.append(task_queue)
|
|
WorkerPool(thread_count,
|
|
self.logger,
|
|
task_queue=task_queue,
|
|
bucket_storage_uri_class=self.bucket_storage_uri_class,
|
|
gsutil_api_map=self.gsutil_api_map,
|
|
debug=self.debug,
|
|
status_queue=glob_status_queue,
|
|
headers=self.non_metadata_headers,
|
|
perf_trace_token=self.perf_trace_token,
|
|
trace_token=self.trace_token,
|
|
user_project=self.user_project)
|
|
finally:
|
|
worker_checking_level_lock.release()
|
|
|
|
task_queue = task_queues[self.recursive_apply_level]
|
|
|
|
# Only use the seek-ahead iterator in the main thread to provide an
|
|
# overall estimate of operations.
|
|
if seek_ahead_iterator and not is_main_thread:
|
|
seek_ahead_iterator = None
|
|
|
|
# Kick off a producer thread to throw tasks in the global task queue. We
|
|
# do this asynchronously so that the main thread can be free to create new
|
|
# consumer pools when needed (otherwise, any thread with a task that needs
|
|
# a new consumer pool must block until we're completely done producing; in
|
|
# the worst case, every worker blocks on such a call and the producer fills
|
|
# up the task queue before it finishes, so we block forever).
|
|
producer_thread = ProducerThread(
|
|
copy.copy(self),
|
|
args_iterator,
|
|
caller_id,
|
|
func,
|
|
task_queue,
|
|
should_return_results,
|
|
exception_handler,
|
|
arg_checker,
|
|
fail_on_error,
|
|
seek_ahead_iterator=seek_ahead_iterator,
|
|
status_queue=(glob_status_queue if is_main_thread else None))
|
|
|
|
# Start the UI thread that is responsible for displaying operation status
|
|
# (aggregated across processes and threads) to the user.
|
|
ui_thread = None
|
|
if is_main_thread:
|
|
ui_thread = UIThread(glob_status_queue, sys.stderr, ui_controller)
|
|
|
|
# Wait here until either:
|
|
# 1. We're the main thread in the multi-process case, and someone needs
|
|
# a new consumer pool - in which case we create one and continue
|
|
# waiting.
|
|
# 2. Someone notifies us that all of the work we requested is done, in
|
|
# which case we retrieve the results (if applicable) and stop
|
|
# waiting.
|
|
# At most one of these can be true, because the main thread is blocked on
|
|
# its call to Apply, and a thread will not ask for a new consumer pool
|
|
# unless it had more work to do.
|
|
while True:
|
|
with need_pool_or_done_cond:
|
|
if call_completed_map[caller_id]:
|
|
break
|
|
elif (process_count > 1 and is_main_thread and
|
|
new_pool_needed.GetValue()):
|
|
new_pool_needed.Reset()
|
|
self._CreateNewConsumerPool(process_count, thread_count,
|
|
glob_status_queue)
|
|
need_pool_or_done_cond.notify_all()
|
|
|
|
# Note that we must check the above conditions before the wait() call;
|
|
# otherwise, the notification can happen before we start waiting, in
|
|
# which case we'll block forever.
|
|
need_pool_or_done_cond.wait()
|
|
|
|
# We've completed all tasks (or excepted), so signal the UI thread to
|
|
# terminate.
|
|
if is_main_thread:
|
|
PutToQueueWithTimeout(glob_status_queue, ZERO_TASKS_TO_DO_ARGUMENT)
|
|
ui_thread.join(timeout=UI_THREAD_JOIN_TIMEOUT)
|
|
# Now that all the work is done, log the types of source URLs encountered.
|
|
self._ProcessSourceUrlTypes(producer_thread.args_iterator)
|
|
|
|
# We encountered an exception from the producer thread before any arguments
|
|
# were enqueued, but it wouldn't have been propagated, so we'll now
|
|
# explicitly raise it here.
|
|
if producer_thread.unknown_exception:
|
|
# pylint: disable=raising-bad-type
|
|
raise producer_thread.unknown_exception
|
|
|
|
# We encountered an exception from the producer thread while iterating over
|
|
# the arguments, so raise it here if we're meant to fail on error.
|
|
if producer_thread.iterator_exception and fail_on_error:
|
|
# pylint: disable=raising-bad-type
|
|
raise producer_thread.iterator_exception
|
|
if is_main_thread and not parallel_operations_override:
|
|
PutToQueueWithTimeout(glob_status_queue, FinalMessage(time.time()))
|
|
|
|
def _ProcessSourceUrlTypes(self, args_iterator):
|
|
"""Logs the URL type information to analytics collection."""
|
|
if not isinstance(args_iterator, CopyObjectsIterator):
|
|
return
|
|
LogPerformanceSummaryParams(is_daisy_chain=args_iterator.is_daisy_chain,
|
|
has_file_src=args_iterator.has_file_src,
|
|
has_cloud_src=args_iterator.has_cloud_src,
|
|
provider_types=args_iterator.provider_types)
|
|
|
|
def _ApplyThreads(self, thread_count, process_count, recursive_apply_level,
|
|
status_queue):
|
|
"""Assigns the work from the multi-process global task queue.
|
|
|
|
Work is assigned to an individual process for later consumption either by
|
|
the WorkerThreads or (if thread_count == 1) this thread.
|
|
|
|
Args:
|
|
thread_count: The number of threads used to perform the work. If 1, then
|
|
perform all work in this thread.
|
|
process_count: The number of processes used to perform the work.
|
|
recursive_apply_level: The depth in the tree of recursive calls to Apply
|
|
of this thread.
|
|
status_queue: Multiprocessing/threading queue for progress reporting and
|
|
performance aggregation.
|
|
"""
|
|
assert process_count > 1, (
|
|
'Invalid state, calling command._ApplyThreads with only one process.')
|
|
|
|
_CryptoRandomAtFork()
|
|
# Separate processes should exit on a terminating signal,
|
|
# but to avoid race conditions only the main process should handle
|
|
# multiprocessing cleanup. Override child processes to use a single signal
|
|
# handler.
|
|
for catch_signal in GetCaughtSignals():
|
|
signal.signal(catch_signal, ChildProcessSignalHandler)
|
|
|
|
self._ResetConnectionPool()
|
|
self.recursive_apply_level = recursive_apply_level
|
|
|
|
task_queue = task_queues[recursive_apply_level]
|
|
|
|
# Ensure fairness across processes by filling our WorkerPool
|
|
# only with as many tasks as it has WorkerThreads. This semaphore is
|
|
# acquired each time that a task is retrieved from the queue and released
|
|
# each time a task is completed by a WorkerThread.
|
|
worker_semaphore = threading.BoundedSemaphore(thread_count)
|
|
|
|
# TODO: Presently, this pool gets recreated with each call to Apply. We
|
|
# should be able to do it just once, at process creation time.
|
|
worker_pool = WorkerPool(
|
|
thread_count,
|
|
self.logger,
|
|
worker_semaphore=worker_semaphore,
|
|
bucket_storage_uri_class=self.bucket_storage_uri_class,
|
|
gsutil_api_map=self.gsutil_api_map,
|
|
debug=self.debug,
|
|
status_queue=status_queue,
|
|
headers=self.non_metadata_headers,
|
|
perf_trace_token=self.perf_trace_token,
|
|
trace_token=self.trace_token,
|
|
user_project=self.user_project)
|
|
|
|
num_enqueued = 0
|
|
while True:
|
|
while not worker_semaphore.acquire(blocking=False):
|
|
# Because Python signal handlers are only called in between atomic
|
|
# instructions, if we block the main thread on an available worker
|
|
# thread, we won't be able to respond to signals such as a
|
|
# user-initiated CTRL-C until a worker thread completes a task.
|
|
# We poll the semaphore periodically as a compromise between
|
|
# efficiency and user responsiveness.
|
|
time.sleep(0.01)
|
|
task = task_queue.get()
|
|
|
|
if task.args != ZERO_TASKS_TO_DO_ARGUMENT:
|
|
# If we have no tasks to do and we're performing a blocking call, we
|
|
# need a special signal to tell us to stop - otherwise, we block on
|
|
# the call to task_queue.get() forever.
|
|
worker_pool.AddTask(task)
|
|
num_enqueued += 1
|
|
else:
|
|
# No tasks remain; since no work was dispatched to a thread, don't
|
|
# block the semaphore on a WorkerThread completion.
|
|
worker_semaphore.release()
|
|
|
|
|
|
# Below here lie classes and functions related to controlling the flow of tasks
|
|
# between various threads and processes.
|
|
class _ConsumerPool(object):
|
|
|
|
def __init__(self, processes, task_queue):
|
|
self.processes = processes
|
|
self.task_queue = task_queue
|
|
|
|
def ShutDown(self):
|
|
for process in self.processes:
|
|
KillProcess(process.pid)
|
|
|
|
|
|
class Task(
|
|
namedtuple('Task', (
|
|
'func args caller_id exception_handler should_return_results arg_checker '
|
|
'fail_on_error'))):
|
|
"""Task class representing work to be completed.
|
|
|
|
Args:
|
|
func: The function to be executed.
|
|
args: The arguments to func.
|
|
caller_id: The globally-unique caller ID corresponding to the Apply call.
|
|
exception_handler: The exception handler to use if the call to func fails.
|
|
should_return_results: True iff the results of this function should be
|
|
returned from the Apply call.
|
|
arg_checker: Used to determine whether we should process the current
|
|
argument or simply skip it. Also handles any logging that
|
|
is specific to a particular type of argument.
|
|
fail_on_error: If true, then raise any exceptions encountered when
|
|
executing func. This is only applicable in the case of
|
|
process_count == thread_count == 1.
|
|
"""
|
|
pass
|
|
|
|
|
|
# TODO: Refactor the various threading code that doesn't need to depend on
|
|
# command.py globals (ProducerThread, UIThread) to different files to aid
|
|
# readability and reduce the size of command.py.
|
|
def _StartSeekAheadThread(seek_ahead_iterator, seek_ahead_thread_cancel_event):
|
|
"""Initializes and runs the seek-ahead thread.
|
|
|
|
We defer starting this thread until it is needed, since it is only useful
|
|
when the ProducerThread iterates more results than it can store on the global
|
|
task queue.
|
|
|
|
Args:
|
|
seek_ahead_iterator: Iterator that yields SeekAheadResults.
|
|
seek_ahead_thread_cancel_event: threading.Event for signaling the
|
|
seek-ahead thread to terminate.
|
|
|
|
Returns:
|
|
The thread object for the initialized thread.
|
|
"""
|
|
# This is initialized in Initialize(Multiprocessing|Threading)Variables
|
|
# pylint: disable=global-variable-not-assigned
|
|
# pylint: disable=global-variable-undefined
|
|
global glob_status_queue
|
|
# pylint: enable=global-variable-not-assigned
|
|
# pylint: enable=global-variable-undefined
|
|
return SeekAheadThread(seek_ahead_iterator, seek_ahead_thread_cancel_event,
|
|
glob_status_queue)
|
|
|
|
|
|
class ProducerThread(threading.Thread):
|
|
"""Thread used to enqueue work for other processes and threads."""
|
|
|
|
def __init__(self,
|
|
cls,
|
|
args_iterator,
|
|
caller_id,
|
|
func,
|
|
task_queue,
|
|
should_return_results,
|
|
exception_handler,
|
|
arg_checker,
|
|
fail_on_error,
|
|
seek_ahead_iterator=None,
|
|
status_queue=None):
|
|
"""Initializes the producer thread.
|
|
|
|
Args:
|
|
cls: Instance of Command for which this ProducerThread was created.
|
|
args_iterator: Iterable collection of arguments to be put into the
|
|
work queue.
|
|
caller_id: Globally-unique caller ID corresponding to this call to Apply.
|
|
func: The function to be called on each element of args_iterator.
|
|
task_queue: The queue into which tasks will be put, to later be consumed
|
|
by Command._ApplyThreads.
|
|
should_return_results: True iff the results for this call to command.Apply
|
|
were requested.
|
|
exception_handler: The exception handler to use when errors are
|
|
encountered during calls to func.
|
|
arg_checker: Used to determine whether we should process the current
|
|
argument or simply skip it. Also handles any logging that
|
|
is specific to a particular type of argument.
|
|
fail_on_error: If true, then raise any exceptions encountered when
|
|
executing func. This is only applicable in the case of
|
|
process_count == thread_count == 1.
|
|
seek_ahead_iterator: If present, a seek-ahead iterator that will
|
|
provide an approximation of the total number of tasks and bytes that
|
|
will be iterated by the ProducerThread.
|
|
status_queue: status_queue to inform task_queue estimation. Only
|
|
valid when calling from the main thread, else None. Even if this is
|
|
the main thread, the status_queue will only properly work if args
|
|
is a collection of NameExpansionResults, which is the type that gives
|
|
us initial information about files to be processed. Otherwise,
|
|
nothing will be added to the queue.
|
|
"""
|
|
super(ProducerThread, self).__init__()
|
|
self.func = func
|
|
self.cls = cls
|
|
self.args_iterator = args_iterator
|
|
self.caller_id = caller_id
|
|
self.task_queue = task_queue
|
|
self.arg_checker = arg_checker
|
|
self.exception_handler = exception_handler
|
|
self.should_return_results = should_return_results
|
|
self.fail_on_error = fail_on_error
|
|
self.shared_variables_updater = _SharedVariablesUpdater()
|
|
self.daemon = True
|
|
self.unknown_exception = None
|
|
self.iterator_exception = None
|
|
self.seek_ahead_iterator = seek_ahead_iterator
|
|
self.status_queue = status_queue
|
|
self.start()
|
|
|
|
def run(self):
|
|
num_tasks = 0
|
|
cur_task = None
|
|
last_task = None
|
|
task_estimation_threshold = None
|
|
seek_ahead_thread = None
|
|
seek_ahead_thread_cancel_event = None
|
|
seek_ahead_thread_considered = False
|
|
args = None
|
|
try:
|
|
total_size = 0
|
|
self.args_iterator = iter(self.args_iterator)
|
|
while True:
|
|
try:
|
|
args = next(self.args_iterator)
|
|
except StopIteration as e:
|
|
break
|
|
except Exception as e: # pylint: disable=broad-except
|
|
_IncrementFailureCount()
|
|
if self.fail_on_error:
|
|
self.iterator_exception = e
|
|
raise
|
|
else:
|
|
try:
|
|
self.exception_handler(self.cls, e)
|
|
except Exception as _: # pylint: disable=broad-except
|
|
self.cls.logger.debug(
|
|
'Caught exception while handling exception for %s:\n%s',
|
|
self.func, traceback.format_exc())
|
|
self.shared_variables_updater.Update(self.caller_id, self.cls)
|
|
continue
|
|
|
|
if self.arg_checker(self.cls, args):
|
|
num_tasks += 1
|
|
if self.status_queue:
|
|
if not num_tasks % 100:
|
|
# Time to update the total number of tasks.
|
|
if (isinstance(args, NameExpansionResult) or
|
|
isinstance(args, CopyObjectInfo) or
|
|
isinstance(args, RsyncDiffToApply)):
|
|
PutToQueueWithTimeout(
|
|
self.status_queue,
|
|
ProducerThreadMessage(num_tasks, total_size, time.time()))
|
|
if (isinstance(args, NameExpansionResult) or
|
|
isinstance(args, CopyObjectInfo)):
|
|
if args.expanded_result:
|
|
json_expanded_result = json.loads(args.expanded_result)
|
|
if 'size' in json_expanded_result:
|
|
total_size += int(json_expanded_result['size'])
|
|
elif isinstance(args, RsyncDiffToApply):
|
|
if args.copy_size:
|
|
total_size += int(args.copy_size)
|
|
|
|
if not seek_ahead_thread_considered:
|
|
if task_estimation_threshold is None:
|
|
task_estimation_threshold = _GetTaskEstimationThreshold()
|
|
if task_estimation_threshold <= 0:
|
|
# Disable the seek-ahead thread (never start it).
|
|
seek_ahead_thread_considered = True
|
|
elif num_tasks >= task_estimation_threshold:
|
|
if self.seek_ahead_iterator:
|
|
seek_ahead_thread_cancel_event = threading.Event()
|
|
seek_ahead_thread = _StartSeekAheadThread(
|
|
self.seek_ahead_iterator, seek_ahead_thread_cancel_event)
|
|
# For integration testing only, force estimation to complete
|
|
# prior to producing further results.
|
|
if boto.config.get('GSUtil', 'task_estimation_force', None):
|
|
seek_ahead_thread.join(timeout=SEEK_AHEAD_JOIN_TIMEOUT)
|
|
|
|
seek_ahead_thread_considered = True
|
|
|
|
last_task = cur_task
|
|
cur_task = Task(self.func, args, self.caller_id,
|
|
self.exception_handler, self.should_return_results,
|
|
self.arg_checker, self.fail_on_error)
|
|
if last_task:
|
|
self.task_queue.put(last_task)
|
|
except Exception as e: # pylint: disable=broad-except
|
|
# This will also catch any exception raised due to an error in the
|
|
# iterator when fail_on_error is set, so check that we failed for some
|
|
# other reason before claiming that we had an unknown exception.
|
|
if not self.iterator_exception:
|
|
self.unknown_exception = e
|
|
finally:
|
|
# We need to make sure to update total_tasks[caller_id] before we enqueue
|
|
# the last task. Otherwise, a worker can retrieve the last task and
|
|
# complete it, then check total_tasks and determine that we're not done
|
|
# producing all before we update total_tasks. This approach forces workers
|
|
# to wait on the last task until after we've updated total_tasks.
|
|
total_tasks[self.caller_id] = num_tasks
|
|
if not cur_task:
|
|
# This happens if there were zero arguments to be put in the queue.
|
|
cur_task = Task(None, ZERO_TASKS_TO_DO_ARGUMENT, self.caller_id, None,
|
|
None, None, None)
|
|
self.task_queue.put(cur_task)
|
|
|
|
# If the seek ahead thread is still running, cancel it and wait for it
|
|
# to exit since we've enumerated all of the tasks already. We don't want
|
|
# to delay command completion on an estimate that has become meaningless.
|
|
if seek_ahead_thread is not None:
|
|
seek_ahead_thread_cancel_event.set()
|
|
# It's possible that the seek-ahead-thread may attempt to put to the
|
|
# status queue after it has been torn down, for example if the system
|
|
# is overloaded. Because the put uses a timeout, it should never block
|
|
# command termination or signal handling.
|
|
seek_ahead_thread.join(timeout=SEEK_AHEAD_JOIN_TIMEOUT)
|
|
# Send a final ProducerThread message that definitively states
|
|
# the amount of actual work performed.
|
|
if (self.status_queue and
|
|
(isinstance(args, NameExpansionResult) or isinstance(
|
|
args, CopyObjectInfo) or isinstance(args, RsyncDiffToApply))):
|
|
PutToQueueWithTimeout(
|
|
self.status_queue,
|
|
ProducerThreadMessage(num_tasks,
|
|
total_size,
|
|
time.time(),
|
|
finished=True))
|
|
|
|
# It's possible that the workers finished before we updated total_tasks,
|
|
# so we need to check here as well.
|
|
_NotifyIfDone(self.caller_id,
|
|
caller_id_finished_count.get(self.caller_id))
|
|
|
|
|
|
class WorkerPool(object):
|
|
"""Pool of worker threads to which tasks can be added."""
|
|
|
|
def __init__(self,
|
|
thread_count,
|
|
logger,
|
|
worker_semaphore=None,
|
|
task_queue=None,
|
|
bucket_storage_uri_class=None,
|
|
gsutil_api_map=None,
|
|
debug=0,
|
|
status_queue=None,
|
|
headers=None,
|
|
perf_trace_token=None,
|
|
trace_token=None,
|
|
user_project=None):
|
|
# In the multi-process case, a worker sempahore is required to ensure
|
|
# even work distribution.
|
|
#
|
|
# In the single process case, the input task queue directly feeds worker
|
|
# threads from the ProducerThread. Since worker threads will consume only
|
|
# one task at a time from the queue, there is no need for a semaphore to
|
|
# ensure even work distribution.
|
|
#
|
|
# Thus, exactly one of task_queue or worker_semaphore must be provided.
|
|
assert (worker_semaphore is None) != (task_queue is None)
|
|
self.headers = headers
|
|
self.perf_trace_token = perf_trace_token
|
|
self.trace_token = trace_token
|
|
self.user_project = user_project
|
|
|
|
self.task_queue = task_queue or _NewThreadsafeQueue()
|
|
self.threads = []
|
|
for _ in range(thread_count):
|
|
worker_thread = WorkerThread(
|
|
self.task_queue,
|
|
logger,
|
|
worker_semaphore=worker_semaphore,
|
|
bucket_storage_uri_class=bucket_storage_uri_class,
|
|
gsutil_api_map=gsutil_api_map,
|
|
debug=debug,
|
|
status_queue=status_queue,
|
|
headers=self.headers,
|
|
perf_trace_token=self.perf_trace_token,
|
|
trace_token=self.trace_token,
|
|
user_project=self.user_project)
|
|
self.threads.append(worker_thread)
|
|
worker_thread.start()
|
|
|
|
def AddTask(self, task):
|
|
"""Adds a task to the task queue; used only in the multi-process case."""
|
|
self.task_queue.put(task)
|
|
|
|
|
|
class WorkerThread(threading.Thread):
|
|
"""Thread where all the work will be performed.
|
|
|
|
This makes the function calls for Apply and takes care of all error handling,
|
|
return value propagation, and shared_vars.
|
|
|
|
Note that this thread is NOT started upon instantiation because the function-
|
|
calling logic is also used in the single-threaded case.
|
|
"""
|
|
# This is initialized in Initialize(Multiprocessing|Threading)Variables
|
|
# pylint: disable=global-variable-not-assigned
|
|
# pylint: disable=global-variable-undefined
|
|
global thread_stats
|
|
|
|
# pylint: enable=global-variable-not-assigned
|
|
# pylint: enable=global-variable-undefined
|
|
|
|
def __init__(self,
|
|
task_queue,
|
|
logger,
|
|
worker_semaphore=None,
|
|
bucket_storage_uri_class=None,
|
|
gsutil_api_map=None,
|
|
debug=0,
|
|
status_queue=None,
|
|
headers=None,
|
|
perf_trace_token=None,
|
|
trace_token=None,
|
|
user_project=None):
|
|
"""Initializes the worker thread.
|
|
|
|
Args:
|
|
task_queue: The thread-safe queue from which this thread should obtain
|
|
its work.
|
|
logger: Logger to use for this thread.
|
|
worker_semaphore: threading.BoundedSemaphore to be released each time a
|
|
task is completed, or None for single-threaded execution.
|
|
bucket_storage_uri_class: Class to instantiate for cloud StorageUris.
|
|
Settable for testing/mocking.
|
|
gsutil_api_map: Map of providers and API selector tuples to api classes
|
|
which can be used to communicate with those providers.
|
|
Used for the instantiating CloudApiDelegator class.
|
|
debug: debug level for the CloudApiDelegator class.
|
|
status_queue: Queue for reporting status updates.
|
|
user_project: Project to be billed for this request.
|
|
"""
|
|
super(WorkerThread, self).__init__()
|
|
|
|
self.pid = os.getpid()
|
|
self.init_time = time.time()
|
|
self.task_queue = task_queue
|
|
self.worker_semaphore = worker_semaphore
|
|
self.daemon = True
|
|
self.cached_classes = {}
|
|
self.shared_vars_updater = _SharedVariablesUpdater()
|
|
self.headers = headers
|
|
self.perf_trace_token = perf_trace_token
|
|
self.trace_token = trace_token
|
|
self.user_project = user_project
|
|
|
|
# Note that thread_gsutil_api is not initialized in the sequential
|
|
# case; task functions should use utils.cloud_api_helper.GetCloudApiInstance
|
|
# to retrieve the main thread's CloudApiDelegator in that case.
|
|
self.thread_gsutil_api = None
|
|
if bucket_storage_uri_class and gsutil_api_map:
|
|
self.thread_gsutil_api = CloudApiDelegator(
|
|
bucket_storage_uri_class,
|
|
gsutil_api_map,
|
|
logger,
|
|
status_queue,
|
|
debug=debug,
|
|
http_headers=self.headers,
|
|
perf_trace_token=self.perf_trace_token,
|
|
trace_token=self.trace_token,
|
|
user_project=self.user_project)
|
|
|
|
@CaptureThreadStatException
|
|
def _StartBlockedTime(self):
|
|
"""Update the thread_stats AtomicDict before task_queue.get() is called."""
|
|
if thread_stats.get((self.pid, self.ident)) is None:
|
|
thread_stats[(self.pid, self.ident)] = _ThreadStat(self.init_time)
|
|
# While this read/modify/write is not an atomic operation on the dict,
|
|
# we are protected since the (process ID, thread ID) tuple is unique
|
|
# to this thread, making this thread the only reader/writer for this key.
|
|
thread_stat = thread_stats[(self.pid, self.ident)]
|
|
thread_stat.StartBlockedTime()
|
|
thread_stats[(self.pid, self.ident)] = thread_stat
|
|
|
|
@CaptureThreadStatException
|
|
def _EndBlockedTime(self):
|
|
"""Update the thread_stats AtomicDict after task_queue.get() is called."""
|
|
thread_stat = thread_stats[(self.pid, self.ident)]
|
|
thread_stat.EndBlockedTime()
|
|
thread_stats[(self.pid, self.ident)] = thread_stat
|
|
|
|
def PerformTask(self, task, cls):
|
|
"""Makes the function call for a task.
|
|
|
|
Args:
|
|
task: The Task to perform.
|
|
cls: The instance of a class which gives context to the functions called
|
|
by the Task's function. E.g., see SetAclFuncWrapper.
|
|
"""
|
|
caller_id = task.caller_id
|
|
try:
|
|
results = task.func(cls, task.args, thread_state=self.thread_gsutil_api)
|
|
if task.should_return_results:
|
|
global_return_values_map.Increment(caller_id, [results],
|
|
default_value=[])
|
|
except Exception as e: # pylint: disable=broad-except
|
|
_IncrementFailureCount()
|
|
if task.fail_on_error:
|
|
raise # Only happens for single thread and process case.
|
|
else:
|
|
try:
|
|
task.exception_handler(cls, e)
|
|
except Exception as _: # pylint: disable=broad-except
|
|
# Don't allow callers to raise exceptions here and kill the worker
|
|
# threads.
|
|
cls.logger.debug(
|
|
'Caught exception while handling exception for %s:\n%s', task,
|
|
traceback.format_exc())
|
|
finally:
|
|
if self.worker_semaphore:
|
|
self.worker_semaphore.release()
|
|
self.shared_vars_updater.Update(caller_id, cls)
|
|
|
|
# Even if we encounter an exception, we still need to claim that that
|
|
# the function finished executing. Otherwise, we won't know when to
|
|
# stop waiting and return results.
|
|
num_done = caller_id_finished_count.Increment(caller_id, 1)
|
|
_NotifyIfDone(caller_id, num_done)
|
|
|
|
def run(self):
|
|
while True:
|
|
self._StartBlockedTime()
|
|
task = self.task_queue.get()
|
|
self._EndBlockedTime()
|
|
if task.args == ZERO_TASKS_TO_DO_ARGUMENT:
|
|
# This can happen in the single-process case because worker threads
|
|
# consume ProducerThread tasks directly.
|
|
continue
|
|
caller_id = task.caller_id
|
|
|
|
# Get the instance of the command with the appropriate context.
|
|
cls = self.cached_classes.get(caller_id, None)
|
|
if not cls:
|
|
cls = copy.copy(class_map[caller_id])
|
|
cls.logger = CreateOrGetGsutilLogger(cls.command_name)
|
|
self.cached_classes[caller_id] = cls
|
|
|
|
self.PerformTask(task, cls)
|
|
|
|
|
|
class _ThreadStat(object):
|
|
"""Stores thread idle and execution time statistics."""
|
|
|
|
def __init__(self, init_time):
|
|
self.total_idle_time = 0
|
|
# The last time EndBlockedTime was called, which is the last time a
|
|
# task_queue.get() completed or when we initialized the thread.
|
|
self.end_block_time = init_time
|
|
# The last time StartBlockedTime was called, which is the last time a
|
|
# task_queue.get() call started.
|
|
self.start_block_time = time.time()
|
|
# Between now and thread initialization, we were not blocked.
|
|
self.total_execution_time = 0
|
|
|
|
def StartBlockedTime(self):
|
|
self.start_block_time = time.time()
|
|
exec_time = self.start_block_time - self.end_block_time
|
|
self.total_execution_time += exec_time
|
|
|
|
def EndBlockedTime(self):
|
|
self.end_block_time = time.time()
|
|
idle_time = self.end_block_time - self.start_block_time
|
|
self.total_idle_time += idle_time
|
|
|
|
def AggregateStat(self, end_time):
|
|
"""Decide final stats upon Apply completion."""
|
|
if self.end_block_time > self.start_block_time:
|
|
# Apply ended before we blocked on task_queue.get(), or there was an
|
|
# exception during StartBlockedTime. In both of these cases, we were not
|
|
# blocked on task_queue.get() and so can add this time to execution time.
|
|
self.total_execution_time += end_time - self.end_block_time
|
|
else:
|
|
# Apply ended while we were blocked on task_queue.get(), or there was an
|
|
# exception during EndBlockedTime. In both of these cases, we were in the
|
|
# midst of or just finishing a task_queue.get() call, and so can add this
|
|
# time to idle time.
|
|
self.total_idle_time += end_time - self.start_block_time
|
|
|
|
|
|
def _AggregateThreadStats():
|
|
"""At the end of the top-level Apply call, aggregate the thread stats dict.
|
|
|
|
This should only be called in the main process and thread because it logs to
|
|
the MetricsCollector.
|
|
"""
|
|
cur_time = time.time()
|
|
total_idle_time = total_execution_time = 0
|
|
for thread_stat in thread_stats.values():
|
|
thread_stat.AggregateStat(cur_time)
|
|
total_idle_time += thread_stat.total_idle_time
|
|
total_execution_time += thread_stat.total_execution_time
|
|
LogPerformanceSummaryParams(thread_idle_time=total_idle_time,
|
|
thread_execution_time=total_execution_time)
|
|
|
|
|
|
class _SharedVariablesUpdater(object):
|
|
"""Used to update shared variable for a class in the global map.
|
|
|
|
Note that each thread will have its own instance of the calling class for
|
|
context, and it will also have its own instance of a
|
|
_SharedVariablesUpdater. This is used in the following way:
|
|
|
|
1. Before any tasks are performed, each thread will get a copy of the
|
|
calling class, and the globally-consistent value of this shared variable
|
|
will be initialized to whatever it was before the call to Apply began.
|
|
|
|
2. After each time a thread performs a task, it will look at the current
|
|
values of the shared variables in its instance of the calling class.
|
|
|
|
2.A. For each such variable, it computes the delta of this variable
|
|
between the last known value for this class (which is stored in
|
|
a dict local to this class) and the current value of the variable
|
|
in the class.
|
|
|
|
2.B. Using this delta, we update the last known value locally as well
|
|
as the globally-consistent value shared across all classes (the
|
|
globally consistent value is simply increased by the computed
|
|
delta).
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.last_shared_var_values = {}
|
|
|
|
def Update(self, caller_id, cls):
|
|
"""Update any shared variables with their deltas."""
|
|
shared_vars = shared_vars_list_map.get(caller_id, None)
|
|
if shared_vars:
|
|
for name in shared_vars:
|
|
key = (caller_id, name)
|
|
last_value = self.last_shared_var_values.get(key, 0)
|
|
# Compute the change made since the last time we updated here. This is
|
|
# calculated by simply subtracting the last known value from the current
|
|
# value in the class instance.
|
|
delta = getattr(cls, name) - last_value
|
|
self.last_shared_var_values[key] = delta + last_value
|
|
|
|
# Update the globally-consistent value by simply increasing it by the
|
|
# computed delta.
|
|
shared_vars_map.Increment(key, delta)
|
|
|
|
|
|
def _NotifyIfDone(caller_id, num_done):
|
|
"""Notify any threads waiting for results that something has finished.
|
|
|
|
Each waiting thread will then need to check the call_completed_map to see if
|
|
its work is done.
|
|
|
|
Note that num_done could be calculated here, but it is passed in as an
|
|
optimization so that we have one less call to a globally-locked data
|
|
structure.
|
|
|
|
Args:
|
|
caller_id: The caller_id of the function whose progress we're checking.
|
|
num_done: The number of tasks currently completed for that caller_id.
|
|
"""
|
|
num_to_do = total_tasks[caller_id]
|
|
if num_to_do == num_done and num_to_do >= 0:
|
|
# Notify the Apply call that's sleeping that it's ready to return.
|
|
with need_pool_or_done_cond:
|
|
call_completed_map[caller_id] = True
|
|
need_pool_or_done_cond.notify_all()
|
|
|
|
|
|
# pylint: disable=global-variable-not-assigned,global-variable-undefined
|
|
def ShutDownGsutil():
|
|
"""Shut down all processes in consumer pools in preparation for exiting."""
|
|
global glob_status_queue
|
|
for q in queues:
|
|
try:
|
|
q.cancel_join_thread()
|
|
except: # pylint: disable=bare-except
|
|
pass
|
|
for consumer_pool in consumer_pools:
|
|
consumer_pool.ShutDown()
|
|
try:
|
|
glob_status_queue.cancel_join_thread()
|
|
except: # pylint: disable=bare-except
|
|
pass
|
|
|
|
|
|
def _GetCurrentMaxRecursiveLevel():
|
|
global current_max_recursive_level
|
|
return current_max_recursive_level.GetValue()
|
|
|
|
|
|
def _IncrementCurrentMaxRecursiveLevel():
|
|
global current_max_recursive_level
|
|
current_max_recursive_level.Increment()
|
|
|
|
|
|
def _IncrementFailureCount():
|
|
global failure_count
|
|
failure_count.Increment()
|
|
|
|
|
|
def DecrementFailureCount():
|
|
global failure_count
|
|
failure_count.Decrement()
|
|
|
|
|
|
def GetFailureCount():
|
|
"""Returns the number of failures processed during calls to Apply()."""
|
|
global failure_count
|
|
return failure_count.GetValue()
|
|
|
|
|
|
def ResetFailureCount():
|
|
"""Resets the failure_count variable to 0 - useful if error is expected."""
|
|
global failure_count
|
|
failure_count.Reset()
|