877 lines
38 KiB
Python
877 lines
38 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Copyright 2010 Google Inc. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""Wildcard iterator class and supporting functions."""
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import print_function
|
|
from __future__ import division
|
|
from __future__ import unicode_literals
|
|
|
|
import fnmatch
|
|
import glob
|
|
import logging
|
|
import os
|
|
import re
|
|
import textwrap
|
|
|
|
import six
|
|
|
|
from gslib.bucket_listing_ref import BucketListingBucket
|
|
from gslib.bucket_listing_ref import BucketListingObject
|
|
from gslib.bucket_listing_ref import BucketListingPrefix
|
|
from gslib.cloud_api import AccessDeniedException
|
|
from gslib.cloud_api import CloudApi
|
|
from gslib.cloud_api import NotFoundException
|
|
from gslib.exception import CommandException
|
|
from gslib.storage_url import ContainsWildcard
|
|
from gslib.storage_url import GenerationFromUrlAndString
|
|
from gslib.storage_url import StorageUrlFromString
|
|
from gslib.storage_url import StripOneSlash
|
|
from gslib.storage_url import WILDCARD_REGEX
|
|
from gslib.third_party.storage_apitools import storage_v1_messages as apitools_messages
|
|
from gslib.utils.constants import UTF8
|
|
from gslib.utils.text_util import FixWindowsEncodingIfNeeded
|
|
from gslib.utils.text_util import PrintableStr
|
|
|
|
if six.PY3:
|
|
# StandardError was removed, so use the base exception type instead
|
|
StandardError = Exception
|
|
|
|
FLAT_LIST_REGEX = re.compile(r'(?P<before>.*?)\*\*(?P<after>.*)')
|
|
|
|
_UNICODE_EXCEPTION_TEXT = (
|
|
'Invalid Unicode path encountered (%s). gsutil cannot proceed '
|
|
'with such files present. Please remove or rename this file and '
|
|
'try again. NOTE: the path printed above replaces the '
|
|
'problematic characters with a hex-encoded printable '
|
|
'representation. For more details (including how to convert to a '
|
|
'gsutil-compatible encoding) see `gsutil help encoding`.')
|
|
|
|
|
|
class WildcardIterator(object):
|
|
"""Class for iterating over Google Cloud Storage strings containing wildcards.
|
|
|
|
The base class is abstract; you should instantiate using the
|
|
wildcard_iterator() static factory method, which chooses the right
|
|
implementation depending on the base string.
|
|
"""
|
|
|
|
# TODO: Standardize on __str__ and __repr__ here and elsewhere. Define both
|
|
# and make one return the other.
|
|
def __repr__(self):
|
|
"""Returns string representation of WildcardIterator."""
|
|
return 'WildcardIterator(%s)' % self.wildcard_url.url_string
|
|
|
|
|
|
class CloudWildcardIterator(WildcardIterator):
|
|
"""WildcardIterator subclass for buckets, bucket subdirs and objects.
|
|
|
|
Iterates over BucketListingRef matching the Url string wildcard. It's
|
|
much more efficient to first get metadata that's available in the Bucket
|
|
(for example to get the name and size of each object), because that
|
|
information is available in the object list results.
|
|
"""
|
|
|
|
def __init__(self,
|
|
wildcard_url,
|
|
gsutil_api,
|
|
all_versions=False,
|
|
project_id=None,
|
|
logger=None):
|
|
"""Instantiates an iterator that matches the wildcard URL.
|
|
|
|
Args:
|
|
wildcard_url: CloudUrl that contains the wildcard to iterate.
|
|
gsutil_api: Cloud storage interface. Passed in for thread safety, also
|
|
settable for testing/mocking.
|
|
all_versions: If true, the iterator yields all versions of objects
|
|
matching the wildcard. If false, yields just the live
|
|
object version.
|
|
project_id: Project ID to use for bucket listings.
|
|
logger: logging.Logger used for outputting debug messages during
|
|
iteration. If None, the root logger will be used.
|
|
"""
|
|
self.wildcard_url = wildcard_url
|
|
self.all_versions = all_versions
|
|
self.gsutil_api = gsutil_api
|
|
self.project_id = project_id
|
|
self.logger = logger or logging.getLogger()
|
|
|
|
def __iter__(self,
|
|
bucket_listing_fields=None,
|
|
expand_top_level_buckets=False):
|
|
"""Iterator that gets called when iterating over the cloud wildcard.
|
|
|
|
In the case where no wildcard is present, returns a single matching object,
|
|
single matching prefix, or one of each if both exist.
|
|
|
|
Args:
|
|
bucket_listing_fields: Iterable fields to include in bucket listings.
|
|
Ex. ['name', 'acl']. Iterator is
|
|
responsible for converting these to list-style
|
|
format ['items/name', 'items/acl'] as well as
|
|
adding any fields necessary for listing such as
|
|
prefixes. API implementation is responsible for
|
|
adding pagination fields. If this is None,
|
|
all fields are returned.
|
|
expand_top_level_buckets: If true, yield no BUCKET references. Instead,
|
|
expand buckets into top-level objects and
|
|
prefixes.
|
|
|
|
Yields:
|
|
BucketListingRef of type BUCKET, OBJECT or PREFIX.
|
|
"""
|
|
single_version_request = self.wildcard_url.HasGeneration()
|
|
|
|
# For wildcard expansion purposes, we need at a minimum the name of
|
|
# each object and prefix. If we're not using the default of requesting
|
|
# all fields, make sure at least these are requested. The Cloud API
|
|
# tolerates specifying the same field twice.
|
|
get_fields = None
|
|
if bucket_listing_fields:
|
|
get_fields = set()
|
|
for field in bucket_listing_fields:
|
|
get_fields.add(field)
|
|
bucket_listing_fields = self._GetToListFields(
|
|
get_fields=bucket_listing_fields)
|
|
bucket_listing_fields.update(['items/name', 'prefixes'])
|
|
get_fields.update(['name'])
|
|
# If we're making versioned requests, ensure generation and
|
|
# metageneration are also included.
|
|
if single_version_request or self.all_versions:
|
|
bucket_listing_fields.update(
|
|
['items/generation', 'items/metageneration'])
|
|
get_fields.update(['generation', 'metageneration'])
|
|
|
|
# Handle bucket wildcarding, if any, in _ExpandBucketWildcards. Then
|
|
# iterate over the expanded bucket strings and handle any object
|
|
# wildcarding.
|
|
for bucket_listing_ref in self._ExpandBucketWildcards(bucket_fields=['id']):
|
|
bucket_url_string = bucket_listing_ref.url_string
|
|
if self.wildcard_url.IsBucket():
|
|
# IsBucket() guarantees there are no prefix or object wildcards, and
|
|
# thus this is a top-level listing of buckets.
|
|
if expand_top_level_buckets:
|
|
url = StorageUrlFromString(bucket_url_string)
|
|
for obj_or_prefix in self.gsutil_api.ListObjects(
|
|
url.bucket_name,
|
|
delimiter='/',
|
|
all_versions=self.all_versions,
|
|
provider=self.wildcard_url.scheme,
|
|
fields=bucket_listing_fields):
|
|
if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT:
|
|
yield self._GetObjectRef(bucket_url_string,
|
|
obj_or_prefix.data,
|
|
with_version=self.all_versions)
|
|
else: # CloudApi.CsObjectOrPrefixType.PREFIX:
|
|
yield self._GetPrefixRef(bucket_url_string, obj_or_prefix.data)
|
|
else:
|
|
yield bucket_listing_ref
|
|
else:
|
|
# By default, assume a non-wildcarded URL is an object, not a prefix.
|
|
# This prevents unnecessary listings (which are slower, more expensive,
|
|
# and also subject to eventual consistency).
|
|
if (not ContainsWildcard(self.wildcard_url.url_string) and
|
|
self.wildcard_url.IsObject() and not self.all_versions):
|
|
try:
|
|
get_object = self.gsutil_api.GetObjectMetadata(
|
|
self.wildcard_url.bucket_name,
|
|
self.wildcard_url.object_name,
|
|
generation=self.wildcard_url.generation,
|
|
provider=self.wildcard_url.scheme,
|
|
fields=get_fields)
|
|
yield self._GetObjectRef(self.wildcard_url.bucket_url_string,
|
|
get_object,
|
|
with_version=(self.all_versions or
|
|
single_version_request))
|
|
return
|
|
except (NotFoundException, AccessDeniedException):
|
|
# It's possible this is a prefix - try to list instead.
|
|
pass
|
|
|
|
# Expand iteratively by building prefix/delimiter bucket listing
|
|
# request, filtering the results per the current level's wildcard
|
|
# (if present), and continuing with the next component of the
|
|
# wildcard. See _BuildBucketFilterStrings() documentation for details.
|
|
if single_version_request:
|
|
url_string = '%s%s#%s' % (bucket_url_string,
|
|
self.wildcard_url.object_name,
|
|
self.wildcard_url.generation)
|
|
else:
|
|
# Rstrip any prefixes to correspond with rstripped prefix wildcard
|
|
# from _BuildBucketFilterStrings().
|
|
url_string = '%s%s' % (
|
|
bucket_url_string, StripOneSlash(self.wildcard_url.object_name) or
|
|
'/') # Cover root object named '/' case.
|
|
urls_needing_expansion = [url_string]
|
|
while urls_needing_expansion:
|
|
url = StorageUrlFromString(urls_needing_expansion.pop(0))
|
|
(prefix, delimiter, prefix_wildcard,
|
|
suffix_wildcard) = (self._BuildBucketFilterStrings(url.object_name))
|
|
regex_patterns = self._GetRegexPatterns(prefix_wildcard)
|
|
|
|
# If we have a suffix wildcard, we only care about listing prefixes.
|
|
listing_fields = (set(['prefixes'])
|
|
if suffix_wildcard else bucket_listing_fields)
|
|
|
|
# List bucket for objects matching prefix up to delimiter.
|
|
for obj_or_prefix in self.gsutil_api.ListObjects(
|
|
url.bucket_name,
|
|
prefix=prefix,
|
|
delimiter=delimiter,
|
|
all_versions=self.all_versions or single_version_request,
|
|
provider=self.wildcard_url.scheme,
|
|
fields=listing_fields):
|
|
for pattern in regex_patterns:
|
|
if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT:
|
|
gcs_object = obj_or_prefix.data
|
|
if pattern.match(gcs_object.name):
|
|
if not suffix_wildcard or (StripOneSlash(gcs_object.name)
|
|
== suffix_wildcard):
|
|
if not single_version_request or (
|
|
self._SingleVersionMatches(gcs_object.generation)):
|
|
yield self._GetObjectRef(
|
|
bucket_url_string,
|
|
gcs_object,
|
|
with_version=(self.all_versions or
|
|
single_version_request))
|
|
break
|
|
else: # CloudApi.CsObjectOrPrefixType.PREFIX
|
|
prefix = obj_or_prefix.data
|
|
|
|
if ContainsWildcard(prefix):
|
|
# TODO: Disambiguate user-supplied strings from iterated
|
|
# prefix and object names so that we can better reason
|
|
# about wildcards and handle this case without raising
|
|
# an error.
|
|
raise CommandException(
|
|
'Cloud folder %s%s contains a wildcard; gsutil does '
|
|
'not currently support objects with wildcards in their '
|
|
'name.' % (bucket_url_string, prefix))
|
|
|
|
# If the prefix ends with a slash, remove it. Note that we only
|
|
# remove one slash so that we can successfully enumerate dirs
|
|
# containing multiple slashes.
|
|
rstripped_prefix = StripOneSlash(prefix)
|
|
if pattern.match(rstripped_prefix):
|
|
if suffix_wildcard and rstripped_prefix != suffix_wildcard:
|
|
# There's more wildcard left to expand.
|
|
url_append_string = '%s%s' % (bucket_url_string,
|
|
rstripped_prefix + '/' +
|
|
suffix_wildcard)
|
|
urls_needing_expansion.append(url_append_string)
|
|
else:
|
|
# No wildcard to expand, just yield the prefix.
|
|
yield self._GetPrefixRef(bucket_url_string, prefix)
|
|
break
|
|
|
|
def _GetRegexPatterns(self, wildcard_pattern):
|
|
"""Returns list of regex patterns derived from the wildcard patterns.
|
|
|
|
Args:
|
|
wildcard_pattern (str): A wilcard_pattern to filter the resources.
|
|
|
|
Returns:
|
|
List of compiled regex patterns.
|
|
|
|
This translates the wildcard_pattern and also creates some additional
|
|
patterns so that we can treat ** in a/b/c/**/d.txt as zero or more folders.
|
|
This means, a/b/c/d.txt will also be returned along with a/b/c/e/f/d.txt.
|
|
"""
|
|
# Case 1: The original pattern should always be present.
|
|
wildcard_patterns = [wildcard_pattern]
|
|
if '/**/' in wildcard_pattern:
|
|
# Case 2: Will fetch object gs://bucket/dir1/a.txt if pattern is
|
|
# gs://bucket/dir1/**/a.txt
|
|
updated_pattern = wildcard_pattern.replace('/**/', '/')
|
|
wildcard_patterns.append(updated_pattern)
|
|
else:
|
|
updated_pattern = wildcard_pattern
|
|
|
|
for pattern in (wildcard_pattern, updated_pattern):
|
|
if pattern.startswith('**/'):
|
|
# Case 3 (using wildcard_pattern): Will fetch object gs://bucket/a.txt
|
|
# if pattern is gs://bucket/**/a.txt. Note that '/**/' will match
|
|
# '/a.txt' not 'a.txt'.
|
|
# Case 4:(using updated_pattern) Will fetch gs://bucket/dir1/dir2/a.txt
|
|
# if the pattern is gs://bucket/**/dir1/**/a.txt
|
|
wildcard_patterns.append(pattern[3:])
|
|
return [re.compile(fnmatch.translate(p)) for p in wildcard_patterns]
|
|
|
|
def _BuildBucketFilterStrings(self, wildcard):
|
|
"""Builds strings needed for querying a bucket and filtering results.
|
|
|
|
This implements wildcard object name matching.
|
|
|
|
Args:
|
|
wildcard: The wildcard string to match to objects.
|
|
|
|
Returns:
|
|
(prefix, delimiter, prefix_wildcard, suffix_wildcard)
|
|
where:
|
|
prefix is the prefix to be sent in bucket GET request.
|
|
delimiter is the delimiter to be sent in bucket GET request.
|
|
prefix_wildcard is the wildcard to be used to filter bucket GET results.
|
|
suffix_wildcard is wildcard to be appended to filtered bucket GET
|
|
results for next wildcard expansion iteration.
|
|
For example, given the wildcard gs://bucket/abc/d*e/f*.txt we
|
|
would build prefix= abc/d, delimiter=/, prefix_wildcard=d*e, and
|
|
suffix_wildcard=f*.txt. Using this prefix and delimiter for a bucket
|
|
listing request will then produce a listing result set that can be
|
|
filtered using this prefix_wildcard; and we'd use this suffix_wildcard
|
|
to feed into the next call(s) to _BuildBucketFilterStrings(), for the
|
|
next iteration of listing/filtering.
|
|
|
|
Raises:
|
|
AssertionError if wildcard doesn't contain any wildcard chars.
|
|
"""
|
|
# Generate a request prefix if the object name part of the wildcard starts
|
|
# with a non-wildcard string (e.g., that's true for 'gs://bucket/abc*xyz').
|
|
match = WILDCARD_REGEX.search(wildcard)
|
|
if not match:
|
|
# Input "wildcard" has no wildcard chars, so just return tuple that will
|
|
# cause a bucket listing to match the given input wildcard. Example: if
|
|
# previous iteration yielded gs://bucket/dir/ with suffix_wildcard abc,
|
|
# the next iteration will call _BuildBucketFilterStrings() with
|
|
# gs://bucket/dir/abc, and we will return prefix ='dir/abc',
|
|
# delimiter='/', prefix_wildcard='dir/abc', and suffix_wildcard=''.
|
|
prefix = wildcard
|
|
delimiter = '/'
|
|
prefix_wildcard = wildcard
|
|
suffix_wildcard = ''
|
|
else:
|
|
if match.start() > 0:
|
|
# Wildcard does not occur at beginning of object name, so construct a
|
|
# prefix string to send to server.
|
|
prefix = wildcard[:match.start()]
|
|
wildcard_part = wildcard[match.start():]
|
|
else:
|
|
prefix = None
|
|
wildcard_part = wildcard
|
|
end = wildcard_part.find('/')
|
|
if end != -1:
|
|
wildcard_part = wildcard_part[:end + 1]
|
|
|
|
prefix_wildcard = (prefix or '') + wildcard_part
|
|
if not prefix_wildcard.endswith('**/'):
|
|
# Remove trailing '/' so we will match gs://bucket/abc* as well as
|
|
# gs://bucket/abc*/ with the same wildcard regex. Don't do this for
|
|
# double wildcards because those won't pass a delimiter to the API.
|
|
prefix_wildcard = StripOneSlash(prefix_wildcard)
|
|
|
|
suffix_wildcard = wildcard[match.end():]
|
|
end = suffix_wildcard.find('/')
|
|
if end == -1:
|
|
suffix_wildcard = ''
|
|
else:
|
|
suffix_wildcard = suffix_wildcard[end + 1:]
|
|
# To implement recursive (**) wildcarding, if prefix_wildcard
|
|
# suffix_wildcard starts with '**' don't send a delimiter, and combine
|
|
# suffix_wildcard at end of prefix_wildcard.
|
|
if prefix_wildcard.find('**') != -1:
|
|
delimiter = None
|
|
prefix_wildcard += suffix_wildcard
|
|
suffix_wildcard = ''
|
|
else:
|
|
delimiter = '/'
|
|
# The following debug output is useful for tracing how the algorithm
|
|
# walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt
|
|
self.logger.debug(
|
|
'wildcard=%s, prefix=%s, delimiter=%s, '
|
|
'prefix_wildcard=%s, suffix_wildcard=%s\n', PrintableStr(wildcard),
|
|
PrintableStr(prefix), PrintableStr(delimiter),
|
|
PrintableStr(prefix_wildcard), PrintableStr(suffix_wildcard))
|
|
return (prefix, delimiter, prefix_wildcard, suffix_wildcard)
|
|
|
|
def _SingleVersionMatches(self, listed_generation):
|
|
decoded_generation = GenerationFromUrlAndString(self.wildcard_url,
|
|
listed_generation)
|
|
return str(self.wildcard_url.generation) == str(decoded_generation)
|
|
|
|
def _ExpandBucketWildcards(self, bucket_fields=None):
|
|
"""Expands bucket and provider wildcards.
|
|
|
|
Builds a list of bucket url strings that can be iterated on.
|
|
|
|
Args:
|
|
bucket_fields: If present, populate only these metadata fields for
|
|
buckets. Example value: ['acl', 'defaultObjectAcl']
|
|
|
|
Yields:
|
|
BucketListingRefereneces of type BUCKET.
|
|
"""
|
|
bucket_url = StorageUrlFromString(self.wildcard_url.bucket_url_string)
|
|
if (bucket_fields and set(bucket_fields) == set(['id']) and
|
|
not ContainsWildcard(self.wildcard_url.bucket_name)):
|
|
# If we just want the name of a non-wildcarded bucket URL,
|
|
# don't make an RPC.
|
|
yield BucketListingBucket(bucket_url)
|
|
elif (self.wildcard_url.IsBucket() and
|
|
not ContainsWildcard(self.wildcard_url.bucket_name)):
|
|
# If we have a non-wildcarded bucket URL, get just that bucket.
|
|
yield BucketListingBucket(bucket_url,
|
|
root_object=self.gsutil_api.GetBucket(
|
|
self.wildcard_url.bucket_name,
|
|
provider=self.wildcard_url.scheme,
|
|
fields=bucket_fields))
|
|
else:
|
|
regex = fnmatch.translate(self.wildcard_url.bucket_name)
|
|
prog = re.compile(regex)
|
|
|
|
fields = self._GetToListFields(bucket_fields)
|
|
if fields:
|
|
fields.add('items/id')
|
|
for bucket in self.gsutil_api.ListBuckets(
|
|
fields=fields,
|
|
project_id=self.project_id,
|
|
provider=self.wildcard_url.scheme):
|
|
if prog.match(bucket.id):
|
|
url = StorageUrlFromString('%s://%s/' %
|
|
(self.wildcard_url.scheme, bucket.id))
|
|
yield BucketListingBucket(url, root_object=bucket)
|
|
|
|
def _GetToListFields(self, get_fields=None):
|
|
"""Prepends 'items/' to the input fields and converts it to a set.
|
|
|
|
This way field sets requested for GetBucket can be used in ListBucket calls.
|
|
Note that the input set must contain only bucket or object fields; listing
|
|
fields such as prefixes or nextPageToken should be added after calling
|
|
this function.
|
|
|
|
Args:
|
|
get_fields: Iterable fields usable in GetBucket/GetObject calls.
|
|
|
|
Returns:
|
|
Set of fields usable in ListBuckets/ListObjects calls.
|
|
"""
|
|
if get_fields:
|
|
list_fields = set()
|
|
for field in get_fields:
|
|
list_fields.add('items/' + field)
|
|
return list_fields
|
|
|
|
def _GetObjectRef(self, bucket_url_string, gcs_object, with_version=False):
|
|
"""Creates a BucketListingRef of type OBJECT from the arguments.
|
|
|
|
Args:
|
|
bucket_url_string: Wildcardless string describing the containing bucket.
|
|
gcs_object: gsutil_api root Object for populating the BucketListingRef.
|
|
with_version: If true, return a reference with a versioned string.
|
|
|
|
Returns:
|
|
BucketListingRef of type OBJECT.
|
|
"""
|
|
# Generation can be None in test mocks, so just return the
|
|
# live object for simplicity.
|
|
if with_version and gcs_object.generation is not None:
|
|
generation_str = GenerationFromUrlAndString(self.wildcard_url,
|
|
gcs_object.generation)
|
|
object_string = '%s%s#%s' % (bucket_url_string, gcs_object.name,
|
|
generation_str)
|
|
else:
|
|
object_string = '%s%s' % (bucket_url_string, gcs_object.name)
|
|
object_url = StorageUrlFromString(object_string)
|
|
return BucketListingObject(object_url, root_object=gcs_object)
|
|
|
|
def _GetPrefixRef(self, bucket_url_string, prefix):
|
|
"""Creates a BucketListingRef of type PREFIX from the arguments.
|
|
|
|
Args:
|
|
bucket_url_string: Wildcardless string describing the containing bucket.
|
|
prefix: gsutil_api Prefix for populating the BucketListingRef
|
|
|
|
Returns:
|
|
BucketListingRef of type PREFIX.
|
|
"""
|
|
prefix_url = StorageUrlFromString('%s%s' % (bucket_url_string, prefix))
|
|
return BucketListingPrefix(prefix_url, root_object=prefix)
|
|
|
|
def IterBuckets(self, bucket_fields=None):
|
|
"""Iterates over the wildcard, returning refs for each expanded bucket.
|
|
|
|
This ignores the object part of the URL entirely and expands only the
|
|
the bucket portion. It will yield BucketListingRefs of type BUCKET only.
|
|
|
|
Args:
|
|
bucket_fields: Iterable fields to include in bucket listings.
|
|
Ex. ['defaultObjectAcl', 'logging']. This function is
|
|
responsible for converting these to listing-style
|
|
format ['items/defaultObjectAcl', 'items/logging'], as
|
|
well as adding any fields necessary for listing such as
|
|
'items/id'. API implemenation is responsible for
|
|
adding pagination fields. If this is None, all fields are
|
|
returned.
|
|
|
|
Yields:
|
|
BucketListingRef of type BUCKET, or empty iterator if no matches.
|
|
"""
|
|
for blr in self._ExpandBucketWildcards(bucket_fields=bucket_fields):
|
|
yield blr
|
|
|
|
def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False):
|
|
"""Iterates over the wildcard, yielding bucket, prefix or object refs.
|
|
|
|
Args:
|
|
bucket_listing_fields: If present, populate only these metadata
|
|
fields for listed objects.
|
|
expand_top_level_buckets: If true and the wildcard expands only to
|
|
Bucket(s), yields the expansion of each bucket
|
|
into a top-level listing of prefixes and objects
|
|
in that bucket instead of a BucketListingRef
|
|
to that bucket.
|
|
|
|
Yields:
|
|
BucketListingRef, or empty iterator if no matches.
|
|
"""
|
|
for blr in self.__iter__(bucket_listing_fields=bucket_listing_fields,
|
|
expand_top_level_buckets=expand_top_level_buckets):
|
|
yield blr
|
|
|
|
def IterObjects(self, bucket_listing_fields=None):
|
|
"""Iterates over the wildcard, yielding only object BucketListingRefs.
|
|
|
|
Args:
|
|
bucket_listing_fields: If present, populate only these metadata
|
|
fields for listed objects.
|
|
|
|
Yields:
|
|
BucketListingRefs of type OBJECT or empty iterator if no matches.
|
|
"""
|
|
for blr in self.__iter__(bucket_listing_fields=bucket_listing_fields,
|
|
expand_top_level_buckets=True):
|
|
if blr.IsObject():
|
|
yield blr
|
|
|
|
|
|
def _GetFileObject(filepath):
|
|
"""Returns an apitools Object class with supported file attributes.
|
|
|
|
To provide size estimates for local to cloud file copies, we need to retrieve
|
|
expose the local file's size.
|
|
|
|
Args:
|
|
filepath: Path to the file.
|
|
|
|
Returns:
|
|
apitools Object that with file name and size attributes filled-in.
|
|
"""
|
|
# TODO: If we are preserving POSIX attributes, we could instead call
|
|
# os.stat() here.
|
|
return apitools_messages.Object(size=os.path.getsize(filepath))
|
|
|
|
|
|
class FileWildcardIterator(WildcardIterator):
|
|
"""WildcardIterator subclass for files and directories.
|
|
|
|
If you use recursive wildcards ('**') only a single such wildcard is
|
|
supported. For example you could use the wildcard '**/*.txt' to list all .txt
|
|
files in any subdirectory of the current directory, but you couldn't use a
|
|
wildcard like '**/abc/**/*.txt' (which would, if supported, let you find .txt
|
|
files in any subdirectory named 'abc').
|
|
"""
|
|
|
|
def __init__(self,
|
|
wildcard_url,
|
|
exclude_tuple=None,
|
|
ignore_symlinks=False,
|
|
logger=None):
|
|
"""Instantiates an iterator over BucketListingRefs matching wildcard URL.
|
|
|
|
Args:
|
|
wildcard_url: FileUrl that contains the wildcard to iterate.
|
|
exclude_tuple: (base_url, exclude_pattern), where base_url is
|
|
top-level URL to list; exclude_pattern is a regex
|
|
of paths to ignore during iteration.
|
|
ignore_symlinks: If True, ignore symlinks during iteration.
|
|
logger: logging.Logger used for outputting debug messages during
|
|
iteration. If None, the root logger will be used.
|
|
"""
|
|
self.wildcard_url = wildcard_url
|
|
self.exclude_tuple = exclude_tuple
|
|
self.ignore_symlinks = ignore_symlinks
|
|
self.logger = logger or logging.getLogger()
|
|
|
|
def __iter__(self, bucket_listing_fields=None):
|
|
"""Iterator that gets called when iterating over the file wildcard.
|
|
|
|
In the case where no wildcard is present, returns a single matching file
|
|
or directory.
|
|
|
|
Args:
|
|
bucket_listing_fields: Iterable fields to include in listings.
|
|
Ex. ['size']. Currently only 'size' is supported.
|
|
If present, will populate yielded BucketListingObject.root_object
|
|
with the file name and size.
|
|
|
|
Raises:
|
|
WildcardException: if invalid wildcard found.
|
|
|
|
Yields:
|
|
BucketListingRef of type OBJECT (for files) or PREFIX (for directories)
|
|
"""
|
|
include_size = (bucket_listing_fields and
|
|
'size' in set(bucket_listing_fields))
|
|
|
|
wildcard = self.wildcard_url.object_name
|
|
match = FLAT_LIST_REGEX.match(wildcard)
|
|
if match:
|
|
# Recursive wildcarding request ('.../**/...').
|
|
# Example input: wildcard = '/tmp/tmp2pQJAX/**/*'
|
|
base_dir = match.group('before')[:-1]
|
|
remaining_wildcard = match.group('after')
|
|
# At this point for the above example base_dir = '/tmp/tmp2pQJAX' and
|
|
# remaining_wildcard = '/*'
|
|
if remaining_wildcard.startswith('*'):
|
|
raise WildcardException('Invalid wildcard with more than 2 consecutive '
|
|
'*s (%s)' % wildcard)
|
|
# If there was no remaining wildcard past the recursive wildcard,
|
|
# treat it as if it were a '*'. For example, file://tmp/** is equivalent
|
|
# to file://tmp/**/*
|
|
if not remaining_wildcard:
|
|
remaining_wildcard = '*'
|
|
# Skip slash(es).
|
|
remaining_wildcard = remaining_wildcard.lstrip(os.sep)
|
|
filepaths = self._IterDir(base_dir, remaining_wildcard)
|
|
else:
|
|
# Not a recursive wildcarding request.
|
|
filepaths = glob.iglob(wildcard)
|
|
for filepath in filepaths:
|
|
expanded_url = StorageUrlFromString(filepath)
|
|
try:
|
|
if self.ignore_symlinks and os.path.islink(filepath):
|
|
if self.logger:
|
|
self.logger.info('Skipping symbolic link %s...', filepath)
|
|
continue
|
|
if os.path.isdir(filepath):
|
|
yield BucketListingPrefix(expanded_url)
|
|
else:
|
|
blr_object = _GetFileObject(filepath) if include_size else None
|
|
yield BucketListingObject(expanded_url, root_object=blr_object)
|
|
except UnicodeEncodeError:
|
|
raise CommandException('\n'.join(
|
|
textwrap.wrap(_UNICODE_EXCEPTION_TEXT % repr(filepath))))
|
|
|
|
def _IterDir(self, directory, wildcard):
|
|
"""An iterator over the specified dir and wildcard.
|
|
|
|
Args:
|
|
directory (unicode): The path of the directory to iterate over.
|
|
wildcard (str): The wildcard characters used for filename pattern
|
|
matching.
|
|
|
|
Yields:
|
|
(str) A string containing the path to a file somewhere under the directory
|
|
hierarchy of `directory`.
|
|
|
|
Raises:
|
|
ComandException: If this method encounters a file path that it cannot
|
|
decode as UTF-8.
|
|
"""
|
|
if os.path.splitdrive(directory)[0] == directory:
|
|
# For Windows-style paths that consist of a drive letter followed by a
|
|
# colon, os.path.join behaves in an odd manner. It intentionally will not
|
|
# join ['c:' and 'foo'] as 'c:\\foo', but rather as 'c:foo'. The latter
|
|
# format is not handled correctly by gsutil, so we check if the path
|
|
# specifies the root of a volume, and if so, append a backslash so that
|
|
# the resulting joined path looks like 'c:\\foo'.
|
|
directory += '\\'
|
|
|
|
# UTF8-encode directory before passing it to os.walk() so if there are
|
|
# non-valid UTF8 chars in the file name (e.g., that can happen if the file
|
|
# originated on Windows) os.walk() will not attempt to decode and then die
|
|
# with a "codec can't decode byte" error, and instead we can catch the error
|
|
# at yield time and print a more informative error message.
|
|
for dirpath, dirnames, filenames in os.walk(six.ensure_text(directory),
|
|
topdown=True):
|
|
filtered_dirnames = []
|
|
|
|
for dirname in dirnames:
|
|
full_dir_path = os.path.join(dirpath, dirname)
|
|
# Removes directories in place to prevent them and their children from
|
|
# being iterated. See https://docs.python.org/3/library/os.html#os.walk
|
|
if not self._ExcludeDir(full_dir_path):
|
|
filtered_dirnames.append(dirname)
|
|
else:
|
|
# If a symlink is excluded above we don't want to print 2 messages.
|
|
continue
|
|
# This only prints a log message as os.walk() will not, by default,
|
|
# walk down into symbolic links that resolve to directories.
|
|
if self.logger and os.path.islink(full_dir_path):
|
|
self.logger.info('Skipping symlink directory "%s"', full_dir_path)
|
|
|
|
dirnames[:] = filtered_dirnames
|
|
|
|
for f in fnmatch.filter(filenames, wildcard):
|
|
try:
|
|
yield os.path.join(dirpath, FixWindowsEncodingIfNeeded(f))
|
|
except UnicodeDecodeError:
|
|
# Note: We considered several ways to deal with this, but each had
|
|
# problems:
|
|
# 1. Raise an exception and try to catch in a higher layer (the
|
|
# gsutil cp command), so we can properly support the gsutil cp -c
|
|
# option. That doesn't work because raising an exception during
|
|
# iteration terminates the generator.
|
|
# 2. Accumulate a list of bad filenames and skip processing each
|
|
# during iteration, then raise at the end, with exception text
|
|
# printing the bad paths. That doesn't work because iteration is
|
|
# wrapped in PluralityCheckableIterator, so it's possible there
|
|
# are not-yet-performed copy operations at the time we reach the
|
|
# end of the iteration and raise the exception - which would cause
|
|
# us to skip copying validly named files. Moreover, the gsutil
|
|
# cp command loops over argv, so if you run the command gsutil cp
|
|
# -rc dir1 dir2 gs://bucket, an invalid unicode name inside dir1
|
|
# would cause dir2 never to be visited.
|
|
# 3. Print the invalid pathname and skip it during iteration. That
|
|
# would work but would mean gsutil cp could exit with status 0
|
|
# even though some files weren't copied.
|
|
# 4. Change the WildcardIterator to include an error status along with
|
|
# the result. That would solve the problem but would be a
|
|
# substantial change (WildcardIterator is used in many parts of
|
|
# gsutil), and we didn't feel that magnitude of change was
|
|
# warranted by this relatively uncommon corner case.
|
|
# Instead we chose to abort when one such file is encountered, and
|
|
# require the user to remove or rename the files and try again.
|
|
raise CommandException('\n'.join(
|
|
textwrap.wrap(_UNICODE_EXCEPTION_TEXT %
|
|
repr(os.path.join(dirpath, f)))))
|
|
|
|
def _ExcludeDir(self, dir):
|
|
"""Check a directory to see if it should be excluded from os.walk.
|
|
|
|
Args:
|
|
dir: String representing the directory to check.
|
|
|
|
Returns:
|
|
True if the directory should be excluded.
|
|
"""
|
|
if self.exclude_tuple is None:
|
|
return False
|
|
(base_url, exclude_dirs, exclude_pattern) = self.exclude_tuple
|
|
if not exclude_dirs:
|
|
return False
|
|
str_to_check = StorageUrlFromString(
|
|
dir).url_string[len(base_url.url_string):]
|
|
if str_to_check.startswith(self.wildcard_url.delim):
|
|
str_to_check = str_to_check[1:]
|
|
if exclude_pattern.match(str_to_check):
|
|
if self.logger:
|
|
self.logger.info('Skipping excluded directory %s...', dir)
|
|
return True
|
|
|
|
# pylint: disable=unused-argument
|
|
def IterObjects(self, bucket_listing_fields=None):
|
|
"""Iterates over the wildcard, yielding only object (file) refs.
|
|
|
|
Args:
|
|
bucket_listing_fields: Iterable fields to include in listings.
|
|
Ex. ['size']. Currently only 'size' is supported.
|
|
If present, will populate yielded BucketListingObject.root_object
|
|
with the file name and size.
|
|
|
|
Yields:
|
|
BucketListingRefs of type OBJECT or empty iterator if no matches.
|
|
"""
|
|
for bucket_listing_ref in self.IterAll(
|
|
bucket_listing_fields=bucket_listing_fields):
|
|
if bucket_listing_ref.IsObject():
|
|
yield bucket_listing_ref
|
|
|
|
# pylint: disable=unused-argument
|
|
def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False):
|
|
"""Iterates over the wildcard, yielding BucketListingRefs.
|
|
|
|
Args:
|
|
bucket_listing_fields: Iterable fields to include in listings.
|
|
Ex. ['size']. Currently only 'size' is supported.
|
|
If present, will populate yielded BucketListingObject.root_object
|
|
with the file name and size.
|
|
expand_top_level_buckets: Ignored; filesystems don't have buckets.
|
|
|
|
Yields:
|
|
BucketListingRefs of type OBJECT (file) or PREFIX (directory),
|
|
or empty iterator if no matches.
|
|
"""
|
|
for bucket_listing_ref in self.__iter__(
|
|
bucket_listing_fields=bucket_listing_fields):
|
|
yield bucket_listing_ref
|
|
|
|
def IterBuckets(self, unused_bucket_fields=None):
|
|
"""Placeholder to allow polymorphic use of WildcardIterator.
|
|
|
|
Args:
|
|
unused_bucket_fields: Ignored; filesystems don't have buckets.
|
|
|
|
Raises:
|
|
WildcardException: in all cases.
|
|
"""
|
|
raise WildcardException(
|
|
'Iterating over Buckets not possible for file wildcards')
|
|
|
|
|
|
class WildcardException(StandardError):
|
|
"""Exception raised for invalid wildcard URLs."""
|
|
|
|
def __init__(self, reason):
|
|
StandardError.__init__(self)
|
|
self.reason = reason
|
|
|
|
def __repr__(self):
|
|
return 'WildcardException: %s' % self.reason
|
|
|
|
def __str__(self):
|
|
return 'WildcardException: %s' % self.reason
|
|
|
|
|
|
def CreateWildcardIterator(url_str,
|
|
gsutil_api,
|
|
all_versions=False,
|
|
project_id=None,
|
|
exclude_tuple=None,
|
|
ignore_symlinks=False,
|
|
logger=None):
|
|
"""Instantiate a WildcardIterator for the given URL string.
|
|
|
|
Args:
|
|
url_str: URL string naming wildcard object(s) to iterate.
|
|
gsutil_api: Cloud storage interface. Passed in for thread safety, also
|
|
settable for testing/mocking.
|
|
all_versions: If true, the iterator yields all versions of objects
|
|
matching the wildcard. If false, yields just the live
|
|
object version.
|
|
project_id: Project id to use for bucket listings.
|
|
exclude_tuple: (base_url, exclude_pattern), where base_url is
|
|
top-level URL to list; exclude_pattern is a regex
|
|
of paths to ignore during iteration.
|
|
ignore_symlinks: For FileUrls, ignore symlinks during iteration if true.
|
|
logger: logging.Logger used for outputting debug messages during iteration.
|
|
If None, the root logger will be used.
|
|
|
|
Returns:
|
|
A WildcardIterator that handles the requested iteration.
|
|
"""
|
|
|
|
url = StorageUrlFromString(url_str)
|
|
logger = logger or logging.getLogger()
|
|
if url.IsFileUrl():
|
|
return FileWildcardIterator(url,
|
|
exclude_tuple=exclude_tuple,
|
|
ignore_symlinks=ignore_symlinks,
|
|
logger=logger)
|
|
else: # Cloud URL
|
|
return CloudWildcardIterator(url,
|
|
gsutil_api,
|
|
all_versions=all_versions,
|
|
project_id=project_id)
|