335 lines
9.8 KiB
Python
335 lines
9.8 KiB
Python
# -*- coding: utf-8 -*- #
|
|
# Copyright 2016 Google LLC. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""Utilities for expanding wildcarded GCS pathnames."""
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import unicode_literals
|
|
|
|
import abc
|
|
import fnmatch
|
|
import os
|
|
import re
|
|
|
|
from googlecloudsdk.api_lib.storage import storage_api
|
|
from googlecloudsdk.api_lib.storage import storage_util
|
|
from googlecloudsdk.core import log
|
|
from googlecloudsdk.core import properties
|
|
|
|
import six
|
|
|
|
|
|
class PathExpander(six.with_metaclass(abc.ABCMeta)):
|
|
"""Abstract base class for path wildcard expansion."""
|
|
|
|
EXPANSION_CHARS = '[*?[]'
|
|
|
|
@classmethod
|
|
def ForPath(cls, path):
|
|
if path.startswith('gs://'):
|
|
return GCSPathExpander()
|
|
return LocalPathExpander()
|
|
|
|
def __init__(self, sep):
|
|
self._sep = sep
|
|
|
|
@abc.abstractmethod
|
|
def AbsPath(self, path):
|
|
pass
|
|
|
|
@abc.abstractmethod
|
|
def IsFile(self, path):
|
|
pass
|
|
|
|
@abc.abstractmethod
|
|
def IsDir(self, path):
|
|
pass
|
|
|
|
@abc.abstractmethod
|
|
def Exists(self, path):
|
|
pass
|
|
|
|
@abc.abstractmethod
|
|
def ListDir(self, path):
|
|
pass
|
|
|
|
@abc.abstractmethod
|
|
def Join(self, path1, path2):
|
|
pass
|
|
|
|
@classmethod
|
|
def HasExpansion(cls, path):
|
|
return bool(re.search(PathExpander.EXPANSION_CHARS, path))
|
|
|
|
def ExpandPath(self, path):
|
|
"""Expand the given path that contains wildcard characters.
|
|
|
|
Args:
|
|
path: str, The path to expand.
|
|
|
|
Returns:
|
|
({str}, {str}), A tuple of the sets of files and directories that match
|
|
the wildcard path. All returned paths are absolute.
|
|
"""
|
|
files = set()
|
|
dirs = set()
|
|
for p in self._Glob(self.AbsPath(path)):
|
|
if p.endswith(self._sep):
|
|
dirs.add(p)
|
|
else:
|
|
files.add(p)
|
|
if self.IsEndRecursive(path):
|
|
# If the path has /** on the end, it is going to match all files under
|
|
# each matching root, so there is no need to process any sub-directories
|
|
# explicitly.
|
|
dirs.clear()
|
|
return (files, dirs)
|
|
|
|
def ExpandPaths(self, paths):
|
|
files = set()
|
|
dirs = set()
|
|
for p in paths:
|
|
(current_files, current_dirs) = self.ExpandPath(p)
|
|
if not current_files and not current_dirs:
|
|
log.warning('[{}] does not match any paths.'.format(p))
|
|
continue
|
|
files.update(current_files)
|
|
dirs.update(current_dirs)
|
|
return files, dirs
|
|
|
|
def IsEndRecursive(self, path):
|
|
return path.endswith(self._sep + '**')
|
|
|
|
def IsDirLike(self, path):
|
|
return path.endswith(self._sep)
|
|
|
|
def _Glob(self, path):
|
|
if not self.HasExpansion(path):
|
|
if self.Exists(path):
|
|
yield self._FormatPath(path)
|
|
return
|
|
|
|
dir_path, basename = os.path.split(path)
|
|
has_basename_expansion = self.HasExpansion(basename)
|
|
for expanded_dir_path in self._Glob(dir_path):
|
|
if not has_basename_expansion:
|
|
path = self.Join(expanded_dir_path, basename)
|
|
if self.Exists(path):
|
|
yield self._FormatPath(path)
|
|
else:
|
|
if basename == '**':
|
|
for n in self._RecursiveDirList(expanded_dir_path):
|
|
yield self._FormatPath(n)
|
|
else:
|
|
for n in fnmatch.filter(
|
|
self.ListDir(expanded_dir_path),
|
|
basename):
|
|
yield self._FormatPath(self.Join(expanded_dir_path, n))
|
|
|
|
def _RecursiveDirList(self, dir_path):
|
|
for n in self.ListDir(dir_path):
|
|
path = self.Join(dir_path, n)
|
|
yield path
|
|
for x in self._RecursiveDirList(path):
|
|
yield x
|
|
|
|
def _FormatPath(self, path):
|
|
if self.IsDir(path) and not path.endswith(self._sep):
|
|
path = path + self._sep
|
|
return path
|
|
|
|
|
|
class LocalPathExpander(PathExpander):
|
|
"""Implements path expansion for the local filesystem."""
|
|
|
|
def __init__(self):
|
|
super(LocalPathExpander, self).__init__(os.sep)
|
|
|
|
def AbsPath(self, path):
|
|
return os.path.abspath(path)
|
|
|
|
def IsFile(self, path):
|
|
return os.path.isfile(path)
|
|
|
|
def IsDir(self, path):
|
|
return os.path.isdir(path)
|
|
|
|
def Exists(self, path):
|
|
return os.path.exists(path)
|
|
|
|
def ListDir(self, path):
|
|
try:
|
|
return os.listdir(path)
|
|
except os.error:
|
|
return []
|
|
|
|
def Join(self, path1, path2):
|
|
return os.path.join(path1, path2)
|
|
|
|
|
|
class GCSPathExpander(PathExpander):
|
|
"""Implements path expansion for gs:// formatted resource strings."""
|
|
|
|
def __init__(self):
|
|
super(GCSPathExpander, self).__init__('/')
|
|
self._client = storage_api.StorageClient()
|
|
self._objects = {}
|
|
self._object_details = {}
|
|
|
|
def GetSortedObjectDetails(self, object_paths):
|
|
"""Gets all the details for the given paths and returns them sorted.
|
|
|
|
Args:
|
|
object_paths: [str], A list of gs:// object or directory paths.
|
|
|
|
Returns:
|
|
[{path, data}], A list of dicts with the keys path and data. Path is the
|
|
gs:// path to the object or directory. Object paths will not end in a '/'
|
|
and directory paths will. The data is either a storage.Object message (for
|
|
objects) or a storage_util.ObjectReference for directories. The sort
|
|
order is alphabetical with all directories first and then all objects.
|
|
"""
|
|
all_data = []
|
|
for path in object_paths:
|
|
is_obj, data = self._GetObjectDetails(path)
|
|
path = path if is_obj else path + '/'
|
|
all_data.append((is_obj, {'path': path, 'data': data}))
|
|
|
|
all_data = sorted(all_data, key=lambda o: (o[0], o[1]['path']))
|
|
return [d[1] for d in all_data]
|
|
|
|
def _GetObjectDetails(self, object_path):
|
|
"""Gets the actual object data for a given GCS path.
|
|
|
|
Args:
|
|
object_path: str, The gs:// path to an object or directory.
|
|
|
|
Returns:
|
|
(bool, data), Where element 0 is True if the path is an object, False if
|
|
a directory and where data is either a storage.Object message (for
|
|
objects) or a storage_util.ObjectReference for directories.
|
|
"""
|
|
details = self._object_details.get(object_path)
|
|
if details:
|
|
return True, details
|
|
else:
|
|
# This isn't an object, must be a "directory" so just return the name
|
|
# data.
|
|
return False, storage_util.ObjectReference.FromUrl(
|
|
object_path, allow_empty_object=True)
|
|
|
|
def AbsPath(self, path):
|
|
if not path.startswith('gs://'):
|
|
raise ValueError('GCS paths must be absolute (starting with gs://)')
|
|
return path
|
|
|
|
def IsFile(self, path):
|
|
exists, is_dir = self._Exists(path)
|
|
return exists and not is_dir
|
|
|
|
def IsDir(self, path):
|
|
exists, is_dir = self._Exists(path)
|
|
return exists and is_dir
|
|
|
|
def Exists(self, path):
|
|
exists, _ = self._Exists(path)
|
|
return exists
|
|
|
|
def _Exists(self, path):
|
|
if self._IsRoot(path):
|
|
# Root of the filesystem always exists
|
|
return True, True
|
|
|
|
path = path.rstrip('/')
|
|
obj_ref = storage_util.ObjectReference.FromUrl(
|
|
path, allow_empty_object=True)
|
|
self._LoadObjectsIfMissing(obj_ref.bucket_ref)
|
|
|
|
if obj_ref.bucket in self._objects:
|
|
if not obj_ref.name:
|
|
# Just a bucket, and it exists.
|
|
return True, True
|
|
if obj_ref.name in self._objects[obj_ref.bucket]:
|
|
# This is an object and it exists.
|
|
return True, False
|
|
# See if this is a directory prefix of an existing object.
|
|
dir_name = self._GetDirString(obj_ref.name)
|
|
for i in self._objects[obj_ref.bucket]:
|
|
if i.startswith(dir_name):
|
|
return True, True
|
|
|
|
return False, False
|
|
|
|
def ListDir(self, path):
|
|
if self._IsRoot(path):
|
|
# The contents of the root filesystem are the buckets in the current
|
|
# project.
|
|
for b in self._client.ListBuckets(
|
|
project=properties.VALUES.core.project.Get(required=True)):
|
|
yield b.name
|
|
return
|
|
|
|
obj_ref = storage_util.ObjectReference.FromUrl(
|
|
path, allow_empty_object=True)
|
|
self._LoadObjectsIfMissing(obj_ref.bucket_ref)
|
|
|
|
dir_name = self._GetDirString(obj_ref.name)
|
|
parent_dir_length = len(dir_name)
|
|
|
|
seen = set()
|
|
for obj_name in self._objects[obj_ref.bucket]:
|
|
if obj_name.startswith(dir_name):
|
|
suffix = obj_name[parent_dir_length:]
|
|
result = suffix.split(self._sep)[0]
|
|
if result not in seen:
|
|
seen.add(result)
|
|
yield result
|
|
|
|
def Join(self, path1, path2):
|
|
if self._IsRoot(path1):
|
|
return 'gs://' + path2.lstrip(self._sep)
|
|
return path1.rstrip(self._sep) + self._sep + path2.lstrip(self._sep)
|
|
|
|
def _IsRoot(self, path):
|
|
return path == 'gs://' or path == 'gs:'
|
|
|
|
def _LoadObjectsIfMissing(self, bucket_ref):
|
|
objects = self._objects.get(bucket_ref.bucket)
|
|
if objects is None:
|
|
try:
|
|
objects = self._client.ListBucket(bucket_ref)
|
|
object_names = set()
|
|
for o in objects:
|
|
full_path = 'gs://' + self.Join(bucket_ref.bucket, o.name)
|
|
self._object_details[full_path] = o
|
|
object_names.add(o.name)
|
|
# Only try to set the result after we start iterating because the API
|
|
# call is not actually made until you try to consume the results. If
|
|
# an API error occurs (like the bucket doesn't exist) we don't want
|
|
# to accidentally cache that it was found.
|
|
self._objects.setdefault(bucket_ref.bucket, set()).update(object_names)
|
|
except storage_api.BucketNotFoundError:
|
|
pass
|
|
|
|
def _GetDirString(self, path):
|
|
if path and not path.endswith(self._sep):
|
|
return path + self._sep
|
|
return path
|
|
|
|
def _FormatPath(self, path):
|
|
path = super(GCSPathExpander, self)._FormatPath(path)
|
|
return 'gs://' if path == 'gs:/' else path
|