294 lines
9.0 KiB
Python
294 lines
9.0 KiB
Python
# Copyright 2003 Google, Inc.
|
|
# All Rights Reserved.
|
|
|
|
"""Some common string manipulation utilities."""
|
|
import base64
|
|
import binascii
|
|
import re
|
|
import string
|
|
|
|
import six
|
|
|
|
# NOTE: These are re-exported to allow their use within google3 without the need
|
|
# to depend on the visibility-restricted //third_party/py/six target.
|
|
ensure_str = six.ensure_str
|
|
ensure_binary = six.ensure_binary
|
|
|
|
_RE_NONASCII = re.compile(r'[^\000-\177]')
|
|
|
|
# Java Language Specification: Escape Sequences for Char and String Literals
|
|
# https://docs.oracle.com/javase/tutorial/java/data/characters.html
|
|
_JAVA_ESCAPE_MAP = {
|
|
'\b': '\\b',
|
|
'\t': '\\t',
|
|
'\n': '\\n',
|
|
'\f': '\\f',
|
|
'\r': '\\r',
|
|
'"': '\\"',
|
|
"'": "\\'",
|
|
'\\': '\\\\',
|
|
}
|
|
# Octal-escape unprintable characters
|
|
#
|
|
# Since stringutil.JavaEscape calls stringutil.UnicodeEscape for all input
|
|
# byte values outside of [0-128), we simply fill the escape map with valid
|
|
# ASCII characters (i.e., [0,128)) and rely on UnicodeEscape to handle the
|
|
# rest.
|
|
for i in range(128):
|
|
c = chr(i)
|
|
if c not in _JAVA_ESCAPE_MAP and c not in string.printable:
|
|
_JAVA_ESCAPE_MAP[c] = '\\%03o' % i
|
|
# Compile characters-to-be-escaped into regex for matching
|
|
_JAVA_ESCAPE_RE = re.compile('|'.join(
|
|
[re.escape(c) for c in _JAVA_ESCAPE_MAP.keys()]))
|
|
|
|
_COMMON_TRUE_STRINGS = frozenset(('true', 't', '1', 'yes', 'y'))
|
|
_COMMON_FALSE_STRINGS = frozenset(('false', 'f', '0', 'no', 'n'))
|
|
|
|
|
|
class Base64ValueError(Exception): "Illegal Base64-encoded value"
|
|
|
|
|
|
def UnicodeEscape(s):
|
|
r"""Replaces each non-ASCII character in s with an escape sequence.
|
|
|
|
Non-ASCII characters are substituted with their 6-character unicode
|
|
escape sequence \uxxxx, where xxxx is a hex number. The resulting
|
|
string consists entirely of ASCII characters. Existing escape
|
|
sequences are unaffected, i.e., this operation is idempotent.
|
|
|
|
Sample usage:
|
|
>>> UnicodeEscape('asdf\xff')
|
|
'asdf\\u00ff'
|
|
|
|
This escaping differs from the built-in s.encode('unicode_escape'). The
|
|
built-in escape function uses hex escape sequences (e.g., '\xe9') and escapes
|
|
some control characters in lower ASCII (e.g., '\x00').
|
|
|
|
Args:
|
|
s: string to be escaped
|
|
|
|
Returns:
|
|
escaped string
|
|
"""
|
|
return _RE_NONASCII.sub(lambda m: '\\u%04x' % ord(m.group(0)), s)
|
|
|
|
|
|
def JavaEscape(s):
|
|
r"""Escapes a string so it can be inserted in a Java string or char literal.
|
|
|
|
Follows the Java Language Specification for "Escape Sequences for Character
|
|
and String Literals":
|
|
|
|
https://docs.oracle.com/javase/tutorial/java/data/characters.html
|
|
|
|
Escapes unprintable and non-ASCII characters. The resulting string consists
|
|
entirely of ASCII characters.
|
|
|
|
This operation is NOT idempotent.
|
|
|
|
Sample usage:
|
|
>>> JavaEscape('single\'double"\n\x00')
|
|
'single\\\'double\\"\\n\\000'
|
|
|
|
Args:
|
|
s: string to be escaped
|
|
|
|
Returns:
|
|
escaped string
|
|
"""
|
|
s_esc = _JAVA_ESCAPE_RE.sub(lambda m: _JAVA_ESCAPE_MAP[m.group(0)], s)
|
|
# Unicode-escape remaining non-ASCII characters. In the default Python
|
|
# locale, printable characters are all ASCII, and we octal-escaped all
|
|
# unprintable characters above, so this step actually does nothing. Leave it
|
|
# in for locales that have non-ASCII printable characters.
|
|
return UnicodeEscape(s_esc)
|
|
|
|
|
|
# FYI, Python 2.4's base64 module has a websafe encode/decode. However:
|
|
#
|
|
# (1) The encode still appends =-padding. Even more annoying,
|
|
# (2) The decode still *requires* that =-padding be present. This makes it
|
|
# incompatible with the C++ or Sawzall (based on the C++) implementations.
|
|
# (3) On decode, the handling of invalid characters varies (both versions ignore
|
|
# whitespace, otherwise the C++ version fails, the Python version ignores
|
|
# invalid characters).
|
|
def WebSafeBase64Escape(unescaped, do_padding):
|
|
"""Python implementation of the Google C library's WebSafeBase64Escape().
|
|
|
|
Python implementation of the Google C library's WebSafeBase64Escape() (from
|
|
strings/strutil.h), using Python's base64 API and string replacement.
|
|
|
|
Args:
|
|
unescaped: any data (byte) string (example: b"12345~6")
|
|
do_padding: whether to add =-padding (example: false)
|
|
|
|
Returns:
|
|
The base64 encoding (with web-safe replacements) of unescaped,
|
|
with =-padding depending on the value of do_padding
|
|
(example: b"MTIzNDV-Ng")
|
|
"""
|
|
escaped = base64.urlsafe_b64encode(unescaped)
|
|
|
|
if not do_padding:
|
|
escaped = escaped.rstrip(b'=')
|
|
|
|
return escaped
|
|
|
|
# Mapping table to convert web-safe base64 encoding to the standard
|
|
# encoding ('-' becomes '+', '_' becomes '/', and other valid base64
|
|
# input characters map to themselves). To maintain compatibility with
|
|
# the C++ library, characters that are neither valid base64 input
|
|
# characters nor whitespace are mapped to '!'.
|
|
|
|
_BASE64_DECODE_TRANSLATION = (
|
|
b'!!!!!!!!! !!!!!!!!!!!!!!!!!!'
|
|
b' !!!!!!!!!!!!+!!0123456789!!!=!!'
|
|
b'!ABCDEFGHIJKLMNOPQRSTUVWXYZ!!!!/'
|
|
b'!abcdefghijklmnopqrstuvwxyz!!!!!'
|
|
b'!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
|
|
b'!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
|
|
b'!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
|
|
b'!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
|
|
|
|
|
|
def WebSafeBase64Unescape(escaped):
|
|
"""Python implementation of the Google C library's WebSafeBase64Unescape().
|
|
|
|
Python implementation of the Google C library's WebSafeBase64Unescape() (from
|
|
strings/strutil.h), using Python's base64 API and string replacement.
|
|
|
|
Args:
|
|
escaped: A base64 binary string using the web-safe encoding
|
|
(example: b"MTIzNDV-Ng")
|
|
|
|
Returns:
|
|
The corresponding unescaped string (example: b"12345~6")
|
|
|
|
Raises:
|
|
Base64ValueError: Invalid character in encoding of string, escaped.
|
|
"""
|
|
escaped_standard = escaped.translate(_BASE64_DECODE_TRANSLATION)
|
|
if escaped_standard.find(b'!') >= 0:
|
|
raise Base64ValueError('%r: Invalid character in encoded string.' % escaped)
|
|
|
|
# Make the encoded string a multiple of 4 characters long, adding "="
|
|
# characters as padding. This is the format standard base64 expects.
|
|
if not escaped_standard.endswith(b'='):
|
|
padding_len = len(escaped_standard) % 4
|
|
escaped_standard += b'=' * padding_len
|
|
|
|
try:
|
|
return binascii.a2b_base64(escaped_standard)
|
|
|
|
except binascii.Error as msg:
|
|
raise Base64ValueError('%r: %s' % (escaped, msg))
|
|
|
|
|
|
def Chunk(value, size, start=0):
|
|
"""Break a string into chunks of a given size.
|
|
|
|
Args:
|
|
value: The value to split.
|
|
size: The maximum size of a chunk.
|
|
start: The index at which to start (defaults to 0).
|
|
|
|
Returns:
|
|
Iterable over string slices of as close to the given size as possible.
|
|
Chunk('hello', 2) => 'he', 'll', 'o'
|
|
|
|
Raises:
|
|
ValueError: If start < 0 or if size <= 0.
|
|
"""
|
|
if start < 0:
|
|
raise ValueError('invalid starting position')
|
|
if size <= 0:
|
|
raise ValueError('invalid chunk size')
|
|
return (value[i:i + size] for i in range(start, len(value), size))
|
|
|
|
|
|
def ReverseChunk(value, size):
|
|
"""Break a string into chunks of a given size, starting at the rear.
|
|
|
|
Like chunk, except the smallest chunk comes at the beginning.
|
|
|
|
Args:
|
|
value: The value to split.
|
|
size: The maximum size of a chunk.
|
|
|
|
Returns:
|
|
Iterable over string slices of as close to the given size as possible.
|
|
ReverseChunk('hello', 2) => 'h', 'el', 'lo'
|
|
|
|
Raises:
|
|
ValueError: If size <= 0.
|
|
"""
|
|
# Check at call, to raise the error as soon as possible, rather than
|
|
# on the first .next()
|
|
if size <= 0:
|
|
raise ValueError('invalid chunk size')
|
|
|
|
def DoChunk():
|
|
"""Actually perform the chunking."""
|
|
start = 0
|
|
# special-case the first chunk, so that the smallest
|
|
# chunk comes first
|
|
if len(value) % size:
|
|
yield value[:len(value) % size]
|
|
start = len(value) % size
|
|
for chunk in Chunk(value, size, start=start):
|
|
yield chunk
|
|
return DoChunk()
|
|
|
|
|
|
def IsCommonTrue(value):
|
|
"""Checks if the string is a commonly accepted True value.
|
|
|
|
Useful if you want most strings to default to False except a few
|
|
accepted values. This method is case-insensitive.
|
|
|
|
Args:
|
|
value: The string to check for true. Or None.
|
|
|
|
Returns:
|
|
True if the string is one of the commonly accepted true values.
|
|
False if value is None. False otherwise.
|
|
|
|
Raises:
|
|
ValueError: when value is something besides a string or None.
|
|
"""
|
|
if value is None:
|
|
return False
|
|
if not isinstance(value, str):
|
|
raise ValueError('IsCommonTrue() called with %s type. Expected string.'
|
|
% type(value))
|
|
if value:
|
|
return value.strip().lower() in _COMMON_TRUE_STRINGS
|
|
return False
|
|
|
|
|
|
def IsCommonFalse(value):
|
|
"""Checks if the string is a commonly accepted False value.
|
|
|
|
Useful if you want most strings to default to True except a few
|
|
accepted values. This method is case-insensitive.
|
|
|
|
Args:
|
|
value: The string to check for true. Or None.
|
|
|
|
Returns:
|
|
True if the string is one of the commonly accepted false values.
|
|
True if value is None. False otherwise.
|
|
|
|
Raises:
|
|
ValueError: when value is something besides a string or None.
|
|
"""
|
|
if value is None:
|
|
return True
|
|
if not isinstance(value, str):
|
|
raise ValueError('IsCommonFalse() called with %s type. Expected string.'
|
|
% type(value))
|
|
if value:
|
|
return value.strip().lower() in _COMMON_FALSE_STRINGS
|
|
return True
|