# Copyright 2003 Google, Inc.
# All Rights Reserved.

"""Some common string manipulation utilities."""
import base64
import binascii
import re
import string

import six

# NOTE: These are re-exported to allow their use within google3 without the need
# to depend on the visibility-restricted //third_party/py/six target.
ensure_str = six.ensure_str
ensure_binary = six.ensure_binary

_RE_NONASCII = re.compile(r'[^\000-\177]')

# Java Language Specification: Escape Sequences for Char and String Literals
# https://docs.oracle.com/javase/tutorial/java/data/characters.html
_JAVA_ESCAPE_MAP = {
    '\b': '\\b',
    '\t': '\\t',
    '\n': '\\n',
    '\f': '\\f',
    '\r': '\\r',
    '"': '\\"',
    "'": "\\'",
    '\\': '\\\\',
}
# Octal-escape unprintable characters
#
# Since stringutil.JavaEscape calls stringutil.UnicodeEscape for all input
# byte values outside of [0-128), we simply fill the escape map with valid
# ASCII characters (i.e., [0,128)) and rely on UnicodeEscape to handle the
# rest.
for i in range(128):
  c = chr(i)
  if c not in _JAVA_ESCAPE_MAP and c not in string.printable:
    _JAVA_ESCAPE_MAP[c] = '\\%03o' % i
# Compile characters-to-be-escaped into regex for matching
_JAVA_ESCAPE_RE = re.compile('|'.join(
    [re.escape(c) for c in _JAVA_ESCAPE_MAP.keys()]))

_COMMON_TRUE_STRINGS = frozenset(('true', 't', '1', 'yes', 'y'))
_COMMON_FALSE_STRINGS = frozenset(('false', 'f', '0', 'no', 'n'))


class Base64ValueError(Exception): "Illegal Base64-encoded value"


def UnicodeEscape(s):
  r"""Replaces each non-ASCII character in s with an escape sequence.

  Non-ASCII characters are substituted with their 6-character unicode
  escape sequence \uxxxx, where xxxx is a hex number.  The resulting
  string consists entirely of ASCII characters.  Existing escape
  sequences are unaffected, i.e., this operation is idempotent.

  Sample usage:
    >>> UnicodeEscape('asdf\xff')
    'asdf\\u00ff'

  This escaping differs from the built-in s.encode('unicode_escape').  The
  built-in escape function uses hex escape sequences (e.g., '\xe9') and escapes
  some control characters in lower ASCII (e.g., '\x00').

  Args:
    s: string to be escaped

  Returns:
    escaped string
  """
  return _RE_NONASCII.sub(lambda m: '\\u%04x' % ord(m.group(0)), s)


def JavaEscape(s):
  r"""Escapes a string so it can be inserted in a Java string or char literal.

  Follows the Java Language Specification for "Escape Sequences for Character
  and String Literals":

  https://docs.oracle.com/javase/tutorial/java/data/characters.html

  Escapes unprintable and non-ASCII characters.  The resulting string consists
  entirely of ASCII characters.

  This operation is NOT idempotent.

  Sample usage:
    >>> JavaEscape('single\'double"\n\x00')
    'single\\\'double\\"\\n\\000'

  Args:
    s: string to be escaped

  Returns:
    escaped string
  """
  s_esc = _JAVA_ESCAPE_RE.sub(lambda m: _JAVA_ESCAPE_MAP[m.group(0)], s)
  # Unicode-escape remaining non-ASCII characters.  In the default Python
  # locale, printable characters are all ASCII, and we octal-escaped all
  # unprintable characters above, so this step actually does nothing.  Leave it
  # in for locales that have non-ASCII printable characters.
  return UnicodeEscape(s_esc)


# FYI, Python 2.4's base64 module has a websafe encode/decode. However:
#
# (1) The encode still appends =-padding. Even more annoying,
# (2) The decode still *requires* that =-padding be present. This makes it
# incompatible with the C++ or Sawzall (based on the C++) implementations.
# (3) On decode, the handling of invalid characters varies (both versions ignore
# whitespace, otherwise the C++ version fails, the Python version ignores
# invalid characters).
def WebSafeBase64Escape(unescaped, do_padding):
  """Python implementation of the Google C library's WebSafeBase64Escape().

  Python implementation of the Google C library's WebSafeBase64Escape() (from
  strings/strutil.h), using Python's base64 API and string replacement.

  Args:
    unescaped: any data (byte) string (example: b"12345~6")
    do_padding: whether to add =-padding (example: false)

  Returns:
    The base64 encoding (with web-safe replacements) of unescaped,
    with =-padding depending on the value of do_padding
    (example: b"MTIzNDV-Ng")
  """
  escaped = base64.urlsafe_b64encode(unescaped)

  if not do_padding:
    escaped = escaped.rstrip(b'=')

  return escaped

# Mapping table to convert web-safe base64 encoding to the standard
# encoding ('-' becomes '+', '_' becomes '/', and other valid base64
# input characters map to themselves).  To maintain compatibility with
# the C++ library, characters that are neither valid base64 input
# characters nor whitespace are mapped to '!'.

_BASE64_DECODE_TRANSLATION = (
    b'!!!!!!!!!     !!!!!!!!!!!!!!!!!!'
    b' !!!!!!!!!!!!+!!0123456789!!!=!!'
    b'!ABCDEFGHIJKLMNOPQRSTUVWXYZ!!!!/'
    b'!abcdefghijklmnopqrstuvwxyz!!!!!'
    b'!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
    b'!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
    b'!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
    b'!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')


def WebSafeBase64Unescape(escaped):
  """Python implementation of the Google C library's WebSafeBase64Unescape().

  Python implementation of the Google C library's WebSafeBase64Unescape() (from
  strings/strutil.h), using Python's base64 API and string replacement.

  Args:
    escaped: A base64 binary string using the web-safe encoding
        (example: b"MTIzNDV-Ng")

  Returns:
    The corresponding unescaped string (example: b"12345~6")

  Raises:
    Base64ValueError: Invalid character in encoding of string, escaped.
  """
  escaped_standard = escaped.translate(_BASE64_DECODE_TRANSLATION)
  if escaped_standard.find(b'!') >= 0:
    raise Base64ValueError('%r: Invalid character in encoded string.' % escaped)

  # Make the encoded string a multiple of 4 characters long, adding "="
  # characters as padding.  This is the format standard base64 expects.
  if not escaped_standard.endswith(b'='):
    padding_len = len(escaped_standard) % 4
    escaped_standard += b'=' * padding_len

  try:
    return binascii.a2b_base64(escaped_standard)

  except binascii.Error as msg:
    raise Base64ValueError('%r: %s' % (escaped, msg))


def Chunk(value, size, start=0):
  """Break a string into chunks of a given size.

  Args:
    value: The value to split.
    size: The maximum size of a chunk.
    start: The index at which to start (defaults to 0).

  Returns:
    Iterable over string slices of as close to the given size as possible.
    Chunk('hello', 2) => 'he', 'll', 'o'

  Raises:
    ValueError: If start < 0 or if size <= 0.
  """
  if start < 0:
    raise ValueError('invalid starting position')
  if size <= 0:
    raise ValueError('invalid chunk size')
  return (value[i:i + size] for i in range(start, len(value), size))


def ReverseChunk(value, size):
  """Break a string into chunks of a given size, starting at the rear.

  Like chunk, except the smallest chunk comes at the beginning.

  Args:
    value: The value to split.
    size: The maximum size of a chunk.

  Returns:
    Iterable over string slices of as close to the given size as possible.
    ReverseChunk('hello', 2) => 'h', 'el', 'lo'

  Raises:
    ValueError: If size <= 0.
  """
  # Check at call, to raise the error as soon as possible, rather than
  # on the first .next()
  if size <= 0:
    raise ValueError('invalid chunk size')

  def DoChunk():
    """Actually perform the chunking."""
    start = 0
    # special-case the first chunk, so that the smallest
    # chunk comes first
    if len(value) % size:
      yield value[:len(value) % size]
      start = len(value) % size
    for chunk in Chunk(value, size, start=start):
      yield chunk
  return DoChunk()


def IsCommonTrue(value):
  """Checks if the string is a commonly accepted True value.

  Useful if you want most strings to default to False except a few
  accepted values.  This method is case-insensitive.

  Args:
    value: The string to check for true.  Or None.

  Returns:
    True if the string is one of the commonly accepted true values.
    False if value is None.  False otherwise.

  Raises:
    ValueError: when value is something besides a string or None.
  """
  if value is None:
    return False
  if not isinstance(value, str):
    raise ValueError('IsCommonTrue() called with %s type.  Expected string.'
                     % type(value))
  if value:
    return value.strip().lower() in _COMMON_TRUE_STRINGS
  return False


def IsCommonFalse(value):
  """Checks if the string is a commonly accepted False value.

  Useful if you want most strings to default to True except a few
  accepted values.  This method is case-insensitive.

  Args:
    value: The string to check for true.  Or None.

  Returns:
    True if the string is one of the commonly accepted false values.
    True if value is None.  False otherwise.

  Raises:
    ValueError: when value is something besides a string or None.
  """
  if value is None:
    return True
  if not isinstance(value, str):
    raise ValueError('IsCommonFalse() called with %s type.  Expected string.'
                     % type(value))
  if value:
    return value.strip().lower() in _COMMON_FALSE_STRINGS
  return True