venv/Lib/site-packages/pip/_vendor/webencodings/__init__.py

   1 # coding: utf8
   2 """
   3
   4     webencodings
   5     ~~~~~~~~~~~~
   6
   7     This is a Python implementation of the `WHATWG Encoding standard
   8     <http://encoding.spec.whatwg.org/>`. See README for details.
   9
  10     :copyright: Copyright 2012 by Simon Sapin
  11     :license: BSD, see LICENSE for details.
  12
  13 """
  14
  15 from __future__ import unicode_literals
  16
  17 import codecs
  18
  19 from .labels import LABELS
  20
  21
  22 VERSION = '0.5'
  23
  24
  25 # Some names in Encoding are not valid Python aliases. Remap these.
  26 PYTHON_NAMES = {
  27     'iso-8859-8-i': 'iso-8859-8',
  28     'x-mac-cyrillic': 'mac-cyrillic',
  29     'macintosh': 'mac-roman',
  30     'windows-874': 'cp874'}
  31
  32 CACHE = {}
  33
  34
  35 def ascii_lower(string):
  36     r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.
  37
  38     :param string: An Unicode string.
  39     :returns: A new Unicode string.
  40
  41     This is used for `ASCII case-insensitive
  42     <http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_
  43     matching of encoding labels.
  44     The same matching is also used, among other things,
  45     for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_.
  46
  47     This is different from the :meth:`~py:str.lower` method of Unicode strings
  48     which also affect non-ASCII characters,
  49     sometimes mapping them into the ASCII range:
  50
  51         >>> keyword = u'Bac\N{KELVIN SIGN}ground'
  52         >>> assert keyword.lower() == u'background'
  53         >>> assert ascii_lower(keyword) != keyword.lower()
  54         >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'
  55
  56     """
  57     # This turns out to be faster than unicode.translate()
  58     return string.encode('utf8').lower().decode('utf8')
  59
  60
  61 def lookup(label):
  62     """
  63     Look for an encoding by its label.
  64     This is the spec’s `get an encoding
  65     <http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm.
  66     Supported labels are listed there.
  67
  68     :param label: A string.
  69     :returns:
  70         An :class:`Encoding` object, or :obj:`None` for an unknown label.
  71
  72     """
  73     # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.
  74     label = ascii_lower(label.strip('\t\n\f\r '))
  75     name = LABELS.get(label)
  76     if name is None:
  77         return None
  78     encoding = CACHE.get(name)
  79     if encoding is None:
  80         if name == 'x-user-defined':
  81             from .x_user_defined import codec_info
  82         else:
  83             python_name = PYTHON_NAMES.get(name, name)
  84             # Any python_name value that gets to here should be valid.
  85             codec_info = codecs.lookup(python_name)
  86         encoding = Encoding(name, codec_info)
  87         CACHE[name] = encoding
  88     return encoding
  89
  90
  91 def _get_encoding(encoding_or_label):
  92     """
  93     Accept either an encoding object or label.
  94
  95     :param encoding: An :class:`Encoding` object or a label string.
  96     :returns: An :class:`Encoding` object.
  97     :raises: :exc:`~exceptions.LookupError` for an unknown label.
  98
  99     """
 100     if hasattr(encoding_or_label, 'codec_info'):
 101         return encoding_or_label
 102
 103     encoding = lookup(encoding_or_label)
 104     if encoding is None:
 105         raise LookupError('Unknown encoding label: %r' % encoding_or_label)
 106     return encoding
 107
 108
 109 class Encoding(object):
 110     """Reresents a character encoding such as UTF-8,
 111     that can be used for decoding or encoding.
 112
 113     .. attribute:: name
 114
 115         Canonical name of the encoding
 116
 117     .. attribute:: codec_info
 118
 119         The actual implementation of the encoding,
 120         a stdlib :class:`~codecs.CodecInfo` object.
 121         See :func:`codecs.register`.
 122
 123     """
 124     def __init__(self, name, codec_info):
 125         self.name = name
 126         self.codec_info = codec_info
 127
 128     def __repr__(self):
 129         return '<Encoding %s>' % self.name
 130
 131
 132 #: The UTF-8 encoding. Should be used for new content and formats.
 133 UTF8 = lookup('utf-8')
 134
 135 _UTF16LE = lookup('utf-16le')
 136 _UTF16BE = lookup('utf-16be')
 137
 138
 139 def decode(input, fallback_encoding, errors='replace'):
 140     """
 141     Decode a single string.
 142
 143     :param input: A byte string
 144     :param fallback_encoding:
 145         An :class:`Encoding` object or a label string.
 146         The encoding to use if :obj:`input` does note have a BOM.
 147     :param errors: Type of error handling. See :func:`codecs.register`.
 148     :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
 149     :return:
 150         A ``(output, encoding)`` tuple of an Unicode string
 151         and an :obj:`Encoding`.
 152
 153     """
 154     # Fail early if `encoding` is an invalid label.
 155     fallback_encoding = _get_encoding(fallback_encoding)
 156     bom_encoding, input = _detect_bom(input)
 157     encoding = bom_encoding or fallback_encoding
 158     return encoding.codec_info.decode(input, errors)[0], encoding
 159
 160
 161 def _detect_bom(input):
 162     """Return (bom_encoding, input), with any BOM removed from the input."""
 163     if input.startswith(b'\xFF\xFE'):
 164         return _UTF16LE, input[2:]
 165     if input.startswith(b'\xFE\xFF'):
 166         return _UTF16BE, input[2:]
 167     if input.startswith(b'\xEF\xBB\xBF'):
 168         return UTF8, input[3:]
 169     return None, input
 170
 171
 172 def encode(input, encoding=UTF8, errors='strict'):
 173     """
 174     Encode a single string.
 175
 176     :param input: An Unicode string.
 177     :param encoding: An :class:`Encoding` object or a label string.
 178     :param errors: Type of error handling. See :func:`codecs.register`.
 179     :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
 180     :return: A byte string.
 181
 182     """
 183     return _get_encoding(encoding).codec_info.encode(input, errors)[0]
 184
 185
 186 def iter_decode(input, fallback_encoding, errors='replace'):
 187     """
 188     "Pull"-based decoder.
 189
 190     :param input:
 191         An iterable of byte strings.
 192
 193         The input is first consumed just enough to determine the encoding
 194         based on the precense of a BOM,
 195         then consumed on demand when the return value is.
 196     :param fallback_encoding:
 197         An :class:`Encoding` object or a label string.
 198         The encoding to use if :obj:`input` does note have a BOM.
 199     :param errors: Type of error handling. See :func:`codecs.register`.
 200     :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
 201     :returns:
 202         An ``(output, encoding)`` tuple.
 203         :obj:`output` is an iterable of Unicode strings,
 204         :obj:`encoding` is the :obj:`Encoding` that is being used.
 205
 206     """
 207
 208     decoder = IncrementalDecoder(fallback_encoding, errors)
 209     generator = _iter_decode_generator(input, decoder)
 210     encoding = next(generator)
 211     return generator, encoding
 212
 213
 214 def _iter_decode_generator(input, decoder):
 215     """Return a generator that first yields the :obj:`Encoding`,
 216     then yields output chukns as Unicode strings.
 217
 218     """
 219     decode = decoder.decode
 220     input = iter(input)
 221     for chunck in input:
 222         output = decode(chunck)
 223         if output:
 224             assert decoder.encoding is not None
 225             yield decoder.encoding
 226             yield output
 227             break
 228     else:
 229         # Input exhausted without determining the encoding
 230         output = decode(b'', final=True)
 231         assert decoder.encoding is not None
 232         yield decoder.encoding
 233         if output:
 234             yield output
 235         return
 236
 237     for chunck in input:
 238         output = decode(chunck)
 239         if output:
 240             yield output
 241     output = decode(b'', final=True)
 242     if output:
 243         yield output
 244
 245
 246 def iter_encode(input, encoding=UTF8, errors='strict'):
 247     """
 248     “Pull”-based encoder.
 249
 250     :param input: An iterable of Unicode strings.
 251     :param encoding: An :class:`Encoding` object or a label string.
 252     :param errors: Type of error handling. See :func:`codecs.register`.
 253     :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
 254     :returns: An iterable of byte strings.
 255
 256     """
 257     # Fail early if `encoding` is an invalid label.
 258     encode = IncrementalEncoder(encoding, errors).encode
 259     return _iter_encode_generator(input, encode)
 260
 261
 262 def _iter_encode_generator(input, encode):
 263     for chunck in input:
 264         output = encode(chunck)
 265         if output:
 266             yield output
 267     output = encode('', final=True)
 268     if output:
 269         yield output
 270
 271
 272 class IncrementalDecoder(object):
 273     """
 274     “Push”-based decoder.
 275
 276     :param fallback_encoding:
 277         An :class:`Encoding` object or a label string.
 278         The encoding to use if :obj:`input` does note have a BOM.
 279     :param errors: Type of error handling. See :func:`codecs.register`.
 280     :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
 281
 282     """
 283     def __init__(self, fallback_encoding, errors='replace'):
 284         # Fail early if `encoding` is an invalid label.
 285         self._fallback_encoding = _get_encoding(fallback_encoding)
 286         self._errors = errors
 287         self._buffer = b''
 288         self._decoder = None
 289         #: The actual :class:`Encoding` that is being used,
 290         #: or :obj:`None` if that is not determined yet.
 291         #: (Ie. if there is not enough input yet to determine
 292         #: if there is a BOM.)
 293         self.encoding = None  # Not known yet.
 294
 295     def decode(self, input, final=False):
 296         """Decode one chunk of the input.
 297
 298         :param input: A byte string.
 299         :param final:
 300             Indicate that no more input is available.
 301             Must be :obj:`True` if this is the last call.
 302         :returns: An Unicode string.
 303
 304         """
 305         decoder = self._decoder
 306         if decoder is not None:
 307             return decoder(input, final)
 308
 309         input = self._buffer + input
 310         encoding, input = _detect_bom(input)
 311         if encoding is None:
 312             if len(input) < 3 and not final:  # Not enough data yet.
 313                 self._buffer = input
 314                 return ''
 315             else:  # No BOM
 316                 encoding = self._fallback_encoding
 317         decoder = encoding.codec_info.incrementaldecoder(self._errors).decode
 318         self._decoder = decoder
 319         self.encoding = encoding
 320         return decoder(input, final)
 321
 322
 323 class IncrementalEncoder(object):
 324     """
 325     “Push”-based encoder.
 326
 327     :param encoding: An :class:`Encoding` object or a label string.
 328     :param errors: Type of error handling. See :func:`codecs.register`.
 329     :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
 330
 331     .. method:: encode(input, final=False)
 332
 333         :param input: An Unicode string.
 334         :param final:
 335             Indicate that no more input is available.
 336             Must be :obj:`True` if this is the last call.
 337         :returns: A byte string.
 338
 339     """
 340     def __init__(self, encoding=UTF8, errors='strict'):
 341         encoding = _get_encoding(encoding)
 342         self.encode = encoding.codec_info.incrementalencoder(errors).encode