venv/Lib/site-packages/pip/_vendor/html5lib/_inputstream.py

   1 from __future__ import absolute_import, division, unicode_literals
   2
   3 from pip._vendor.six import text_type, binary_type
   4 from pip._vendor.six.moves import http_client, urllib
   5
   6 import codecs
   7 import re
   8
   9 from pip._vendor import webencodings
  10
  11 from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
  12 from .constants import ReparseException
  13 from . import _utils
  14
  15 from io import StringIO
  16
  17 try:
  18     from io import BytesIO
  19 except ImportError:
  20     BytesIO = StringIO
  21
  22 # Non-unicode versions of constants for use in the pre-parser
  23 spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
  24 asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
  25 asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
  26 spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
  27
  28
  29 invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"  # noqa
  30
  31 if _utils.supports_lone_surrogates:
  32     # Use one extra step of indirection and create surrogates with
  33     # eval. Not using this indirection would introduce an illegal
  34     # unicode literal on platforms not supporting such lone
  35     # surrogates.
  36     assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
  37     invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
  38                                     eval('"\\uD800-\\uDFFF"') +  # pylint:disable=eval-used
  39                                     "]")
  40 else:
  41     invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
  42
  43 non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
  44                                   0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
  45                                   0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
  46                                   0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
  47                                   0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
  48                                   0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
  49                                   0x10FFFE, 0x10FFFF])
  50
  51 ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
  52
  53 # Cache for charsUntil()
  54 charsUntilRegEx = {}
  55
  56
  57 class BufferedStream(object):
  58     """Buffering for streams that do not have buffering of their own
  59
  60     The buffer is implemented as a list of chunks on the assumption that
  61     joining many strings will be slow since it is O(n**2)
  62     """
  63
  64     def __init__(self, stream):
  65         self.stream = stream
  66         self.buffer = []
  67         self.position = [-1, 0]  # chunk number, offset
  68
  69     def tell(self):
  70         pos = 0
  71         for chunk in self.buffer[:self.position[0]]:
  72             pos += len(chunk)
  73         pos += self.position[1]
  74         return pos
  75
  76     def seek(self, pos):
  77         assert pos <= self._bufferedBytes()
  78         offset = pos
  79         i = 0
  80         while len(self.buffer[i]) < offset:
  81             offset -= len(self.buffer[i])
  82             i += 1
  83         self.position = [i, offset]
  84
  85     def read(self, bytes):
  86         if not self.buffer:
  87             return self._readStream(bytes)
  88         elif (self.position[0] == len(self.buffer) and
  89               self.position[1] == len(self.buffer[-1])):
  90             return self._readStream(bytes)
  91         else:
  92             return self._readFromBuffer(bytes)
  93
  94     def _bufferedBytes(self):
  95         return sum([len(item) for item in self.buffer])
  96
  97     def _readStream(self, bytes):
  98         data = self.stream.read(bytes)
  99         self.buffer.append(data)
 100         self.position[0] += 1
 101         self.position[1] = len(data)
 102         return data
 103
 104     def _readFromBuffer(self, bytes):
 105         remainingBytes = bytes
 106         rv = []
 107         bufferIndex = self.position[0]
 108         bufferOffset = self.position[1]
 109         while bufferIndex < len(self.buffer) and remainingBytes != 0:
 110             assert remainingBytes > 0
 111             bufferedData = self.buffer[bufferIndex]
 112
 113             if remainingBytes <= len(bufferedData) - bufferOffset:
 114                 bytesToRead = remainingBytes
 115                 self.position = [bufferIndex, bufferOffset + bytesToRead]
 116             else:
 117                 bytesToRead = len(bufferedData) - bufferOffset
 118                 self.position = [bufferIndex, len(bufferedData)]
 119                 bufferIndex += 1
 120             rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
 121             remainingBytes -= bytesToRead
 122
 123             bufferOffset = 0
 124
 125         if remainingBytes:
 126             rv.append(self._readStream(remainingBytes))
 127
 128         return b"".join(rv)
 129
 130
 131 def HTMLInputStream(source, **kwargs):
 132     # Work around Python bug #20007: read(0) closes the connection.
 133     # http://bugs.python.org/issue20007
 134     if (isinstance(source, http_client.HTTPResponse) or
 135         # Also check for addinfourl wrapping HTTPResponse
 136         (isinstance(source, urllib.response.addbase) and
 137          isinstance(source.fp, http_client.HTTPResponse))):
 138         isUnicode = False
 139     elif hasattr(source, "read"):
 140         isUnicode = isinstance(source.read(0), text_type)
 141     else:
 142         isUnicode = isinstance(source, text_type)
 143
 144     if isUnicode:
 145         encodings = [x for x in kwargs if x.endswith("_encoding")]
 146         if encodings:
 147             raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
 148
 149         return HTMLUnicodeInputStream(source, **kwargs)
 150     else:
 151         return HTMLBinaryInputStream(source, **kwargs)
 152
 153
 154 class HTMLUnicodeInputStream(object):
 155     """Provides a unicode stream of characters to the HTMLTokenizer.
 156
 157     This class takes care of character encoding and removing or replacing
 158     incorrect byte-sequences and also provides column and line tracking.
 159
 160     """
 161
 162     _defaultChunkSize = 10240
 163
 164     def __init__(self, source):
 165         """Initialises the HTMLInputStream.
 166
 167         HTMLInputStream(source, [encoding]) -> Normalized stream from source
 168         for use by html5lib.
 169
 170         source can be either a file-object, local filename or a string.
 171
 172         The optional encoding parameter must be a string that indicates
 173         the encoding.  If specified, that encoding will be used,
 174         regardless of any BOM or later declaration (such as in a meta
 175         element)
 176
 177         """
 178
 179         if not _utils.supports_lone_surrogates:
 180             # Such platforms will have already checked for such
 181             # surrogate errors, so no need to do this checking.
 182             self.reportCharacterErrors = None
 183         elif len("\U0010FFFF") == 1:
 184             self.reportCharacterErrors = self.characterErrorsUCS4
 185         else:
 186             self.reportCharacterErrors = self.characterErrorsUCS2
 187
 188         # List of where new lines occur
 189         self.newLines = [0]
 190
 191         self.charEncoding = (lookupEncoding("utf-8"), "certain")
 192         self.dataStream = self.openStream(source)
 193
 194         self.reset()
 195
 196     def reset(self):
 197         self.chunk = ""
 198         self.chunkSize = 0
 199         self.chunkOffset = 0
 200         self.errors = []
 201
 202         # number of (complete) lines in previous chunks
 203         self.prevNumLines = 0
 204         # number of columns in the last line of the previous chunk
 205         self.prevNumCols = 0
 206
 207         # Deal with CR LF and surrogates split over chunk boundaries
 208         self._bufferedCharacter = None
 209
 210     def openStream(self, source):
 211         """Produces a file object from source.
 212
 213         source can be either a file object, local filename or a string.
 214
 215         """
 216         # Already a file object
 217         if hasattr(source, 'read'):
 218             stream = source
 219         else:
 220             stream = StringIO(source)
 221
 222         return stream
 223
 224     def _position(self, offset):
 225         chunk = self.chunk
 226         nLines = chunk.count('\n', 0, offset)
 227         positionLine = self.prevNumLines + nLines
 228         lastLinePos = chunk.rfind('\n', 0, offset)
 229         if lastLinePos == -1:
 230             positionColumn = self.prevNumCols + offset
 231         else:
 232             positionColumn = offset - (lastLinePos + 1)
 233         return (positionLine, positionColumn)
 234
 235     def position(self):
 236         """Returns (line, col) of the current position in the stream."""
 237         line, col = self._position(self.chunkOffset)
 238         return (line + 1, col)
 239
 240     def char(self):
 241         """ Read one character from the stream or queue if available. Return
 242             EOF when EOF is reached.
 243         """
 244         # Read a new chunk from the input stream if necessary
 245         if self.chunkOffset >= self.chunkSize:
 246             if not self.readChunk():
 247                 return EOF
 248
 249         chunkOffset = self.chunkOffset
 250         char = self.chunk[chunkOffset]
 251         self.chunkOffset = chunkOffset + 1
 252
 253         return char
 254
 255     def readChunk(self, chunkSize=None):
 256         if chunkSize is None:
 257             chunkSize = self._defaultChunkSize
 258
 259         self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
 260
 261         self.chunk = ""
 262         self.chunkSize = 0
 263         self.chunkOffset = 0
 264
 265         data = self.dataStream.read(chunkSize)
 266
 267         # Deal with CR LF and surrogates broken across chunks
 268         if self._bufferedCharacter:
 269             data = self._bufferedCharacter + data
 270             self._bufferedCharacter = None
 271         elif not data:
 272             # We have no more data, bye-bye stream
 273             return False
 274
 275         if len(data) > 1:
 276             lastv = ord(data[-1])
 277             if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
 278                 self._bufferedCharacter = data[-1]
 279                 data = data[:-1]
 280
 281         if self.reportCharacterErrors:
 282             self.reportCharacterErrors(data)
 283
 284         # Replace invalid characters
 285         data = data.replace("\r\n", "\n")
 286         data = data.replace("\r", "\n")
 287
 288         self.chunk = data
 289         self.chunkSize = len(data)
 290
 291         return True
 292
 293     def characterErrorsUCS4(self, data):
 294         for _ in range(len(invalid_unicode_re.findall(data))):
 295             self.errors.append("invalid-codepoint")
 296
 297     def characterErrorsUCS2(self, data):
 298         # Someone picked the wrong compile option
 299         # You lose
 300         skip = False
 301         for match in invalid_unicode_re.finditer(data):
 302             if skip:
 303                 continue
 304             codepoint = ord(match.group())
 305             pos = match.start()
 306             # Pretty sure there should be endianness issues here
 307             if _utils.isSurrogatePair(data[pos:pos + 2]):
 308                 # We have a surrogate pair!
 309                 char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
 310                 if char_val in non_bmp_invalid_codepoints:
 311                     self.errors.append("invalid-codepoint")
 312                 skip = True
 313             elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
 314                   pos == len(data) - 1):
 315                 self.errors.append("invalid-codepoint")
 316             else:
 317                 skip = False
 318                 self.errors.append("invalid-codepoint")
 319
 320     def charsUntil(self, characters, opposite=False):
 321         """ Returns a string of characters from the stream up to but not
 322         including any character in 'characters' or EOF. 'characters' must be
 323         a container that supports the 'in' method and iteration over its
 324         characters.
 325         """
 326
 327         # Use a cache of regexps to find the required characters
 328         try:
 329             chars = charsUntilRegEx[(characters, opposite)]
 330         except KeyError:
 331             if __debug__:
 332                 for c in characters:
 333                     assert(ord(c) < 128)
 334             regex = "".join(["\\x%02x" % ord(c) for c in characters])
 335             if not opposite:
 336                 regex = "^%s" % regex
 337             chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
 338
 339         rv = []
 340
 341         while True:
 342             # Find the longest matching prefix
 343             m = chars.match(self.chunk, self.chunkOffset)
 344             if m is None:
 345                 # If nothing matched, and it wasn't because we ran out of chunk,
 346                 # then stop
 347                 if self.chunkOffset != self.chunkSize:
 348                     break
 349             else:
 350                 end = m.end()
 351                 # If not the whole chunk matched, return everything
 352                 # up to the part that didn't match
 353                 if end != self.chunkSize:
 354                     rv.append(self.chunk[self.chunkOffset:end])
 355                     self.chunkOffset = end
 356                     break
 357             # If the whole remainder of the chunk matched,
 358             # use it all and read the next chunk
 359             rv.append(self.chunk[self.chunkOffset:])
 360             if not self.readChunk():
 361                 # Reached EOF
 362                 break
 363
 364         r = "".join(rv)
 365         return r
 366
 367     def unget(self, char):
 368         # Only one character is allowed to be ungotten at once - it must
 369         # be consumed again before any further call to unget
 370         if char is not None:
 371             if self.chunkOffset == 0:
 372                 # unget is called quite rarely, so it's a good idea to do
 373                 # more work here if it saves a bit of work in the frequently
 374                 # called char and charsUntil.
 375                 # So, just prepend the ungotten character onto the current
 376                 # chunk:
 377                 self.chunk = char + self.chunk
 378                 self.chunkSize += 1
 379             else:
 380                 self.chunkOffset -= 1
 381                 assert self.chunk[self.chunkOffset] == char
 382
 383
 384 class HTMLBinaryInputStream(HTMLUnicodeInputStream):
 385     """Provides a unicode stream of characters to the HTMLTokenizer.
 386
 387     This class takes care of character encoding and removing or replacing
 388     incorrect byte-sequences and also provides column and line tracking.
 389
 390     """
 391
 392     def __init__(self, source, override_encoding=None, transport_encoding=None,
 393                  same_origin_parent_encoding=None, likely_encoding=None,
 394                  default_encoding="windows-1252", useChardet=True):
 395         """Initialises the HTMLInputStream.
 396
 397         HTMLInputStream(source, [encoding]) -> Normalized stream from source
 398         for use by html5lib.
 399
 400         source can be either a file-object, local filename or a string.
 401
 402         The optional encoding parameter must be a string that indicates
 403         the encoding.  If specified, that encoding will be used,
 404         regardless of any BOM or later declaration (such as in a meta
 405         element)
 406
 407         """
 408         # Raw Stream - for unicode objects this will encode to utf-8 and set
 409         #              self.charEncoding as appropriate
 410         self.rawStream = self.openStream(source)
 411
 412         HTMLUnicodeInputStream.__init__(self, self.rawStream)
 413
 414         # Encoding Information
 415         # Number of bytes to use when looking for a meta element with
 416         # encoding information
 417         self.numBytesMeta = 1024
 418         # Number of bytes to use when using detecting encoding using chardet
 419         self.numBytesChardet = 100
 420         # Things from args
 421         self.override_encoding = override_encoding
 422         self.transport_encoding = transport_encoding
 423         self.same_origin_parent_encoding = same_origin_parent_encoding
 424         self.likely_encoding = likely_encoding
 425         self.default_encoding = default_encoding
 426
 427         # Determine encoding
 428         self.charEncoding = self.determineEncoding(useChardet)
 429         assert self.charEncoding[0] is not None
 430
 431         # Call superclass
 432         self.reset()
 433
 434     def reset(self):
 435         self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
 436         HTMLUnicodeInputStream.reset(self)
 437
 438     def openStream(self, source):
 439         """Produces a file object from source.
 440
 441         source can be either a file object, local filename or a string.
 442
 443         """
 444         # Already a file object
 445         if hasattr(source, 'read'):
 446             stream = source
 447         else:
 448             stream = BytesIO(source)
 449
 450         try:
 451             stream.seek(stream.tell())
 452         except:  # pylint:disable=bare-except
 453             stream = BufferedStream(stream)
 454
 455         return stream
 456
 457     def determineEncoding(self, chardet=True):
 458         # BOMs take precedence over everything
 459         # This will also read past the BOM if present
 460         charEncoding = self.detectBOM(), "certain"
 461         if charEncoding[0] is not None:
 462             return charEncoding
 463
 464         # If we've been overriden, we've been overriden
 465         charEncoding = lookupEncoding(self.override_encoding), "certain"
 466         if charEncoding[0] is not None:
 467             return charEncoding
 468
 469         # Now check the transport layer
 470         charEncoding = lookupEncoding(self.transport_encoding), "certain"
 471         if charEncoding[0] is not None:
 472             return charEncoding
 473
 474         # Look for meta elements with encoding information
 475         charEncoding = self.detectEncodingMeta(), "tentative"
 476         if charEncoding[0] is not None:
 477             return charEncoding
 478
 479         # Parent document encoding
 480         charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
 481         if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
 482             return charEncoding
 483
 484         # "likely" encoding
 485         charEncoding = lookupEncoding(self.likely_encoding), "tentative"
 486         if charEncoding[0] is not None:
 487             return charEncoding
 488
 489         # Guess with chardet, if available
 490         if chardet:
 491             try:
 492                 from chardet.universaldetector import UniversalDetector
 493             except ImportError:
 494                 pass
 495             else:
 496                 buffers = []
 497                 detector = UniversalDetector()
 498                 while not detector.done:
 499                     buffer = self.rawStream.read(self.numBytesChardet)
 500                     assert isinstance(buffer, bytes)
 501                     if not buffer:
 502                         break
 503                     buffers.append(buffer)
 504                     detector.feed(buffer)
 505                 detector.close()
 506                 encoding = lookupEncoding(detector.result['encoding'])
 507                 self.rawStream.seek(0)
 508                 if encoding is not None:
 509                     return encoding, "tentative"
 510
 511         # Try the default encoding
 512         charEncoding = lookupEncoding(self.default_encoding), "tentative"
 513         if charEncoding[0] is not None:
 514             return charEncoding
 515
 516         # Fallback to html5lib's default if even that hasn't worked
 517         return lookupEncoding("windows-1252"), "tentative"
 518
 519     def changeEncoding(self, newEncoding):
 520         assert self.charEncoding[1] != "certain"
 521         newEncoding = lookupEncoding(newEncoding)
 522         if newEncoding is None:
 523             return
 524         if newEncoding.name in ("utf-16be", "utf-16le"):
 525             newEncoding = lookupEncoding("utf-8")
 526             assert newEncoding is not None
 527         elif newEncoding == self.charEncoding[0]:
 528             self.charEncoding = (self.charEncoding[0], "certain")
 529         else:
 530             self.rawStream.seek(0)
 531             self.charEncoding = (newEncoding, "certain")
 532             self.reset()
 533             raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
 534
 535     def detectBOM(self):
 536         """Attempts to detect at BOM at the start of the stream. If
 537         an encoding can be determined from the BOM return the name of the
 538         encoding otherwise return None"""
 539         bomDict = {
 540             codecs.BOM_UTF8: 'utf-8',
 541             codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
 542             codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
 543         }
 544
 545         # Go to beginning of file and read in 4 bytes
 546         string = self.rawStream.read(4)
 547         assert isinstance(string, bytes)
 548
 549         # Try detecting the BOM using bytes from the string
 550         encoding = bomDict.get(string[:3])         # UTF-8
 551         seek = 3
 552         if not encoding:
 553             # Need to detect UTF-32 before UTF-16
 554             encoding = bomDict.get(string)         # UTF-32
 555             seek = 4
 556             if not encoding:
 557                 encoding = bomDict.get(string[:2])  # UTF-16
 558                 seek = 2
 559
 560         # Set the read position past the BOM if one was found, otherwise
 561         # set it to the start of the stream
 562         if encoding:
 563             self.rawStream.seek(seek)
 564             return lookupEncoding(encoding)
 565         else:
 566             self.rawStream.seek(0)
 567             return None
 568
 569     def detectEncodingMeta(self):
 570         """Report the encoding declared by the meta element
 571         """
 572         buffer = self.rawStream.read(self.numBytesMeta)
 573         assert isinstance(buffer, bytes)
 574         parser = EncodingParser(buffer)
 575         self.rawStream.seek(0)
 576         encoding = parser.getEncoding()
 577
 578         if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
 579             encoding = lookupEncoding("utf-8")
 580
 581         return encoding
 582
 583
 584 class EncodingBytes(bytes):
 585     """String-like object with an associated position and various extra methods
 586     If the position is ever greater than the string length then an exception is
 587     raised"""
 588     def __new__(self, value):
 589         assert isinstance(value, bytes)
 590         return bytes.__new__(self, value.lower())
 591
 592     def __init__(self, value):
 593         # pylint:disable=unused-argument
 594         self._position = -1
 595
 596     def __iter__(self):
 597         return self
 598
 599     def __next__(self):
 600         p = self._position = self._position + 1
 601         if p >= len(self):
 602             raise StopIteration
 603         elif p < 0:
 604             raise TypeError
 605         return self[p:p + 1]
 606
 607     def next(self):
 608         # Py2 compat
 609         return self.__next__()
 610
 611     def previous(self):
 612         p = self._position
 613         if p >= len(self):
 614             raise StopIteration
 615         elif p < 0:
 616             raise TypeError
 617         self._position = p = p - 1
 618         return self[p:p + 1]
 619
 620     def setPosition(self, position):
 621         if self._position >= len(self):
 622             raise StopIteration
 623         self._position = position
 624
 625     def getPosition(self):
 626         if self._position >= len(self):
 627             raise StopIteration
 628         if self._position >= 0:
 629             return self._position
 630         else:
 631             return None
 632
 633     position = property(getPosition, setPosition)
 634
 635     def getCurrentByte(self):
 636         return self[self.position:self.position + 1]
 637
 638     currentByte = property(getCurrentByte)
 639
 640     def skip(self, chars=spaceCharactersBytes):
 641         """Skip past a list of characters"""
 642         p = self.position               # use property for the error-checking
 643         while p < len(self):
 644             c = self[p:p + 1]
 645             if c not in chars:
 646                 self._position = p
 647                 return c
 648             p += 1
 649         self._position = p
 650         return None
 651
 652     def skipUntil(self, chars):
 653         p = self.position
 654         while p < len(self):
 655             c = self[p:p + 1]
 656             if c in chars:
 657                 self._position = p
 658                 return c
 659             p += 1
 660         self._position = p
 661         return None
 662
 663     def matchBytes(self, bytes):
 664         """Look for a sequence of bytes at the start of a string. If the bytes
 665         are found return True and advance the position to the byte after the
 666         match. Otherwise return False and leave the position alone"""
 667         p = self.position
 668         data = self[p:p + len(bytes)]
 669         rv = data.startswith(bytes)
 670         if rv:
 671             self.position += len(bytes)
 672         return rv
 673
 674     def jumpTo(self, bytes):
 675         """Look for the next sequence of bytes matching a given sequence. If
 676         a match is found advance the position to the last byte of the match"""
 677         newPosition = self[self.position:].find(bytes)
 678         if newPosition > -1:
 679             # XXX: This is ugly, but I can't see a nicer way to fix this.
 680             if self._position == -1:
 681                 self._position = 0
 682             self._position += (newPosition + len(bytes) - 1)
 683             return True
 684         else:
 685             raise StopIteration
 686
 687
 688 class EncodingParser(object):
 689     """Mini parser for detecting character encoding from meta elements"""
 690
 691     def __init__(self, data):
 692         """string - the data to work on for encoding detection"""
 693         self.data = EncodingBytes(data)
 694         self.encoding = None
 695
 696     def getEncoding(self):
 697         methodDispatch = (
 698             (b"<!--", self.handleComment),
 699             (b"<meta", self.handleMeta),
 700             (b"</", self.handlePossibleEndTag),
 701             (b"<!", self.handleOther),
 702             (b"<?", self.handleOther),
 703             (b"<", self.handlePossibleStartTag))
 704         for _ in self.data:
 705             keepParsing = True
 706             for key, method in methodDispatch:
 707                 if self.data.matchBytes(key):
 708                     try:
 709                         keepParsing = method()
 710                         break
 711                     except StopIteration:
 712                         keepParsing = False
 713                         break
 714             if not keepParsing:
 715                 break
 716
 717         return self.encoding
 718
 719     def handleComment(self):
 720         """Skip over comments"""
 721         return self.data.jumpTo(b"-->")
 722
 723     def handleMeta(self):
 724         if self.data.currentByte not in spaceCharactersBytes:
 725             # if we have <meta not followed by a space so just keep going
 726             return True
 727         # We have a valid meta element we want to search for attributes
 728         hasPragma = False
 729         pendingEncoding = None
 730         while True:
 731             # Try to find the next attribute after the current position
 732             attr = self.getAttribute()
 733             if attr is None:
 734                 return True
 735             else:
 736                 if attr[0] == b"http-equiv":
 737                     hasPragma = attr[1] == b"content-type"
 738                     if hasPragma and pendingEncoding is not None:
 739                         self.encoding = pendingEncoding
 740                         return False
 741                 elif attr[0] == b"charset":
 742                     tentativeEncoding = attr[1]
 743                     codec = lookupEncoding(tentativeEncoding)
 744                     if codec is not None:
 745                         self.encoding = codec
 746                         return False
 747                 elif attr[0] == b"content":
 748                     contentParser = ContentAttrParser(EncodingBytes(attr[1]))
 749                     tentativeEncoding = contentParser.parse()
 750                     if tentativeEncoding is not None:
 751                         codec = lookupEncoding(tentativeEncoding)
 752                         if codec is not None:
 753                             if hasPragma:
 754                                 self.encoding = codec
 755                                 return False
 756                             else:
 757                                 pendingEncoding = codec
 758
 759     def handlePossibleStartTag(self):
 760         return self.handlePossibleTag(False)
 761
 762     def handlePossibleEndTag(self):
 763         next(self.data)
 764         return self.handlePossibleTag(True)
 765
 766     def handlePossibleTag(self, endTag):
 767         data = self.data
 768         if data.currentByte not in asciiLettersBytes:
 769             # If the next byte is not an ascii letter either ignore this
 770             # fragment (possible start tag case) or treat it according to
 771             # handleOther
 772             if endTag:
 773                 data.previous()
 774                 self.handleOther()
 775             return True
 776
 777         c = data.skipUntil(spacesAngleBrackets)
 778         if c == b"<":
 779             # return to the first step in the overall "two step" algorithm
 780             # reprocessing the < byte
 781             data.previous()
 782         else:
 783             # Read all attributes
 784             attr = self.getAttribute()
 785             while attr is not None:
 786                 attr = self.getAttribute()
 787         return True
 788
 789     def handleOther(self):
 790         return self.data.jumpTo(b">")
 791
 792     def getAttribute(self):
 793         """Return a name,value pair for the next attribute in the stream,
 794         if one is found, or None"""
 795         data = self.data
 796         # Step 1 (skip chars)
 797         c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
 798         assert c is None or len(c) == 1
 799         # Step 2
 800         if c in (b">", None):
 801             return None
 802         # Step 3
 803         attrName = []
 804         attrValue = []
 805         # Step 4 attribute name
 806         while True:
 807             if c == b"=" and attrName:
 808                 break
 809             elif c in spaceCharactersBytes:
 810                 # Step 6!
 811                 c = data.skip()
 812                 break
 813             elif c in (b"/", b">"):
 814                 return b"".join(attrName), b""
 815             elif c in asciiUppercaseBytes:
 816                 attrName.append(c.lower())
 817             elif c is None:
 818                 return None
 819             else:
 820                 attrName.append(c)
 821             # Step 5
 822             c = next(data)
 823         # Step 7
 824         if c != b"=":
 825             data.previous()
 826             return b"".join(attrName), b""
 827         # Step 8
 828         next(data)
 829         # Step 9
 830         c = data.skip()
 831         # Step 10
 832         if c in (b"'", b'"'):
 833             # 10.1
 834             quoteChar = c
 835             while True:
 836                 # 10.2
 837                 c = next(data)
 838                 # 10.3
 839                 if c == quoteChar:
 840                     next(data)
 841                     return b"".join(attrName), b"".join(attrValue)
 842                 # 10.4
 843                 elif c in asciiUppercaseBytes:
 844                     attrValue.append(c.lower())
 845                 # 10.5
 846                 else:
 847                     attrValue.append(c)
 848         elif c == b">":
 849             return b"".join(attrName), b""
 850         elif c in asciiUppercaseBytes:
 851             attrValue.append(c.lower())
 852         elif c is None:
 853             return None
 854         else:
 855             attrValue.append(c)
 856         # Step 11
 857         while True:
 858             c = next(data)
 859             if c in spacesAngleBrackets:
 860                 return b"".join(attrName), b"".join(attrValue)
 861             elif c in asciiUppercaseBytes:
 862                 attrValue.append(c.lower())
 863             elif c is None:
 864                 return None
 865             else:
 866                 attrValue.append(c)
 867
 868
 869 class ContentAttrParser(object):
 870     def __init__(self, data):
 871         assert isinstance(data, bytes)
 872         self.data = data
 873
 874     def parse(self):
 875         try:
 876             # Check if the attr name is charset
 877             # otherwise return
 878             self.data.jumpTo(b"charset")
 879             self.data.position += 1
 880             self.data.skip()
 881             if not self.data.currentByte == b"=":
 882                 # If there is no = sign keep looking for attrs
 883                 return None
 884             self.data.position += 1
 885             self.data.skip()
 886             # Look for an encoding between matching quote marks
 887             if self.data.currentByte in (b'"', b"'"):
 888                 quoteMark = self.data.currentByte
 889                 self.data.position += 1
 890                 oldPosition = self.data.position
 891                 if self.data.jumpTo(quoteMark):
 892                     return self.data[oldPosition:self.data.position]
 893                 else:
 894                     return None
 895             else:
 896                 # Unquoted value
 897                 oldPosition = self.data.position
 898                 try:
 899                     self.data.skipUntil(spaceCharactersBytes)
 900                     return self.data[oldPosition:self.data.position]
 901                 except StopIteration:
 902                     # Return the whole remaining value
 903                     return self.data[oldPosition:]
 904         except StopIteration:
 905             return None
 906
 907
 908 def lookupEncoding(encoding):
 909     """Return the python codec name corresponding to an encoding or None if the
 910     string doesn't correspond to a valid encoding."""
 911     if isinstance(encoding, binary_type):
 912         try:
 913             encoding = encoding.decode("ascii")
 914         except UnicodeDecodeError:
 915             return None
 916
 917     if encoding is not None:
 918         try:
 919             return webencodings.lookup(encoding)
 920         except AttributeError:
 921             return None
 922     else:
 923         return None