1 from __future__ import absolute_import, division, unicode_literals
3 from pip._vendor.six import text_type, binary_type
4 from pip._vendor.six.moves import http_client, urllib
9 from pip._vendor import webencodings
11 from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
12 from .constants import ReparseException
15 from io import StringIO
18 from io import BytesIO
22 # Non-unicode versions of constants for use in the pre-parser
23 spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
24 asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
25 asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
26 spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
29 invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa
31 if _utils.supports_lone_surrogates:
32 # Use one extra step of indirection and create surrogates with
33 # eval. Not using this indirection would introduce an illegal
34 # unicode literal on platforms not supporting such lone
36 assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
37 invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
38 eval('"\\uD800-\\uDFFF"') + # pylint:disable=eval-used
41 invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
43 non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
44 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
45 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
46 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
47 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
48 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
51 ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
53 # Cache for charsUntil()
57 class BufferedStream(object):
58 """Buffering for streams that do not have buffering of their own
60 The buffer is implemented as a list of chunks on the assumption that
61 joining many strings will be slow since it is O(n**2)
64 def __init__(self, stream):
67 self.position = [-1, 0] # chunk number, offset
71 for chunk in self.buffer[:self.position[0]]:
73 pos += self.position[1]
77 assert pos <= self._bufferedBytes()
80 while len(self.buffer[i]) < offset:
81 offset -= len(self.buffer[i])
83 self.position = [i, offset]
85 def read(self, bytes):
87 return self._readStream(bytes)
88 elif (self.position[0] == len(self.buffer) and
89 self.position[1] == len(self.buffer[-1])):
90 return self._readStream(bytes)
92 return self._readFromBuffer(bytes)
94 def _bufferedBytes(self):
95 return sum([len(item) for item in self.buffer])
97 def _readStream(self, bytes):
98 data = self.stream.read(bytes)
99 self.buffer.append(data)
100 self.position[0] += 1
101 self.position[1] = len(data)
104 def _readFromBuffer(self, bytes):
105 remainingBytes = bytes
107 bufferIndex = self.position[0]
108 bufferOffset = self.position[1]
109 while bufferIndex < len(self.buffer) and remainingBytes != 0:
110 assert remainingBytes > 0
111 bufferedData = self.buffer[bufferIndex]
113 if remainingBytes <= len(bufferedData) - bufferOffset:
114 bytesToRead = remainingBytes
115 self.position = [bufferIndex, bufferOffset + bytesToRead]
117 bytesToRead = len(bufferedData) - bufferOffset
118 self.position = [bufferIndex, len(bufferedData)]
120 rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
121 remainingBytes -= bytesToRead
126 rv.append(self._readStream(remainingBytes))
131 def HTMLInputStream(source, **kwargs):
132 # Work around Python bug #20007: read(0) closes the connection.
133 # http://bugs.python.org/issue20007
134 if (isinstance(source, http_client.HTTPResponse) or
135 # Also check for addinfourl wrapping HTTPResponse
136 (isinstance(source, urllib.response.addbase) and
137 isinstance(source.fp, http_client.HTTPResponse))):
139 elif hasattr(source, "read"):
140 isUnicode = isinstance(source.read(0), text_type)
142 isUnicode = isinstance(source, text_type)
145 encodings = [x for x in kwargs if x.endswith("_encoding")]
147 raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
149 return HTMLUnicodeInputStream(source, **kwargs)
151 return HTMLBinaryInputStream(source, **kwargs)
154 class HTMLUnicodeInputStream(object):
155 """Provides a unicode stream of characters to the HTMLTokenizer.
157 This class takes care of character encoding and removing or replacing
158 incorrect byte-sequences and also provides column and line tracking.
162 _defaultChunkSize = 10240
164 def __init__(self, source):
165 """Initialises the HTMLInputStream.
167 HTMLInputStream(source, [encoding]) -> Normalized stream from source
170 source can be either a file-object, local filename or a string.
172 The optional encoding parameter must be a string that indicates
173 the encoding. If specified, that encoding will be used,
174 regardless of any BOM or later declaration (such as in a meta
179 if not _utils.supports_lone_surrogates:
180 # Such platforms will have already checked for such
181 # surrogate errors, so no need to do this checking.
182 self.reportCharacterErrors = None
183 elif len("\U0010FFFF") == 1:
184 self.reportCharacterErrors = self.characterErrorsUCS4
186 self.reportCharacterErrors = self.characterErrorsUCS2
188 # List of where new lines occur
191 self.charEncoding = (lookupEncoding("utf-8"), "certain")
192 self.dataStream = self.openStream(source)
202 # number of (complete) lines in previous chunks
203 self.prevNumLines = 0
204 # number of columns in the last line of the previous chunk
207 # Deal with CR LF and surrogates split over chunk boundaries
208 self._bufferedCharacter = None
210 def openStream(self, source):
211 """Produces a file object from source.
213 source can be either a file object, local filename or a string.
216 # Already a file object
217 if hasattr(source, 'read'):
220 stream = StringIO(source)
224 def _position(self, offset):
226 nLines = chunk.count('\n', 0, offset)
227 positionLine = self.prevNumLines + nLines
228 lastLinePos = chunk.rfind('\n', 0, offset)
229 if lastLinePos == -1:
230 positionColumn = self.prevNumCols + offset
232 positionColumn = offset - (lastLinePos + 1)
233 return (positionLine, positionColumn)
236 """Returns (line, col) of the current position in the stream."""
237 line, col = self._position(self.chunkOffset)
238 return (line + 1, col)
241 """ Read one character from the stream or queue if available. Return
242 EOF when EOF is reached.
244 # Read a new chunk from the input stream if necessary
245 if self.chunkOffset >= self.chunkSize:
246 if not self.readChunk():
249 chunkOffset = self.chunkOffset
250 char = self.chunk[chunkOffset]
251 self.chunkOffset = chunkOffset + 1
255 def readChunk(self, chunkSize=None):
256 if chunkSize is None:
257 chunkSize = self._defaultChunkSize
259 self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
265 data = self.dataStream.read(chunkSize)
267 # Deal with CR LF and surrogates broken across chunks
268 if self._bufferedCharacter:
269 data = self._bufferedCharacter + data
270 self._bufferedCharacter = None
272 # We have no more data, bye-bye stream
276 lastv = ord(data[-1])
277 if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
278 self._bufferedCharacter = data[-1]
281 if self.reportCharacterErrors:
282 self.reportCharacterErrors(data)
284 # Replace invalid characters
285 data = data.replace("\r\n", "\n")
286 data = data.replace("\r", "\n")
289 self.chunkSize = len(data)
293 def characterErrorsUCS4(self, data):
294 for _ in range(len(invalid_unicode_re.findall(data))):
295 self.errors.append("invalid-codepoint")
297 def characterErrorsUCS2(self, data):
298 # Someone picked the wrong compile option
301 for match in invalid_unicode_re.finditer(data):
304 codepoint = ord(match.group())
306 # Pretty sure there should be endianness issues here
307 if _utils.isSurrogatePair(data[pos:pos + 2]):
308 # We have a surrogate pair!
309 char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
310 if char_val in non_bmp_invalid_codepoints:
311 self.errors.append("invalid-codepoint")
313 elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
314 pos == len(data) - 1):
315 self.errors.append("invalid-codepoint")
318 self.errors.append("invalid-codepoint")
320 def charsUntil(self, characters, opposite=False):
321 """ Returns a string of characters from the stream up to but not
322 including any character in 'characters' or EOF. 'characters' must be
323 a container that supports the 'in' method and iteration over its
327 # Use a cache of regexps to find the required characters
329 chars = charsUntilRegEx[(characters, opposite)]
334 regex = "".join(["\\x%02x" % ord(c) for c in characters])
336 regex = "^%s" % regex
337 chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
342 # Find the longest matching prefix
343 m = chars.match(self.chunk, self.chunkOffset)
345 # If nothing matched, and it wasn't because we ran out of chunk,
347 if self.chunkOffset != self.chunkSize:
351 # If not the whole chunk matched, return everything
352 # up to the part that didn't match
353 if end != self.chunkSize:
354 rv.append(self.chunk[self.chunkOffset:end])
355 self.chunkOffset = end
357 # If the whole remainder of the chunk matched,
358 # use it all and read the next chunk
359 rv.append(self.chunk[self.chunkOffset:])
360 if not self.readChunk():
367 def unget(self, char):
368 # Only one character is allowed to be ungotten at once - it must
369 # be consumed again before any further call to unget
371 if self.chunkOffset == 0:
372 # unget is called quite rarely, so it's a good idea to do
373 # more work here if it saves a bit of work in the frequently
374 # called char and charsUntil.
375 # So, just prepend the ungotten character onto the current
377 self.chunk = char + self.chunk
380 self.chunkOffset -= 1
381 assert self.chunk[self.chunkOffset] == char
384 class HTMLBinaryInputStream(HTMLUnicodeInputStream):
385 """Provides a unicode stream of characters to the HTMLTokenizer.
387 This class takes care of character encoding and removing or replacing
388 incorrect byte-sequences and also provides column and line tracking.
392 def __init__(self, source, override_encoding=None, transport_encoding=None,
393 same_origin_parent_encoding=None, likely_encoding=None,
394 default_encoding="windows-1252", useChardet=True):
395 """Initialises the HTMLInputStream.
397 HTMLInputStream(source, [encoding]) -> Normalized stream from source
400 source can be either a file-object, local filename or a string.
402 The optional encoding parameter must be a string that indicates
403 the encoding. If specified, that encoding will be used,
404 regardless of any BOM or later declaration (such as in a meta
408 # Raw Stream - for unicode objects this will encode to utf-8 and set
409 # self.charEncoding as appropriate
410 self.rawStream = self.openStream(source)
412 HTMLUnicodeInputStream.__init__(self, self.rawStream)
414 # Encoding Information
415 # Number of bytes to use when looking for a meta element with
416 # encoding information
417 self.numBytesMeta = 1024
418 # Number of bytes to use when using detecting encoding using chardet
419 self.numBytesChardet = 100
421 self.override_encoding = override_encoding
422 self.transport_encoding = transport_encoding
423 self.same_origin_parent_encoding = same_origin_parent_encoding
424 self.likely_encoding = likely_encoding
425 self.default_encoding = default_encoding
428 self.charEncoding = self.determineEncoding(useChardet)
429 assert self.charEncoding[0] is not None
435 self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
436 HTMLUnicodeInputStream.reset(self)
438 def openStream(self, source):
439 """Produces a file object from source.
441 source can be either a file object, local filename or a string.
444 # Already a file object
445 if hasattr(source, 'read'):
448 stream = BytesIO(source)
451 stream.seek(stream.tell())
452 except: # pylint:disable=bare-except
453 stream = BufferedStream(stream)
457 def determineEncoding(self, chardet=True):
458 # BOMs take precedence over everything
459 # This will also read past the BOM if present
460 charEncoding = self.detectBOM(), "certain"
461 if charEncoding[0] is not None:
464 # If we've been overriden, we've been overriden
465 charEncoding = lookupEncoding(self.override_encoding), "certain"
466 if charEncoding[0] is not None:
469 # Now check the transport layer
470 charEncoding = lookupEncoding(self.transport_encoding), "certain"
471 if charEncoding[0] is not None:
474 # Look for meta elements with encoding information
475 charEncoding = self.detectEncodingMeta(), "tentative"
476 if charEncoding[0] is not None:
479 # Parent document encoding
480 charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
481 if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
485 charEncoding = lookupEncoding(self.likely_encoding), "tentative"
486 if charEncoding[0] is not None:
489 # Guess with chardet, if available
492 from chardet.universaldetector import UniversalDetector
497 detector = UniversalDetector()
498 while not detector.done:
499 buffer = self.rawStream.read(self.numBytesChardet)
500 assert isinstance(buffer, bytes)
503 buffers.append(buffer)
504 detector.feed(buffer)
506 encoding = lookupEncoding(detector.result['encoding'])
507 self.rawStream.seek(0)
508 if encoding is not None:
509 return encoding, "tentative"
511 # Try the default encoding
512 charEncoding = lookupEncoding(self.default_encoding), "tentative"
513 if charEncoding[0] is not None:
516 # Fallback to html5lib's default if even that hasn't worked
517 return lookupEncoding("windows-1252"), "tentative"
519 def changeEncoding(self, newEncoding):
520 assert self.charEncoding[1] != "certain"
521 newEncoding = lookupEncoding(newEncoding)
522 if newEncoding is None:
524 if newEncoding.name in ("utf-16be", "utf-16le"):
525 newEncoding = lookupEncoding("utf-8")
526 assert newEncoding is not None
527 elif newEncoding == self.charEncoding[0]:
528 self.charEncoding = (self.charEncoding[0], "certain")
530 self.rawStream.seek(0)
531 self.charEncoding = (newEncoding, "certain")
533 raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
536 """Attempts to detect at BOM at the start of the stream. If
537 an encoding can be determined from the BOM return the name of the
538 encoding otherwise return None"""
540 codecs.BOM_UTF8: 'utf-8',
541 codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
542 codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
545 # Go to beginning of file and read in 4 bytes
546 string = self.rawStream.read(4)
547 assert isinstance(string, bytes)
549 # Try detecting the BOM using bytes from the string
550 encoding = bomDict.get(string[:3]) # UTF-8
553 # Need to detect UTF-32 before UTF-16
554 encoding = bomDict.get(string) # UTF-32
557 encoding = bomDict.get(string[:2]) # UTF-16
560 # Set the read position past the BOM if one was found, otherwise
561 # set it to the start of the stream
563 self.rawStream.seek(seek)
564 return lookupEncoding(encoding)
566 self.rawStream.seek(0)
569 def detectEncodingMeta(self):
570 """Report the encoding declared by the meta element
572 buffer = self.rawStream.read(self.numBytesMeta)
573 assert isinstance(buffer, bytes)
574 parser = EncodingParser(buffer)
575 self.rawStream.seek(0)
576 encoding = parser.getEncoding()
578 if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
579 encoding = lookupEncoding("utf-8")
584 class EncodingBytes(bytes):
585 """String-like object with an associated position and various extra methods
586 If the position is ever greater than the string length then an exception is
588 def __new__(self, value):
589 assert isinstance(value, bytes)
590 return bytes.__new__(self, value.lower())
592 def __init__(self, value):
593 # pylint:disable=unused-argument
600 p = self._position = self._position + 1
609 return self.__next__()
617 self._position = p = p - 1
620 def setPosition(self, position):
621 if self._position >= len(self):
623 self._position = position
625 def getPosition(self):
626 if self._position >= len(self):
628 if self._position >= 0:
629 return self._position
633 position = property(getPosition, setPosition)
635 def getCurrentByte(self):
636 return self[self.position:self.position + 1]
638 currentByte = property(getCurrentByte)
640 def skip(self, chars=spaceCharactersBytes):
641 """Skip past a list of characters"""
642 p = self.position # use property for the error-checking
652 def skipUntil(self, chars):
663 def matchBytes(self, bytes):
664 """Look for a sequence of bytes at the start of a string. If the bytes
665 are found return True and advance the position to the byte after the
666 match. Otherwise return False and leave the position alone"""
668 data = self[p:p + len(bytes)]
669 rv = data.startswith(bytes)
671 self.position += len(bytes)
674 def jumpTo(self, bytes):
675 """Look for the next sequence of bytes matching a given sequence. If
676 a match is found advance the position to the last byte of the match"""
677 newPosition = self[self.position:].find(bytes)
679 # XXX: This is ugly, but I can't see a nicer way to fix this.
680 if self._position == -1:
682 self._position += (newPosition + len(bytes) - 1)
688 class EncodingParser(object):
689 """Mini parser for detecting character encoding from meta elements"""
691 def __init__(self, data):
692 """string - the data to work on for encoding detection"""
693 self.data = EncodingBytes(data)
696 def getEncoding(self):
698 (b"<!--", self.handleComment),
699 (b"<meta", self.handleMeta),
700 (b"</", self.handlePossibleEndTag),
701 (b"<!", self.handleOther),
702 (b"<?", self.handleOther),
703 (b"<", self.handlePossibleStartTag))
706 for key, method in methodDispatch:
707 if self.data.matchBytes(key):
709 keepParsing = method()
711 except StopIteration:
719 def handleComment(self):
720 """Skip over comments"""
721 return self.data.jumpTo(b"-->")
723 def handleMeta(self):
724 if self.data.currentByte not in spaceCharactersBytes:
725 # if we have <meta not followed by a space so just keep going
727 # We have a valid meta element we want to search for attributes
729 pendingEncoding = None
731 # Try to find the next attribute after the current position
732 attr = self.getAttribute()
736 if attr[0] == b"http-equiv":
737 hasPragma = attr[1] == b"content-type"
738 if hasPragma and pendingEncoding is not None:
739 self.encoding = pendingEncoding
741 elif attr[0] == b"charset":
742 tentativeEncoding = attr[1]
743 codec = lookupEncoding(tentativeEncoding)
744 if codec is not None:
745 self.encoding = codec
747 elif attr[0] == b"content":
748 contentParser = ContentAttrParser(EncodingBytes(attr[1]))
749 tentativeEncoding = contentParser.parse()
750 if tentativeEncoding is not None:
751 codec = lookupEncoding(tentativeEncoding)
752 if codec is not None:
754 self.encoding = codec
757 pendingEncoding = codec
759 def handlePossibleStartTag(self):
760 return self.handlePossibleTag(False)
762 def handlePossibleEndTag(self):
764 return self.handlePossibleTag(True)
766 def handlePossibleTag(self, endTag):
768 if data.currentByte not in asciiLettersBytes:
769 # If the next byte is not an ascii letter either ignore this
770 # fragment (possible start tag case) or treat it according to
777 c = data.skipUntil(spacesAngleBrackets)
779 # return to the first step in the overall "two step" algorithm
780 # reprocessing the < byte
783 # Read all attributes
784 attr = self.getAttribute()
785 while attr is not None:
786 attr = self.getAttribute()
789 def handleOther(self):
790 return self.data.jumpTo(b">")
792 def getAttribute(self):
793 """Return a name,value pair for the next attribute in the stream,
794 if one is found, or None"""
796 # Step 1 (skip chars)
797 c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
798 assert c is None or len(c) == 1
800 if c in (b">", None):
805 # Step 4 attribute name
807 if c == b"=" and attrName:
809 elif c in spaceCharactersBytes:
813 elif c in (b"/", b">"):
814 return b"".join(attrName), b""
815 elif c in asciiUppercaseBytes:
816 attrName.append(c.lower())
826 return b"".join(attrName), b""
832 if c in (b"'", b'"'):
841 return b"".join(attrName), b"".join(attrValue)
843 elif c in asciiUppercaseBytes:
844 attrValue.append(c.lower())
849 return b"".join(attrName), b""
850 elif c in asciiUppercaseBytes:
851 attrValue.append(c.lower())
859 if c in spacesAngleBrackets:
860 return b"".join(attrName), b"".join(attrValue)
861 elif c in asciiUppercaseBytes:
862 attrValue.append(c.lower())
869 class ContentAttrParser(object):
870 def __init__(self, data):
871 assert isinstance(data, bytes)
876 # Check if the attr name is charset
878 self.data.jumpTo(b"charset")
879 self.data.position += 1
881 if not self.data.currentByte == b"=":
882 # If there is no = sign keep looking for attrs
884 self.data.position += 1
886 # Look for an encoding between matching quote marks
887 if self.data.currentByte in (b'"', b"'"):
888 quoteMark = self.data.currentByte
889 self.data.position += 1
890 oldPosition = self.data.position
891 if self.data.jumpTo(quoteMark):
892 return self.data[oldPosition:self.data.position]
897 oldPosition = self.data.position
899 self.data.skipUntil(spaceCharactersBytes)
900 return self.data[oldPosition:self.data.position]
901 except StopIteration:
902 # Return the whole remaining value
903 return self.data[oldPosition:]
904 except StopIteration:
908 def lookupEncoding(encoding):
909 """Return the python codec name corresponding to an encoding or None if the
910 string doesn't correspond to a valid encoding."""
911 if isinstance(encoding, binary_type):
913 encoding = encoding.decode("ascii")
914 except UnicodeDecodeError:
917 if encoding is not None:
919 return webencodings.lookup(encoding)
920 except AttributeError: