venv/Lib/site-packages/pip/_vendor/distlib/locators.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # Copyright (C) 2012-2015 Vinay Sajip.
   4 # Licensed to the Python Software Foundation under a contributor agreement.
   5 # See LICENSE.txt and CONTRIBUTORS.txt.
   6 #
   7
   8 import gzip
   9 from io import BytesIO
  10 import json
  11 import logging
  12 import os
  13 import posixpath
  14 import re
  15 try:
  16     import threading
  17 except ImportError:  # pragma: no cover
  18     import dummy_threading as threading
  19 import zlib
  20
  21 from . import DistlibException
  22 from .compat import (urljoin, urlparse, urlunparse, url2pathname, pathname2url,
  23                      queue, quote, unescape, string_types, build_opener,
  24                      HTTPRedirectHandler as BaseRedirectHandler, text_type,
  25                      Request, HTTPError, URLError)
  26 from .database import Distribution, DistributionPath, make_dist
  27 from .metadata import Metadata
  28 from .util import (cached_property, parse_credentials, ensure_slash,
  29                    split_filename, get_project_data, parse_requirement,
  30                    parse_name_and_version, ServerProxy, normalize_name)
  31 from .version import get_scheme, UnsupportedVersionError
  32 from .wheel import Wheel, is_compatible
  33
  34 logger = logging.getLogger(__name__)
  35
  36 HASHER_HASH = re.compile('^(\w+)=([a-f0-9]+)')
  37 CHARSET = re.compile(r';\s*charset\s*=\s*(.*)\s*$', re.I)
  38 HTML_CONTENT_TYPE = re.compile('text/html|application/x(ht)?ml')
  39 DEFAULT_INDEX = 'https://pypi.python.org/pypi'
  40
  41 def get_all_distribution_names(url=None):
  42     """
  43     Return all distribution names known by an index.
  44     :param url: The URL of the index.
  45     :return: A list of all known distribution names.
  46     """
  47     if url is None:
  48         url = DEFAULT_INDEX
  49     client = ServerProxy(url, timeout=3.0)
  50     return client.list_packages()
  51
  52 class RedirectHandler(BaseRedirectHandler):
  53     """
  54     A class to work around a bug in some Python 3.2.x releases.
  55     """
  56     # There's a bug in the base version for some 3.2.x
  57     # (e.g. 3.2.2 on Ubuntu Oneiric). If a Location header
  58     # returns e.g. /abc, it bails because it says the scheme ''
  59     # is bogus, when actually it should use the request's
  60     # URL for the scheme. See Python issue #13696.
  61     def http_error_302(self, req, fp, code, msg, headers):
  62         # Some servers (incorrectly) return multiple Location headers
  63         # (so probably same goes for URI).  Use first header.
  64         newurl = None
  65         for key in ('location', 'uri'):
  66             if key in headers:
  67                 newurl = headers[key]
  68                 break
  69         if newurl is None:
  70             return
  71         urlparts = urlparse(newurl)
  72         if urlparts.scheme == '':
  73             newurl = urljoin(req.get_full_url(), newurl)
  74             if hasattr(headers, 'replace_header'):
  75                 headers.replace_header(key, newurl)
  76             else:
  77                 headers[key] = newurl
  78         return BaseRedirectHandler.http_error_302(self, req, fp, code, msg,
  79                                                   headers)
  80
  81     http_error_301 = http_error_303 = http_error_307 = http_error_302
  82
  83 class Locator(object):
  84     """
  85     A base class for locators - things that locate distributions.
  86     """
  87     source_extensions = ('.tar.gz', '.tar.bz2', '.tar', '.zip', '.tgz', '.tbz')
  88     binary_extensions = ('.egg', '.exe', '.whl')
  89     excluded_extensions = ('.pdf',)
  90
  91     # A list of tags indicating which wheels you want to match. The default
  92     # value of None matches against the tags compatible with the running
  93     # Python. If you want to match other values, set wheel_tags on a locator
  94     # instance to a list of tuples (pyver, abi, arch) which you want to match.
  95     wheel_tags = None
  96
  97     downloadable_extensions = source_extensions + ('.whl',)
  98
  99     def __init__(self, scheme='default'):
 100         """
 101         Initialise an instance.
 102         :param scheme: Because locators look for most recent versions, they
 103                        need to know the version scheme to use. This specifies
 104                        the current PEP-recommended scheme - use ``'legacy'``
 105                        if you need to support existing distributions on PyPI.
 106         """
 107         self._cache = {}
 108         self.scheme = scheme
 109         # Because of bugs in some of the handlers on some of the platforms,
 110         # we use our own opener rather than just using urlopen.
 111         self.opener = build_opener(RedirectHandler())
 112         # If get_project() is called from locate(), the matcher instance
 113         # is set from the requirement passed to locate(). See issue #18 for
 114         # why this can be useful to know.
 115         self.matcher = None
 116         self.errors = queue.Queue()
 117
 118     def get_errors(self):
 119         """
 120         Return any errors which have occurred.
 121         """
 122         result = []
 123         while not self.errors.empty():  # pragma: no cover
 124             try:
 125                 e = self.errors.get(False)
 126                 result.append(e)
 127             except self.errors.Empty:
 128                 continue
 129             self.errors.task_done()
 130         return result
 131
 132     def clear_errors(self):
 133         """
 134         Clear any errors which may have been logged.
 135         """
 136         # Just get the errors and throw them away
 137         self.get_errors()
 138
 139     def clear_cache(self):
 140         self._cache.clear()
 141
 142     def _get_scheme(self):
 143         return self._scheme
 144
 145     def _set_scheme(self, value):
 146         self._scheme = value
 147
 148     scheme = property(_get_scheme, _set_scheme)
 149
 150     def _get_project(self, name):
 151         """
 152         For a given project, get a dictionary mapping available versions to Distribution
 153         instances.
 154
 155         This should be implemented in subclasses.
 156
 157         If called from a locate() request, self.matcher will be set to a
 158         matcher for the requirement to satisfy, otherwise it will be None.
 159         """
 160         raise NotImplementedError('Please implement in the subclass')
 161
 162     def get_distribution_names(self):
 163         """
 164         Return all the distribution names known to this locator.
 165         """
 166         raise NotImplementedError('Please implement in the subclass')
 167
 168     def get_project(self, name):
 169         """
 170         For a given project, get a dictionary mapping available versions to Distribution
 171         instances.
 172
 173         This calls _get_project to do all the work, and just implements a caching layer on top.
 174         """
 175         if self._cache is None:
 176             result = self._get_project(name)
 177         elif name in self._cache:
 178             result = self._cache[name]
 179         else:
 180             self.clear_errors()
 181             result = self._get_project(name)
 182             self._cache[name] = result
 183         return result
 184
 185     def score_url(self, url):
 186         """
 187         Give an url a score which can be used to choose preferred URLs
 188         for a given project release.
 189         """
 190         t = urlparse(url)
 191         basename = posixpath.basename(t.path)
 192         compatible = True
 193         is_wheel = basename.endswith('.whl')
 194         if is_wheel:
 195             compatible = is_compatible(Wheel(basename), self.wheel_tags)
 196         return (t.scheme != 'https', 'pypi.python.org' in t.netloc,
 197                 is_wheel, compatible, basename)
 198
 199     def prefer_url(self, url1, url2):
 200         """
 201         Choose one of two URLs where both are candidates for distribution
 202         archives for the same version of a distribution (for example,
 203         .tar.gz vs. zip).
 204
 205         The current implementation favours https:// URLs over http://, archives
 206         from PyPI over those from other locations, wheel compatibility (if a
 207         wheel) and then the archive name.
 208         """
 209         result = url2
 210         if url1:
 211             s1 = self.score_url(url1)
 212             s2 = self.score_url(url2)
 213             if s1 > s2:
 214                 result = url1
 215             if result != url2:
 216                 logger.debug('Not replacing %r with %r', url1, url2)
 217             else:
 218                 logger.debug('Replacing %r with %r', url1, url2)
 219         return result
 220
 221     def split_filename(self, filename, project_name):
 222         """
 223         Attempt to split a filename in project name, version and Python version.
 224         """
 225         return split_filename(filename, project_name)
 226
 227     def convert_url_to_download_info(self, url, project_name):
 228         """
 229         See if a URL is a candidate for a download URL for a project (the URL
 230         has typically been scraped from an HTML page).
 231
 232         If it is, a dictionary is returned with keys "name", "version",
 233         "filename" and "url"; otherwise, None is returned.
 234         """
 235         def same_project(name1, name2):
 236             return normalize_name(name1) == normalize_name(name2)
 237
 238         result = None
 239         scheme, netloc, path, params, query, frag = urlparse(url)
 240         if frag.lower().startswith('egg='):
 241             logger.debug('%s: version hint in fragment: %r',
 242                          project_name, frag)
 243         m = HASHER_HASH.match(frag)
 244         if m:
 245             algo, digest = m.groups()
 246         else:
 247             algo, digest = None, None
 248         origpath = path
 249         if path and path[-1] == '/':
 250             path = path[:-1]
 251         if path.endswith('.whl'):
 252             try:
 253                 wheel = Wheel(path)
 254                 if is_compatible(wheel, self.wheel_tags):
 255                     if project_name is None:
 256                         include = True
 257                     else:
 258                         include = same_project(wheel.name, project_name)
 259                     if include:
 260                         result = {
 261                             'name': wheel.name,
 262                             'version': wheel.version,
 263                             'filename': wheel.filename,
 264                             'url': urlunparse((scheme, netloc, origpath,
 265                                                params, query, '')),
 266                             'python-version': ', '.join(
 267                                 ['.'.join(list(v[2:])) for v in wheel.pyver]),
 268                         }
 269             except Exception as e:  # pragma: no cover
 270                 logger.warning('invalid path for wheel: %s', path)
 271         elif path.endswith(self.downloadable_extensions):
 272             path = filename = posixpath.basename(path)
 273             for ext in self.downloadable_extensions:
 274                 if path.endswith(ext):
 275                     path = path[:-len(ext)]
 276                     t = self.split_filename(path, project_name)
 277                     if not t:
 278                         logger.debug('No match for project/version: %s', path)
 279                     else:
 280                         name, version, pyver = t
 281                         if not project_name or same_project(project_name, name):
 282                             result = {
 283                                 'name': name,
 284                                 'version': version,
 285                                 'filename': filename,
 286                                 'url': urlunparse((scheme, netloc, origpath,
 287                                                    params, query, '')),
 288                                 #'packagetype': 'sdist',
 289                             }
 290                             if pyver:
 291                                 result['python-version'] = pyver
 292                     break
 293         if result and algo:
 294             result['%s_digest' % algo] = digest
 295         return result
 296
 297     def _get_digest(self, info):
 298         """
 299         Get a digest from a dictionary by looking at keys of the form
 300         'algo_digest'.
 301
 302         Returns a 2-tuple (algo, digest) if found, else None. Currently
 303         looks only for SHA256, then MD5.
 304         """
 305         result = None
 306         for algo in ('sha256', 'md5'):
 307             key = '%s_digest' % algo
 308             if key in info:
 309                 result = (algo, info[key])
 310                 break
 311         return result
 312
 313     def _update_version_data(self, result, info):
 314         """
 315         Update a result dictionary (the final result from _get_project) with a
 316         dictionary for a specific version, which typically holds information
 317         gleaned from a filename or URL for an archive for the distribution.
 318         """
 319         name = info.pop('name')
 320         version = info.pop('version')
 321         if version in result:
 322             dist = result[version]
 323             md = dist.metadata
 324         else:
 325             dist = make_dist(name, version, scheme=self.scheme)
 326             md = dist.metadata
 327         dist.digest = digest = self._get_digest(info)
 328         url = info['url']
 329         result['digests'][url] = digest
 330         if md.source_url != info['url']:
 331             md.source_url = self.prefer_url(md.source_url, url)
 332             result['urls'].setdefault(version, set()).add(url)
 333         dist.locator = self
 334         result[version] = dist
 335
 336     def locate(self, requirement, prereleases=False):
 337         """
 338         Find the most recent distribution which matches the given
 339         requirement.
 340
 341         :param requirement: A requirement of the form 'foo (1.0)' or perhaps
 342                             'foo (>= 1.0, < 2.0, != 1.3)'
 343         :param prereleases: If ``True``, allow pre-release versions
 344                             to be located. Otherwise, pre-release versions
 345                             are not returned.
 346         :return: A :class:`Distribution` instance, or ``None`` if no such
 347                  distribution could be located.
 348         """
 349         result = None
 350         r = parse_requirement(requirement)
 351         if r is None:
 352             raise DistlibException('Not a valid requirement: %r' % requirement)
 353         scheme = get_scheme(self.scheme)
 354         self.matcher = matcher = scheme.matcher(r.requirement)
 355         logger.debug('matcher: %s (%s)', matcher, type(matcher).__name__)
 356         versions = self.get_project(r.name)
 357         if len(versions) > 2:   # urls and digests keys are present
 358             # sometimes, versions are invalid
 359             slist = []
 360             vcls = matcher.version_class
 361             for k in versions:
 362                 if k in ('urls', 'digests'):
 363                     continue
 364                 try:
 365                     if not matcher.match(k):
 366                         logger.debug('%s did not match %r', matcher, k)
 367                     else:
 368                         if prereleases or not vcls(k).is_prerelease:
 369                             slist.append(k)
 370                         else:
 371                             logger.debug('skipping pre-release '
 372                                          'version %s of %s', k, matcher.name)
 373                 except Exception:  # pragma: no cover
 374                     logger.warning('error matching %s with %r', matcher, k)
 375                     pass # slist.append(k)
 376             if len(slist) > 1:
 377                 slist = sorted(slist, key=scheme.key)
 378             if slist:
 379                 logger.debug('sorted list: %s', slist)
 380                 version = slist[-1]
 381                 result = versions[version]
 382         if result:
 383             if r.extras:
 384                 result.extras = r.extras
 385             result.download_urls = versions.get('urls', {}).get(version, set())
 386             d = {}
 387             sd = versions.get('digests', {})
 388             for url in result.download_urls:
 389                 if url in sd:
 390                     d[url] = sd[url]
 391             result.digests = d
 392         self.matcher = None
 393         return result
 394
 395
 396 class PyPIRPCLocator(Locator):
 397     """
 398     This locator uses XML-RPC to locate distributions. It therefore
 399     cannot be used with simple mirrors (that only mirror file content).
 400     """
 401     def __init__(self, url, **kwargs):
 402         """
 403         Initialise an instance.
 404
 405         :param url: The URL to use for XML-RPC.
 406         :param kwargs: Passed to the superclass constructor.
 407         """
 408         super(PyPIRPCLocator, self).__init__(**kwargs)
 409         self.base_url = url
 410         self.client = ServerProxy(url, timeout=3.0)
 411
 412     def get_distribution_names(self):
 413         """
 414         Return all the distribution names known to this locator.
 415         """
 416         return set(self.client.list_packages())
 417
 418     def _get_project(self, name):
 419         result = {'urls': {}, 'digests': {}}
 420         versions = self.client.package_releases(name, True)
 421         for v in versions:
 422             urls = self.client.release_urls(name, v)
 423             data = self.client.release_data(name, v)
 424             metadata = Metadata(scheme=self.scheme)
 425             metadata.name = data['name']
 426             metadata.version = data['version']
 427             metadata.license = data.get('license')
 428             metadata.keywords = data.get('keywords', [])
 429             metadata.summary = data.get('summary')
 430             dist = Distribution(metadata)
 431             if urls:
 432                 info = urls[0]
 433                 metadata.source_url = info['url']
 434                 dist.digest = self._get_digest(info)
 435                 dist.locator = self
 436                 result[v] = dist
 437                 for info in urls:
 438                     url = info['url']
 439                     digest = self._get_digest(info)
 440                     result['urls'].setdefault(v, set()).add(url)
 441                     result['digests'][url] = digest
 442         return result
 443
 444 class PyPIJSONLocator(Locator):
 445     """
 446     This locator uses PyPI's JSON interface. It's very limited in functionality
 447     and probably not worth using.
 448     """
 449     def __init__(self, url, **kwargs):
 450         super(PyPIJSONLocator, self).__init__(**kwargs)
 451         self.base_url = ensure_slash(url)
 452
 453     def get_distribution_names(self):
 454         """
 455         Return all the distribution names known to this locator.
 456         """
 457         raise NotImplementedError('Not available from this locator')
 458
 459     def _get_project(self, name):
 460         result = {'urls': {}, 'digests': {}}
 461         url = urljoin(self.base_url, '%s/json' % quote(name))
 462         try:
 463             resp = self.opener.open(url)
 464             data = resp.read().decode() # for now
 465             d = json.loads(data)
 466             md = Metadata(scheme=self.scheme)
 467             data = d['info']
 468             md.name = data['name']
 469             md.version = data['version']
 470             md.license = data.get('license')
 471             md.keywords = data.get('keywords', [])
 472             md.summary = data.get('summary')
 473             dist = Distribution(md)
 474             dist.locator = self
 475             urls = d['urls']
 476             result[md.version] = dist
 477             for info in d['urls']:
 478                 url = info['url']
 479                 dist.download_urls.add(url)
 480                 dist.digests[url] = self._get_digest(info)
 481                 result['urls'].setdefault(md.version, set()).add(url)
 482                 result['digests'][url] = self._get_digest(info)
 483             # Now get other releases
 484             for version, infos in d['releases'].items():
 485                 if version == md.version:
 486                     continue    # already done
 487                 omd = Metadata(scheme=self.scheme)
 488                 omd.name = md.name
 489                 omd.version = version
 490                 odist = Distribution(omd)
 491                 odist.locator = self
 492                 result[version] = odist
 493                 for info in infos:
 494                     url = info['url']
 495                     odist.download_urls.add(url)
 496                     odist.digests[url] = self._get_digest(info)
 497                     result['urls'].setdefault(version, set()).add(url)
 498                     result['digests'][url] = self._get_digest(info)
 499 #            for info in urls:
 500 #                md.source_url = info['url']
 501 #                dist.digest = self._get_digest(info)
 502 #                dist.locator = self
 503 #                for info in urls:
 504 #                    url = info['url']
 505 #                    result['urls'].setdefault(md.version, set()).add(url)
 506 #                    result['digests'][url] = self._get_digest(info)
 507         except Exception as e:
 508             self.errors.put(text_type(e))
 509             logger.exception('JSON fetch failed: %s', e)
 510         return result
 511
 512
 513 class Page(object):
 514     """
 515     This class represents a scraped HTML page.
 516     """
 517     # The following slightly hairy-looking regex just looks for the contents of
 518     # an anchor link, which has an attribute "href" either immediately preceded
 519     # or immediately followed by a "rel" attribute. The attribute values can be
 520     # declared with double quotes, single quotes or no quotes - which leads to
 521     # the length of the expression.
 522     _href = re.compile("""
 523 (rel\s*=\s*(?:"(?P<rel1>[^"]*)"|'(?P<rel2>[^']*)'|(?P<rel3>[^>\s\n]*))\s+)?
 524 href\s*=\s*(?:"(?P<url1>[^"]*)"|'(?P<url2>[^']*)'|(?P<url3>[^>\s\n]*))
 525 (\s+rel\s*=\s*(?:"(?P<rel4>[^"]*)"|'(?P<rel5>[^']*)'|(?P<rel6>[^>\s\n]*)))?
 526 """, re.I | re.S | re.X)
 527     _base = re.compile(r"""<base\s+href\s*=\s*['"]?([^'">]+)""", re.I | re.S)
 528
 529     def __init__(self, data, url):
 530         """
 531         Initialise an instance with the Unicode page contents and the URL they
 532         came from.
 533         """
 534         self.data = data
 535         self.base_url = self.url = url
 536         m = self._base.search(self.data)
 537         if m:
 538             self.base_url = m.group(1)
 539
 540     _clean_re = re.compile(r'[^a-z0-9$&+,/:;=?@.#%_\\|-]', re.I)
 541
 542     @cached_property
 543     def links(self):
 544         """
 545         Return the URLs of all the links on a page together with information
 546         about their "rel" attribute, for determining which ones to treat as
 547         downloads and which ones to queue for further scraping.
 548         """
 549         def clean(url):
 550             "Tidy up an URL."
 551             scheme, netloc, path, params, query, frag = urlparse(url)
 552             return urlunparse((scheme, netloc, quote(path),
 553                                params, query, frag))
 554
 555         result = set()
 556         for match in self._href.finditer(self.data):
 557             d = match.groupdict('')
 558             rel = (d['rel1'] or d['rel2'] or d['rel3'] or
 559                    d['rel4'] or d['rel5'] or d['rel6'])
 560             url = d['url1'] or d['url2'] or d['url3']
 561             url = urljoin(self.base_url, url)
 562             url = unescape(url)
 563             url = self._clean_re.sub(lambda m: '%%%2x' % ord(m.group(0)), url)
 564             result.add((url, rel))
 565         # We sort the result, hoping to bring the most recent versions
 566         # to the front
 567         result = sorted(result, key=lambda t: t[0], reverse=True)
 568         return result
 569
 570
 571 class SimpleScrapingLocator(Locator):
 572     """
 573     A locator which scrapes HTML pages to locate downloads for a distribution.
 574     This runs multiple threads to do the I/O; performance is at least as good
 575     as pip's PackageFinder, which works in an analogous fashion.
 576     """
 577
 578     # These are used to deal with various Content-Encoding schemes.
 579     decoders = {
 580         'deflate': zlib.decompress,
 581         'gzip': lambda b: gzip.GzipFile(fileobj=BytesIO(d)).read(),
 582         'none': lambda b: b,
 583     }
 584
 585     def __init__(self, url, timeout=None, num_workers=10, **kwargs):
 586         """
 587         Initialise an instance.
 588         :param url: The root URL to use for scraping.
 589         :param timeout: The timeout, in seconds, to be applied to requests.
 590                         This defaults to ``None`` (no timeout specified).
 591         :param num_workers: The number of worker threads you want to do I/O,
 592                             This defaults to 10.
 593         :param kwargs: Passed to the superclass.
 594         """
 595         super(SimpleScrapingLocator, self).__init__(**kwargs)
 596         self.base_url = ensure_slash(url)
 597         self.timeout = timeout
 598         self._page_cache = {}
 599         self._seen = set()
 600         self._to_fetch = queue.Queue()
 601         self._bad_hosts = set()
 602         self.skip_externals = False
 603         self.num_workers = num_workers
 604         self._lock = threading.RLock()
 605         # See issue #45: we need to be resilient when the locator is used
 606         # in a thread, e.g. with concurrent.futures. We can't use self._lock
 607         # as it is for coordinating our internal threads - the ones created
 608         # in _prepare_threads.
 609         self._gplock = threading.RLock()
 610
 611     def _prepare_threads(self):
 612         """
 613         Threads are created only when get_project is called, and terminate
 614         before it returns. They are there primarily to parallelise I/O (i.e.
 615         fetching web pages).
 616         """
 617         self._threads = []
 618         for i in range(self.num_workers):
 619             t = threading.Thread(target=self._fetch)
 620             t.setDaemon(True)
 621             t.start()
 622             self._threads.append(t)
 623
 624     def _wait_threads(self):
 625         """
 626         Tell all the threads to terminate (by sending a sentinel value) and
 627         wait for them to do so.
 628         """
 629         # Note that you need two loops, since you can't say which
 630         # thread will get each sentinel
 631         for t in self._threads:
 632             self._to_fetch.put(None)    # sentinel
 633         for t in self._threads:
 634             t.join()
 635         self._threads = []
 636
 637     def _get_project(self, name):
 638         result = {'urls': {}, 'digests': {}}
 639         with self._gplock:
 640             self.result = result
 641             self.project_name = name
 642             url = urljoin(self.base_url, '%s/' % quote(name))
 643             self._seen.clear()
 644             self._page_cache.clear()
 645             self._prepare_threads()
 646             try:
 647                 logger.debug('Queueing %s', url)
 648                 self._to_fetch.put(url)
 649                 self._to_fetch.join()
 650             finally:
 651                 self._wait_threads()
 652             del self.result
 653         return result
 654
 655     platform_dependent = re.compile(r'\b(linux-(i\d86|x86_64|arm\w+)|'
 656                                     r'win(32|-amd64)|macosx-?\d+)\b', re.I)
 657
 658     def _is_platform_dependent(self, url):
 659         """
 660         Does an URL refer to a platform-specific download?
 661         """
 662         return self.platform_dependent.search(url)
 663
 664     def _process_download(self, url):
 665         """
 666         See if an URL is a suitable download for a project.
 667
 668         If it is, register information in the result dictionary (for
 669         _get_project) about the specific version it's for.
 670
 671         Note that the return value isn't actually used other than as a boolean
 672         value.
 673         """
 674         if self._is_platform_dependent(url):
 675             info = None
 676         else:
 677             info = self.convert_url_to_download_info(url, self.project_name)
 678         logger.debug('process_download: %s -> %s', url, info)
 679         if info:
 680             with self._lock:    # needed because self.result is shared
 681                 self._update_version_data(self.result, info)
 682         return info
 683
 684     def _should_queue(self, link, referrer, rel):
 685         """
 686         Determine whether a link URL from a referring page and with a
 687         particular "rel" attribute should be queued for scraping.
 688         """
 689         scheme, netloc, path, _, _, _ = urlparse(link)
 690         if path.endswith(self.source_extensions + self.binary_extensions +
 691                          self.excluded_extensions):
 692             result = False
 693         elif self.skip_externals and not link.startswith(self.base_url):
 694             result = False
 695         elif not referrer.startswith(self.base_url):
 696             result = False
 697         elif rel not in ('homepage', 'download'):
 698             result = False
 699         elif scheme not in ('http', 'https', 'ftp'):
 700             result = False
 701         elif self._is_platform_dependent(link):
 702             result = False
 703         else:
 704             host = netloc.split(':', 1)[0]
 705             if host.lower() == 'localhost':
 706                 result = False
 707             else:
 708                 result = True
 709         logger.debug('should_queue: %s (%s) from %s -> %s', link, rel,
 710                      referrer, result)
 711         return result
 712
 713     def _fetch(self):
 714         """
 715         Get a URL to fetch from the work queue, get the HTML page, examine its
 716         links for download candidates and candidates for further scraping.
 717
 718         This is a handy method to run in a thread.
 719         """
 720         while True:
 721             url = self._to_fetch.get()
 722             try:
 723                 if url:
 724                     page = self.get_page(url)
 725                     if page is None:    # e.g. after an error
 726                         continue
 727                     for link, rel in page.links:
 728                         if link not in self._seen:
 729                             self._seen.add(link)
 730                             if (not self._process_download(link) and
 731                                 self._should_queue(link, url, rel)):
 732                                 logger.debug('Queueing %s from %s', link, url)
 733                                 self._to_fetch.put(link)
 734             except Exception as e:  # pragma: no cover
 735                 self.errors.put(text_type(e))
 736             finally:
 737                 # always do this, to avoid hangs :-)
 738                 self._to_fetch.task_done()
 739             if not url:
 740                 #logger.debug('Sentinel seen, quitting.')
 741                 break
 742
 743     def get_page(self, url):
 744         """
 745         Get the HTML for an URL, possibly from an in-memory cache.
 746
 747         XXX TODO Note: this cache is never actually cleared. It's assumed that
 748         the data won't get stale over the lifetime of a locator instance (not
 749         necessarily true for the default_locator).
 750         """
 751         # http://peak.telecommunity.com/DevCenter/EasyInstall#package-index-api
 752         scheme, netloc, path, _, _, _ = urlparse(url)
 753         if scheme == 'file' and os.path.isdir(url2pathname(path)):
 754             url = urljoin(ensure_slash(url), 'index.html')
 755
 756         if url in self._page_cache:
 757             result = self._page_cache[url]
 758             logger.debug('Returning %s from cache: %s', url, result)
 759         else:
 760             host = netloc.split(':', 1)[0]
 761             result = None
 762             if host in self._bad_hosts:
 763                 logger.debug('Skipping %s due to bad host %s', url, host)
 764             else:
 765                 req = Request(url, headers={'Accept-encoding': 'identity'})
 766                 try:
 767                     logger.debug('Fetching %s', url)
 768                     resp = self.opener.open(req, timeout=self.timeout)
 769                     logger.debug('Fetched %s', url)
 770                     headers = resp.info()
 771                     content_type = headers.get('Content-Type', '')
 772                     if HTML_CONTENT_TYPE.match(content_type):
 773                         final_url = resp.geturl()
 774                         data = resp.read()
 775                         encoding = headers.get('Content-Encoding')
 776                         if encoding:
 777                             decoder = self.decoders[encoding]   # fail if not found
 778                             data = decoder(data)
 779                         encoding = 'utf-8'
 780                         m = CHARSET.search(content_type)
 781                         if m:
 782                             encoding = m.group(1)
 783                         try:
 784                             data = data.decode(encoding)
 785                         except UnicodeError:  # pragma: no cover
 786                             data = data.decode('latin-1')    # fallback
 787                         result = Page(data, final_url)
 788                         self._page_cache[final_url] = result
 789                 except HTTPError as e:
 790                     if e.code != 404:
 791                         logger.exception('Fetch failed: %s: %s', url, e)
 792                 except URLError as e:  # pragma: no cover
 793                     logger.exception('Fetch failed: %s: %s', url, e)
 794                     with self._lock:
 795                         self._bad_hosts.add(host)
 796                 except Exception as e:  # pragma: no cover
 797                     logger.exception('Fetch failed: %s: %s', url, e)
 798                 finally:
 799                     self._page_cache[url] = result   # even if None (failure)
 800         return result
 801
 802     _distname_re = re.compile('<a href=[^>]*>([^<]+)<')
 803
 804     def get_distribution_names(self):
 805         """
 806         Return all the distribution names known to this locator.
 807         """
 808         result = set()
 809         page = self.get_page(self.base_url)
 810         if not page:
 811             raise DistlibException('Unable to get %s' % self.base_url)
 812         for match in self._distname_re.finditer(page.data):
 813             result.add(match.group(1))
 814         return result
 815
 816 class DirectoryLocator(Locator):
 817     """
 818     This class locates distributions in a directory tree.
 819     """
 820
 821     def __init__(self, path, **kwargs):
 822         """
 823         Initialise an instance.
 824         :param path: The root of the directory tree to search.
 825         :param kwargs: Passed to the superclass constructor,
 826                        except for:
 827                        * recursive - if True (the default), subdirectories are
 828                          recursed into. If False, only the top-level directory
 829                          is searched,
 830         """
 831         self.recursive = kwargs.pop('recursive', True)
 832         super(DirectoryLocator, self).__init__(**kwargs)
 833         path = os.path.abspath(path)
 834         if not os.path.isdir(path):  # pragma: no cover
 835             raise DistlibException('Not a directory: %r' % path)
 836         self.base_dir = path
 837
 838     def should_include(self, filename, parent):
 839         """
 840         Should a filename be considered as a candidate for a distribution
 841         archive? As well as the filename, the directory which contains it
 842         is provided, though not used by the current implementation.
 843         """
 844         return filename.endswith(self.downloadable_extensions)
 845
 846     def _get_project(self, name):
 847         result = {'urls': {}, 'digests': {}}
 848         for root, dirs, files in os.walk(self.base_dir):
 849             for fn in files:
 850                 if self.should_include(fn, root):
 851                     fn = os.path.join(root, fn)
 852                     url = urlunparse(('file', '',
 853                                       pathname2url(os.path.abspath(fn)),
 854                                       '', '', ''))
 855                     info = self.convert_url_to_download_info(url, name)
 856                     if info:
 857                         self._update_version_data(result, info)
 858             if not self.recursive:
 859                 break
 860         return result
 861
 862     def get_distribution_names(self):
 863         """
 864         Return all the distribution names known to this locator.
 865         """
 866         result = set()
 867         for root, dirs, files in os.walk(self.base_dir):
 868             for fn in files:
 869                 if self.should_include(fn, root):
 870                     fn = os.path.join(root, fn)
 871                     url = urlunparse(('file', '',
 872                                       pathname2url(os.path.abspath(fn)),
 873                                       '', '', ''))
 874                     info = self.convert_url_to_download_info(url, None)
 875                     if info:
 876                         result.add(info['name'])
 877             if not self.recursive:
 878                 break
 879         return result
 880
 881 class JSONLocator(Locator):
 882     """
 883     This locator uses special extended metadata (not available on PyPI) and is
 884     the basis of performant dependency resolution in distlib. Other locators
 885     require archive downloads before dependencies can be determined! As you
 886     might imagine, that can be slow.
 887     """
 888     def get_distribution_names(self):
 889         """
 890         Return all the distribution names known to this locator.
 891         """
 892         raise NotImplementedError('Not available from this locator')
 893
 894     def _get_project(self, name):
 895         result = {'urls': {}, 'digests': {}}
 896         data = get_project_data(name)
 897         if data:
 898             for info in data.get('files', []):
 899                 if info['ptype'] != 'sdist' or info['pyversion'] != 'source':
 900                     continue
 901                 # We don't store summary in project metadata as it makes
 902                 # the data bigger for no benefit during dependency
 903                 # resolution
 904                 dist = make_dist(data['name'], info['version'],
 905                                  summary=data.get('summary',
 906                                                   'Placeholder for summary'),
 907                                  scheme=self.scheme)
 908                 md = dist.metadata
 909                 md.source_url = info['url']
 910                 # TODO SHA256 digest
 911                 if 'digest' in info and info['digest']:
 912                     dist.digest = ('md5', info['digest'])
 913                 md.dependencies = info.get('requirements', {})
 914                 dist.exports = info.get('exports', {})
 915                 result[dist.version] = dist
 916                 result['urls'].setdefault(dist.version, set()).add(info['url'])
 917         return result
 918
 919 class DistPathLocator(Locator):
 920     """
 921     This locator finds installed distributions in a path. It can be useful for
 922     adding to an :class:`AggregatingLocator`.
 923     """
 924     def __init__(self, distpath, **kwargs):
 925         """
 926         Initialise an instance.
 927
 928         :param distpath: A :class:`DistributionPath` instance to search.
 929         """
 930         super(DistPathLocator, self).__init__(**kwargs)
 931         assert isinstance(distpath, DistributionPath)
 932         self.distpath = distpath
 933
 934     def _get_project(self, name):
 935         dist = self.distpath.get_distribution(name)
 936         if dist is None:
 937             result = {'urls': {}, 'digests': {}}
 938         else:
 939             result = {
 940                 dist.version: dist,
 941                 'urls': {dist.version: set([dist.source_url])},
 942                 'digests': {dist.version: set([None])}
 943             }
 944         return result
 945
 946
 947 class AggregatingLocator(Locator):
 948     """
 949     This class allows you to chain and/or merge a list of locators.
 950     """
 951     def __init__(self, *locators, **kwargs):
 952         """
 953         Initialise an instance.
 954
 955         :param locators: The list of locators to search.
 956         :param kwargs: Passed to the superclass constructor,
 957                        except for:
 958                        * merge - if False (the default), the first successful
 959                          search from any of the locators is returned. If True,
 960                          the results from all locators are merged (this can be
 961                          slow).
 962         """
 963         self.merge = kwargs.pop('merge', False)
 964         self.locators = locators
 965         super(AggregatingLocator, self).__init__(**kwargs)
 966
 967     def clear_cache(self):
 968         super(AggregatingLocator, self).clear_cache()
 969         for locator in self.locators:
 970             locator.clear_cache()
 971
 972     def _set_scheme(self, value):
 973         self._scheme = value
 974         for locator in self.locators:
 975             locator.scheme = value
 976
 977     scheme = property(Locator.scheme.fget, _set_scheme)
 978
 979     def _get_project(self, name):
 980         result = {}
 981         for locator in self.locators:
 982             d = locator.get_project(name)
 983             if d:
 984                 if self.merge:
 985                     files = result.get('urls', {})
 986                     digests = result.get('digests', {})
 987                     # next line could overwrite result['urls'], result['digests']
 988                     result.update(d)
 989                     df = result.get('urls')
 990                     if files and df:
 991                         for k, v in files.items():
 992                             if k in df:
 993                                 df[k] |= v
 994                             else:
 995                                 df[k] = v
 996                     dd = result.get('digests')
 997                     if digests and dd:
 998                         dd.update(digests)
 999                 else:
1000                     # See issue #18. If any dists are found and we're looking
1001                     # for specific constraints, we only return something if
1002                     # a match is found. For example, if a DirectoryLocator
1003                     # returns just foo (1.0) while we're looking for
1004                     # foo (>= 2.0), we'll pretend there was nothing there so
1005                     # that subsequent locators can be queried. Otherwise we
1006                     # would just return foo (1.0) which would then lead to a
1007                     # failure to find foo (>= 2.0), because other locators
1008                     # weren't searched. Note that this only matters when
1009                     # merge=False.
1010                     if self.matcher is None:
1011                         found = True
1012                     else:
1013                         found = False
1014                         for k in d:
1015                             if self.matcher.match(k):
1016                                 found = True
1017                                 break
1018                     if found:
1019                         result = d
1020                         break
1021         return result
1022
1023     def get_distribution_names(self):
1024         """
1025         Return all the distribution names known to this locator.
1026         """
1027         result = set()
1028         for locator in self.locators:
1029             try:
1030                 result |= locator.get_distribution_names()
1031             except NotImplementedError:
1032                 pass
1033         return result
1034
1035
1036 # We use a legacy scheme simply because most of the dists on PyPI use legacy
1037 # versions which don't conform to PEP 426 / PEP 440.
1038 default_locator = AggregatingLocator(
1039                     JSONLocator(),
1040                     SimpleScrapingLocator('https://pypi.python.org/simple/',
1041                                           timeout=3.0),
1042                     scheme='legacy')
1043
1044 locate = default_locator.locate
1045
1046 NAME_VERSION_RE = re.compile(r'(?P<name>[\w-]+)\s*'
1047                              r'\(\s*(==\s*)?(?P<ver>[^)]+)\)$')
1048
1049 class DependencyFinder(object):
1050     """
1051     Locate dependencies for distributions.
1052     """
1053
1054     def __init__(self, locator=None):
1055         """
1056         Initialise an instance, using the specified locator
1057         to locate distributions.
1058         """
1059         self.locator = locator or default_locator
1060         self.scheme = get_scheme(self.locator.scheme)
1061
1062     def add_distribution(self, dist):
1063         """
1064         Add a distribution to the finder. This will update internal information
1065         about who provides what.
1066         :param dist: The distribution to add.
1067         """
1068         logger.debug('adding distribution %s', dist)
1069         name = dist.key
1070         self.dists_by_name[name] = dist
1071         self.dists[(name, dist.version)] = dist
1072         for p in dist.provides:
1073             name, version = parse_name_and_version(p)
1074             logger.debug('Add to provided: %s, %s, %s', name, version, dist)
1075             self.provided.setdefault(name, set()).add((version, dist))
1076
1077     def remove_distribution(self, dist):
1078         """
1079         Remove a distribution from the finder. This will update internal
1080         information about who provides what.
1081         :param dist: The distribution to remove.
1082         """
1083         logger.debug('removing distribution %s', dist)
1084         name = dist.key
1085         del self.dists_by_name[name]
1086         del self.dists[(name, dist.version)]
1087         for p in dist.provides:
1088             name, version = parse_name_and_version(p)
1089             logger.debug('Remove from provided: %s, %s, %s', name, version, dist)
1090             s = self.provided[name]
1091             s.remove((version, dist))
1092             if not s:
1093                 del self.provided[name]
1094
1095     def get_matcher(self, reqt):
1096         """
1097         Get a version matcher for a requirement.
1098         :param reqt: The requirement
1099         :type reqt: str
1100         :return: A version matcher (an instance of
1101                  :class:`distlib.version.Matcher`).
1102         """
1103         try:
1104             matcher = self.scheme.matcher(reqt)
1105         except UnsupportedVersionError:  # pragma: no cover
1106             # XXX compat-mode if cannot read the version
1107             name = reqt.split()[0]
1108             matcher = self.scheme.matcher(name)
1109         return matcher
1110
1111     def find_providers(self, reqt):
1112         """
1113         Find the distributions which can fulfill a requirement.
1114
1115         :param reqt: The requirement.
1116          :type reqt: str
1117         :return: A set of distribution which can fulfill the requirement.
1118         """
1119         matcher = self.get_matcher(reqt)
1120         name = matcher.key   # case-insensitive
1121         result = set()
1122         provided = self.provided
1123         if name in provided:
1124             for version, provider in provided[name]:
1125                 try:
1126                     match = matcher.match(version)
1127                 except UnsupportedVersionError:
1128                     match = False
1129
1130                 if match:
1131                     result.add(provider)
1132                     break
1133         return result
1134
1135     def try_to_replace(self, provider, other, problems):
1136         """
1137         Attempt to replace one provider with another. This is typically used
1138         when resolving dependencies from multiple sources, e.g. A requires
1139         (B >= 1.0) while C requires (B >= 1.1).
1140
1141         For successful replacement, ``provider`` must meet all the requirements
1142         which ``other`` fulfills.
1143
1144         :param provider: The provider we are trying to replace with.
1145         :param other: The provider we're trying to replace.
1146         :param problems: If False is returned, this will contain what
1147                          problems prevented replacement. This is currently
1148                          a tuple of the literal string 'cantreplace',
1149                          ``provider``, ``other``  and the set of requirements
1150                          that ``provider`` couldn't fulfill.
1151         :return: True if we can replace ``other`` with ``provider``, else
1152                  False.
1153         """
1154         rlist = self.reqts[other]
1155         unmatched = set()
1156         for s in rlist:
1157             matcher = self.get_matcher(s)
1158             if not matcher.match(provider.version):
1159                 unmatched.add(s)
1160         if unmatched:
1161             # can't replace other with provider
1162             problems.add(('cantreplace', provider, other,
1163                           frozenset(unmatched)))
1164             result = False
1165         else:
1166             # can replace other with provider
1167             self.remove_distribution(other)
1168             del self.reqts[other]
1169             for s in rlist:
1170                 self.reqts.setdefault(provider, set()).add(s)
1171             self.add_distribution(provider)
1172             result = True
1173         return result
1174
1175     def find(self, requirement, meta_extras=None, prereleases=False):
1176         """
1177         Find a distribution and all distributions it depends on.
1178
1179         :param requirement: The requirement specifying the distribution to
1180                             find, or a Distribution instance.
1181         :param meta_extras: A list of meta extras such as :test:, :build: and
1182                             so on.
1183         :param prereleases: If ``True``, allow pre-release versions to be
1184                             returned - otherwise, don't return prereleases
1185                             unless they're all that's available.
1186
1187         Return a set of :class:`Distribution` instances and a set of
1188         problems.
1189
1190         The distributions returned should be such that they have the
1191         :attr:`required` attribute set to ``True`` if they were
1192         from the ``requirement`` passed to ``find()``, and they have the
1193         :attr:`build_time_dependency` attribute set to ``True`` unless they
1194         are post-installation dependencies of the ``requirement``.
1195
1196         The problems should be a tuple consisting of the string
1197         ``'unsatisfied'`` and the requirement which couldn't be satisfied
1198         by any distribution known to the locator.
1199         """
1200
1201         self.provided = {}
1202         self.dists = {}
1203         self.dists_by_name = {}
1204         self.reqts = {}
1205
1206         meta_extras = set(meta_extras or [])
1207         if ':*:' in meta_extras:
1208             meta_extras.remove(':*:')
1209             # :meta: and :run: are implicitly included
1210             meta_extras |= set([':test:', ':build:', ':dev:'])
1211
1212         if isinstance(requirement, Distribution):
1213             dist = odist = requirement
1214             logger.debug('passed %s as requirement', odist)
1215         else:
1216             dist = odist = self.locator.locate(requirement,
1217                                                prereleases=prereleases)
1218             if dist is None:
1219                 raise DistlibException('Unable to locate %r' % requirement)
1220             logger.debug('located %s', odist)
1221         dist.requested = True
1222         problems = set()
1223         todo = set([dist])
1224         install_dists = set([odist])
1225         while todo:
1226             dist = todo.pop()
1227             name = dist.key     # case-insensitive
1228             if name not in self.dists_by_name:
1229                 self.add_distribution(dist)
1230             else:
1231                 #import pdb; pdb.set_trace()
1232                 other = self.dists_by_name[name]
1233                 if other != dist:
1234                     self.try_to_replace(dist, other, problems)
1235
1236             ireqts = dist.run_requires | dist.meta_requires
1237             sreqts = dist.build_requires
1238             ereqts = set()
1239             if dist in install_dists:
1240                 for key in ('test', 'build', 'dev'):
1241                     e = ':%s:' % key
1242                     if e in meta_extras:
1243                         ereqts |= getattr(dist, '%s_requires' % key)
1244             all_reqts = ireqts | sreqts | ereqts
1245             for r in all_reqts:
1246                 providers = self.find_providers(r)
1247                 if not providers:
1248                     logger.debug('No providers found for %r', r)
1249                     provider = self.locator.locate(r, prereleases=prereleases)
1250                     # If no provider is found and we didn't consider
1251                     # prereleases, consider them now.
1252                     if provider is None and not prereleases:
1253                         provider = self.locator.locate(r, prereleases=True)
1254                     if provider is None:
1255                         logger.debug('Cannot satisfy %r', r)
1256                         problems.add(('unsatisfied', r))
1257                     else:
1258                         n, v = provider.key, provider.version
1259                         if (n, v) not in self.dists:
1260                             todo.add(provider)
1261                         providers.add(provider)
1262                         if r in ireqts and dist in install_dists:
1263                             install_dists.add(provider)
1264                             logger.debug('Adding %s to install_dists',
1265                                          provider.name_and_version)
1266                 for p in providers:
1267                     name = p.key
1268                     if name not in self.dists_by_name:
1269                         self.reqts.setdefault(p, set()).add(r)
1270                     else:
1271                         other = self.dists_by_name[name]
1272                         if other != p:
1273                             # see if other can be replaced by p
1274                             self.try_to_replace(p, other, problems)
1275
1276         dists = set(self.dists.values())
1277         for dist in dists:
1278             dist.build_time_dependency = dist not in install_dists
1279             if dist.build_time_dependency:
1280                 logger.debug('%s is a build-time dependency only.',
1281                              dist.name_and_version)
1282         logger.debug('find done for %s', odist)
1283         return dists, problems