Source code for link_checker.url_utils

"""URL normalization, prefix matching, and classification utilities."""

from __future__ import annotations

from urllib.parse import urlparse, urlunparse

HTML_EXTENSIONS: frozenset[str] = frozenset(
    {'.htm', '.html', '.shtml', '.php', '.asp', '.jsp', '.cgi'}
)

# Index filenames that are equivalent to the parent directory URL.
# e.g. /cassini/index.html → /cassini/
_INDEX_FILENAMES: frozenset[str] = frozenset({'index' + ext for ext in HTML_EXTENSIONS})


def _strip_index_filename(path: str) -> str:
    """Strip a conventional directory-index filename from a URL path.

    If the final segment of *path* is a known index filename
    (``index.html``, ``index.php``, etc.) it is removed so the path
    ends with ``/``.  All other paths are returned unchanged.

    Parameters:
        path: The path component of a URL (no scheme, host, query, or
            fragment).

    Returns:
        The path with any trailing index filename removed.
    """
    if not path:
        return path
    last_segment = path.rsplit('/', 1)[-1]
    if last_segment.lower() in _INDEX_FILENAMES:
        return path[: len(path) - len(last_segment)]
    return path


[docs] def normalize_url(url: str) -> tuple[str, str | None]: """Normalize a URL to its canonical form. Transformations applied: - Scheme is preserved (``http`` remains ``http``, ``https`` remains ``https``). - Host is lowercased. - Fragment is stripped (returned separately). - Query string is preserved as part of the URL identity. - Directory-index filenames (``index.html``, ``index.php``, etc.) are stripped so ``/cassini/index.html`` → ``/cassini/``. Note that bare directory paths without a trailing slash (e.g. ``/cassini``) are left unchanged by this function. Use :func:`add_trailing_slash` to add the trailing slash when you know the URL refers to a directory (e.g. for internal crawl targets). Parameters: url: The URL to normalize. Returns: A tuple of ``(canonical_url, fragment_or_none)``. For non-HTTP URLs the original string is returned unchanged with no fragment. """ parsed = urlparse(url) if parsed.scheme not in ('http', 'https'): return url, None fragment: str | None = parsed.fragment if parsed.fragment else None normalized = urlunparse( ( parsed.scheme, parsed.netloc.lower(), _strip_index_filename(parsed.path), parsed.params, parsed.query, '', ) ) return normalized, fragment
[docs] def add_trailing_slash(url: str) -> str: """Ensure a URL path that has no file extension ends with ``/``. This should be applied to **internal** URLs (same domain as the crawl root) to canonicalize bare directory paths: - ``/cassini`` → ``/cassini/`` - ``/cassini/`` → ``/cassini/`` (unchanged) - ``/cassini/page.html`` → ``/cassini/page.html`` (has extension, unchanged) - ``/data.csv`` → ``/data.csv`` (has extension, unchanged) - ``/path/.htaccess`` → ``/path/.htaccess`` (dotfile, treated as a file) Leading-dot filenames (e.g. ``.htaccess``, ``.gitignore``) are considered files and do **not** receive a trailing slash. Apply after :func:`normalize_url` so that index-file stripping has already run (``/cassini/index.html`` → ``/cassini/`` → unchanged here). Parameters: url: An already-normalized http/https URL. Returns: The URL with a trailing slash added to any extension-free path. """ parsed = urlparse(url) path = parsed.path if not path.endswith('/'): last_segment = path.rsplit('/', 1)[-1] dot_idx = last_segment.rfind('.') if dot_idx < 0: path = path + '/' url = urlunparse((parsed.scheme, parsed.netloc, path, parsed.params, parsed.query, '')) return url
[docs] def normalize_internal_url(url: str) -> tuple[str, str | None]: """Normalize an internal (same-domain) URL to its canonical form. Applies all transformations from :func:`normalize_url` and additionally adds a trailing slash to directory-like paths (no file extension): - ``/cassini`` → ``/cassini/`` - ``/cassini/`` → ``/cassini/`` (unchanged) - ``/cassini/index.html`` → ``/cassini/`` (index stripped + already slash) - ``/cassini/page.html`` → ``/cassini/page.html`` (has extension, unchanged) Use this for deduplication and request URLs of internal crawl targets. For external URLs use :func:`normalize_url` alone to avoid altering the request path in ways the server may not expect. Parameters: url: The URL to normalize. Returns: A tuple of ``(canonical_url, fragment_or_none)``. """ normalized, fragment = normalize_url(url) if not is_http_url(normalized): return normalized, fragment return add_trailing_slash(normalized), fragment
[docs] def is_same_domain(url: str, root_url: str) -> bool: """Return True if *url* has the same host (including port) as *root_url*. Comparison is case-insensitive. Scheme is ignored. Subdomains are considered different domains. Parameters: url: The URL to check. root_url: The reference URL whose host to compare against. Returns: True if both URLs share the same host. """ return urlparse(url).netloc.lower() == urlparse(root_url).netloc.lower()
[docs] def is_under_root(url_path: str, root_path: str) -> bool: """Return True if *url_path* is at or under *root_path* on a segment boundary. Parameters: url_path: The path component of the URL to test. root_path: The root path to test containment against. Returns: True if *url_path* is the same as or under *root_path*. """ root = root_path.rstrip('/') candidate = url_path.rstrip('/') if root == '': return True return candidate == root or candidate.startswith(root + '/')
[docs] def matches_prefix(candidate_url: str, prefix_url: str) -> bool: """Return True if *candidate_url* matches *prefix_url* per spec §5.7 rules. Matching rules: - Scheme is ignored (http and https are equivalent). - Host comparison is case-insensitive (and must be equal). - Path of candidate must equal path of prefix, or start with prefix path followed by ``/`` (segment-boundary matching). Parameters: candidate_url: The URL to test. prefix_url: The prefix URL to match against. Returns: True if *candidate_url* matches *prefix_url*. """ c = urlparse(candidate_url) p = urlparse(prefix_url) if c.netloc.lower() != p.netloc.lower(): return False prefix_path = p.path.rstrip('/') candidate_path = c.path.rstrip('/') return candidate_path == prefix_path or candidate_path.startswith(prefix_path + '/')
[docs] def get_file_extension(url: str) -> str | None: """Return the lowercase file extension from the URL path, or None. Only inspects the path component (ignores query string and fragment). Returns None for paths ending in ``/`` or having no dot in the last segment. Parameters: url: The URL to inspect. Returns: Lowercase extension including the leading dot (e.g. ``'.pdf'``), or None if there is no extension. """ path = urlparse(url).path if path.endswith('/'): return None last_segment = path.rsplit('/', 1)[-1] dot_idx = last_segment.rfind('.') if dot_idx <= 0: return None return last_segment[dot_idx:].lower()
[docs] def is_html_extension(ext: str | None) -> bool: """Return True if *ext* indicates an HTML-like page (or no extension at all). Parameters: ext: Lowercase file extension including leading dot, or None. Returns: True for extensions in :data:`HTML_EXTENSIONS` and for None. """ if ext is None: return True return ext.lower() in HTML_EXTENSIONS
[docs] def get_depth(url_path: str, root_path: str) -> int: """Return the directory depth of *url_path* relative to *root_path*. Depth 0 is the root page itself (or its trailing-slash variant). Each additional directory segment adds 1. Parameters: url_path: The path to measure. root_path: The root path to measure from. Returns: Integer depth >= 0. """ root = root_path.rstrip('/') candidate = url_path.rstrip('/') if candidate == root: return 0 relative = candidate[len(root) :] relative = relative.lstrip('/') if not relative: return 0 return len(relative.split('/'))
[docs] def is_http_url(url: str) -> bool: """Return True if *url* uses the ``http`` or ``https`` scheme. Parameters: url: The URL string to test. Returns: True for HTTP/HTTPS URLs, False for everything else. """ return urlparse(url).scheme in ('http', 'https')
[docs] def is_http_to_https_redirect(original_url: str, final_url: str) -> bool: """Return True if *original_url* and *final_url* differ only in scheme upgrade. A pure HTTP-to-HTTPS redirect is one where the original URL uses ``http`` and the final URL uses ``https``, with the same host (case-insensitive), path, and query string. Any other difference (different host, path, query, or port) returns False. Parameters: original_url: The URL before the redirect. final_url: The URL after the redirect. Returns: True if the redirect is a simple HTTP-to-HTTPS scheme upgrade. """ orig = urlparse(original_url) final = urlparse(final_url) return ( orig.scheme == 'http' and final.scheme == 'https' and orig.netloc.lower() == final.netloc.lower() and orig.path == final.path and orig.query == final.query )