Source code for link_checker.url_utils

"""URL normalization, prefix matching, and classification utilities."""

from __future__ import annotations

from urllib.parse import urlparse, urlunparse

HTML_EXTENSIONS: frozenset[str] = frozenset(
    {'.htm', '.html', '.shtml', '.php', '.asp', '.jsp', '.cgi'}
)

# Index filenames that are equivalent to the parent directory URL.
# e.g. /cassini/index.html → /cassini/
_INDEX_FILENAMES: frozenset[str] = frozenset({'index' + ext for ext in HTML_EXTENSIONS})


def _strip_index_filename(path: str) -> str:
    """Strip a conventional directory-index filename from a URL path.

    If the final segment of *path* is a known index filename
    (``index.html``, ``index.php``, etc.) it is removed so the path
    ends with ``/``.  All other paths are returned unchanged.

    Parameters:
        path: The path component of a URL (no scheme, host, query, or
            fragment).

    Returns:
        The path with any trailing index filename removed.
    """
    if not path:
        return path
    last_segment = path.rsplit('/', 1)[-1]
    if last_segment.lower() in _INDEX_FILENAMES:
        return path[: len(path) - len(last_segment)]
    return path



[docs]
def normalize_url(url: str) -> tuple[str, str | None]:
    """Normalize a URL to its canonical form.

    Transformations applied:

    - Scheme is preserved (``http`` remains ``http``, ``https`` remains ``https``).
    - Host is lowercased.
    - Fragment is stripped (returned separately).
    - Query string is preserved as part of the URL identity.
    - Directory-index filenames (``index.html``, ``index.php``, etc.)
      are stripped so ``/cassini/index.html`` → ``/cassini/``.

    Note that bare directory paths without a trailing slash (e.g.
    ``/cassini``) are left unchanged by this function.  Use
    :func:`add_trailing_slash` to add the trailing slash when you know
    the URL refers to a directory (e.g. for internal crawl targets).

    Parameters:
        url: The URL to normalize.

    Returns:
        A tuple of ``(canonical_url, fragment_or_none)``. For non-HTTP URLs
        the original string is returned unchanged with no fragment.
    """
    parsed = urlparse(url)

    if parsed.scheme not in ('http', 'https'):
        return url, None

    fragment: str | None = parsed.fragment if parsed.fragment else None

    normalized = urlunparse(
        (
            parsed.scheme,
            parsed.netloc.lower(),
            _strip_index_filename(parsed.path),
            parsed.params,
            parsed.query,
            '',
        )
    )
    return normalized, fragment




[docs]
def add_trailing_slash(url: str) -> str:
    """Ensure a URL path that has no file extension ends with ``/``.

    This should be applied to **internal** URLs (same domain as the crawl
    root) to canonicalize bare directory paths:

    - ``/cassini`` → ``/cassini/``
    - ``/cassini/`` → ``/cassini/`` (unchanged)
    - ``/cassini/page.html`` → ``/cassini/page.html`` (has extension, unchanged)
    - ``/data.csv`` → ``/data.csv`` (has extension, unchanged)
    - ``/path/.htaccess`` → ``/path/.htaccess`` (dotfile, treated as a file)

    Leading-dot filenames (e.g. ``.htaccess``, ``.gitignore``) are considered
    files and do **not** receive a trailing slash.

    Apply after :func:`normalize_url` so that index-file stripping has
    already run (``/cassini/index.html`` → ``/cassini/`` → unchanged here).

    Parameters:
        url: An already-normalized http/https URL.

    Returns:
        The URL with a trailing slash added to any extension-free path.
    """
    parsed = urlparse(url)
    path = parsed.path
    if not path.endswith('/'):
        last_segment = path.rsplit('/', 1)[-1]
        dot_idx = last_segment.rfind('.')
        if dot_idx < 0:
            path = path + '/'
            url = urlunparse((parsed.scheme, parsed.netloc, path, parsed.params, parsed.query, ''))
    return url




[docs]
def normalize_internal_url(url: str) -> tuple[str, str | None]:
    """Normalize an internal (same-domain) URL to its canonical form.

    Applies all transformations from :func:`normalize_url` and additionally
    adds a trailing slash to directory-like paths (no file extension):

    - ``/cassini`` → ``/cassini/``
    - ``/cassini/`` → ``/cassini/`` (unchanged)
    - ``/cassini/index.html`` → ``/cassini/`` (index stripped + already slash)
    - ``/cassini/page.html`` → ``/cassini/page.html`` (has extension, unchanged)

    Use this for deduplication and request URLs of internal crawl targets.
    For external URLs use :func:`normalize_url` alone to avoid altering
    the request path in ways the server may not expect.

    Parameters:
        url: The URL to normalize.

    Returns:
        A tuple of ``(canonical_url, fragment_or_none)``.
    """
    normalized, fragment = normalize_url(url)
    if not is_http_url(normalized):
        return normalized, fragment
    return add_trailing_slash(normalized), fragment




[docs]
def is_same_domain(url: str, root_url: str) -> bool:
    """Return True if *url* has the same host (including port) as *root_url*.

    Comparison is case-insensitive. Scheme is ignored. Subdomains are
    considered different domains.

    Parameters:
        url: The URL to check.
        root_url: The reference URL whose host to compare against.

    Returns:
        True if both URLs share the same host.
    """
    return urlparse(url).netloc.lower() == urlparse(root_url).netloc.lower()




[docs]
def is_under_root(url_path: str, root_path: str) -> bool:
    """Return True if *url_path* is at or under *root_path* on a segment boundary.

    Parameters:
        url_path: The path component of the URL to test.
        root_path: The root path to test containment against.

    Returns:
        True if *url_path* is the same as or under *root_path*.
    """
    root = root_path.rstrip('/')
    candidate = url_path.rstrip('/')

    if root == '':
        return True

    return candidate == root or candidate.startswith(root + '/')




[docs]
def matches_prefix(candidate_url: str, prefix_url: str) -> bool:
    """Return True if *candidate_url* matches *prefix_url* per spec §5.7 rules.

    Matching rules:

    - Scheme is ignored (http and https are equivalent).
    - Host comparison is case-insensitive (and must be equal).
    - Path of candidate must equal path of prefix, or start with prefix path
      followed by ``/`` (segment-boundary matching).

    Parameters:
        candidate_url: The URL to test.
        prefix_url: The prefix URL to match against.

    Returns:
        True if *candidate_url* matches *prefix_url*.
    """
    c = urlparse(candidate_url)
    p = urlparse(prefix_url)

    if c.netloc.lower() != p.netloc.lower():
        return False

    prefix_path = p.path.rstrip('/')
    candidate_path = c.path.rstrip('/')

    return candidate_path == prefix_path or candidate_path.startswith(prefix_path + '/')




[docs]
def get_file_extension(url: str) -> str | None:
    """Return the lowercase file extension from the URL path, or None.

    Only inspects the path component (ignores query string and fragment).
    Returns None for paths ending in ``/`` or having no dot in the last
    segment.

    Parameters:
        url: The URL to inspect.

    Returns:
        Lowercase extension including the leading dot (e.g. ``'.pdf'``),
        or None if there is no extension.
    """
    path = urlparse(url).path
    if path.endswith('/'):
        return None

    last_segment = path.rsplit('/', 1)[-1]
    dot_idx = last_segment.rfind('.')
    if dot_idx <= 0:
        return None

    return last_segment[dot_idx:].lower()




[docs]
def is_html_extension(ext: str | None) -> bool:
    """Return True if *ext* indicates an HTML-like page (or no extension at all).

    Parameters:
        ext: Lowercase file extension including leading dot, or None.

    Returns:
        True for extensions in :data:`HTML_EXTENSIONS` and for None.
    """
    if ext is None:
        return True
    return ext.lower() in HTML_EXTENSIONS




[docs]
def get_depth(url_path: str, root_path: str) -> int:
    """Return the directory depth of *url_path* relative to *root_path*.

    Depth 0 is the root page itself (or its trailing-slash variant).
    Each additional directory segment adds 1.

    Parameters:
        url_path: The path to measure.
        root_path: The root path to measure from.

    Returns:
        Integer depth >= 0.
    """
    root = root_path.rstrip('/')
    candidate = url_path.rstrip('/')

    if candidate == root:
        return 0

    relative = candidate[len(root) :]
    relative = relative.lstrip('/')

    if not relative:
        return 0

    return len(relative.split('/'))




[docs]
def is_http_url(url: str) -> bool:
    """Return True if *url* uses the ``http`` or ``https`` scheme.

    Parameters:
        url: The URL string to test.

    Returns:
        True for HTTP/HTTPS URLs, False for everything else.
    """
    return urlparse(url).scheme in ('http', 'https')




[docs]
def is_http_to_https_redirect(original_url: str, final_url: str) -> bool:
    """Return True if *original_url* and *final_url* differ only in scheme upgrade.

    A pure HTTP-to-HTTPS redirect is one where the original URL uses ``http``
    and the final URL uses ``https``, with the same host (case-insensitive),
    path, and query string.  Any other difference (different host, path, query,
    or port) returns False.

    Parameters:
        original_url: The URL before the redirect.
        final_url: The URL after the redirect.

    Returns:
        True if the redirect is a simple HTTP-to-HTTPS scheme upgrade.
    """
    orig = urlparse(original_url)
    final = urlparse(final_url)
    return (
        orig.scheme == 'http'
        and final.scheme == 'https'
        and orig.netloc.lower() == final.netloc.lower()
        and orig.path == final.path
        and orig.query == final.query
    )