Source code for link_checker.html_parser

"""HTML link extraction, anchor collection, and base-href handling."""

from __future__ import annotations

from dataclasses import dataclass
from urllib.parse import urljoin

from bs4 import BeautifulSoup, Tag

# Each entry: (element_name, attribute, is_asset)
_LINK_SPEC: tuple[tuple[str, str, bool], ...] = (
    ('a', 'href', False),
    ('img', 'src', True),
    ('img', 'srcset', True),
    ('link', 'href', True),
    ('script', 'src', True),
    ('iframe', 'src', False),
    ('source', 'src', True),
    ('source', 'srcset', True),
    ('video', 'src', True),
    ('video', 'poster', True),
    ('audio', 'src', True),
    ('object', 'data', True),
    ('embed', 'src', True),
    ('form', 'action', False),
)



[docs]
@dataclass(frozen=True)
class ExtractedLink:
    """A link extracted from an HTML page.

    Attributes:
        url: Absolute URL of the link.
        source_element: HTML element name (e.g. ``"a"``, ``"img"``).
        source_attribute: HTML attribute name (e.g. ``"href"``, ``"src"``).
        is_asset: True if from an asset-only element (never crawled).
    """

    url: str
    source_element: str
    source_attribute: str
    is_asset: bool




[docs]
def extract_links(
    html: str,
    page_url: str,
    *,
    base_url: str | None = None,
) -> list[ExtractedLink]:
    """Extract all links from *html* and return them as :class:`ExtractedLink` objects.

    Relative URLs are resolved against *base_url* (if given) or the
    ``<base href>`` tag in the document, falling back to *page_url*.

    Parameters:
        html: Raw HTML string to parse.
        page_url: URL of the page (used for relative URL resolution).
        base_url: Override for relative URL resolution. Supersedes any
            ``<base href>`` found in the document.

    Returns:
        List of :class:`ExtractedLink` with absolute URLs.
    """
    soup = BeautifulSoup(html, 'html.parser')

    effective_base = base_url or _find_base_href_from_soup(soup) or page_url

    results: list[ExtractedLink] = []
    seen: set[tuple[str, str, str]] = set()

    for element_name, attr, is_asset in _LINK_SPEC:
        for tag in soup.find_all(element_name):
            if not isinstance(tag, Tag):
                continue
            raw_val = tag.get(attr)
            if not raw_val:
                continue

            if attr == 'srcset':
                urls = _parse_srcset(str(raw_val))
            else:
                urls = [str(raw_val).strip()]

            for raw_url in urls:
                if not raw_url or raw_url == '#':
                    continue
                absolute = urljoin(effective_base, raw_url)
                key = (absolute, element_name, attr)
                if key in seen:
                    continue
                seen.add(key)
                results.append(
                    ExtractedLink(
                        url=absolute,
                        source_element=element_name,
                        source_attribute=attr,
                        is_asset=is_asset,
                    )
                )

    return results




[docs]
def extract_anchors(html: str) -> frozenset[str]:
    """Extract all anchor IDs from *html* (``id`` attributes and ``<a name>``).

    Parameters:
        html: Raw HTML string.

    Returns:
        Frozen set of anchor identifier strings.
    """
    soup = BeautifulSoup(html, 'html.parser')
    anchors: set[str] = set()

    for tag in soup.find_all(id=True):
        if not isinstance(tag, Tag):
            continue
        id_val = tag.get('id')
        if id_val:
            anchors.add(str(id_val))

    for tag in soup.find_all('a', attrs={'name': True}):
        if not isinstance(tag, Tag):
            continue
        name_val = tag.get('name')
        if name_val:
            anchors.add(str(name_val))

    return frozenset(anchors)




[docs]
def find_base_href(html: str) -> str | None:
    """Return the first ``<base href>`` value from the document ``<head>``.

    Parameters:
        html: Raw HTML string.

    Returns:
        The href value, or None if no ``<base href>`` is found.
    """
    soup = BeautifulSoup(html, 'html.parser')
    return _find_base_href_from_soup(soup)



def _find_base_href_from_soup(soup: BeautifulSoup) -> str | None:
    """Return the first ``<base href>`` value from the document ``<head>``.

    Searches only within ``<head>`` per the HTML specification, which requires
    ``<base>`` to be a child of ``<head>``.

    Parameters:
        soup: Parsed :class:`~bs4.BeautifulSoup` document.

    Returns:
        The href value, or None if no ``<head>`` or no ``<base href>`` is found.
    """
    head = soup.head
    if head is None:
        return None
    base_tag = head.find('base')
    if not isinstance(base_tag, Tag):
        return None
    href = base_tag.get('href')
    return str(href) if href else None


def _parse_srcset(value: str) -> list[str]:
    """Parse a ``srcset`` attribute value into individual URLs.

    Handles both bare URLs and ``url descriptor`` pairs.

    Parameters:
        value: Raw ``srcset`` attribute string.

    Returns:
        List of URL strings (descriptors stripped).
    """
    urls: list[str] = []
    for part in value.split(','):
        tokens = part.strip().split()
        if tokens:
            urls.append(tokens[0])
    return urls