Source code for link_checker.html_parser

"""HTML link extraction, anchor collection, and base-href handling."""

from __future__ import annotations

from dataclasses import dataclass
from urllib.parse import urljoin

from bs4 import BeautifulSoup, Tag

# Each entry: (element_name, attribute, is_asset)
_LINK_SPEC: tuple[tuple[str, str, bool], ...] = (
    ('a', 'href', False),
    ('img', 'src', True),
    ('img', 'srcset', True),
    ('link', 'href', True),
    ('script', 'src', True),
    ('iframe', 'src', False),
    ('source', 'src', True),
    ('source', 'srcset', True),
    ('video', 'src', True),
    ('video', 'poster', True),
    ('audio', 'src', True),
    ('object', 'data', True),
    ('embed', 'src', True),
    ('form', 'action', False),
)










[docs] def extract_anchors(html: str) -> frozenset[str]: """Extract all anchor IDs from *html* (``id`` attributes and ``<a name>``). Parameters: html: Raw HTML string. Returns: Frozen set of anchor identifier strings. """ soup = BeautifulSoup(html, 'html.parser') anchors: set[str] = set() for tag in soup.find_all(id=True): if not isinstance(tag, Tag): continue id_val = tag.get('id') if id_val: anchors.add(str(id_val)) for tag in soup.find_all('a', attrs={'name': True}): if not isinstance(tag, Tag): continue name_val = tag.get('name') if name_val: anchors.add(str(name_val)) return frozenset(anchors)
[docs] def find_base_href(html: str) -> str | None: """Return the first ``<base href>`` value from the document ``<head>``. Parameters: html: Raw HTML string. Returns: The href value, or None if no ``<base href>`` is found. """ soup = BeautifulSoup(html, 'html.parser') return _find_base_href_from_soup(soup)
def _find_base_href_from_soup(soup: BeautifulSoup) -> str | None: """Return the first ``<base href>`` value from the document ``<head>``. Searches only within ``<head>`` per the HTML specification, which requires ``<base>`` to be a child of ``<head>``. Parameters: soup: Parsed :class:`~bs4.BeautifulSoup` document. Returns: The href value, or None if no ``<head>`` or no ``<base href>`` is found. """ head = soup.head if head is None: return None base_tag = head.find('base') if not isinstance(base_tag, Tag): return None href = base_tag.get('href') return str(href) if href else None def _parse_srcset(value: str) -> list[str]: """Parse a ``srcset`` attribute value into individual URLs. Handles both bare URLs and ``url descriptor`` pairs. Parameters: value: Raw ``srcset`` attribute string. Returns: List of URL strings (descriptors stripped). """ urls: list[str] = [] for part in value.split(','): tokens = part.strip().split() if tokens: urls.append(tokens[0]) return urls