Source code for link_checker.classifier

"""URL decision tree and asset type classification."""

from __future__ import annotations

from enum import Enum
from urllib.parse import urlparse

from link_checker.config import CrawlConfig
from link_checker.url_utils import (
    add_trailing_slash,
    get_file_extension,
    is_html_extension,
    is_http_url,
    is_same_domain,
    is_under_root,
    matches_prefix,
    normalize_url,
)

# ---------------------------------------------------------------------------
# Asset type classification
# ---------------------------------------------------------------------------

_IMAGE_EXTENSIONS: frozenset[str] = frozenset(
    {'.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico', '.bmp', '.tiff', '.tif', '.avif'}
)
_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset(
    {
        '.pdf',
        '.doc',
        '.docx',
        '.xls',
        '.xlsx',
        '.ppt',
        '.pptx',
        '.txt',
        '.csv',
        '.rtf',
        '.odt',
        '.ods',
        '.odp',
    }
)
_DATA_EXTENSIONS: frozenset[str] = frozenset({'.tab', '.xml', '.lbl', '.lblx', '.img'})
_INFRASTRUCTURE_EXTENSIONS: frozenset[str] = frozenset(
    {'.js', '.mjs', '.css', '.woff', '.woff2', '.ttf', '.eot', '.otf', '.map', '.json'}
)


[docs] class AssetType(Enum): """Category of a non-HTML asset URL.""" IMAGE = 'Image' DOCUMENT = 'Document' DATA = 'Data' INFRASTRUCTURE = 'Infrastructure' OTHER = 'Other'
[docs] class UrlDisposition(Enum): """How the crawler should handle a given URL (spec §6 decision tree).""" NON_HTTP = 'non_http' IGNORED = 'ignored' ALREADY_VISITED = 'already_visited' NO_CRAWL = 'no_crawl' EXTERNAL = 'external' DEPTH_LIMITED = 'depth_limited' INTERNAL_CRAWL = 'internal_crawl' INTERNAL_ASSET = 'internal_asset'
[docs] def classify_asset(extension: str) -> AssetType: """Return the :class:`AssetType` for a file extension. Parameters: extension: Lowercase file extension including leading dot (e.g. ``'.jpg'``). Returns: The :class:`AssetType` for the extension. """ ext = extension.lower() if ext in _IMAGE_EXTENSIONS: return AssetType.IMAGE if ext in _DOCUMENT_EXTENSIONS: return AssetType.DOCUMENT if ext in _DATA_EXTENSIONS: return AssetType.DATA if ext in _INFRASTRUCTURE_EXTENSIONS: return AssetType.INFRASTRUCTURE return AssetType.OTHER
[docs] def classify_url( url: str, *, config: CrawlConfig, root_url: str, root_path: str, visited_set: set[str], depth: int, ) -> UrlDisposition: """Classify a URL per the spec §6 decision tree. Steps (in order): 1. Non-HTTP scheme → :attr:`UrlDisposition.NON_HTTP` 2. Matches ``ignore_urls`` → :attr:`UrlDisposition.IGNORED` 3. Already visited → :attr:`UrlDisposition.ALREADY_VISITED` 4. Matches ``no_crawl_urls`` → :attr:`UrlDisposition.NO_CRAWL` 5. External (different domain or above root path) → :attr:`UrlDisposition.EXTERNAL` 6. Exceeds depth limit → :attr:`UrlDisposition.DEPTH_LIMITED` 7. Internal HTML/no-extension → :attr:`UrlDisposition.INTERNAL_CRAWL` 8. Internal asset → :attr:`UrlDisposition.INTERNAL_ASSET` Parameters: url: The URL to classify (absolute, not yet normalized for query/fragment). config: Current crawl configuration. root_url: Normalized root URL. root_path: Path component of the root URL. visited_set: Set of already-visited canonical URLs. depth: Directory depth of *url* relative to root. Returns: The :class:`UrlDisposition` for the URL. """ if not is_http_url(url): return UrlDisposition.NON_HTTP for prefix in config.ignore_urls: if matches_prefix(url, prefix): return UrlDisposition.IGNORED canonical, _ = normalize_url(url) if is_same_domain(url, root_url): canonical = add_trailing_slash(canonical) if canonical in visited_set: return UrlDisposition.ALREADY_VISITED for prefix in config.no_crawl_urls: if matches_prefix(url, prefix): return UrlDisposition.NO_CRAWL parsed_url = urlparse(url) if not is_same_domain(url, root_url) or not is_under_root(parsed_url.path, root_path): return UrlDisposition.EXTERNAL if config.max_depth is not None and depth > config.max_depth: return UrlDisposition.DEPTH_LIMITED ext = get_file_extension(url) if is_html_extension(ext): return UrlDisposition.INTERNAL_CRAWL return UrlDisposition.INTERNAL_ASSET
[docs] def is_misplaced_asset( url: str, *, config: CrawlConfig, root_url: str, root_path: str, ) -> bool: """Return True if *url* is a misplaced asset per spec §8.6. An asset is misplaced when **all** of the following hold: 1. ``asset_urls`` is defined and non-empty. 2. The asset does not fall under any ``asset_urls`` prefix. 3. The asset is not external (is on the same domain and under root). 4. The asset is not matched by an ``ignore_urls`` prefix. 5. The asset is not matched by a ``no_crawl_urls`` prefix. Parameters: url: The asset URL to test. config: Current crawl configuration. root_url: Normalized root URL. root_path: Path component of the root URL. Returns: True if the asset is misplaced. """ if not config.asset_urls: return False for prefix in config.asset_urls: if matches_prefix(url, prefix): return False parsed_url = urlparse(url) if not is_same_domain(url, root_url) or not is_under_root(parsed_url.path, root_path): return False if any(matches_prefix(url, prefix) for prefix in config.ignore_urls): return False return not any(matches_prefix(url, prefix) for prefix in config.no_crawl_urls)