Source code for link_checker.config

"""Configuration dataclass and YAML loading for rms-link-checker."""

from __future__ import annotations

import argparse
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

import yaml

_VALID_LOG_LEVELS: frozenset[str] = frozenset({'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'})

_KNOWN_YAML_KEYS: frozenset[str] = frozenset(
    {
        'root_url',
        'timeout',
        'retries',
        'max_requests',
        'max_depth',
        'max_threads',
        'max_referencing_pages',
        'log_level',
        'output',
        'log_file',
        'asset_urls',
        'no_crawl_urls',
        'ignore_urls',
        'verify',
        'ignore_http_to_https_redirects',
    }
)


[docs] @dataclass(frozen=True) class CrawlConfig: """Immutable crawl configuration. Attributes: root_url: The root URL to begin crawling from. timeout: Timeout in seconds for each HTTP request. retries: Number of retry attempts for transient failures. max_requests: Maximum total HTTP requests (None = unlimited). max_depth: Maximum directory depth (None = unlimited). max_threads: Maximum concurrent threads. max_referencing_pages: Max referencing pages per URL in report. log_level: Minimum log level string. output: File path for report (None = stdout). log_file: File path for log messages (None = stderr). asset_urls: Expected URL prefixes for asset files. no_crawl_urls: URL prefixes to check but not crawl. ignore_urls: URL prefixes to skip entirely. verify: TLS certificate verification. ``True`` (default) uses the system CA bundle; ``False`` disables verification (insecure); a string is treated as a path to a CA-bundle file. ignore_http_to_https_redirects: When ``True``, redirects where only the scheme changes from ``http`` to ``https`` (same host, path, and query) are silently dropped from the Redirects section of the report. Defaults to ``False``. """ root_url: str timeout: int = 10 retries: int = 3 max_requests: int | None = None max_depth: int | None = None max_threads: int = 10 max_referencing_pages: int = 10 log_level: str = 'INFO' output: str | None = None log_file: str | None = None asset_urls: tuple[str, ...] = field(default_factory=tuple) no_crawl_urls: tuple[str, ...] = field(default_factory=tuple) ignore_urls: tuple[str, ...] = field(default_factory=tuple) verify: bool | str = True ignore_http_to_https_redirects: bool = False
[docs] def load_config( cli_namespace: argparse.Namespace, *, config_path: str | None = None, ) -> CrawlConfig: """Build a :class:`CrawlConfig` by merging a YAML file and CLI overrides. Precedence (highest to lowest): CLI arguments > YAML file > defaults. Parameters: cli_namespace: Parsed argparse namespace. Fields set to ``None`` are treated as "not specified" and do not override YAML or defaults. config_path: Path to a YAML configuration file. If ``None``, no file is loaded. May also be read from ``cli_namespace.config_file``. Returns: A fully populated :class:`CrawlConfig` instance. Raises: ValueError: If required fields are missing, the config file cannot be parsed, or any value fails validation. """ effective_path = config_path or cli_namespace.config_file yaml_data: dict[str, Any] = {} if effective_path is not None: yaml_data = _load_yaml_file(effective_path) def _resolve(key: str, default: Any = None) -> Any: cli_val = vars(cli_namespace).get(key) if cli_val is not None: return cli_val return yaml_data.get(key, default) root_url: str | None = _resolve('root_url') if root_url is None or root_url.strip() == '': raise ValueError( 'root_url is required: provide it as a positional argument or in the config file.' ) timeout = _coerce_int(_resolve('timeout', 10), 'timeout') retries = _coerce_int(_resolve('retries', 3), 'retries') max_requests_raw = _resolve('max_requests') max_requests = ( _coerce_int(max_requests_raw, 'max_requests') if max_requests_raw is not None else None ) max_depth_raw = _resolve('max_depth') max_depth = _coerce_int(max_depth_raw, 'max_depth') if max_depth_raw is not None else None max_threads = _coerce_int(_resolve('max_threads', 10), 'max_threads') max_referencing_pages = _coerce_int( _resolve('max_referencing_pages', 10), 'max_referencing_pages' ) log_level = str(_resolve('log_level', 'INFO')).upper() output = _resolve('output') log_file = _resolve('log_file') asset_urls = _coerce_url_list(yaml_data.get('asset_urls'), 'asset_urls') no_crawl_urls = _coerce_url_list(yaml_data.get('no_crawl_urls'), 'no_crawl_urls') ignore_urls = _coerce_url_list(yaml_data.get('ignore_urls'), 'ignore_urls') verify: bool | str = _coerce_verify(_resolve('verify', True)) ignore_http_to_https_redirects = _coerce_bool( _resolve('ignore_http_to_https_redirects', False), 'ignore_http_to_https_redirects', ) _validate( timeout=timeout, retries=retries, max_requests=max_requests, max_depth=max_depth, max_threads=max_threads, max_referencing_pages=max_referencing_pages, log_level=log_level, ) return CrawlConfig( root_url=root_url, timeout=timeout, retries=retries, max_requests=max_requests, max_depth=max_depth, max_threads=max_threads, max_referencing_pages=max_referencing_pages, log_level=log_level, output=output, log_file=log_file, asset_urls=asset_urls, no_crawl_urls=no_crawl_urls, ignore_urls=ignore_urls, verify=verify, ignore_http_to_https_redirects=ignore_http_to_https_redirects, )
def _load_yaml_file(path: str) -> dict[str, Any]: """Load and parse a YAML config file. Parameters: path: Filesystem path to the YAML file. Returns: Parsed YAML contents as a dict (empty dict for empty files). Raises: ValueError: If the file does not exist or cannot be parsed. """ p = Path(path) if not p.exists(): raise ValueError(f'Config file not found: {path!r}') try: content = p.read_text(encoding='utf-8') data = yaml.safe_load(content) except yaml.YAMLError as exc: raise ValueError(f'Invalid YAML in config file {path!r}: {exc}') from exc if data is None: return {} if not isinstance(data, dict): raise ValueError(f'Config file {path!r} must contain a YAML mapping, got {type(data)}') unknown = sorted(set(data.keys()) - _KNOWN_YAML_KEYS) if unknown: raise ValueError( f'Unknown key(s) in config file {path!r}: {", ".join(unknown)}. ' f'Valid keys are: {", ".join(sorted(_KNOWN_YAML_KEYS))}' ) return data def _coerce_verify(value: Any) -> bool | str: """Coerce *value* to a TLS verification setting. Parameters: value: Raw value from CLI or YAML. Accepts ``True``/``False`` (bool), ``'true'``/``'false'`` (case-insensitive strings), or a non-empty string path to a CA-bundle file. Returns: ``True``, ``False``, or a CA-bundle path string. Raises: ValueError: If *value* is not a bool or a non-empty string. """ if isinstance(value, bool): return value if isinstance(value, str): low = value.lower() if low == 'true': return True if low == 'false': return False if value.strip() == '': raise ValueError('verify must be true, false, or a CA-bundle path, got empty string') return value raise ValueError(f'verify must be true, false, or a CA-bundle path, got {value!r}') def _coerce_bool(value: Any, field_name: str) -> bool: """Coerce *value* to a boolean. Accepts Python booleans directly, and the strings ``'true'`` / ``'false'`` (case-insensitive). Parameters: value: Raw value from CLI or YAML. field_name: Field name used in the error message. Returns: ``True`` or ``False``. Raises: ValueError: If *value* cannot be interpreted as a boolean. """ if isinstance(value, bool): return value if isinstance(value, str): low = value.lower() if low == 'true': return True if low == 'false': return False raise ValueError(f'{field_name} must be true or false, got {value!r}') def _coerce_url_list(value: Any, field_name: str) -> tuple[str, ...]: """Coerce a YAML value to a tuple of strings, or raise on bad input. Parameters: value: Raw value from YAML (expected to be a list or absent/None). field_name: Field name used in the error message. Returns: Tuple of strings (empty if *value* is None). Raises: ValueError: If *value* is present but not a list. """ if value is None: return () if not isinstance(value, list): raise ValueError( f'{field_name} must be a list in the config file, got {type(value).__name__!r}' ) for i, item in enumerate(value): if not isinstance(item, str): raise ValueError( f'{field_name}[{i}] must be a string, got {type(item).__name__!r}: {item!r}' ) return tuple(value) def _coerce_int(value: Any, field_name: str) -> int: """Coerce *value* to int, raising :exc:`ValueError` with a clear message on failure. Parameters: value: The raw value to coerce. field_name: The field name, used in the error message. Returns: Integer representation of *value*. Raises: ValueError: If *value* cannot be converted to an integer. """ try: return int(value) except (TypeError, ValueError): raise ValueError(f'{field_name} must be an integer, got {value!r}') from None def _validate( *, timeout: int, retries: int, max_requests: int | None, max_depth: int | None, max_threads: int, max_referencing_pages: int, log_level: str, ) -> None: """Validate config values, raising ValueError on failure. Parameters: timeout: Request timeout in seconds (must be > 0). retries: Retry count (must be >= 0). max_requests: Maximum HTTP requests (must be > 0 if set). max_depth: Maximum crawl depth (must be >= 0 if set). max_threads: Thread pool size (must be >= 1). max_referencing_pages: Max referencing pages in report (must be >= 1). log_level: Log level string (must be one of the standard levels). Raises: ValueError: If any value is invalid. """ if timeout <= 0: raise ValueError(f'timeout must be > 0, got {timeout}') if retries < 0: raise ValueError(f'retries must be >= 0, got {retries}') if max_requests is not None and max_requests <= 0: raise ValueError(f'max_requests must be > 0, got {max_requests}') if max_depth is not None and max_depth < 0: raise ValueError(f'max_depth must be >= 0, got {max_depth}') if max_threads < 1: raise ValueError(f'max_threads must be >= 1, got {max_threads}') if max_referencing_pages < 1: raise ValueError(f'max_referencing_pages must be >= 1, got {max_referencing_pages}') if log_level not in _VALID_LOG_LEVELS: raise ValueError(f'log_level must be one of {sorted(_VALID_LOG_LEVELS)}, got {log_level!r}')