Source code for link_checker.cli

"""Command-line entry point for rms-link-checker."""

from __future__ import annotations

import argparse
import logging
import sys

from link_checker import __version__
from link_checker.config import load_config
from link_checker.crawler import Crawler
from link_checker.progress import ProgressReporter
from link_checker.report import generate_report

_EXIT_OK = 0
_EXIT_PROBLEMS = 1
_EXIT_CONFIG_ERROR = 2
_EXIT_INTERRUPTED = 130


def _build_parser() -> argparse.ArgumentParser:
    """Build and return the argument parser for the link_check CLI.

    Returns:
        Configured :class:`argparse.ArgumentParser`.
    """
    parser = argparse.ArgumentParser(
        prog='link_check',
        description='Crawl a website, check all links, and generate a report.',
    )
    parser.add_argument(
        'root_url',
        nargs='?',
        default=None,
        help='Root URL to begin crawling (may also be set in the config file).',
    )
    parser.add_argument(
        '-o',
        '--output',
        default=None,
        metavar='PATH',
        help='File path for the final plain-text report (default: stdout).',
    )
    parser.add_argument(
        '--log-file',
        default=None,
        metavar='PATH',
        help='File path for log messages (default: stderr).',
    )
    parser.add_argument(
        '--log-level',
        default=None,
        type=str.upper,
        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
        metavar='LEVEL',
        help='Minimum log level (default: INFO).',
    )
    parser.add_argument(
        '--timeout',
        type=int,
        default=None,
        metavar='SECONDS',
        help='Timeout in seconds for each HTTP request (default: 10).',
    )
    parser.add_argument(
        '--retries',
        type=int,
        default=None,
        metavar='N',
        help='Number of retry attempts for transient failures (default: 3).',
    )
    parser.add_argument(
        '--max-requests',
        type=int,
        default=None,
        metavar='N',
        help='Maximum total HTTP requests to issue (default: unlimited).',
    )
    parser.add_argument(
        '--max-depth',
        type=int,
        default=None,
        metavar='N',
        help='Maximum directory depth to crawl (default: unlimited).',
    )
    parser.add_argument(
        '--max-threads',
        type=int,
        default=None,
        metavar='N',
        help='Maximum number of concurrent threads (default: 10).',
    )
    parser.add_argument(
        '--max-referencing-pages',
        type=int,
        default=None,
        metavar='N',
        help='Maximum referencing pages listed per URL in report (default: 10).',
    )
    parser.add_argument(
        '--verify',
        default=None,
        metavar='BOOL_OR_PATH',
        help=(
            'TLS certificate verification. '
            'Pass "false" to disable (insecure, e.g. for self-signed certs), '
            '"true" to enable (default), or a path to a CA-bundle file.'
        ),
    )
    parser.add_argument(
        '--ignore-http-to-https-redirects',
        default=None,
        action='store_true',
        dest='ignore_http_to_https_redirects',
        help=(
            'Suppress redirects where only the scheme changes from http to https '
            '(same host, path, and query) from the Redirects section of the report.'
        ),
    )
    parser.add_argument(
        '--config-file',
        default=None,
        metavar='PATH',
        help='Path to a YAML configuration file.',
    )
    parser.add_argument(
        '--version',
        action='version',
        version=f'%(prog)s {__version__}',
    )
    return parser


def _setup_logging(log_file: str | None, log_level: str) -> None:
    """Configure the ``link_checker`` logger.

    Parameters:
        log_file: File path for log output. If None, logs go to stderr.
        log_level: Log level string (e.g. ``'INFO'``).
    """
    numeric_level = getattr(logging, log_level.upper(), logging.INFO)
    handler: logging.Handler
    if log_file is not None:
        handler = logging.FileHandler(log_file, encoding='utf-8')
    else:
        handler = logging.StreamHandler(sys.stderr)
    handler.setLevel(numeric_level)
    formatter = logging.Formatter('%(asctime)s %(name)s %(levelname)s %(message)s')
    handler.setFormatter(formatter)
    log = logging.getLogger('link_checker')
    log.setLevel(numeric_level)
    for h in log.handlers:
        h.close()
    log.handlers.clear()
    log.addHandler(handler)
    log.propagate = False


[docs] def main() -> None: """Entry point for the link_check CLI command.""" parser = _build_parser() args = parser.parse_args() try: config = load_config(args, config_path=args.config_file) except ValueError as exc: print(f'Error: {exc}', file=sys.stderr) sys.exit(_EXIT_CONFIG_ERROR) _setup_logging(config.log_file, config.log_level) progress = ProgressReporter() crawler = Crawler(config, progress=progress) progress.start() interrupted = False try: results = crawler.crawl() except KeyboardInterrupt: interrupted = True print('\nInterrupted — waiting for in-flight requests to finish...', file=sys.stderr) crawler.abort() results = crawler.results finally: progress.stop() if interrupted: print('Generating partial report.', file=sys.stderr) report = generate_report(results, config) if config.output: with open(config.output, 'w', encoding='utf-8') as fh: fh.write(report) else: print(report, end='') sys.exit( _EXIT_INTERRUPTED if interrupted else (_EXIT_PROBLEMS if results.has_problems() else _EXIT_OK) )