Source code for link_checker.report

"""Plain-text report generator for the 11 report sections."""

from __future__ import annotations

import http
import time
from collections import defaultdict

from link_checker.config import CrawlConfig
from link_checker.results import CrawlResults

_BYTES_PER_MB = 1024 * 1024


[docs] def generate_report(results: CrawlResults, config: CrawlConfig) -> str: """Generate the full plain-text report for a completed crawl. Produces 11 sections covering configuration, statistics, broken links, broken anchors, non-200 responses, redirects, misplaced assets, ignored URLs, non-HTTP links, SSL warnings, and unvalidated anchors. Parameters: results: Completed crawl results. config: Crawl configuration used. Returns: The full report as a multi-line string. """ sections = [ _section_config_summary(config), _section_statistics(results), _section_broken_links(results, config), _section_broken_anchors(results, config), _section_non200_responses(results, config), _section_redirects(results, config), _section_misplaced_assets(results, config), _section_ignore_matches(results, config), _section_non_http_links(results, config), _section_ssl_warnings(results, config), _section_unvalidated_anchors(results, config), ] return '\n\n'.join(sections) + '\n'
# --------------------------------------------------------------------------- # §10.1 Configuration Summary # --------------------------------------------------------------------------- def _section_config_summary(config: CrawlConfig) -> str: """Render the Configuration Summary section. Parameters: config: Crawl configuration. Returns: Formatted section string. """ lines = ['=== Configuration Summary ==='] lines.append(f'Root URL: {config.root_url}') lines.append(f'Timeout: {config.timeout}s') lines.append(f'Retries: {config.retries}') lines.append(f'Max threads: {config.max_threads}') max_depth_str = str(config.max_depth) if config.max_depth is not None else 'unlimited' lines.append(f'Max depth: {max_depth_str}') max_req_str = str(config.max_requests) if config.max_requests is not None else 'unlimited' lines.append(f'Max requests: {max_req_str}') lines.append(f'Max ref. pages: {config.max_referencing_pages}') if config.output is not None: lines.append(f'Output file: {config.output}') if config.ignore_http_to_https_redirects: lines.append('Ignore http→https redirects: yes') if config.asset_urls: lines.append('') lines.append('Asset URL prefixes:') for url in config.asset_urls: lines.append(f' - {url}') if config.no_crawl_urls: lines.append('') lines.append('No-crawl URL prefixes:') for url in config.no_crawl_urls: lines.append(f' - {url}') if config.ignore_urls: lines.append('') lines.append('Ignore URL prefixes:') for url in config.ignore_urls: lines.append(f' - {url}') return '\n'.join(lines) # --------------------------------------------------------------------------- # §10.2 Statistics Summary # --------------------------------------------------------------------------- def _section_statistics(results: CrawlResults) -> str: """Render the Statistics Summary section. Parameters: results: Completed crawl results. Returns: Formatted section string. """ stats = results.statistics elapsed = time.time() - stats.start_time minutes = int(elapsed // 60) seconds = int(elapsed % 60) rps = stats.total_requests / elapsed if elapsed > 0 else 0.0 mb = stats.bytes_downloaded / _BYTES_PER_MB lines = ['=== Statistics Summary ==='] lines.append(f'Elapsed time: {minutes}m {seconds}s') lines.append(f'Total HTTP requests: {stats.total_requests:,}') lines.append(f'Requests per second: {rps:.1f}') lines.append(f'Total bytes downloaded: {mb:.1f} MB') lines.append('') lines.append(f'Internal pages crawled: {stats.pages_crawled}') lines.append(f'Internal pages checked: {stats.pages_checked}') lines.append(f'External links checked: {stats.external_checked}') lines.append('') broken_count = len(results.broken_links) anchor_count = len(results.broken_anchors) unval_count = len(results.unvalidated_anchors) redirect_count = len(results.redirects) misplaced_count = len(results.misplaced_assets) ssl_count = len(results.ssl_warnings) non_http_count = len(results.non_http_links) lines.append(f'Broken links: {broken_count}') lines.append(f'Broken anchors: {anchor_count}') lines.append(f'Unvalidated anchors: {unval_count}') lines.append(f'Redirects encountered: {redirect_count}') lines.append(f'Misplaced assets: {misplaced_count}') lines.append(f'SSL warnings: {ssl_count} domain{"s" if ssl_count != 1 else ""}') lines.append('') if non_http_count: lines.append(f'Non-HTTP scheme links: {non_http_count}') scheme_counts: dict[str, int] = defaultdict(int) for lk in results.non_http_links: scheme_counts[lk.scheme] += 1 for scheme, count in sorted(scheme_counts.items(), key=lambda x: -x[1]): lines.append(f' {scheme}: {count}') lines.append('') if stats.per_domain_requests: lines.append('Per-domain request breakdown:') sorted_domains = sorted(stats.per_domain_requests.items(), key=lambda x: -x[1]) for domain, count in sorted_domains: lines.append(f' {domain}: {count:,}') return '\n'.join(lines) # --------------------------------------------------------------------------- # §10.3 Broken Links # --------------------------------------------------------------------------- def _section_broken_links(results: CrawlResults, config: CrawlConfig) -> str: """Render the Broken Links section. Parameters: results: Completed crawl results. config: Crawl configuration (used for ``max_referencing_pages``). Returns: Formatted section string. """ items = results.broken_links lines = [f'=== Broken Links ({len(items)}) ==='] if not items: return '\n'.join(lines) by_page: dict[str, list[tuple[str, str]]] = defaultdict(list) for bl in items: for page in bl.referencing_pages: by_page[page].append((bl.url, bl.error)) for page in sorted(by_page): lines.append('') lines.append(f'Page: {page}') for url, error in sorted(by_page[page]): lines.append(f' - {url}{error}') return '\n'.join(lines) # --------------------------------------------------------------------------- # §10.4 Broken Anchors # --------------------------------------------------------------------------- def _section_broken_anchors(results: CrawlResults, config: CrawlConfig) -> str: """Render the Broken Anchors section. Parameters: results: Completed crawl results. config: Crawl configuration (used for ``max_referencing_pages``). Returns: Formatted section string. """ items = results.broken_anchors lines = [f'=== Broken Anchors ({len(items)}) ==='] for ba in items: lines.append('') lines.append(f'Target: {ba.target_url}') lines.append(' Referenced by:') for page in _truncated_pages(ba.referencing_pages, config.max_referencing_pages): lines.append(f' - {page}') extra = len(ba.referencing_pages) - config.max_referencing_pages if extra > 0: lines.append(f' ... and {extra} more referencing pages') return '\n'.join(lines) # --------------------------------------------------------------------------- # §10.5 Non-200 Responses # --------------------------------------------------------------------------- def _section_non200_responses(results: CrawlResults, config: CrawlConfig) -> str: """Render the Non-200 Responses section. Parameters: results: Completed crawl results. config: Crawl configuration (used for ``max_referencing_pages``). Returns: Formatted section string. """ items = results.non200_responses lines = [f'=== Non-200 Responses ({len(items)}) ==='] by_status: dict[int, list[tuple[str, list[str]]]] = defaultdict(list) for r in items: by_status[r.status_code].append((r.url, r.referencing_pages)) for status in sorted(by_status): reason = _http_reason(status) lines.append('') lines.append(f'{status} {reason}:') for url, refs in sorted(by_status[status], key=lambda x: x[0]): lines.append('') lines.append(f' - {url}') lines.append(' Referenced by:') for page in _truncated_pages(refs, config.max_referencing_pages): lines.append(f' - {page}') extra = len(refs) - config.max_referencing_pages if extra > 0: lines.append(f' ... and {extra} more referencing pages') return '\n'.join(lines) # --------------------------------------------------------------------------- # §10.6 Redirects # --------------------------------------------------------------------------- def _section_redirects(results: CrawlResults, config: CrawlConfig) -> str: """Render the Redirects section. Parameters: results: Completed crawl results. config: Crawl configuration (used for ``max_referencing_pages``). Returns: Formatted section string. """ items = results.redirects header = f'=== Redirects ({len(items)}) ===' if config.ignore_http_to_https_redirects: header += ' [http→https upgrades suppressed]' lines = [header] for r in items: lines.append('') lines.append(f'{r.original_url}{r.final_url} ({r.status_code})') lines.append(' Referenced by:') for page in _truncated_pages(r.referencing_pages, config.max_referencing_pages): lines.append(f' - {page}') extra = len(r.referencing_pages) - config.max_referencing_pages if extra > 0: lines.append(f' ... and {extra} more referencing pages') return '\n'.join(lines) # --------------------------------------------------------------------------- # §10.7 Misplaced Assets # --------------------------------------------------------------------------- _ASSET_TYPE_ORDER = ['Image', 'Document', 'Data', 'Infrastructure', 'Other'] def _section_misplaced_assets(results: CrawlResults, config: CrawlConfig) -> str: """Render the Misplaced Assets section. Assets are grouped by type. Within each type, entries are sorted alphabetically by filename and separated by blank lines. Parameters: results: Completed crawl results. config: Crawl configuration (used for ``max_referencing_pages``). Returns: Formatted section string. """ items = results.misplaced_assets lines = [f'=== Misplaced Assets ({len(items)}) ==='] by_type: dict[str, list[tuple[str, list[str]]]] = defaultdict(list) for a in items: by_type[a.asset_type].append((a.url, a.referencing_pages)) for asset_type in _ASSET_TYPE_ORDER: lines.append('') lines.append(f'{asset_type}:') entries = by_type.get(asset_type, []) if not entries: lines.append(' (none)') continue for i, (url, refs) in enumerate(sorted(entries, key=lambda x: x[0].rsplit('/', 1)[-1])): if i > 0: lines.append('') filename = url.rsplit('/', 1)[-1] lines.append(f' {filename} ({url})') lines.append(' Referenced by:') for page in _truncated_pages(refs, config.max_referencing_pages): lines.append(f' - {page}') extra = len(refs) - config.max_referencing_pages if extra > 0: lines.append(f' ... and {extra} more referencing pages') return '\n'.join(lines) # --------------------------------------------------------------------------- # §10.9 Ignore URL Matches # --------------------------------------------------------------------------- def _section_ignore_matches(results: CrawlResults, config: CrawlConfig) -> str: """Render the Ignore URL Matches section. Parameters: results: Completed crawl results. config: Crawl configuration (used for ``max_referencing_pages``). Returns: Formatted section string. """ items = results.ignore_matches lines = [f'=== Ignore URL Matches ({len(items)}) ==='] for m in items: lines.append('') lines.append(m.url) lines.append(' Referenced by:') for page in _truncated_pages(m.referencing_pages, config.max_referencing_pages): lines.append(f' - {page}') extra = len(m.referencing_pages) - config.max_referencing_pages if extra > 0: lines.append(f' ... and {extra} more referencing pages') return '\n'.join(lines) # --------------------------------------------------------------------------- # §10.10 Non-HTTP Scheme Links # --------------------------------------------------------------------------- def _section_non_http_links(results: CrawlResults, config: CrawlConfig) -> str: """Render the Non-HTTP Scheme Links section. Parameters: results: Completed crawl results. config: Crawl configuration (used for ``max_referencing_pages``). Returns: Formatted section string. """ items = results.non_http_links lines = [f'=== Non-HTTP Scheme Links ({len(items)}) ==='] for lk in items: lines.append('') lines.append(lk.url) lines.append(' Referenced by:') for page in _truncated_pages(lk.referencing_pages, config.max_referencing_pages): lines.append(f' - {page}') extra = len(lk.referencing_pages) - config.max_referencing_pages if extra > 0: lines.append(f' ... and {extra} more referencing pages') return '\n'.join(lines) # --------------------------------------------------------------------------- # §10.11 SSL Warnings # --------------------------------------------------------------------------- def _section_ssl_warnings(results: CrawlResults, config: CrawlConfig) -> str: """Render the SSL Warnings section. Parameters: results: Completed crawl results. config: Crawl configuration (used for ``max_referencing_pages``). Returns: Formatted section string. """ items = results.ssl_warnings lines = [f'=== SSL Warnings ({len(items)} domain{"s" if len(items) != 1 else ""}) ==='] for sw in items: lines.append('') lines.append(f'{sw.domain}{sw.error}') lines.append(' Affected URLs:') for url, refs in sw.affected_urls: lines.append(f' - {url}') lines.append(' Referenced by:') for page in _truncated_pages(refs, config.max_referencing_pages): lines.append(f' - {page}') extra = len(refs) - config.max_referencing_pages if extra > 0: lines.append(f' ... and {extra} more referencing pages') return '\n'.join(lines) # --------------------------------------------------------------------------- # §10.12 Unvalidated Anchors # --------------------------------------------------------------------------- def _section_unvalidated_anchors(results: CrawlResults, config: CrawlConfig) -> str: """Render the Unvalidated Anchors section. Parameters: results: Completed crawl results. config: Crawl configuration (used for ``max_referencing_pages``). Returns: Formatted section string. """ items = results.unvalidated_anchors lines = [f'=== Unvalidated Anchors ({len(items)}) ==='] for ua in items: lines.append('') lines.append(f'{ua.target_url} ({ua.reason})') lines.append(' Referenced by:') for page in _truncated_pages(ua.referencing_pages, config.max_referencing_pages): lines.append(f' - {page}') extra = len(ua.referencing_pages) - config.max_referencing_pages if extra > 0: lines.append(f' ... and {extra} more referencing pages') return '\n'.join(lines) # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _truncated_pages(pages: list[str], max_pages: int) -> list[str]: """Return at most *max_pages* items from *pages*. Parameters: pages: List of page URLs. max_pages: Maximum number to return. Returns: Truncated list. """ return pages[:max_pages] def _http_reason(status_code: int) -> str: """Return the HTTP reason phrase for *status_code*, or an empty string. Parameters: status_code: HTTP status code integer. Returns: Reason phrase string (e.g. ``'Not Found'``). """ try: return http.HTTPStatus(status_code).phrase except ValueError: return ''