"""HTTP client with retry, redirect, SSL, and User-Agent support."""
from __future__ import annotations
import logging
import time as _time_module
from collections.abc import Callable
from dataclasses import dataclass, field
from urllib.parse import urljoin, urlparse
import requests
import requests.exceptions
logger = logging.getLogger('link_checker')
_MAX_REDIRECTS = 10
_TRANSIENT_STATUSES: frozenset[int] = frozenset({429, 503})
[docs]
@dataclass
class RedirectHop:
"""A single hop in an HTTP redirect chain.
Attributes:
url: The URL of this hop.
status_code: The HTTP status code returned for this hop.
"""
url: str
status_code: int
[docs]
@dataclass
class RequestResult:
"""Result of an HTTP request, including all redirect hops.
Attributes:
final_url: The final URL after following all redirects.
status_code: The final HTTP status code.
headers: Response headers from the final response.
body: Response body text (only populated for GET requests).
content_type: Value of the ``Content-Type`` header, if present.
redirect_chain: List of redirect hops leading to the final URL.
error: Human-readable error string if the request failed.
bytes_downloaded: Approximate bytes received.
"""
final_url: str
status_code: int
headers: dict[str, str]
body: str | None
content_type: str | None
redirect_chain: list[RedirectHop] = field(default_factory=list)
error: str | None = None
bytes_downloaded: int = 0
[docs]
class HttpClient:
"""HTTP client wrapping :mod:`requests` with retry, redirect, and SSL handling.
Parameters:
timeout: Timeout in seconds for each individual request attempt.
retries: Maximum number of retry attempts for transient failures.
user_agent: User-Agent header value to send with every request.
verify: TLS certificate verification. Pass ``False`` to disable
(e.g. for internal environments with self-signed certificates),
or a path to a CA bundle. Defaults to ``True``.
sleep: Callable used to pause between retry attempts. Defaults to
:func:`time.sleep`. Pass a no-op (e.g. ``lambda _: None``) in
tests to avoid real waits.
"""
def __init__(
self,
*,
timeout: int,
retries: int,
user_agent: str,
verify: bool | str = True,
sleep: Callable[[float], None] | None = None,
) -> None:
"""Initialise the HTTP client.
Parameters:
timeout: Timeout in seconds per request attempt.
retries: Max retry attempts for transient errors.
user_agent: User-Agent string.
verify: TLS verification flag or CA-bundle path.
sleep: Optional callable for inter-retry pauses. Defaults to
:func:`time.sleep`.
"""
self._timeout = timeout
self._retries = retries
self._user_agent = user_agent
self._verify: bool | str = verify
self._sleep: Callable[[float], None] = sleep if sleep is not None else _time_module.sleep
self._session = self._make_session()
self._ssl_warned_domains: set[str] = set()
@property
def ssl_warned_domains(self) -> set[str]:
"""Domains that have already triggered an SSL warning."""
return set(self._ssl_warned_domains)
def _make_session(self) -> requests.Session:
"""Create a new :class:`~requests.Session` with a cleared cookie jar and User-Agent set.
Returns:
Configured :class:`~requests.Session` instance.
"""
session = requests.Session()
session.cookies.clear()
session.headers.update({'User-Agent': self._user_agent})
return session
[docs]
def request(self, url: str, *, method: str = 'HEAD') -> RequestResult:
"""Issue an HTTP request, following redirects manually and retrying on transient errors.
If *method* is ``'HEAD'`` and the server returns 405, automatically
retries with ``'GET'``.
Parameters:
url: The URL to request.
method: HTTP method string (``'HEAD'`` or ``'GET'``).
Returns:
A :class:`RequestResult` describing the outcome.
"""
result = self._request_with_retry(url, method=method)
if method == 'HEAD' and result.status_code == 405:
result = self._request_with_retry(url, method='GET')
return result
def _request_with_retry(self, url: str, *, method: str) -> RequestResult:
"""Attempt a request up to ``retries + 1`` times on transient failures.
Parameters:
url: Target URL.
method: HTTP method string.
Returns:
:class:`RequestResult` from the final attempt.
"""
last_result: RequestResult | None = None
attempts = self._retries + 1
for attempt in range(attempts):
try:
result = self._do_request(url, method=method)
except requests.exceptions.SSLError as exc:
domain = urlparse(url).netloc
if domain not in self._ssl_warned_domains:
self._ssl_warned_domains.add(domain)
logger.warning('SSL error for %s: %s', url, exc)
return RequestResult(
final_url=url,
status_code=0,
headers={},
body=None,
content_type=None,
error=str(exc),
)
except requests.exceptions.RequestException as exc:
last_result = RequestResult(
final_url=url,
status_code=0,
headers={},
body=None,
content_type=None,
error=str(exc),
)
if attempt < attempts - 1:
logger.warning('Transient error for %s (attempt %d): %s', url, attempt + 1, exc)
self._sleep(self._timeout)
continue
return last_result
if self._is_transient(result.status_code):
last_result = result
if attempt < attempts - 1:
logger.warning(
'Transient status %d for %s (attempt %d), retrying',
result.status_code,
url,
attempt + 1,
)
self._sleep(self._timeout)
continue
return result
return last_result or RequestResult(
final_url=url,
status_code=0,
headers={},
body=None,
content_type=None,
error='All retry attempts exhausted',
)
def _is_transient(self, status_code: int) -> bool:
"""Return True if *status_code* indicates a transient failure.
Parameters:
status_code: HTTP status code integer.
Returns:
True for codes in :data:`_TRANSIENT_STATUSES`.
"""
return status_code in _TRANSIENT_STATUSES
def _do_request(self, url: str, method: str) -> RequestResult:
"""Perform a single HTTP request, manually following redirects up to the limit.
Parameters:
url: Starting URL.
method: HTTP method string.
Returns:
:class:`RequestResult` for the final response.
Raises:
:exc:`requests.exceptions.RequestException`: On network-level failures.
"""
redirect_chain: list[RedirectHop] = []
current_url = url
for _ in range(_MAX_REDIRECTS + 1):
with self._session.request(
method,
current_url,
allow_redirects=False,
timeout=self._timeout,
stream=True,
verify=self._verify,
) as response:
if response.is_redirect:
location = response.headers.get('Location', '')
redirect_chain.append(
RedirectHop(url=current_url, status_code=response.status_code)
)
current_url = urljoin(current_url, location)
continue
body: str | None = None
bytes_downloaded = 0
if method == 'GET':
body = response.text
bytes_downloaded = len(response.content)
return RequestResult(
final_url=current_url,
status_code=response.status_code,
headers=dict(response.headers),
body=body,
content_type=response.headers.get('Content-Type'),
redirect_chain=redirect_chain,
bytes_downloaded=bytes_downloaded,
)
return RequestResult(
final_url=current_url,
status_code=0,
headers={},
body=None,
content_type=None,
redirect_chain=redirect_chain,
error=f'Too many redirects (>{_MAX_REDIRECTS}) starting from {url}',
)