Source code for cladetime.cladetime

"""Class for clade time traveling."""

import tempfile
import warnings
from datetime import datetime, timezone
from pathlib import Path

import polars as pl
import structlog

from cladetime import Tree, sequence
from cladetime.clade import Clade
from cladetime.exceptions import (
    CladeTimeDataUnavailableError,
    CladeTimeDateWarning,
    CladeTimeInvalidURLError,
    CladeTimeSequenceWarning,
)
from cladetime.util.config import Config
from cladetime.util.reference import _get_clade_assignments, _get_date, _get_nextclade_dataset, _get_s3_object_url

logger = structlog.get_logger()



[docs]
class CladeTime:
    """Interface for Nextstrain SARS-CoV-2 genome sequences and clades.

    The CladeTime class is instantiated with two optional arguments that
    specify the point in time at which to access genome sequences/metadata
    as well as the reference tree used for clade assignment. CladeTime
    interacts with GenBank-based data provided by the Nextstrain project.

    Important
    ---------
    Historical data availability is constrained by Nextstrain's infrastructure:

    - sequence_as_of: Must be >= 2025-09-29 (Nextstrain S3 90 day retention)
    - tree_as_of: Must be >= 2024-10-09 (variant-nowcast-hub archive availability)

    These constraints reflect Nextstrain's October 2025 implementation of a
    90 day retention policy for S3 versioned objects. Dates outside these
    windows will raise CladeTimeDataUnavailableError. See GitHub issue #185
    for details and potential workarounds.

    Note: These limitations may change as Nextstrain's infrastructure evolves.

    Parameters
    ----------
    sequence_as_of : datetime.datetime | str | None
        Sets the versions of Nextstrain SARS-CoV-2 genome sequence and
        sequence metadata files that will be used by CladeTime
        properties and methods. Can be a datetime object or a
        string in YYYY-MM-DD format, both of which will be treated as
        UTC. The default value is the current UTC time. Dates passed
        as YYYY-MM-DD strings will be set to 11:59:59 PM UTC.
        Must be >= 2025-09-29.
    tree_as_of : datetime.datetime | str | None
        Sets the version of the Nextstrain reference tree that will be
        used by CladeTime. Can be a datetime object or a string in
        YYYY-MM-DD format, both of which will be treated as UTC.
        The default value is :any:`sequence_as_of<sequence_as_of>`.
        Dates passed as YYYY-MM-DD strings will be set to 11:59:59 PM UTC.
        Must be >= 2024-10-09.

    Attributes
    ----------
    url_ncov_metadata : str
        S3 URL to metadata from the Nextstrain pipeline run that
        generated the sequence clade assignments in
        :any:`url_sequence_metadata<url_sequence_metadata>`
    url_sequence : str
        S3 URL to the Nextstrain Sars-CoV-2 sequence file (zst-compressed
        .fasta) that was current at the date specified in
        :any:`sequence_as_of<sequence_as_of>`
    url_sequence_metadata : str
        S3 URL to the Nextstrain Sars-CoV-2 sequence metadata file
        (zst-compressed tsv) that was current at the date specified in
        :any:`sequence_as_of<sequence_as_of>`
    """

    def __init__(self, sequence_as_of=None, tree_as_of=None):
        """CladeTime constructor."""
        self._config = self._get_config()
        self.sequence_as_of = sequence_as_of
        self.tree_as_of = tree_as_of
        self._ncov_metadata = {}
        self._sequence_metadata = pl.LazyFrame()

        self.url_sequence = _get_s3_object_url(
            self._config.nextstrain_ncov_bucket, self._config.nextstrain_genome_sequence_key, self.sequence_as_of
        )[1]
        self.url_sequence_metadata = _get_s3_object_url(
            self._config.nextstrain_ncov_bucket, self._config.nextstrain_genome_metadata_key, self.sequence_as_of
        )[1]

        # Nextstrain began publishing ncov pipeline metadata starting on 2024-08-01
        if self.sequence_as_of >= self._config.nextstrain_min_ncov_metadata_date:
            try:
                self.url_ncov_metadata = _get_s3_object_url(
                    self._config.nextstrain_ncov_bucket, self._config.nextstrain_ncov_metadata_key, self.sequence_as_of
                )[1]
            except ValueError as e:
                # S3 doesn't have historical metadata - will use Hub fallback when fetching
                logger.warn(
                    "Nextstrain S3 metadata not available, will use Hub fallback",
                    date=self.sequence_as_of.strftime("%Y-%m-%d"),
                    error=str(e),
                )
                # Set to empty string so fallback will be triggered
                self.url_ncov_metadata = ""
        else:
            self.url_ncov_metadata = None

    @property
    def sequence_as_of(self) -> datetime:
        """
        datetime.datetime : The date and time (UTC) used to retrieve NextStrain sequences
        and sequence metadata. :any:`url_sequence<url_sequence>` and
        :any:`url_sequence_metadata<url_sequence_metadata>` link to
        Nextstrain files that were current as of this date.
        """
        return self._sequence_as_of

    @sequence_as_of.setter
    def sequence_as_of(self, date) -> None:
        min_sequence_date = self._config.nextstrain_min_seq_date
        date_warning = False
        utc_now = datetime.now(timezone.utc)

        try:
            sequence_as_of = _get_date(date)
        except ValueError:
            sequence_as_of = utc_now
            date_warning = True

        # Check if date is before data availability window - raise error
        if sequence_as_of < min_sequence_date:
            raise CladeTimeDataUnavailableError(
                f"\nSequence data is not available before {min_sequence_date.strftime('%Y-%m-%d')}. "
                f"Nextstrain S3 only retains up to 90 days of historical versions. "
                f"Requested date: {sequence_as_of.strftime('%Y-%m-%d')}. "
                f"\nNote: This limitation is due to Nextstrain's data retention policy, "
                f"which may change over time. See GitHub issue #185 for more details."
            )
        elif sequence_as_of > utc_now:
            sequence_as_of = utc_now
            date_warning = True

        if date_warning:
            msg = (
                "\nSequence as_of cannot be in the future, defaulting to "
                f"current date: {sequence_as_of.strftime('%Y-%m-%d')}"
            )
            warnings.warn(msg, category=CladeTimeDateWarning)

        self._sequence_as_of = sequence_as_of

    @property
    def tree_as_of(self) -> datetime:
        """
        datetime.datetime : The date and time (UTC) used to retrieve the NextStrain
        reference tree.
        """
        return self._tree_as_of

    @tree_as_of.setter
    def tree_as_of(self, date) -> None:
        min_tree_date = self._config.nextstrain_min_ncov_metadata_date
        date_warning = False

        if date is None:
            tree_as_of = self.sequence_as_of
        else:
            try:
                tree_as_of = _get_date(date)
            except ValueError:
                date_warning = True
                default_field = "sequence_as_of"
                tree_as_of = self.sequence_as_of

        utc_now = datetime.now(timezone.utc)

        # Check if date is before reference tree metadata availability - raise error
        if tree_as_of < min_tree_date:
            raise CladeTimeDataUnavailableError(
                f"\nReference tree metadata is not available before {min_tree_date.strftime('%Y-%m-%d')}. "
                f"Historical metadata is provided by variant-nowcast-hub archives starting from this date. "
                f"Requested date: {tree_as_of.strftime('%Y-%m-%d')}. "
                f"\nNote: This limitation is due to hub archive availability, which may expand over time. "
                f"See GitHub issue #185 for more details."
            )
        elif tree_as_of > utc_now:
            default_field = "current date"
            date_warning = True
            tree_as_of = utc_now

        if date_warning:
            msg = (
                "\nTree as_of cannot be in the future, defaulting to "
                f"{default_field}: {tree_as_of.strftime('%Y-%m-%d')}"
            )
            warnings.warn(msg, category=CladeTimeDateWarning)

        self._tree_as_of = tree_as_of

    @property
    def ncov_metadata(self):
        return self._ncov_metadata

    @ncov_metadata.getter
    def ncov_metadata(self) -> dict:
        """
        dict : Metadata for the reference tree that was used for SARS-CoV-2
        clade assignments as of :any:`tree_as_of<tree_as_of>`.
        This property will be empty for dates before 2024-08-01, when
        Nextstrain began publishing ncov pipeline metadata.
        """
        if self.url_ncov_metadata is not None:
            # Pass sequence_as_of date for Hub fallback support
            # Note: empty string "" is valid here - it triggers fallback in _get_ncov_metadata
            metadata = sequence._get_ncov_metadata(self.url_ncov_metadata, as_of_date=self.sequence_as_of)
            return metadata
        else:
            metadata = {}
        return metadata

    @property
    def sequence_metadata(self):
        return self._sequence_metadata

    @sequence_metadata.getter
    def sequence_metadata(self) -> pl.LazyFrame:
        """
        :external+polars:std:doc:`polars.LazyFrame<reference/lazyframe/index>` : A Polars LazyFrame that references
        :any:`url_sequence_metadata<url_sequence_metadata>`
        """
        if self.url_sequence_metadata:
            sequence_metadata = sequence.get_metadata(metadata_url=self.url_sequence_metadata)
            return sequence_metadata
        else:
            raise CladeTimeInvalidURLError("CladeTime is missing url_sequence_metadata")

    def __repr__(self):
        return f"CladeTime(sequence_as_of={self.sequence_as_of}, tree_as_of={self.tree_as_of})"

    def __str__(self):
        return f"Work with Nextstrain Sara-CoV-2 sequences as of {self.sequence_as_of} and Nextclade clade assignments as of {self.tree_as_of}"

    def _get_config(self) -> Config:
        """Return a config object."""
        config = Config()

        return config


[docs]
    def assign_clades(self, sequence_metadata: pl.LazyFrame, output_file: Path | str | None = None) -> Clade:
        """Assign clades to a specified set of sequences.

        For each sequence in a sequence file (.fasta), assign a Nextstrain
        clade using the Nextclade reference tree that corresponds to the
        tree_as_of date. The earliest available tree_as_of date is 2024-08-01,
        when Nextstrain began publishing the pipeline metadata that Cladetime
        uses to retrieve past reference trees.

        Parameters
        ----------
        sequence_metadata : polars.LazyFrame
            A Polars LazyFrame of the Nexstrain
            :external+ncov:doc:`sequence metadata<reference/metadata-fields>`
            to use for clade assignment.
        output_file : str | None
            The full path (including a .tsv filename) to where the clade
            assignment output file will be saved. The default value is
            <home_dir>/cladetime/clade_assignments.tsv.

        Returns
        -------
        :class:`cladetime.clade.Clade`
            A Clade object that contains detailed and summarized information
            about clades assigned to the sequences in sequence_metadata.

        Raises
        -------
        CladeTimeSequenceWarning
            If sequence_metadata is empty, the clade assignment process
            will be stopped.

        Example
        -------
        >>> import polars as pl
        >>>
        >>> from cladetime import CladeTime, sequence
        >>> ct = CladeTime(sequence_as_of="2024-11-15", tree_as_of="2024-09-01")
        >>>
        >>> filtered_metadata = sequence.filter_metadata(
        >>>     ct.sequence_metadata,
        >>>     collection_min_date = "2024-10-01",
        >>> )
        >>> clade_assignments = ct.assign_clades(filtered_metadata)
        >>>
        >>> clade_assignment_summary = clade_assignments.summary
        >>> clade_assignment_summary.select(
        >>>     ["location", "date", "clade_nextstrain", "count"])
        >>>     .sort("count", descending=True)
        >>>     .collect(stream=True).head()
        ┌──────────┬────────────┬──────────────────┬───────┐
        │ location ┆ date       ┆ clade_nextstrain ┆ count │
        │ ---      ┆ ---        ┆ ---              ┆ ---   │
        │ str      ┆ date       ┆ str              ┆ u32   │
        ╞══════════╪════════════╪══════════════════╪═══════╡
        │ NY       ┆ 2024-10-01 ┆ 24C              ┆ 15    │
        │ NY       ┆ 2024-10-15 ┆ 24C              ┆ 15    │
        │ NY       ┆ 2024-10-03 ┆ 24C              ┆ 14    │
        │ NY       ┆ 2024-10-14 ┆ 24C              ┆ 14    │
        │ NJ       ┆ 2024-10-16 ┆ 24C              ┆ 12    │
        └──────────┴────────────┴──────────────────┴───────┘
        """
        assignment_date = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M")
        if output_file is not None:
            output_file = Path(output_file)
        else:
            output_file = Path.home() / "cladetime" / "clade_assignments.tsv"

        logger.info(
            "Starting clade assignment pipeline", sequence_as_of=self.sequence_as_of, tree_as_of=self.tree_as_of
        )

        # drop any clade-related columns from sequence_metadata (if any exists, it will be replaced
        # by the results of the clade assignment)
        logger.info("Removing current sequence assignments from metadata")
        sequence_metadata = sequence_metadata.drop(
            [
                col
                for col in sequence_metadata.collect_schema().names()
                if col not in self._config.nextstrain_standard_metadata_fields
            ]
        )

        # from the sequence metadata, derive a set of sequence IDs (the "strain")
        # column for use when filtering sequences in the .fasta file
        logger.info("Collecting sequence IDs from metadata")
        ids: set = sequence.get_metadata_ids(sequence_metadata)
        sequence_count = len(ids)

        # if there are no sequences in the filtered metadata, stop the clade assignment
        if sequence_count == 0:
            msg = "Sequence_metadata is empty or missing 'strain' columns \n" "Stopping clade assignment...."
            warnings.warn(
                msg,
                category=CladeTimeSequenceWarning,
            )
            return Clade(meta={}, detail=pl.LazyFrame(), summary=pl.LazyFrame())
        else:
            logger.info("Sequence count complete", sequence_count=sequence_count)

        # if there are many sequences in the filtered metadata, warn that clade assignment will
        # take a long time and require a lot of resources
        if sequence_count > self._config.clade_assignment_warning_threshold:
            msg = (
                f"About to assign clades to {sequence_count} sequences. \n" 
                "The assignment process is resource intensive. \n"
                "Depending on the limitations of your machine, \n"
                "you may want to use a smaller subset of sequences."
            )
            warnings.warn(
                msg,
                category=CladeTimeSequenceWarning,
            )

        tree = Tree(self.tree_as_of, self.url_sequence)

        with tempfile.TemporaryDirectory() as tmpdir:
            filtered_sequences = sequence.filter(ids, self.url_sequence, Path(tmpdir))
            nextclade_dataset = _get_nextclade_dataset(
                tree.ncov_metadata.get("nextclade_version_num", ""),
                tree.ncov_metadata.get("nextclade_dataset_name", "").lower(),
                tree.ncov_metadata.get("nextclade_dataset_version", ""),
                Path(tmpdir),
            )
            logger.info(
                "Assigning clades",
                sequences_to_assign=len(ids),
                nextclade_dataset_version=tree.ncov_metadata.get("nextclade_dataset_version"),
            )
            assignments = _get_clade_assignments(
                tree.ncov_metadata.get("nextclade_version_num", ""), filtered_sequences, nextclade_dataset, output_file
            )
            assigned_clades_df = pl.read_csv(assignments, separator="\t", infer_schema_length=100000)
            # get a count of non-null clade_nextstrain values
            # (this is the number of sequences that were assigned to a clade)
            assigned_sequence_count = assigned_clades_df.select(pl.count("clade_nextstrain")).to_series().to_list()[0]

            logger.info(
                "Nextclade assignments done",
                sequences_to_assign=sequence_count,
                sequences_assigned=assigned_sequence_count,
                assignment_file=assignments,
                nextclade_dataset=tree.ncov_metadata.get("nextclade_dataset_version"),
            )

        # join the assigned clades with the original sequence metadata, create a summarized LazyFrame
        # of clade counts by location, date, and host, and return both (along with metadata) in a
        # Clade object
        assigned_clades = sequence_metadata.join(
            assigned_clades_df.lazy(), left_on="strain", right_on="seqName", how="left"
        )
        summarized_clades = sequence.summarize_clades(
            assigned_clades, group_by=["location", "date", "host", "clade_nextstrain", "country"]
        )

        metadata = {
            "sequences_to_assign": sequence_count,
            "sequences_assigned": assigned_sequence_count,
            "sequence_as_of": self.sequence_as_of,
            "tree_as_of": self.tree_as_of,
            "nextclade_dataset_version": tree.ncov_metadata.get("nextclade_dataset_version"),
            "nextclade_dataset_name": tree.ncov_metadata.get("nextclade_dataset_name"),
            "nextclade_version_num": tree.ncov_metadata.get("nextclade_version_num"),
            "assignment_as_of": assignment_date,
        }
        metadata_clades = Clade(meta=metadata, detail=assigned_clades, summary=summarized_clades)

        return metadata_clades