ThomasTheMaker commited on Aug 31, 2025

Commit

4d7adb3

verified ·

1 Parent(s): 84cb578

Delete src

Browse files

Files changed (22) hide show

src/checkpointing/__init__.py +0 -23
src/checkpointing/evaluation.py +0 -68
src/checkpointing/learning_dynamics.py +0 -424
src/checkpointing/training.py +0 -287
src/config/__init__.py +0 -31
src/config/_constants.py +0 -18
src/config/checkpointing_config.py +0 -97
src/config/data_config.py +0 -36
src/config/evaluation_config.py +0 -28
src/config/model_config.py +0 -33
src/config/monitoring_config.py +0 -29
src/config/training_config.py +0 -40
src/evaluation/__init__.py +0 -103
src/evaluation/tasks/paloma.py +0 -52
src/model/__init__.py +0 -12
src/model/pico_decoder.py +0 -911
src/training/trainer.py +0 -753
src/training/utils/__init__.py +0 -34
src/training/utils/data.py +0 -35
src/training/utils/initialization.py +0 -702
src/training/utils/io.py +0 -52
src/training/utils/logging.py +0 -48

src/checkpointing/__init__.py DELETED Viewed

@@ -1,23 +0,0 @@
-"""
-Pico Checkpointing Package
-We subdivide the checkpointing into training, evaluation, and learning_dynamics. Training
-checkpoints store the model, optimizer, and learning rate scheduler. Evaluation checkpoints store
-the evaluation results on the defined metrics. Learning dynamics checkpoints store activations and gradients used for
-learning dynamics analysis.
-"""
-from .evaluation import save_evaluation_results
-from .learning_dynamics import (
-    compute_learning_dynamics_states,
-    save_learning_dynamics_states,
-)
-from .training import load_checkpoint, save_checkpoint
-__all__ = [
-    "compute_learning_dynamics_states",
-    "load_checkpoint",
-    "save_checkpoint",
-    "save_evaluation_results",
-    "save_learning_dynamics_states",
-]

src/checkpointing/evaluation.py DELETED Viewed

@@ -1,68 +0,0 @@
-"""
-Utilities for checkpointing evaluation-related states (i.e. evaluation results, etc.)
-We save the evaluation results in a JSON file at the step-specific evaluation results directory.
-"""
-import json
-import os
-from typing import Any, Dict
-from huggingface_hub import upload_folder
-from lightning.fabric import Fabric
-from lightning.fabric.utilities.rank_zero import rank_zero_only
-from src.config import CheckpointingConfig
-from src.training.utils.io import use_backoff
-@rank_zero_only
-@use_backoff()
-def save_evaluation_results(
-    checkpointing_config: CheckpointingConfig,
-    checkpoint_step: int,
-    fabric: Fabric,
-    evaluation_results: Dict[str, Any],
-) -> None:
-    """Save evaluation results to disk and optionally to HuggingFace Hub.
-    The evaluation results are saved in the following directory structure:
-    {checkpointing_config.runs_dir}/
-        └── {checkpointing_config.run_name}/
-            └── {checkpointing_config.eval_results_dir}/
-                └── step_{checkpoint_step}.json
-    NOTE: this function is only called on rank 0 to avoid conflicts; assumes that the evaluation
-    results are gathered on rank 0.
-    Args:
-        checkpointing_config: Configuration object containing checkpoint settings
-        checkpoint_step: Current training checkpoint step (i.e. number of learning steps taken)
-        fabric: Lightning Fabric instance
-        evaluation_results: Dictionary containing evaluation metrics
-    """
-    run_dir = os.path.join(checkpointing_config.runs_dir, checkpointing_config.run_name)
-    eval_results_dir = os.path.join(
-        run_dir, checkpointing_config.evaluation.eval_results_dir
-    )
-    os.makedirs(eval_results_dir, exist_ok=True)
-    curr_eval_results_path = os.path.join(
-        eval_results_dir, f"step_{checkpoint_step}.json"
-    )
-    # save out as json
-    with open(curr_eval_results_path, "w") as f:
-        json.dump(evaluation_results, f)
-    if checkpointing_config.save_to_hf:
-        upload_folder(
-            folder_path=eval_results_dir,
-            path_in_repo=checkpointing_config.evaluation.eval_results_dir,
-            repo_id=checkpointing_config.hf_checkpoint.repo_id,
-            commit_message=f"Saving Evaluation Results -- Step {checkpoint_step}",
-            revision=checkpointing_config.run_name,
-            token=os.getenv("HF_TOKEN"),
-        )

src/checkpointing/learning_dynamics.py DELETED Viewed

@@ -1,424 +0,0 @@
-"""
-Utilities for checkpointing learning dynamics-related states (i.e. activations, weights, grads, etc.)
-We save the learning dynamics states in a subdirectory of the checkpointing directory.
-"""
-import os
-import re
-from typing import Dict, Optional
-import deepspeed
-import torch
-import torch.nn as nn
-import torch.optim as optim
-from datasets import Dataset
-from huggingface_hub import upload_folder
-from lightning.fabric import Fabric
-from lightning.fabric.strategies import DeepSpeedStrategy
-from lightning.fabric.utilities.rank_zero import rank_zero_only
-from torch.nn import functional as F
-from torch.utils.data import DataLoader
-from transformers import PreTrainedTokenizerBase
-from src.config import CheckpointingConfig
-from src.config.checkpointing_config import LearningDynamicsCheckpointingConfig
-from src.training.utils.initialization import initialize_model
-from src.training.utils.io import use_backoff
-# NOTE: DeepSpeed requires a dummy optimizer to be passed in to the setup function
-class DummyOptimizer(optim.Optimizer):
-    def __init__(self, params):
-        super().__init__(params, defaults={})
-class CheckpointStateExtractor:
-    """
-    Class to extract and save the states of a model at a given checkpoint step for learning
-    dynamics research.
-    """
-    def __init__(
-        self,
-        learning_dynamics_config: LearningDynamicsCheckpointingConfig,
-        fabric: Fabric,
-        model: nn.Module,
-    ):
-        self.learning_dynamics_config = learning_dynamics_config
-        self.fabric = fabric
-        self.model = model
-    def extract_states(self, dataloader, compute_gradients: bool = False):
-        """Extracts model states (activations, weights, and optionally gradients).
-        Given a dataloader, this function will perform a forward pass of the model on each batch,
-        and save the activations and weights at each layer. If compute_gradients is True, it will
-        also compute the gradients of the model parameters.
-        Args:
-            dataloader: The dataloader containing the dataset to extract states from.
-            compute_gradients: Whether to compute the gradients of the model parameters.
-        Returns:
-            A dictionary containing the activations, weights, and optionally gradients of the model.
-        """
-        checkpoint_activations = {}
-        checkpoint_weights = {}
-        # NOTE: to extract activations and weights, we need to setup forward hooks on the layers
-        # of the model that we are interested in. This is a good intro to forward hooks if you
-        # are not familiar: https://web.stanford.edu/~nanbhas/blog/forward-hooks-pytorch/
-        forward_hooks = self._setup_forward_hooks(
-            checkpoint_activations,
-            checkpoint_weights,
-        )
-        ########################################################
-        #
-        # Forward Pass: Extract activations and weights; and compute gradients
-        #
-        ########################################################
-        for sub_batch in dataloader:
-            _input_ids = torch.tensor(sub_batch["input_ids"], device=self.fabric.device)
-            if compute_gradients:
-                if "labels" in sub_batch:
-                    input_ids = _input_ids
-                    labels = torch.tensor(
-                        sub_batch["labels"], device=self.fabric.device
-                    )
-                else:
-                    input_ids = _input_ids[:, :-1]
-                    labels = _input_ids[:, 1:]
-            else:
-                input_ids = _input_ids
-                labels = None
-            if labels is None:
-                # we can throw away the outputs, we are only interested in the hidden states
-                with torch.no_grad():
-                    _ = self.model(input_ids)
-            else:
-                # NOTE: if we are computing gradients, calling backwards will compute the gradients
-                # of the model parameters.
-                outputs, _ = self.model(input_ids)
-                outputs = outputs.transpose(1, 2)
-                loss = F.cross_entropy(outputs, labels)
-                self.fabric.backward(loss, model=self.model)
-        # cleanup forward hooks
-        # NOTE this is not strictly necessary, since self.model is a deepcopy of the original model
-        # but it is good practice to remove the hooks after the forward pass is complete.
-        for hook in forward_hooks:
-            hook.remove()
-        ########################################################
-        #
-        # Extract gradients from the target tensors of the model
-        #
-        ########################################################
-        layer_suffixes = self.learning_dynamics_config.layer_suffixes
-        checkpoint_gradients = {}
-        if compute_gradients:
-            for name, param in self.model.named_parameters():
-                # only do this for the weight matrix of the layer_suffixes
-                if (
-                    any(layer_suffix in name for layer_suffix in layer_suffixes)
-                    and "weight" in name
-                ):
-                    if isinstance(self.fabric.strategy, DeepSpeedStrategy):
-                        _grad = deepspeed.utils.safe_get_full_grad(param)
-                    else:
-                        _grad = param.grad
-                    assert _grad is not None, f"Gradient is None for layer: {name}"
-                    name = re.sub(r"\.weight", "", name)
-                    checkpoint_gradients[name] = _grad.detach().cpu()
-        # zero out the gradients
-        self.model.zero_grad()
-        return checkpoint_activations, checkpoint_weights, checkpoint_gradients
-    ########################################################
-    #
-    # Setup forward hooks to save activations and weights at each layer
-    #
-    ########################################################
-    def _setup_forward_hooks(self, checkpoint_activations, checkpoint_weights):
-        """Setup forward hooks for the model to save activations and weights at each layer.
-        This function will setup forward hooks on the layers of the model that we are interested in.
-        The forward hooks will save the activations and weights at each layer whenever the forward pass
-        is performed.
-        Args:
-            checkpoint_activations: A dictionary to store the activations at each layer.
-            checkpoint_weights: A dictionary to store the weights at each layer.
-        Returns:
-            A list of forward hooks. We do this so that we can remove the hooks after the forward pass
-            is complete.
-        """
-        forward_hooks = []
-        layer_suffixes = self.learning_dynamics_config.layer_suffixes
-        for name, module in self.model.named_modules():
-            if any(layer_suffix in name for layer_suffix in layer_suffixes):
-                _forward_hook = module.register_forward_hook(
-                    self._get_forward_hook(
-                        name, checkpoint_activations, checkpoint_weights
-                    )
-                )
-                forward_hooks.append(_forward_hook)
-        return forward_hooks
-    def _get_forward_hook(
-        self, module_name, checkpoint_activations, checkpoint_weights
-    ):
-        """Get a forward hook for a given module.
-        This function is called by the _setup_forward_hooks function to setup a forward hook for a given
-        module. This functions is a closure that captures the module_name, checkpoint_activations, and
-        checkpoint_weights.
-        Args:
-            module_name: The name of the module to setup a forward hook for.
-            checkpoint_activations: A dictionary to store the activations at each layer.
-            checkpoint_weights: A dictionary to store the weights at each layer.
-        Returns:
-            A forward hook for the given module.
-        """
-        def _forward_hook(module, _, module_out):
-            sequence_idx = self.learning_dynamics_config.sequence_idx
-            local_activations = module_out[:, sequence_idx, :].detach()
-            # Gather activations from all processes using fabric
-            gathered_activations = self.fabric.all_gather(local_activations)
-            # Reshape from [num_processes, batch_size, hidden_dim] to [total_batch_size, hidden_dim]
-            # NOTE: transposing allows us to interleave the activations from each process so that
-            # they are in the correct order. (i.e. activation N is from data sample N)
-            gathered_activations = gathered_activations.transpose(0, 1).reshape(
-                -1, gathered_activations.shape[-1]
-            )
-            # check if there is already a key for the module name
-            if module_name not in checkpoint_activations:
-                # if there is no key, then we create a new key and store the hidden states
-                checkpoint_activations[module_name] = (
-                    gathered_activations.detach().cpu()
-                )
-                # extract the weight matrix just once
-                weight_matrix = module.weight.detach().cpu()
-                checkpoint_weights[module_name] = weight_matrix
-            else:
-                # if there is already a key, then we concatenate the new hidden states to the existing ones
-                checkpoint_activations[module_name] = torch.cat(
-                    (
-                        checkpoint_activations[module_name],
-                        gathered_activations.detach().cpu(),
-                    )
-                )
-        return _forward_hook
-def compute_learning_dynamics_states(
-    checkpointing_config: CheckpointingConfig,
-    fabric: Fabric,
-    model: nn.Module,
-    dataset: Dataset,
-    compute_gradients: bool = False,
-) -> Dict[str, torch.Tensor]:
-    """Computes the learning dynamics metrics for a given checkpoint step.
-    Uses the CheckpointStateExtractor to extract the activations, weights, and optionally gradients
-    of the model at a given checkpoint step.
-    Args:
-        checkpointing_config: The configuration object for checkpointing.
-        fabric: The Fabric instance for distributed training.
-        model: The model to extract states from.
-        dataset: The dataset to extract states from.
-        compute_gradients: Whether to compute the gradients of the model parameters.
-    Returns:
-        A dictionary containing the activations, weights, and optionally gradients of the model.
-    """
-    # NOTE: Synchronizing processes for fabric dataloader setup
-    fabric.barrier()
-    model.to("cpu")  # Offloading model to CPU
-    # Setting up Dataloader for learning dynamics
-    def _collate_fn(batch):
-        return {"input_ids": [entry["input_ids"] for entry in batch]}
-    batch_size = checkpointing_config.learning_dynamics.batch_size
-    sub_batch_size = batch_size // fabric.world_size
-    # NOTE: Make sure to set drop_last to False, otherwise the last batch will be dropped
-    # and we will not have a complete set of activations for the last sample. Also,
-    # we need to set shuffle to False, otherwise the activations will be shuffled across
-    # processes and we will not be able to interleave them correctly.
-    extractor_dataloader = DataLoader(
-        dataset,
-        batch_size=sub_batch_size,
-        shuffle=False,
-        collate_fn=_collate_fn,
-        drop_last=False,
-    )
-    extractor_dataloader = fabric.setup_dataloaders(
-        extractor_dataloader, use_distributed_sampler=True
-    )
-    # Create a new model instance with same parameters but zero gradients
-    _model = initialize_model(model.config)
-    _model.load_state_dict(model.state_dict())
-    if isinstance(fabric.strategy, DeepSpeedStrategy):
-        _model, _ = fabric.setup(_model, DummyOptimizer(_model.parameters()))
-    else:
-        _model = fabric.setup(_model)
-    _model.zero_grad()
-    # setup forward hooks for the model to save activations and weights at each layer
-    state_extractor = CheckpointStateExtractor(
-        checkpointing_config.learning_dynamics, fabric, _model
-    )
-    checkpoint_activations, checkpoint_weights, checkpoint_gradients = (
-        state_extractor.extract_states(
-            extractor_dataloader, compute_gradients=compute_gradients
-        )
-    )
-    del _model
-    torch.cuda.empty_cache()
-    # NOTE: Synchronizing processes for model setup
-    fabric.barrier()
-    model.to(fabric.device)
-    # NOTE: Trimming down the activations to match the dataset size;
-    # This is because the DataSampler might add extra samples to the dataset to make it evenly divisible
-    # by the number of processes. We need to remove these extra samples.
-    for layer_name, layer_activations in checkpoint_activations.items():
-        if len(layer_activations) > len(dataset):
-            checkpoint_activations[layer_name] = layer_activations[: len(dataset)]
-        elif len(layer_activations) < len(dataset):
-            raise ValueError(
-                f"Number of activations ({len(layer_activations)}) in layer {layer_name} does not match number of samples in dataset ({len(dataset)})"
-            )
-    return {
-        "activations": checkpoint_activations,
-        "weights": checkpoint_weights,
-        "gradients": checkpoint_gradients,
-    }
-@rank_zero_only
-@use_backoff()
-def save_learning_dynamics_states(
-    checkpointing_config: CheckpointingConfig,
-    checkpoint_step: int,
-    prefix: str,
-    fabric: Fabric,
-    learning_dynamics_states: Dict[str, torch.Tensor],
-    learning_dynamics_dataset: Optional[Dataset] = None,
-    tokenizer: Optional[PreTrainedTokenizerBase] = None,
-) -> None:
-    """Save the learning dynamics metrics to the checkpointing directory.
-    By default only the learning dynamics states are saved. If the learning dynamics dataset
-    is provided, it is also saved; if a tokenizer is provided, the dataset is also detokenized
-    (i.e. a new column with the text is added to the dataset).
-    The learning dynamics dataset is saved in the checkpointing directory as a HuggingFace
-    dataset.
-    Creates a versioned checkpoint directory with the following structure:
-    {checkpointing_config.runs_dir}/
-        └── {checkpointing_config.run_name}/
-            └── {checkpointing_config.checkpoints_dir}/
-                ├── step_{checkpoint_step}/
-                │   └── {checkpointing_config.learning_dynamics_dir}/ # Learning Dynamics files
-                │      ├── {prefix}_activations.pt
-                │      ├── {prefix}_weights.pt
-                │      └── {prefix}_gradients.pt
-                │      └── {prefix}_data/ # if learning_dynamics_dataset is provided
-                └── latest -> step_{checkpoint_step}/
-    NOTE: this function is only called on rank 0
-    Args:
-        checkpointing_config: The configuration object for checkpointing.
-        checkpoint_step: The checkpoint step at which the learning dynamics states were computed.
-        prefix: The prefix for the learning dynamics states.
-        fabric: The Fabric instance for distributed training.
-        learning_dynamics_states: The learning dynamics states to save.
-        learning_dynamics_dataset: The dataset containing learning dynamics data,
-            including input IDs that need to be decoded. (optional)
-        tokenizer: The tokenizer used to decode input IDs into text. (optional)
-    """
-    runs_dir = checkpointing_config.runs_dir
-    run_name = checkpointing_config.run_name
-    checkpoints_dir = checkpointing_config.checkpoints_dir
-    learning_dynamics_dir = checkpointing_config.learning_dynamics_dir
-    run_path = os.path.join(runs_dir, run_name)
-    root_checkpoint_path = os.path.join(run_path, checkpoints_dir)
-    checkpoint_path = os.path.join(root_checkpoint_path, f"step_{checkpoint_step}")
-    learning_dynamics_path = os.path.join(checkpoint_path, learning_dynamics_dir)
-    os.makedirs(learning_dynamics_path, exist_ok=True)
-    # save the learning dynamics states
-    for key, value in learning_dynamics_states.items():
-        if value is not None and len(value) > 0:
-            torch.save(
-                value, os.path.join(learning_dynamics_path, f"{prefix}_{key}.pt")
-            )
-    if learning_dynamics_dataset is not None:
-        if tokenizer is not None:
-            # go through dataset and decode the input ids; and add back into dataset
-            detokenized_dataset = {"input_ids": [], "text": []}
-            for entry in learning_dynamics_dataset:
-                input_ids = entry["input_ids"]
-                decoded_text = tokenizer.decode(input_ids, skip_special_tokens=True)
-                detokenized_dataset["input_ids"].append(input_ids)
-                detokenized_dataset["text"].append(decoded_text)
-            learning_dynamics_dataset = Dataset.from_dict(detokenized_dataset)
-        learning_dynamics_dataset_path = os.path.join(
-            learning_dynamics_path, f"{prefix}_data"
-        )
-        learning_dynamics_dataset.save_to_disk(learning_dynamics_dataset_path)
-    if checkpointing_config.save_to_hf:
-        # Upload the HF model
-        upload_folder(
-            folder_path=learning_dynamics_path,
-            path_in_repo=learning_dynamics_dir,
-            repo_id=checkpointing_config.hf_checkpoint.repo_id,
-            commit_message=f"Saving Learning Dynamics Data ({prefix}) -- Step {checkpoint_step}",
-            revision=checkpointing_config.run_name,
-            token=os.getenv("HF_TOKEN"),
-        )

src/checkpointing/training.py DELETED Viewed

@@ -1,287 +0,0 @@
-"""
-Utilities for checkpointing training-related states (i.e. model, optimizer, lr_scheduler, etc.)
-We save both a HuggingFace model and a Fabric-specific checkpoint. The HuggingFace model is
-saved at the step-specific checkpoint directory, while the Fabric-specific checkpoint is saved
-in a subdirectory. This is done to facilitate easier versioning of the HuggingFace model files
-(which are what gets uploaded to the Hub).
-"""
-import os
-from dataclasses import asdict
-from typing import Any, Dict, Tuple, Union
-import yaml
-from huggingface_hub import upload_file, upload_folder
-from lightning.fabric import Fabric
-from lightning.fabric.strategies import DeepSpeedStrategy
-from lightning.fabric.utilities.seed import _collect_rng_states, _set_rng_states
-from torch import nn
-from torch.optim import Optimizer
-from torch.optim.lr_scheduler import LRScheduler
-from transformers import PreTrainedTokenizerBase
-from src.config import CheckpointingConfig
-from src.training.utils.io import use_backoff
-@use_backoff()
-def load_checkpoint(
-    checkpointing_config: CheckpointingConfig,
-    checkpoint_step: Union[str, int],
-    fabric: Fabric,
-    model: nn.Module,
-    optimizer: Optimizer,
-    lr_scheduler: LRScheduler,
-) -> Tuple[nn.Module, Optimizer, LRScheduler, int]:
-    """Load model checkpoint and associated states from a given step.
-    Args:
-        checkpointing_config: Configuration object containing checkpoint settings
-        checkpoint_step: The step at which to load the checkpoint
-        fabric: Lightning Fabric instance for distributed training support
-        model: The model instance to load weights into
-        optimizer: The optimizer instance to load states into
-        lr_scheduler: The learning rate scheduler to load states into
-    Returns:
-        Tuple containing the model, optimizer, lr_scheduler, and checkpoint step.
-        Returns None if no checkpoint is found.
-    """
-    if isinstance(checkpoint_step, int):
-        checkpoint_step = f"step_{checkpoint_step}"
-    checkpoint_path = os.path.join(
-        checkpointing_config.runs_dir,
-        checkpointing_config.run_name,
-        checkpointing_config.checkpoints_dir,
-        checkpoint_step,
-    )
-    if not os.path.exists(checkpoint_path):
-        return None
-    # Load from specified fabric checkpoint subdirectory
-    fabric_checkpoint_path = os.path.join(
-        checkpoint_path, checkpointing_config.fabric_checkpoint_dir
-    )
-    checkpoint_state = {
-        "_model": model,
-        "_optimizer": optimizer,
-        "_lr_scheduler": lr_scheduler,
-    }
-    if not isinstance(fabric.strategy, DeepSpeedStrategy):
-        fabric_load_file = os.path.join(
-            fabric_checkpoint_path, checkpointing_config.fabric_checkpoint_filename
-        )
-    else:
-        # Deepspeed checkpoints create sub-directory with distributed checkpoint file
-        fabric_load_file = fabric_checkpoint_path
-    extra_state = fabric.load(os.path.join(fabric_load_file), state=checkpoint_state)
-    # NOTE: extra_state will contain any additional states that were saved in the checkpoint
-    checkpoint_step = extra_state["_checkpoint_step"]
-    if "_rng_states" in extra_state:
-        _rng_states = extra_state["_rng_states"]
-        _set_rng_states(_rng_states)
-    return model, optimizer, lr_scheduler, checkpoint_step
-@use_backoff()
-def save_checkpoint(
-    configs: Dict[str, Any],
-    checkpoint_step: int,
-    fabric: Fabric,
-    model: nn.Module,
-    optimizer: Optimizer,
-    lr_scheduler: LRScheduler,
-    tokenizer: PreTrainedTokenizerBase,
-    upload_logs: bool = False,
-) -> None:
-    """Save training checkpoint and associated states to disk and optionally to HuggingFace Hub.
-    We save the following files:
-    - HuggingFace model files (config.json, pytorch_model.bin)
-    - Tokenizer files (vocab.json, merges.txt)
-    - Fabric-specific files - fabric state of the model, optimizer, and lr_scheduler. If using
-      DeepSpeed, the checkpoint is saved in a subdirectory, otherwise it is saved in a single file.
-    Note that the HuggingFace model files are saved at the step-specific checkpoint directory, while the
-    Fabric-specific files are saved in a subdirectory. This is done to facilitate easier
-    versioning of the HuggingFace model files (which are what gets uploaded to the Hub).
-    NOTE: Why do we save a HF model at all? We do this because it makes it easier to load the model
-    in a separate script for evaluation and to play nicely with the HuggingFace Hub.
-    Creates a versioned checkpoint directory with the following structure:
-    {checkpointing_config.runs_dir}/
-        └── {checkpointing_config.run_name}/
-            └── training_config.yaml           # Training config
-            └── {checkpointing_config.checkpoints_dir}/
-                ├── step_{checkpoint_step}/
-                │   ├── config.json                    # HuggingFace model config
-                │   ├── model.safetensors              # HuggingFace model weights
-                │   ├── pico_{model_type}.py           # HuggingFace custom model class
-                │   ├── tokenizer.json                 # Tokenizer vocab
-                │   ├── tokenizer_config.json          # Tokenizer config
-                │   └── {checkpointing_config.fabric_checkpoint_dir}/  # Fabric-specific files
-                │       └── checkpoint/                # Distributed model checkpoint files (if using DeepSpeed)
-                │           OR
-                │       └── checkpoint.pt              # Single checkpoint file (if using other strategies)
-                └── latest -> step_{checkpoint_step}/
-    Args:
-        configs: A dictionary containing the initialized configuration objects.
-        checkpoint_step: The current training checkpoint step (i.e. number of learning steps taken)
-        fabric: Lightning Fabric instance for distributed training support
-        model: The model instance to save
-        optimizer: The optimizer instance to save
-        lr_scheduler: The learning rate scheduler to save
-        tokenizer: The tokenizer to save
-        upload_logs: Whether to upload training logs to HF Hub (default: False)
-    """
-    checkpointing_config = configs["checkpointing"]
-    # Get the directories from the training config
-    runs_dir = checkpointing_config.runs_dir
-    checkpoints_dir = checkpointing_config.checkpoints_dir
-    fabric_checkpoint_dir = checkpointing_config.fabric_checkpoint_dir
-    logs_dir = checkpointing_config.logs_dir
-    run_path = os.path.join(runs_dir, checkpointing_config.run_name)
-    root_checkpoint_path = os.path.join(run_path, checkpoints_dir)
-    checkpoint_path = os.path.join(root_checkpoint_path, f"step_{checkpoint_step}")
-    # Create directories
-    os.makedirs(checkpoint_path, exist_ok=True)
-    ########################################################
-    #
-    # Save HuggingFace files
-    #
-    ########################################################
-    # NOTE: we convert the Pico model to a HuggingFace model before saving it. See `model.py`
-    # for more details.
-    if fabric.global_rank == 0:
-        hf_model = model.convert_to_hf_model()
-        hf_model.save_pretrained(checkpoint_path)
-        tokenizer.save_pretrained(checkpoint_path)
-    ########################################################
-    #
-    # Save Fabric-specific files
-    #
-    ########################################################
-    # Create fabric-specific subdirectory
-    fabric_checkpoint_path = os.path.join(checkpoint_path, fabric_checkpoint_dir)
-    os.makedirs(fabric_checkpoint_path, exist_ok=True)
-    # Save model states (use underscore to avoid conflicts with third-party libraries)
-    checkpoint_state = {
-        "_model": model,
-        "_optimizer": optimizer,
-        "_lr_scheduler": lr_scheduler,
-        "_checkpoint_step": checkpoint_step,
-    }
-    if not isinstance(fabric.strategy, DeepSpeedStrategy):
-        checkpoint_state["_rng_states"] = _collect_rng_states()
-        fabric_save_file = os.path.join(
-            fabric_checkpoint_path, checkpointing_config.fabric_checkpoint_filename
-        )
-    else:
-        # Deepspeed checkpoints create sub-directory with distributed checkpoint file
-        fabric_save_file = fabric_checkpoint_path
-    fabric.save(fabric_save_file, checkpoint_state)
-    if fabric.global_rank == 0:
-        # Save config in fabric directory
-        config_path = os.path.join(run_path, "training_config.yaml")
-        if not os.path.exists(config_path):
-            # Converting dataclasses to joined dicts and saving to file
-            _training_config = {}
-            for config_name, config in configs.items():
-                _training_config[config_name] = asdict(config)
-            with open(config_path, "w") as f:
-                yaml.dump(_training_config, f)
-        # Update latest symlink
-        latest_symlink_path = os.path.join(root_checkpoint_path, "latest")
-        if os.path.lexists(latest_symlink_path):
-            os.remove(latest_symlink_path)
-        os.symlink(
-            f"step_{checkpoint_step}", latest_symlink_path, target_is_directory=True
-        )
-    ########################################################
-    #
-    # Push to HuggingFace Hub (if configured)
-    #
-    ########################################################
-    if fabric.global_rank == 0:
-        # Push only on rank zero thread
-        if checkpointing_config.save_to_hf:
-            repo_id = checkpointing_config.hf_checkpoint.repo_id
-            # Upload the HF model
-            hf_model.push_to_hub(
-                repo_id=repo_id,
-                commit_message=f"Saving HF Model -- Step {checkpoint_step}",
-                revision=checkpointing_config.run_name,
-                token=os.getenv("HF_TOKEN"),
-            )
-            if checkpoint_step == 0:
-                # Uploading Tokenizer during first step since it never changes
-                tokenizer.push_to_hub(
-                    repo_id=repo_id,
-                    commit_message=f"Saving Tokenizer -- Step {checkpoint_step}",
-                    revision=checkpointing_config.run_name,
-                    token=os.getenv("HF_TOKEN"),
-                )
-                # Upload training config, also only in first step
-                upload_file(
-                    path_or_fileobj=config_path,
-                    path_in_repo="training_config.yaml",
-                    repo_id=repo_id,
-                    commit_message=f"Saving Training Config -- Step {checkpoint_step}",
-                    revision=checkpointing_config.run_name,
-                    token=os.getenv("HF_TOKEN"),
-                )
-            # Upload the fabric checkpoint directory
-            upload_folder(
-                folder_path=fabric_checkpoint_path,
-                path_in_repo=fabric_checkpoint_dir,
-                repo_id=repo_id,
-                commit_message=f"Saving Fabric Checkpoint -- Step {checkpoint_step}",
-                revision=checkpointing_config.run_name,
-                token=os.getenv("HF_TOKEN"),
-            )
-            # Upload logs if requested
-            if upload_logs:
-                logs_path = os.path.join(run_path, logs_dir)
-                upload_folder(
-                    folder_path=logs_path,
-                    path_in_repo=logs_dir,
-                    repo_id=repo_id,
-                    commit_message=f"Saving Logs -- Step {checkpoint_step}",
-                    revision=checkpointing_config.run_name,
-                    token=os.getenv("HF_TOKEN"),
-                )

src/config/__init__.py DELETED Viewed

@@ -1,31 +0,0 @@
-"""
-Pico Config Package
-The modules of this package are where you can specify the hyperparameters for the Pico model,
-the dataset, the training process, evaluation, etc.
-As with anything else in Pico, we've designed for the configuration setup to be as flexible
-as possible. By default the configs are implemented as vanilla dataclasses -- this makes it easy to
-switch to different config management systems if you want, like hydra.
-Some things to NOTE:
-- All hyperparameters are initialized with default values, which can be overridden.
-- The default vocab size is set to the size of the OLMo tokenizer.
-"""
-# For convenience, we export the config classes here
-from .checkpointing_config import CheckpointingConfig
-from .data_config import DataConfig
-from .evaluation_config import EvaluationConfig
-from .model_config import ModelConfig
-from .monitoring_config import MonitoringConfig
-from .training_config import TrainingConfig
-__all__ = [
-    "CheckpointingConfig",
-    "DataConfig",
-    "EvaluationConfig",
-    "ModelConfig",
-    "MonitoringConfig",
-    "TrainingConfig",
-]

src/config/_constants.py DELETED Viewed

@@ -1,18 +0,0 @@
-"""
-Constants used throughout the codebase
-"""
-# Basic Training Constants used throughout the codebase
-VOCAB_SIZE = 50304
-MAX_SEQ_LEN = 2048
-BATCH_SIZE = 1024
-GRADIENT_ACCUMULATION_STEPS = 128
-# Directories used to store training runs, checkpoints, logs, and evaluation results
-RUNS_DIR = "runs"
-CHECKPOINTS_DIR = "checkpoints"
-LOGS_DIR = "logs"
-FABRIC_CHECKPOINT_DIR = "fabric_state"
-FABRIC_CHECKPOINT_FILENAME = "checkpoint.pt"
-LEARNING_DYNAMICS_DIR = "learning_dynamics"
-EVAL_RESULTS_DIR = "eval_results"

src/config/checkpointing_config.py DELETED Viewed

@@ -1,97 +0,0 @@
-"""
-Checkpointing Config
-Specifies the hyperparameters for the checkpointing process; checkpointing is used to save
-the model and optimizer states, as well as the learning dynamics metrics.
-"""
-from dataclasses import dataclass, field
-from typing import List, Optional
-from ._constants import (
-    CHECKPOINTS_DIR,
-    EVAL_RESULTS_DIR,
-    FABRIC_CHECKPOINT_DIR,
-    FABRIC_CHECKPOINT_FILENAME,
-    LEARNING_DYNAMICS_DIR,
-    LOGS_DIR,
-    RUNS_DIR,
-)
-@dataclass
-class TrainingCheckpointingConfig:
-    # Automatically resume training from the most recent checkpoint
-    auto_resume: bool = True
-@dataclass
-class EvaluationCheckpointingConfig:
-    # Directory in which evaluation results are saved
-    eval_results_dir: str = EVAL_RESULTS_DIR
-@dataclass
-class LearningDynamicsCheckpointingConfig:
-    # Suffixes of the layers to compute learning dynamics for
-    layer_suffixes: List[str] = field(
-        default_factory=lambda: [
-            "attention.v_proj",
-            "attention.o_proj",
-            "swiglu.w_2",
-        ]
-    )
-    # Sequence index at which to extract hidden states; by default, we extract the hidden states
-    # at the last token of the sequence (-1)
-    sequence_idx: int = -1
-    # size of the sub-batch used for extracting learning dynamics states
-    batch_size: int = 8
-    # Path to evaluation dataset - used across learning dynamics checkpointing for consistency
-    # NOTE: set to None to disable extracting learning dynamics states for an eval_batch
-    # NOTE: this dataset should be small, ideally just a batch of additional data
-    eval_data: Optional[str] = "pico-lm/pretokenized-paloma-tinsy"
-@dataclass
-class HuggingFaceCheckpointingConfig:
-    # Should be in the format of <(username or organization name)>/<repo_name>, e.g. pico-lm/demo
-    repo_id: str = ""
-    # HuggingFace Collection Slug (specifies a tag for the run)
-    collection_slug: Optional[str] = None
-@dataclass
-class CheckpointingConfig:
-    # Assign a name to the run
-    run_name: Optional[str] = None
-    # Defining checkpointing directories
-    runs_dir: str = RUNS_DIR
-    checkpoints_dir: str = CHECKPOINTS_DIR
-    logs_dir: str = LOGS_DIR
-    fabric_checkpoint_dir: str = FABRIC_CHECKPOINT_DIR
-    fabric_checkpoint_filename: str = FABRIC_CHECKPOINT_FILENAME
-    learning_dynamics_dir: str = LEARNING_DYNAMICS_DIR
-    # How often to save checkpoints
-    save_every_n_steps: int = 1000
-    # Whether to save checkpoints to HuggingFace
-    save_to_hf: Optional[bool] = False
-    hf_checkpoint: HuggingFaceCheckpointingConfig = field(
-        default_factory=HuggingFaceCheckpointingConfig
-    )
-    training: TrainingCheckpointingConfig = field(
-        default_factory=TrainingCheckpointingConfig
-    )
-    evaluation: EvaluationCheckpointingConfig = field(
-        default_factory=EvaluationCheckpointingConfig
-    )
-    learning_dynamics: LearningDynamicsCheckpointingConfig = field(
-        default_factory=LearningDynamicsCheckpointingConfig
-    )

src/config/data_config.py DELETED Viewed

@@ -1,36 +0,0 @@
-"""
-Data Config
-Specifies the hyperparameters for the dataset, dataloader, and tokenizer.
-"""
-from dataclasses import dataclass, field
-from ._constants import BATCH_SIZE, VOCAB_SIZE
-@dataclass
-class DatasetConfig:
-    # Defines the HuggingFace name of a dataset
-    name: str = "pico-lm/pretokenized-dolma"
-@dataclass
-class DataLoaderConfig:
-    # NOTE: You should only change these values jointly with the training config; so that the
-    # sub-batch size is consistent with the gradient accumulation steps
-    batch_size: int = BATCH_SIZE
-@dataclass
-class TokenizerConfig:
-    # Specify a tokenizer to use
-    name: str = "allenai/OLMo-7B-0724-hf"
-    vocab_size: int = VOCAB_SIZE
-@dataclass
-class DataConfig:
-    dataset: DatasetConfig = field(default_factory=DatasetConfig)
-    dataloader: DataLoaderConfig = field(default_factory=DataLoaderConfig)
-    tokenizer: TokenizerConfig = field(default_factory=TokenizerConfig)

src/config/evaluation_config.py DELETED Viewed

@@ -1,28 +0,0 @@
-"""
-Evaluation Config
-Specifies the hyperparameters for the evaluation process, i.e. what metrics to compute, etc.
-"""
-from dataclasses import dataclass, field
-from typing import List, Optional
-from src.config._constants import MAX_SEQ_LEN
-@dataclass
-class PalomaEvaluationConfig:
-    dataset_name: str = "pico-lm/pretokenized-paloma-tinsy"
-    dataset_split: str = "val"
-    max_length: int = MAX_SEQ_LEN
-    batch_size: int = 16
-@dataclass
-class EvaluationConfig:
-    # Evaluation metrics to compute: by default, we compute the perplexity of the model on the paloma dataset
-    metrics: Optional[List[str]] = field(default_factory=lambda: ["paloma"])
-    # NOTE: Add other evaluation configs here
-    # Each evaluation metric should have its own config
-    paloma: PalomaEvaluationConfig = field(default_factory=PalomaEvaluationConfig)

src/config/model_config.py DELETED Viewed

@@ -1,33 +0,0 @@
-"""
-Model Config
-Specifies the hyperparameters for the Pico model/model architecture.
-"""
-from dataclasses import dataclass
-from typing import Optional
-from ._constants import BATCH_SIZE, MAX_SEQ_LEN, VOCAB_SIZE
-@dataclass
-class ModelConfig:
-    model_type: str = "pico_decoder"
-    # Pico Decoder default hyperparameters
-    d_model: int = 768
-    n_layers: int = 12
-    vocab_size: int = VOCAB_SIZE
-    batch_size: int = BATCH_SIZE
-    max_seq_len: int = MAX_SEQ_LEN
-    attention_n_heads: int = 12
-    attention_n_kv_heads: Optional[int] = 4
-    activation_hidden_dim: int = 3072
-    norm_eps: float = 1e-6
-    position_emb_theta: float = 10000.0

src/config/monitoring_config.py DELETED Viewed

@@ -1,29 +0,0 @@
-"""
-Monitoring Config
-Specifies the monitoring process, e.g. how to log metrics and keep track of training progress.
-"""
-from dataclasses import dataclass, field
-@dataclass
-class LoggingConfig:
-    log_level: str = "INFO"
-    log_every_n_steps: int = 100
-@dataclass
-class WandbConfig:
-    # configure logging to Weights and Biases
-    project: str = ""
-    entity: str = ""
-@dataclass
-class MonitoringConfig:
-    logging: LoggingConfig = field(default_factory=LoggingConfig)
-    # Weights and Biases
-    save_to_wandb: bool = False
-    wandb: WandbConfig = field(default_factory=WandbConfig)

src/config/training_config.py DELETED Viewed

@@ -1,40 +0,0 @@
-"""
-Training Config
-Specifies the hyperparameters for the training process, i.e. the optimizer, learning rate, etc.
-"""
-from dataclasses import dataclass, field
-from ._constants import GRADIENT_ACCUMULATION_STEPS
-@dataclass
-class FabricConfig:
-    # Configure nodes/devices for parallelised training
-    num_nodes: int = 1
-    num_devices: int = 1
-    precision: str = "bf16-mixed"
-    # Hardware accelerator to use, can be cpu/cuda/mps etc.
-    accelerator: str = "cuda"
-@dataclass
-class OptimizationConfig:
-    # Optimizer
-    optimizer: str = "adamw"
-    lr: float = 3e-4
-    # Learning Rate Scheduler
-    lr_scheduler: str = "linear_with_warmup"
-    lr_warmup_steps: int = 2500
-    # Define number of gradient accumulation steps
-    gradient_accumulation_steps: int = GRADIENT_ACCUMULATION_STEPS
-@dataclass
-class TrainingConfig:
-    fabric: FabricConfig = field(default_factory=FabricConfig)
-    optimization: OptimizationConfig = field(default_factory=OptimizationConfig)
-    max_steps: int = 200_000

src/evaluation/__init__.py DELETED Viewed

@@ -1,103 +0,0 @@
-"""
-Pico Evaluation Package
-This package implements the evaluation pipeline for the Pico language model. It provides
-functionality to evaluate model performance using various metrics and handles the complete
-evaluation workflow.
-We recommend that each evaluation metric should have its own config, and should be
-implemented as a module in the `evaluation/tasks` directory that exposes a `run_<metric_name>` function.
-NOTE: Out of the box we only support Paloma, but the structure is designed to be flexible and
-you are meant to add whatever metrics you want. One of the main reasons we store out
-the model in the HuggingFace format is so that its easy to use third-party evaluation
-libraries/frameworks.
-"""
-import os
-import torch
-from lightning.fabric import Fabric
-from torch import nn
-from src.config import CheckpointingConfig, EvaluationConfig
-from .tasks.paloma import run_paloma_evaluation
-def run_evaluation(
-    evaluation_config: EvaluationConfig,
-    checkpointing_config: CheckpointingConfig,
-    fabric: Fabric,
-    model: nn.Module,
-) -> None:
-    """Run model evaluation using specified metrics in `evaluation_config`.
-    This function orchestrates the complete evaluation pipeline by:
-    1. Resolving the model checkpoint path (either specified or latest) to load the model from;
-        during training, this is the path to the latest checkpoint in the run directory.
-    2. Iterating over each evaluation metric, and running the corresponding evaluation function.
-        NOTE: we suggest you follow the pattern of the Paloma evaluation function, and implement
-        your own evaluation function for each metric in the `evaluation/tasks` directory.
-    3. Aggregating results across all metrics in a dictionary, and returning it.
-    Args:
-        evaluation_config (EvaluationConfig): Configuration object containing:
-            - metrics (List[str]): Metrics to evaluate; each metric should have its
-                own config. Currently supported: ["paloma"];
-            - paloma (PalomaConfig): Configuration for Paloma evaluation
-                - max_length (int): Maximum sequence length
-                - limit_eval_examples (Optional[int]): Number of examples to evaluate
-        checkpointing_config (CheckpointingConfig): Configuration object containing:
-        fabric (Fabric): Lightning Fabric instance
-        model (nn.Module): Original model instance
-    Returns:
-        Dict[str, float]: Dictionary mapping metric names to their values
-            Example: {"paloma": 3.45}
-    Raises:
-        ValueError: If an unsupported evaluation metric is requested
-    Example:
-        results = run_evaluation(
-            EvaluationConfig(
-                run_name="experiment_1",
-                metrics=["paloma"],
-                paloma=PalomaConfig(max_length=2048, batch_size=16)
-            )
-        )
-    """
-    fabric.barrier()
-    model.to("cpu")  # Offloading model to CPU
-    evaluation_results = {}
-    # NOTE: Evaluation is only run on first processes to enable third-party evaluation libraries
-    # to determine how to handle distributed evaluation.
-    if fabric.global_rank == 0:
-        run_name = checkpointing_config.run_name
-        model_path = f"{os.getcwd()}/{checkpointing_config.runs_dir}/{run_name}/{checkpointing_config.checkpoints_dir}/latest"
-        os.makedirs(model_path, exist_ok=True)
-        for metric in evaluation_config.metrics:
-            # NOTE: add your own metrics here
-            if metric == "paloma":
-                evaluation_result = run_paloma_evaluation(
-                    model_path, evaluation_config.paloma
-                )
-            else:
-                raise ValueError(f"Metric {metric} not supported")
-            evaluation_results[metric] = evaluation_result
-    torch.cuda.empty_cache()
-    fabric.barrier()
-    model.to(fabric.device)
-    return evaluation_results

src/evaluation/tasks/paloma.py DELETED Viewed

@@ -1,52 +0,0 @@
-"""
-Paloma is a comprehensive evaluation benchmark for large language models (LLMs) that focuses
-on measuring perplexity across diverse text domains.
-To evaluate on Paloma, we use the huggingface evaluation framework.
-For more details, see: https://huggingface.co/datasets/allenai/paloma
-"""
-import evaluate
-from datasets import load_dataset
-from datasets.utils.logging import disable_progress_bar, enable_progress_bar
-from src.config.evaluation_config import PalomaEvaluationConfig
-def run_paloma_evaluation(
-    model_path: str,
-    paloma_config: PalomaEvaluationConfig,
-) -> None:
-    """Run Perplexity evaluation on the Paloma evaluation dataset.
-    We use the HuggingFace evaluate library to load in and compute the perplexity metric.
-    Args:
-        model_path (str): Path to the model checkpoint to be evaluated
-        paloma_config (PalomaEvaluationConfig): Configuration for Paloma evaluation
-    """
-    disable_progress_bar()
-    # load custom evaluation space, see https://huggingface.co/spaces/pico-lm/perplexity
-    perplexity = evaluate.load("pico-lm/perplexity")
-    dataset = load_dataset(
-        paloma_config.dataset_name, split=paloma_config.dataset_split
-    )["text"]
-    # compute perplexity score on Paloma dataset
-    perplexity_result = perplexity.compute(
-        model_id=model_path,
-        predictions=dataset,
-        add_start_token=False,
-        max_length=paloma_config.max_length,
-        batch_size=paloma_config.batch_size,
-        trust_remote_code=True,
-    )
-    mean_perplexity = perplexity_result["mean_perplexity"]
-    enable_progress_bar()
-    return mean_perplexity

src/model/__init__.py DELETED Viewed

@@ -1,12 +0,0 @@
-"""
-Model Package
-This Package contains Pico models (currently only the Pico Decoder). We plan to implement other
-architectures in the future.
-If you have other models you'd like to implement, we recommend you add modules to this package.
-"""
-from .pico_decoder import PicoDecoder
-__all__ = ["PicoDecoder"]

src/model/pico_decoder.py DELETED Viewed

@@ -1,911 +0,0 @@
-"""
-Pico Decoder: A Lightweight Causal Transformer Language Model
-Pico Decoder uses a simple LLAMA-style transformer architecture, written for clarity and educational purposes.
-Everything is written with a modular design for easy modification and experimentation.
-Key features:
-- RMSNorm for layer normalization
-- Rotary Positional Embeddings (RoPE)
-- Multi-head attention with KV-cache support
-- SwiGLU activation function
-- Residual connections throughout
-- KV-cache for faster autoregressive generation
-References:
-    - RoPE: https://arxiv.org/abs/2104.09864
-    - SwiGLU: https://arxiv.org/abs/2002.05202
-    - LLAMA: https://arxiv.org/abs/2302.13971
-Adapted from:
-    - OLMO: https://github.com/allenai/OLMo
-    - LLAMA: https://github.com/meta/llama
-"""
-from dataclasses import asdict
-from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-# Handle PyTorch version compatibility for attention backend
-try:
-    from torch.nn.attention import SDPBackend, sdpa_kernel
-    HAS_TORCH_ATTENTION = True
-except ImportError:
-    # Fallback for older PyTorch versions
-    HAS_TORCH_ATTENTION = False
-    SDPBackend = None
-    sdpa_kernel = None
-from transformers import GenerationMixin, PretrainedConfig, PreTrainedModel
-from transformers.generation import GenerationConfig
-from transformers.modeling_outputs import CausalLMOutput, CausalLMOutputWithPast
-try:
-    if TYPE_CHECKING:
-        # We need to do this to avoid importing these when creating the HF-compatible models
-        from src.config import ModelConfig
-except ImportError:
-    pass
-########################################################
-#
-# Layer Normalization
-#
-########################################################
-class RMSNorm(torch.nn.Module):
-    """Root Mean Square Layer Normalization.
-    A variant of Layer Normalization that uses RMS statistics instead of mean/variance,
-    resulting in improved stability and performance.
-    Args:
-        config (Union[ModelConfig, PicoHFConfig]): Configuration object containing normalization parameters
-            - config.norm_eps: Small constant for numerical stability
-            - config.d_model: Model dimension for the weight parameter
-    References:
-        https://arxiv.org/abs/1910.07467
-    """
-    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
-        super().__init__()
-        self.eps = config.norm_eps
-        self.weight = nn.Parameter(torch.ones(config.d_model))
-    def _norm(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Normalizes the input tensor by its RMS value.
-        """
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Applies RMS normalization to the input tensor and scales it by the weight parameter.
-        """
-        output = self._norm(x.float()).type_as(x)
-        return output * self.weight
-########################################################
-#
-# Positional Embedding
-#
-########################################################
-class RoPE(nn.Module):
-    """Rotary Positional Embeddings (RoPE).
-    Implements position-dependent rotation of keys and queries in attention mechanism,
-    allowing better modeling of relative positions in sequences. Uses complex number
-    operations for efficient rotation.
-    Args:
-        config (Union[ModelConfig, PicoHFConfig]): Model configuration containing:
-            - config.position_emb_theta: Base for frequency computation
-            - config.d_model: Model dimension
-            - config.attention_n_heads: Number of attention heads
-            - config.max_seq_len: Maximum sequence length
-    References:
-        https://arxiv.org/abs/2104.09864
-    """
-    _freqs_cis_tensor: torch.Tensor | None = None
-    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
-        super().__init__()
-        self.theta = config.position_emb_theta
-        self.dim = config.d_model // config.attention_n_heads
-        max_seq_len = config.max_seq_len
-        # only gets set once, and then reused for all RoPE instances
-        if RoPE._freqs_cis_tensor is None:
-            RoPE._freqs_cis_tensor = self._setup_freqs_cis(
-                max_seq_len, self.theta, self.dim
-            )
-        # register _freqs_cis buffer
-        # can be easily recomputed so persistent=False
-        self.register_buffer("_freqs_cis", self._freqs_cis_tensor, persistent=False)
-    @classmethod
-    def _setup_freqs_cis(cls, seq_len: int, theta: float, dim: int) -> torch.Tensor:
-        """Setup Frequency Tensor for RoPE Embeddings
-        Initializes the complex frequency tensor that is used to compute the RoPE embeddings.
-        Note other implementations will use cos and sin directly, but using the complex
-        number representation is (probably) more efficient:
-            e^(theta * i * t) = cos(theta * t) + i * sin(theta * t) [Euler's formula]
-        """
-        _freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
-        positions = torch.arange(seq_len)
-        freqs = torch.outer(positions, _freqs)
-        return torch.polar(torch.ones_like(freqs), freqs)  # complex64
-    def get_freqs_cis(
-        self, input_shape: torch.Size, start_pos: int, end_pos: int
-    ) -> torch.Tensor:
-        """Reshape Frequency Tensor for RoPE Embeddings
-        Makes the frequency tensor broadcastable with the input tensor.
-        """
-        _freqs_cis = self._freqs_cis[start_pos:end_pos]
-        ndim = len(input_shape)
-        assert 0 <= 1 < ndim
-        assert _freqs_cis.shape == (input_shape[1], input_shape[-1])
-        # TODO: Check whether this is correct (might be able to remove this)
-        shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(input_shape)]
-        return _freqs_cis.view(*shape)
-    def forward(
-        self,
-        queries: torch.Tensor,
-        keys: torch.Tensor,
-        start_pos: int = 0,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Apply RoPE Embeddings to Queries and Keys
-        Applies the rotary positional embeddings to the input tensors via complex num multiplication
-        NOTE: The start_pos is used if we want to use the kv_cache in the attention mechanism.
-        """
-        queries_ = torch.view_as_complex(
-            queries.float().reshape(*queries.shape[:-1], -1, 2)
-        )
-        keys_ = torch.view_as_complex(keys.float().reshape(*keys.shape[:-1], -1, 2))
-        input_shape = (
-            queries_.shape
-        )  # same as keys: (batch_size, seq_len, n_heads, head_dim/2)
-        freqs_start_pos = start_pos
-        freqs_end_pos = freqs_start_pos + queries_.shape[1]
-        freqs_cis = self.get_freqs_cis(input_shape, freqs_start_pos, freqs_end_pos)
-        queries_rotated = torch.view_as_real(queries_ * freqs_cis).flatten(3)
-        keys_rotated = torch.view_as_real(keys_ * freqs_cis).flatten(3)
-        return queries_rotated.type_as(queries), keys_rotated.type_as(keys)
-########################################################
-#
-# Attention
-#
-########################################################
-class Attention(nn.Module):
-    """Multi-head Attention with Group Query Attention support.
-    Implements scaled dot-product attention and supports:
-    - Grouped Query Attention (GQA)
-    - Key-Value caching for efficient inference
-    - RoPE integration
-    Args:
-        config (Union[ModelConfig, PretrainedConfig]): Configuration containing:
-            - config.attention_n_heads: Number of attention heads
-            - config.attention_n_kv_heads: Number of key/value heads
-            - config.d_model: Model dimension
-            - config.batch_size: Maximum batch size
-            - config.max_seq_len: Maximum sequence length
-    Shape:
-        - Input: (batch_size, seq_len, d_model)
-        - Output: (batch_size, seq_len, d_model)
-    """
-    def __init__(
-        self,
-        config: Union["ModelConfig", "PicoDecoderHFConfig"],
-    ):
-        super().__init__()
-        self.n_heads = config.attention_n_heads
-        self.n_kv_heads = config.attention_n_kv_heads
-        self.batch_size = config.batch_size
-        self.max_seq_len = config.max_seq_len
-        d_model = config.d_model
-        self.head_dim = d_model // self.n_heads
-        self.n_rep = self.n_heads // self.n_kv_heads
-        self.q_proj = nn.Linear(d_model, self.n_heads * self.head_dim, bias=False)
-        self.k_proj = nn.Linear(d_model, self.n_kv_heads * self.head_dim, bias=False)
-        self.v_proj = nn.Linear(d_model, self.n_kv_heads * self.head_dim, bias=False)
-        self.o_proj = nn.Linear(self.n_heads * self.head_dim, d_model, bias=False)
-        self.rope = RoPE(config)
-    def forward(
-        self,
-        input: torch.Tensor,
-        mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[Tuple[torch.Tensor, ...]] = None,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
-        """Forward pass for the attention mechanism.
-        Computes queries, keys, and values for the attention mechanism. Applies rotary positional
-        embeddings to the queries and keys, and then computes attention scores and outputs.
-        For an introduction to the attention mechanism, see:
-        https://arxiv.org/abs/1706.03762
-        A few things to note:
-        - The past_key_values is used to implement the KV cache, which is used to speed up
-          generation by caching the KV pairs from previous forward passes. This is useful when doing
-          tasks that require generating multiple tokens conditioned on previous tokens (e.g. language
-          modeling, text generation, etc.). The way the KV cache is implemented is that each layer has
-          its own KV cache - this KV cache is implemented as a tuple.
-        """
-        bsz, seq_len, _ = input.shape
-        _queries, _keys, _values = (
-            self.q_proj(input),
-            self.k_proj(input),
-            self.v_proj(input),
-        )
-        # Reshaping for multi-head attention
-        queries = _queries.view(bsz, seq_len, self.n_heads, self.head_dim)
-        keys = _keys.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
-        values = _values.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
-        # The start position is used to apply the RoPE embeddings to only the new tokens
-        # when using the kv_cache in the attention mechanism.
-        # We want to start from the last position in the cache.
-        start_pos = 0
-        if past_key_values is not None and past_key_values[0] is not None:
-            start_pos = past_key_values[0].shape[1]
-        # apply rotary positional embeddings
-        queries, keys = self.rope(queries, keys, start_pos)
-        if (
-            past_key_values is not None
-            and past_key_values[0] is not None
-            and past_key_values[1] is not None
-        ):
-            keys = torch.cat([past_key_values[0], keys], dim=1)
-            values = torch.cat([past_key_values[1], values], dim=1)
-        if use_cache:
-            cached_keys = keys
-            cached_values = values
-        else:
-            cached_keys = None
-            cached_values = None
-        queries = queries.transpose(1, 2)
-        keys = keys.transpose(1, 2)
-        values = values.transpose(1, 2)
-        apply_gqa = self.n_rep > 1
-        if apply_gqa and queries.device.type == "mps":
-            # NOTE: MPS does not support GQA in the SDPA kernel, but we can repeat the keys and values
-            # outside of the kernel to get the same effect.
-            # See: https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
-            keys = keys.repeat_interleave(self.n_rep, dim=-3)
-            values = values.repeat_interleave(self.n_rep, dim=-3)
-            apply_gqa = False
-        if HAS_TORCH_ATTENTION:
-            backends = [SDPBackend.CUDNN_ATTENTION, SDPBackend.MATH]
-            with sdpa_kernel(backends=backends):
-                attn_output = F.scaled_dot_product_attention(
-                    queries.contiguous(),
-                    keys.contiguous(),
-                    values.contiguous(),
-                    attn_mask=mask.to(queries.dtype) if mask is not None else None,
-                    enable_gqa=apply_gqa,
-                )
-        else:
-            # Fallback for older PyTorch versions - use default backend
-            attn_output = F.scaled_dot_product_attention(
-                queries.contiguous(),
-                keys.contiguous(),
-                values.contiguous(),
-                attn_mask=mask.to(queries.dtype) if mask is not None else None,
-                enable_gqa=apply_gqa,
-            )
-        attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, seq_len, -1)
-        output = self.o_proj(attn_output)
-        return output, (cached_keys, cached_values)
-########################################################
-#
-# SwiGLU (Combines MLP and Activation)
-#
-########################################################
-class SwiGLU(nn.Module):
-    """SwiGLU Activation Function with Linear Projections.
-    Implements the SwiGLU activation function combined with linear transformations,
-    serving as the feed-forward network in transformer blocks.
-    Args:
-        config (Union[ModelConfig, PicoDecoderHFConfig]): Configuration containing:
-            - config.d_model: Model dimension
-            - config.activation_hidden_dim: Hidden dimension (typically 4 * d_model)
-    References:
-        https://arxiv.org/abs/2002.05202
-    """
-    def __init__(self, config: Union["ModelConfig", "PicoDecoderHFConfig"]):
-        super().__init__()
-        model_dim = config.d_model
-        act_hidden_dim = config.activation_hidden_dim  # usually 4 * d_model
-        self.w_0 = nn.Linear(model_dim, act_hidden_dim, bias=False)
-        self.w_1 = nn.Linear(model_dim, act_hidden_dim, bias=False)
-        self.w_2 = nn.Linear(act_hidden_dim, model_dim, bias=False)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.w_2(F.silu(self.w_0(x)) * self.w_1(x))
-########################################################
-#
-# PicoDecoderBlock
-#
-########################################################
-class PicoDecoderBlock(nn.Module):
-    """Single Transformer Block with Attention and Feed-forward layers.
-    Implements a standard transformer block with:
-    - Multi-head attention with normalization and residual connection
-    - SwiGLU feed-forward network with normalization and residual connection
-    Args:
-        config (Union[ModelConfig, PicoDecoderHFConfig]): Model configuration; either a dataclass or
-            a HuggingFace PicoDecoderHFConfig
-    """
-    def __init__(
-        self,
-        config: Union["ModelConfig", "PicoDecoderHFConfig"],
-    ):
-        super().__init__()
-        self.attention = Attention(config)
-        self.swiglu = SwiGLU(config)
-        self.attention_norm = RMSNorm(config)
-        self.swiglu_norm = RMSNorm(config)
-    def forward(
-        self,
-        input: torch.Tensor,
-        mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[Tuple[torch.Tensor]] = None,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
-        attention_output, cached_key_values = self.attention(
-            self.attention_norm(input),
-            mask=mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-        )
-        # NOTE: cached_key_values is None if use_cache is False
-        h = input + attention_output
-        out = h + self.swiglu(self.swiglu_norm(h))
-        return out, cached_key_values
-########################################################
-#
-# Pico Decoder (Causal Transformer Model)
-#
-########################################################
-class PicoDecoder(nn.Module):
-    """
-    Pico Decoder: combines the embedding, causal decoder blocks, and output projection into a
-    single autoregressive model.
-    For more information on the model, see the classes for the modules that make up the model.
-    """
-    def __init__(
-        self,
-        model_config: Union["ModelConfig", "PicoDecoderHFConfig"],
-    ):
-        super().__init__()
-        self.config = model_config
-        self.embedding_proj = nn.Embedding(self.config.vocab_size, self.config.d_model)
-        self.layers = nn.ModuleList(
-            [PicoDecoderBlock(self.config) for _ in range(self.config.n_layers)]
-        )
-        self.output_norm = RMSNorm(self.config)
-        self.de_embedding_proj = nn.Linear(
-            self.config.d_model, self.config.vocab_size, bias=False
-        )
-    def convert_to_hf_model(self) -> "PicoDecoderHF":
-        """Convert the Lightning model to a HuggingFace model."""
-        # Create HF config without fabric-specific settings
-        hf_config = PicoDecoderHFConfig.from_dataclass(self.config)
-        # Create new HF model
-        hf_model = PicoDecoderHF(hf_config)
-        # Copy state dict, excluding fabric-specific keys
-        hf_model.load_state_dict(self.state_dict(prefix="pico_decoder."))
-        return hf_model
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[Tuple[Tuple[torch.Tensor, torch.Tensor]]]]:
-        """
-        This is the forward pass for the entire Pico model. It boils down to:
-        - Embedding the input ids
-        - Creating a causal mask
-        - Processing through the pico layers
-        - Projecting the output to logits
-        NOTE: One feature that might be confusing is the KV cache. The KV cache is used to speed up
-        generation by caching the KV pairs from previous forward passes. This is useful when doing
-        tasks that require generating multiple tokens conditioned on previous tokens (e.g. language
-        modeling, text generation, etc.). The way the KV cache is implemented is that each layer has
-        its own KV cache which is stored as a tuple. The whole model then stores a tuple of these
-        KV caches (so a tuple of tuples).
-        """
-        seq_len = input_ids.shape[-1]
-        h = self.embedding_proj(input_ids)
-        # Calculate start position from past cached KV pairs. Remember that each layer has its
-        # own KV Cache. So when we index past_key_values, we need to index into the KV pairs for the
-        # correct layer and then for either the keys or values.
-        start_pos = 0
-        if (
-            past_key_values is not None
-            and past_key_values[0] is not None
-            and past_key_values[0][0] is not None
-        ):
-            start_pos = past_key_values[0][0].shape[1]
-        # Create causal mask for current sequence
-        mask = None
-        if seq_len > 1:
-            mask = torch.full((seq_len, seq_len), float("-inf"))
-            mask = torch.triu(mask, diagonal=1)
-            # If using KV cache, extend mask to cover cached sequence length
-            if past_key_values is not None:
-                # Add zeros for cached tokens (we can attend to all of them)
-                mask = torch.hstack([torch.zeros((seq_len, start_pos)), mask])
-            mask = mask.to(h.device)
-        # NOTE: If we are using the cache, we need to store the cached KV pairs for each layer
-        #       in a tuple. Each layer will have its own cached KV pair which we aggregate in a tuple.
-        cached_key_values = () if use_cache else None
-        # Process through transformer blocks
-        for idx, layer in enumerate(self.layers):
-            layer_past_key_values = None
-            if past_key_values is not None:
-                try:
-                    # Handle both tuple-based cache and HuggingFace cache objects
-                    if hasattr(past_key_values, "__getitem__") and idx < len(
-                        past_key_values
-                    ):
-                        layer_past_key_values = past_key_values[idx]
-                except (KeyError, IndexError, TypeError):
-                    # If we can't access the cache properly, just skip it
-                    layer_past_key_values = None
-            h, layer_cached_key_values = layer(
-                h, mask=mask, past_key_values=layer_past_key_values, use_cache=use_cache
-            )
-            if use_cache:
-                cached_key_values += (layer_cached_key_values,)
-        # Final norm and projection
-        h = self.output_norm(h)
-        logits = self.de_embedding_proj(h).float()
-        return logits, cached_key_values
-########################################################
-#
-# HuggingFace Wrapper for the Pico Decoder model.
-#
-########################################################
-class PicoDecoderHFConfig(PretrainedConfig):
-    """Config class for the Pico Decoder HuggingFace wrapper."""
-    model_type = "pico_decoder"
-    @classmethod
-    def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PicoDecoderHFConfig":
-        """
-        Initialize config from a dictionary. Note that no kwargs are passed to the constructor --
-        this is because with some kwargs special handling is required and can make this class
-        brittle.
-        """
-        pico_config = cls(**config_dict)
-        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
-        unused_kwargs = {
-            key: value for key, value in kwargs.items() if not hasattr(pico_config, key)
-        }
-        if return_unused_kwargs:
-            return pico_config, unused_kwargs
-        return pico_config
-    @classmethod
-    def from_dataclass(cls, model_config: "ModelConfig"):
-        """Initialise from our custom config dataclass."""
-        return cls.from_dict(asdict(model_config))
-class PicoDecoderHF(PreTrainedModel, GenerationMixin):
-    """
-    HuggingFace wrapper for the Pico model with generation support.
-    Many evaluation frameworks require a model be setup as a HuggingFace model, so we provide a simple
-    wrapper that does just that. When we save checkpoints of the Pico model, we save both the normal
-    Pico model as well as the model wrapped in this HuggingFace class.
-    This also lets you do cool things like:
-    `model = AutoModelForCausalLM.from_pretrained("path/to/checkpoint")`
-    """
-    config_class = PicoDecoderHFConfig
-    _no_split_modules = ["PicoBlock", "Attention", "SwiGLU", "RMSNorm"]
-    main_input_name = "input_ids"
-    def __init__(self, config: PicoDecoderHFConfig):
-        super().__init__(config)
-        self.pico_decoder = PicoDecoder(config)
-        # Initialize generation config with defaults
-        self.generation_config = GenerationConfig()
-        # Set some reasonable defaults for the model
-        if hasattr(config, "max_position_embeddings"):
-            self.generation_config.max_length = config.max_position_embeddings
-        if hasattr(config, "vocab_size"):
-            self.generation_config.vocab_size = config.vocab_size
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Union[CausalLMOutput, CausalLMOutputWithPast]:
-        """HuggingFace forward pass wrapper.
-        Forwards pass for the HuggingFace version of the Pico Model. Basic wrapper around the
-        Pico model's forward pass, and returns the output as a HuggingFace CausalLMOutput.
-        """
-        logits, past_key_values = self.pico_decoder(
-            input_ids, past_key_values, use_cache
-        )
-        if use_cache:
-            return CausalLMOutputWithPast(
-                logits=logits,
-                past_key_values=past_key_values,
-            )
-        else:
-            return CausalLMOutput(
-                logits=logits,
-            )
-    def prepare_inputs_for_generation(
-        self,
-        input_ids: torch.LongTensor,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        attention_mask: Optional[torch.LongTensor] = None,
-        **kwargs,
-    ) -> Dict[str, Any]:
-        """
-        Prepare inputs for generation.
-        Args:
-            input_ids: Input token IDs
-            past_key_values: Cached key-value pairs from previous forward passes
-            attention_mask: Attention mask for the input
-            **kwargs: Additional arguments
-        Returns:
-            Dictionary containing prepared inputs
-        """
-        # If we have past_key_values, we only need the last token
-        if past_key_values is not None:
-            input_ids = input_ids[:, -1:]
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "use_cache": True,
-        }
-    def get_input_embeddings(self):
-        """Get the input embeddings layer."""
-        return self.pico_decoder.embedding_proj
-    def set_input_embeddings(self, value):
-        """Set the input embeddings layer."""
-        self.pico_decoder.embedding_proj = value
-    def get_output_embeddings(self):
-        """Get the output embeddings layer."""
-        return self.pico_decoder.de_embedding_proj
-    def set_output_embeddings(self, value):
-        """Set the output embeddings layer."""
-        self.pico_decoder.de_embedding_proj = value
-    def get_lm_head(self):
-        """Get the language model head."""
-        return self.pico_decoder.de_embedding_proj
-    def can_generate(self) -> bool:
-        """Check if the model can generate text."""
-        return True
-    @property
-    def is_encoder_decoder(self) -> bool:
-        """Check if the model is an encoder-decoder model."""
-        return False
-    @property
-    def can_use_cache(self) -> bool:
-        """Check if the model can use KV cache."""
-        return True
-    def resize_token_embeddings(
-        self, new_num_tokens: Optional[int] = None
-    ) -> torch.nn.Embedding:
-        """Resize token embeddings."""
-        old_embeddings = self.get_input_embeddings()
-        if new_num_tokens is None:
-            new_num_tokens = old_embeddings.num_embeddings
-        new_embeddings = torch.nn.Embedding(
-            new_num_tokens, old_embeddings.embedding_dim
-        )
-        new_embeddings.weight.data[: old_embeddings.num_embeddings] = (
-            old_embeddings.weight.data
-        )
-        self.pico_decoder.embedding_proj = new_embeddings
-        self.pico_decoder.de_embedding_proj = torch.nn.Linear(
-            old_embeddings.embedding_dim, new_num_tokens, bias=False
-        )
-        return new_embeddings
-# Register for auto classes
-PicoDecoderHFConfig.register_for_auto_class()
-PicoDecoderHF.register_for_auto_class("AutoModel")
-PicoDecoderHF.register_for_auto_class("AutoModelForCausalLM")
-########################################################
-#
-# New PicoDecoderForCausalLM class for generation support
-#
-########################################################
-class PicoDecoderForCausalLM(PreTrainedModel, GenerationMixin):
-    """
-    PicoDecoderForCausalLM: A HuggingFace-compatible model that properly supports generation.
-    This class is designed to work with existing checkpoints and provides full generation support.
-    It inherits from the right base classes that HuggingFace expects for text generation.
-    """
-    config_class = PicoDecoderHFConfig
-    _no_split_modules = ["PicoBlock", "Attention", "SwiGLU", "RMSNorm"]
-    main_input_name = "input_ids"
-    def __init__(self, config: PicoDecoderHFConfig):
-        super().__init__(config)
-        self.pico_decoder = PicoDecoder(config)
-        # Initialize generation config with defaults
-        self.generation_config = GenerationConfig()
-        # Set some reasonable defaults for the model
-        if hasattr(config, "max_position_embeddings"):
-            self.generation_config.max_length = config.max_position_embeddings
-        if hasattr(config, "vocab_size"):
-            self.generation_config.vocab_size = config.vocab_size
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Union[CausalLMOutput, CausalLMOutputWithPast]:
-        """Forward pass for text generation."""
-        logits, past_key_values = self.pico_decoder(
-            input_ids, past_key_values, use_cache
-        )
-        if use_cache:
-            return CausalLMOutputWithPast(
-                logits=logits,
-                past_key_values=past_key_values,
-            )
-        else:
-            return CausalLMOutput(
-                logits=logits,
-            )
-    def prepare_inputs_for_generation(
-        self,
-        input_ids: torch.LongTensor,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        attention_mask: Optional[torch.LongTensor] = None,
-        **kwargs,
-    ) -> Dict[str, Any]:
-        """Prepare inputs for generation."""
-        # If we have past_key_values, we only need the last token
-        if past_key_values is not None:
-            input_ids = input_ids[:, -1:]
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "use_cache": True,
-        }
-    def get_input_embeddings(self):
-        """Get the input embeddings layer."""
-        return self.pico_decoder.embedding_proj
-    def set_input_embeddings(self, value):
-        """Set the input embeddings layer."""
-        self.pico_decoder.embedding_proj = value
-    def get_output_embeddings(self):
-        """Get the output embeddings layer."""
-        return self.pico_decoder.de_embedding_proj
-    def set_output_embeddings(self, value):
-        """Set the output embeddings layer."""
-        self.pico_decoder.de_embedding_proj = value
-    def get_lm_head(self):
-        """Get the language model head."""
-        return self.pico_decoder.de_embedding_proj
-    def can_generate(self) -> bool:
-        """Check if the model can generate text."""
-        return True
-    @property
-    def is_encoder_decoder(self) -> bool:
-        """Check if the model is an encoder-decoder model."""
-        return False
-    @property
-    def can_use_cache(self) -> bool:
-        """Check if the model can use KV cache."""
-        return True
-    def resize_token_embeddings(
-        self, new_num_tokens: Optional[int] = None
-    ) -> torch.nn.Embedding:
-        """Resize token embeddings."""
-        old_embeddings = self.get_input_embeddings()
-        if new_num_tokens is None:
-            new_num_tokens = old_embeddings.num_embeddings
-        new_embeddings = torch.nn.Embedding(
-            new_num_tokens, old_embeddings.embedding_dim
-        )
-        new_embeddings.weight.data[: old_embeddings.num_embeddings] = (
-            old_embeddings.weight.data
-        )
-        self.pico_decoder.embedding_proj = new_embeddings
-        self.pico_decoder.de_embedding_proj = torch.nn.Linear(
-            old_embeddings.embedding_dim, new_num_tokens, bias=False
-        )
-        return new_embeddings
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        """
-        Load a pretrained model from a checkpoint.
-        This method handles loading from both the old PicoDecoderHF format and the new format.
-        """
-        # First try to load with the new class
-        try:
-            return super().from_pretrained(
-                pretrained_model_name_or_path, *model_args, **kwargs
-            )
-        except Exception as e:
-            print(f"Failed to load with new class: {e}")
-            print("Attempting to load with legacy class and convert...")
-            # Try to load with the old class and convert
-            try:
-                from transformers import AutoModel
-                old_model = AutoModel.from_pretrained(
-                    pretrained_model_name_or_path,
-                    trust_remote_code=True,
-                    *model_args,
-                    **kwargs,
-                )
-                # Create new model instance
-                new_model = cls(old_model.config)
-                # Copy state dict
-                new_model.load_state_dict(old_model.state_dict(), strict=False)
-                return new_model
-            except Exception as e2:
-                print(f"Failed to convert from legacy format: {e2}")
-                raise e
-# Register the new class
-PicoDecoderForCausalLM.register_for_auto_class("AutoModelForCausalLM")

src/training/trainer.py DELETED Viewed

@@ -1,753 +0,0 @@
-"""
-Pico Language Model Trainer
-This Trainer implements a minimalistic end-to-end training pipeline of the Pico language model with
-distributed training support via Lightning Fabric. It provides a modular and configurable training
-pipeline with the features:
-    - Configuration Management: YAML-based configuration for all aspects of training
-    - Distributed Training: Multi-GPU support via Lightning Fabric
-    - Checkpointing: Regular model saving and training state recovery
-    - Evaluation: Periodic model evaluation on validation datasets
-    - Logging: Comprehensive metric tracking and experiment monitoring
-    - Optimization: Support for gradient accumulation, clipping, and LR scheduling
-"""
-import logging
-import os
-import platform
-from typing import Any, Dict
-import lightning as L
-import psutil
-import torch
-import torch.nn.functional as F
-import yaml
-from datasets import Dataset, load_dataset
-from lightning.fabric.utilities.rank_zero import rank_zero_only
-from src.checkpointing import (
-    compute_learning_dynamics_states,
-    load_checkpoint,
-    save_checkpoint,
-    save_evaluation_results,
-    save_learning_dynamics_states,
-)
-from src.evaluation import run_evaluation
-from src.training.utils import (
-    initialize_configuration,
-    initialize_dataloader,
-    initialize_dataset,
-    initialize_fabric,
-    initialize_hf_checkpointing,
-    initialize_logging,
-    initialize_lr_scheduler,
-    initialize_model,
-    initialize_optimizer,
-    initialize_run_dir,
-    initialize_tokenizer,
-    initialize_wandb,
-)
-from src.training.utils.logging import pretty_print_yaml_config
-class Trainer:
-    def __init__(self, config_path: str):
-        """
-        Initializes the Trainer class. This Trainer class implements a `train` method, which is the
-        main entry point for training the Pico model. Before calling `train`, the Trainer class
-        initializes the following:
-            - Configuration loading and validation
-            - Model, optimizer, and dataset setup
-            - Logging and experiment tracking setup
-            - Checkpoint management
-        Args:
-            config_path (str): Path to the YAML configuration file containing any overrides.
-        """
-        ########################################################
-        #
-        # Basic Initialization of Configs, Fabric, Model, Optimizer, etc.
-        #
-        ########################################################
-        # Setup Config
-        self.configs = initialize_configuration(config_path)
-        # Setup Run Directory (i.e. where we store checkpoints, logs, etc.)
-        initialize_run_dir(checkpointing_config=self.configs["checkpointing"])
-        # Setup Logger
-        if self.configs["monitoring"].save_to_wandb:
-            wandb_logger = initialize_wandb(
-                monitoring_config=self.configs["monitoring"],
-                checkpointing_config=self.configs["checkpointing"],
-            )
-        else:
-            wandb_logger = None
-        # Setup Fabric
-        self.fabric = initialize_fabric(
-            training_config=self.configs["training"],
-            wandb_logger=wandb_logger,
-        )
-        L.seed_everything(42, verbose=False)
-        # Optimize for Tensor Cores on RTX 5090
-        if self.fabric.device.type == "cuda":
-            torch.set_float32_matmul_precision(
-                "high"
-            )  # Best performance for Tensor Cores
-            print(
-                "Enabled Tensor Core optimization: torch.set_float32_matmul_precision('high')"
-            )
-        # Set up logging
-        self.logger = initialize_logging(
-            monitoring_config=self.configs["monitoring"],
-            checkpointing_config=self.configs["checkpointing"],
-            fabric=self.fabric,
-        )
-        # Setup Model, Optimizer, and Dataloaders
-        self.model = initialize_model(model_config=self.configs["model"])
-        self.optimizer = initialize_optimizer(
-            training_config=self.configs["training"], model=self.model
-        )
-        self.lr_scheduler = initialize_lr_scheduler(
-            training_config=self.configs["training"], optimizer=self.optimizer
-        )
-        # Wrap model and optimizer with Fabric
-        self.model, self.optimizer = self.fabric.setup(self.model, self.optimizer)
-        # Setup HuggingFace Checkpointing
-        if self.configs["checkpointing"].save_to_hf:
-            initialize_hf_checkpointing(
-                checkpointing_config=self.configs["checkpointing"], fabric=self.fabric
-            )
-        ########################################################
-        #
-        # Boilerplate to deal with loading/resuming from checkpoints
-        #
-        ########################################################
-        self.should_load_checkpoint = self.configs["checkpointing"].training.auto_resume
-        # Possibly load a checkpoint
-        if self.should_load_checkpoint:
-            resume_checkpoint = load_checkpoint(
-                checkpointing_config=self.configs["checkpointing"],
-                checkpoint_step="latest",
-                fabric=self.fabric,
-                model=self.model,
-                optimizer=self.optimizer,
-                lr_scheduler=self.lr_scheduler,
-            )
-            if resume_checkpoint:
-                (
-                    self.model,
-                    self.optimizer,
-                    self.lr_scheduler,
-                    self.initial_batch_step,
-                ) = resume_checkpoint
-            else:
-                self.initial_batch_step = 0
-        else:
-            self.initial_batch_step = 0
-        ########################################################
-        #
-        # Initialization of Dataset & DataLoader (possibly fast-forwarding to correct batch)
-        #
-        ########################################################
-        self.train_dataset, fast_forward_steps = initialize_dataset(
-            data_config=self.configs["data"],
-            fabric=self.fabric,
-            initial_batch_step=self.initial_batch_step,
-            return_fast_forward_steps=True,
-        )
-        self.train_dataloader = initialize_dataloader(
-            data_config=self.configs["data"],
-            training_config=self.configs["training"],
-            fabric=self.fabric,
-            dataset=self.train_dataset,
-        )
-        self.train_dataloader = self.fabric.setup_dataloaders(
-            self.train_dataloader, use_distributed_sampler=False
-        )
-        self.tokenizer = initialize_tokenizer(data_config=self.configs["data"])
-        # NOTE: We may need to fast-forward the iterator to the correct step so that we can
-        # continue from the correct batch of data we would have seen had training not
-        # previously stopped.
-        train_iterator = iter(self.train_dataloader)
-        if fast_forward_steps > 0:
-            fast_forward_sub_steps = (
-                fast_forward_steps
-                * self.configs["training"].optimization.gradient_accumulation_steps
-            )
-            for _ in range(fast_forward_sub_steps):
-                next(train_iterator)
-        self.train_iterator = train_iterator
-        # NOTE: Sychronizing processes after fast-forwarding iterator
-        self.fabric.barrier()
-        ########################################################
-        #
-        # Helper flags used during training for checkpointing and evaluation
-        #
-        ########################################################
-        # Helper flag to determine if we should evaluate the model
-        self.should_evaluate = (
-            self.configs["evaluation"].metrics is not None
-            and len(self.configs["evaluation"].metrics) > 0
-        )
-        self.should_compute_learning_dynamics = (
-            self.configs["checkpointing"].learning_dynamics.layer_suffixes is not None
-            and len(self.configs["checkpointing"].learning_dynamics.layer_suffixes) > 0
-        )
-        if self.should_compute_learning_dynamics:
-            if self.configs["checkpointing"].learning_dynamics.eval_data is not None:
-                self.learning_dynamics_eval_dataset = load_dataset(
-                    self.configs["checkpointing"].learning_dynamics.eval_data,
-                    split="val",
-                )
-            else:
-                self.learning_dynamics_eval_dataset = None
-    def train(self) -> None:
-        """Execute the main training pipeline.
-        This method orchestrates the complete training process by:
-        1. Creating an initial checkpoint to save the starting state and evaluate the model as a
-            baseline
-        2. Running the main training loop via `_training_loop`
-        3. Handling final checkpointing and evaluation
-        The training progress is tracked through checkpoints and evaluations
-        at intervals specified in the configuration.
-        """
-        ########################################################
-        #
-        # Initial Checkpointing and Evaluation
-        #
-        ########################################################
-        # Save Initial Checkpoint -- If the checkpoint already exists, this performs a no-op
-        save_checkpoint(
-            configs=self.configs,
-            checkpoint_step=self.initial_batch_step,
-            fabric=self.fabric,
-            model=self.model,
-            optimizer=self.optimizer,
-            lr_scheduler=self.lr_scheduler,
-            tokenizer=self.tokenizer,
-        )
-        # Save Initial Evaluation Results
-        if self.should_evaluate:
-            if self.initial_batch_step == 0:
-                evaluation_results = run_evaluation(
-                    evaluation_config=self.configs["evaluation"],
-                    checkpointing_config=self.configs["checkpointing"],
-                    fabric=self.fabric,
-                    model=self.model,
-                )
-                self._log_evaluation_results(
-                    evaluation_results, self.initial_batch_step
-                )
-                save_evaluation_results(
-                    checkpointing_config=self.configs["checkpointing"],
-                    fabric=self.fabric,
-                    evaluation_results=evaluation_results,
-                    checkpoint_step=self.initial_batch_step,
-                )
-            else:
-                # NOTE: If the run crashed while evaluating, we need to restart the evaluation
-                eval_results_path = os.path.join(
-                    self.configs["checkpointing"].evaluation.eval_results_dir,
-                    f"step_{self.initial_batch_step}.json",
-                )
-                if not os.path.exists(eval_results_path):
-                    evaluation_results = run_evaluation(
-                        evaluation_config=self.configs["evaluation"],
-                        checkpointing_config=self.configs["checkpointing"],
-                        fabric=self.fabric,
-                        model=self.model,
-                    )
-                    self._log_evaluation_results(
-                        evaluation_results, self.initial_batch_step
-                    )
-                    save_evaluation_results(
-                        checkpointing_config=self.configs["checkpointing"],
-                        fabric=self.fabric,
-                        evaluation_results=evaluation_results,
-                        checkpoint_step=self.initial_batch_step,
-                    )
-        ########################################################
-        #
-        # Main Training Loop (see `_training_loop` for details)
-        #
-        ########################################################
-        if self.initial_batch_step < self.configs["training"].max_steps:
-            self._log_training_configuration()
-            final_step = self._training_loop()
-        else:
-            final_step = self.initial_batch_step
-        ########################################################
-        #
-        # Final Checkpointing and Evaluation
-        #
-        ########################################################
-        # Save Learning Dynamics States
-        if self.should_compute_learning_dynamics:
-            if self.learning_dynamics_eval_dataset is not None:
-                self.log(f"Step {final_step} -- 📈 Saving Learning Dynamics")
-                learning_dynamics_val_states = compute_learning_dynamics_states(
-                    checkpointing_config=self.configs["checkpointing"],
-                    fabric=self.fabric,
-                    model=self.model,
-                    dataset=self.learning_dynamics_eval_dataset,
-                    compute_gradients=True,
-                )
-                save_learning_dynamics_states(
-                    checkpointing_config=self.configs["checkpointing"],
-                    fabric=self.fabric,
-                    learning_dynamics_states=learning_dynamics_val_states,
-                    checkpoint_step=final_step,
-                    prefix="val",
-                )
-        # Handle checkpointing and final evaluation
-        if final_step % self.configs["checkpointing"].save_every_n_steps != 0:
-            self.log(f"Step {final_step} -- 💾 Saving Final Checkpoint")
-            save_checkpoint(
-                configs=self.configs,
-                checkpoint_step=final_step,
-                fabric=self.fabric,
-                model=self.model,
-                optimizer=self.optimizer,
-                lr_scheduler=self.lr_scheduler,
-                tokenizer=self.tokenizer,
-            )
-            # Final evaluation
-            if self.should_evaluate:
-                evaluation_results = run_evaluation(
-                    evaluation_config=self.configs["evaluation"],
-                    checkpointing_config=self.configs["checkpointing"],
-                    fabric=self.fabric,
-                    model=self.model,
-                )
-                self._log_evaluation_results(evaluation_results, final_step)
-                save_evaluation_results(
-                    checkpointing_config=self.configs["checkpointing"],
-                    checkpoint_step=final_step,
-                    fabric=self.fabric,
-                    evaluation_results=evaluation_results,
-                )
-        self.log(f"🎉 Training complete! Final step: {final_step}")
-        if final_step < self.configs["training"].max_steps:
-            self.log(
-                f"\t Note: Training stopped before max steps ({self.configs['training'].max_steps})",
-                level=logging.WARNING,
-            )
-        # Cleanup distributed training
-        self.fabric.barrier()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            if torch.distributed.is_initialized():
-                torch.distributed.destroy_process_group()
-            del self.train_dataloader  # NOTE: shutting down worker nodes
-        self.fabric.barrier()
-    def _training_loop(self) -> int:
-        """Execute the main training loop.
-        This method orchestrates the core training loop and includes the following features:
-            - Gradient accumulation
-            - Gradient clipping
-            - Periodic model evaluation and checkpointing
-            - Learning Dynamics Checkpointing
-            - Learning rate scheduling
-            - Logging of training metrics including loss and learning rate
-            - Handling of infinite/NaN losses
-        Returns:
-            int: The final step count reached during training.
-                NOTE: A complete training run should match the configured max_steps.
-        """
-        # Setup training loop variables
-        batch_step = self.initial_batch_step
-        # NOTE: these are used to compute the average loss over a training interval.
-        # This is more accurate than using the loss at the end of the interval.
-        interval_loss = torch.tensor(0.0, device=self.fabric.device)
-        interval_steps = torch.tensor(0, device=self.fabric.device)
-        interval_inf_or_nan_count = torch.tensor(0, device=self.fabric.device)
-        if self.should_compute_learning_dynamics:
-            # NOTE: we basically re-construct the full batch here so that we can compute learning dynamics
-            training_batch = {"input_ids": []}
-        # NOTE: determine what sub-batch we should start from
-        initial_sub_batch_step = (
-            batch_step
-            * self.configs["training"].optimization.gradient_accumulation_steps
-        )
-        ###############################################################
-        #
-        # Core loop starts here
-        # NOTE: the ratio between sub_batch_step and batch_step
-        # is the configured number of gradient_accumulation_steps
-        # i.e. with 32 configured gradient accumulation steps,
-        # there are 32 sub_batch_steps for each batch_step
-        #
-        ###############################################################
-        for sub_batch_step, sub_batch in enumerate(
-            self.train_iterator, start=initial_sub_batch_step
-        ):
-            # NOTE: We want to store the entire training batch whenever we are computing learning dynamics
-            # and we are at a checkpointing step.
-            should_store_training_batch = self.should_compute_learning_dynamics and (
-                batch_step % self.configs["checkpointing"].save_every_n_steps == 0
-            )
-            ########################################################
-            #
-            # Forward Pass
-            #
-            ########################################################
-            _input_ids = torch.tensor(sub_batch["input_ids"], device=self.fabric.device)
-            input_ids = _input_ids[:, :-1]
-            labels = _input_ids[:, 1:]
-            if should_store_training_batch:
-                gathered_input_ids = self.fabric.all_gather(_input_ids)
-                # NOTE: On multi-GPU, we need to reshape the input_ids to be a 2D tensor; on
-                # a single GPU, the input_ids are already a 2D tensor.
-                if self.fabric.world_size > 1:
-                    gathered_input_ids = gathered_input_ids.reshape(
-                        -1, *gathered_input_ids.shape[2:]
-                    )
-                training_batch["input_ids"].extend(gathered_input_ids.tolist())
-            # Forward pass
-            model_output, _ = self.model(input_ids)
-            model_output = model_output.transpose(1, 2)
-            ########################################################
-            #
-            # Gradient accumulation
-            #
-            ########################################################
-            should_accumulate_gradients = (sub_batch_step + 1) % self.configs[
-                "training"
-            ].optimization.gradient_accumulation_steps != 0
-            with self.fabric.no_backward_sync(
-                self.model, enabled=should_accumulate_gradients
-            ):
-                loss = F.cross_entropy(model_output, labels)
-                self.fabric.backward(
-                    loss
-                    / self.configs["training"].optimization.gradient_accumulation_steps,
-                    model=self.model,
-                )
-                if torch.isnan(loss) or torch.isinf(loss):
-                    interval_inf_or_nan_count += 1
-                else:
-                    interval_loss += loss.item()
-                    interval_steps += 1
-            # NOTE: if we are not accumulating gradients, we should skip the logging and optimization steps
-            if should_accumulate_gradients:
-                continue
-            ########################################################
-            #
-            # Logging
-            #
-            ########################################################
-            if batch_step % self.configs["monitoring"].logging.log_every_n_steps == 0:
-                self._log_training_metrics(
-                    interval_loss=interval_loss,
-                    interval_steps=interval_steps,
-                    interval_inf_or_nan_count=interval_inf_or_nan_count,
-                    batch_step=batch_step,
-                )
-                interval_loss = torch.tensor(0.0, device=self.fabric.device)
-                interval_steps = torch.tensor(0, device=self.fabric.device)
-                interval_inf_or_nan_count = torch.tensor(0, device=self.fabric.device)
-            ########################################################
-            #
-            # Learning Dynamics Checkpointing
-            #
-            ########################################################
-            if batch_step % self.configs["checkpointing"].save_every_n_steps == 0:
-                if self.should_compute_learning_dynamics:
-                    self.log(f"Step {batch_step} -- 📈 Saving Learning Dynamics")
-                    # Training Batch Learning Dynamics
-                    training_batch_dataset = Dataset.from_dict(training_batch)
-                    learning_dynamics_train_states = compute_learning_dynamics_states(
-                        checkpointing_config=self.configs["checkpointing"],
-                        fabric=self.fabric,
-                        model=self.model,
-                        dataset=training_batch_dataset,
-                        compute_gradients=True,
-                    )
-                    save_learning_dynamics_states(
-                        checkpointing_config=self.configs["checkpointing"],
-                        checkpoint_step=batch_step,
-                        prefix="train",
-                        fabric=self.fabric,
-                        learning_dynamics_states=learning_dynamics_train_states,
-                        learning_dynamics_dataset=training_batch_dataset,
-                        tokenizer=self.tokenizer,
-                    )
-                    training_batch = {
-                        "input_ids": []
-                    }  # Resetting training_batch for next training batch
-                    # Validation Data Learning Dynamics
-                    if self.learning_dynamics_eval_dataset is not None:
-                        learning_dynamics_val_states = compute_learning_dynamics_states(
-                            checkpointing_config=self.configs["checkpointing"],
-                            fabric=self.fabric,
-                            model=self.model,
-                            dataset=self.learning_dynamics_eval_dataset,
-                            compute_gradients=True,
-                        )
-                        save_learning_dynamics_states(
-                            checkpointing_config=self.configs["checkpointing"],
-                            checkpoint_step=batch_step,
-                            prefix="val",
-                            fabric=self.fabric,
-                            learning_dynamics_states=learning_dynamics_val_states,
-                        )
-            ########################################################
-            #
-            # Optimization step
-            #
-            ########################################################
-            self.optimizer.step()
-            self.optimizer.zero_grad()
-            self.lr_scheduler.step()
-            batch_step += 1
-            ########################################################
-            #
-            # Training Checkpointing and evaluation
-            #
-            ########################################################
-            if batch_step % self.configs["checkpointing"].save_every_n_steps == 0:
-                self.log(f"Step {batch_step} -- 💾 Saving Checkpoint")
-                save_checkpoint(
-                    configs=self.configs,
-                    checkpoint_step=batch_step,
-                    fabric=self.fabric,
-                    model=self.model,
-                    optimizer=self.optimizer,
-                    lr_scheduler=self.lr_scheduler,
-                    tokenizer=self.tokenizer,
-                )
-                if self.should_evaluate:
-                    evaluation_results = run_evaluation(
-                        evaluation_config=self.configs["evaluation"],
-                        checkpointing_config=self.configs["checkpointing"],
-                        fabric=self.fabric,
-                        model=self.model,
-                    )
-                    if evaluation_results is not None:
-                        self._log_evaluation_results(evaluation_results, batch_step)
-                        save_evaluation_results(
-                            checkpointing_config=self.configs["checkpointing"],
-                            fabric=self.fabric,
-                            evaluation_results=evaluation_results,
-                            checkpoint_step=batch_step,
-                        )
-            # Break if we've reached training steps
-            if batch_step >= self.configs["training"].max_steps:
-                break
-        return batch_step
-    ########################################################
-    #
-    # Trainer Logging Functinalities
-    #
-    ########################################################
-    def _log_training_metrics(
-        self,
-        interval_loss: torch.Tensor,
-        interval_steps: torch.Tensor,
-        interval_inf_or_nan_count: torch.Tensor,
-        batch_step: int,
-    ):
-        """
-        Gathers together the training metrics computed across all processes in distributed training
-        and logs them in a tree-style format.
-        """
-        gathered_interval_loss = self.fabric.all_reduce(
-            interval_loss, reduce_op="sum"
-        ).item()
-        gathered_interval_inf_or_nan_count = self.fabric.all_reduce(
-            interval_inf_or_nan_count, reduce_op="sum"
-        ).item()
-        gathered_interval_steps = self.fabric.all_reduce(
-            interval_steps, reduce_op="sum"
-        ).item()
-        avg_loss = (
-            gathered_interval_loss / gathered_interval_steps
-            if gathered_interval_steps > 0
-            else float("inf")
-        )
-        self.fabric.log("train/loss", avg_loss, step=batch_step)
-        self.fabric.log(
-            "trainer/inf_or_nan_count",
-            gathered_interval_inf_or_nan_count,
-            step=batch_step,
-        )
-        self.fabric.log(
-            "trainer/learning_rate",
-            self.lr_scheduler.get_last_lr()[0],
-            step=batch_step,
-        )
-        # Log to console in tree format
-        self.log(f"Step {batch_step} -- 🔄 Training Metrics")
-        self.log(f"├── Loss: {avg_loss:.4f}")
-        self.log(f"├── Learning Rate: {self.lr_scheduler.get_last_lr()[0]:.2e}")
-        self.log(f"└── Inf/NaN count: {gathered_interval_inf_or_nan_count}")
-    def _log_evaluation_results(
-        self, evaluation_results: Dict[str, Any], batch_step: int
-    ):
-        """Log model evaluation metrics to experiment tracking system and console."""
-        self.log(f"Step {batch_step} -- 📊 Evaluation Results")
-        for i, (metric, result) in enumerate(evaluation_results.items()):
-            prefix = "└──" if i == len(evaluation_results) - 1 else "├──"
-            self.log(f"{prefix} {metric}: {result}")
-            self.fabric.log(f"eval/{metric}", result, step=batch_step)
-    def _log_training_configuration(self):
-        """
-        Log training configuration details as well as runtime information about the hardware,
-        software, and batch settings.
-        This function is called at the beginning of the training loop to provide a summary of the
-        training configuration.
-        """
-        total_params = sum(p.numel() for p in self.model.parameters())
-        trainable_params = sum(
-            p.numel() for p in self.model.parameters() if p.requires_grad
-        )
-        global_batch_size = self.configs["data"].dataloader.batch_size
-        per_device_batch_size = self.train_dataloader.batch_size
-        gradient_accumulation_steps = self.configs[
-            "training"
-        ].optimization.gradient_accumulation_steps
-        device_type = ""
-        fabric_device = str(self.fabric.device)
-        if torch.cuda.is_available() and "cuda" in fabric_device:
-            device_type = torch.cuda.get_device_name(self.fabric.device)
-        elif torch.backends.mps.is_available() and "mps" in fabric_device:
-            device_type = "MPS (Apple Silicon)"
-        else:
-            device_type = "CPU"
-        training_config_path = os.path.join(
-            self.configs["checkpointing"].runs_dir,
-            self.configs["checkpointing"].run_name,
-            "training_config.yaml",
-        )
-        if os.path.exists(training_config_path):
-            self.log("=" * 50)
-            self.log("✨ Training Configuration")
-            self.log("=" * 50)
-            training_config = yaml.safe_load(open(training_config_path, "r"))
-            pretty_print_yaml_config(self.logger, training_config)
-        self.log("=" * 50)
-        self.log("⛭ Runtime Summary:")
-        self.log("=" * 50)
-        self.log(f"Starting from step: {self.initial_batch_step}")
-        self.log("Model Setup:")
-        self.log(f"└─ Total Parameters: {total_params:,}")
-        self.log(f"└─ Trainable Parameters: {trainable_params:,}")
-        self.log("Distributed Setup:")
-        self.log(f"└─ Number of Devices: {self.fabric.world_size}")
-        self.log(f"└─ Device Type: {device_type}")
-        self.log(
-            f"└─ Available Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB"
-            if torch.cuda.is_available()
-            else f"└─ Available Memory: {psutil.virtual_memory().total / 1e9:.2f} GB"
-        )
-        self.log("Software Setup:")
-        self.log(f"└─ Python Version: {platform.python_version()}")
-        self.log(f"└─ PyTorch Version: {torch.__version__}")
-        self.log(
-            f"└─ CUDA Version: {torch.version.cuda if torch.cuda.is_available() else 'N/A'}"
-        )
-        self.log(f"└─ Operating System: {platform.system()} {platform.release()}")
-        self.log("Batch Size Configuration:")
-        self.log(f"└─ Global Batch Size: {global_batch_size}")
-        self.log(f"└─ Per Device Batch Size: {per_device_batch_size}")
-        self.log(f"└─ Gradient Accumulation Steps: {gradient_accumulation_steps}")
-        self.log("=" * 50)
-    @rank_zero_only
-    def log(self, msg: str, level: int = logging.INFO) -> None:
-        """NOTE: Log messages only from rank zero process."""
-        self.logger.log(level, msg)

src/training/utils/__init__.py DELETED Viewed

@@ -1,34 +0,0 @@
-"""
-Utility package that contains functions for the training process, e.g. initialization, logging, etc.
-"""
-# For convenience, we export the initialization functions here
-from .initialization import (
-    initialize_configuration,
-    initialize_dataloader,
-    initialize_dataset,
-    initialize_fabric,
-    initialize_hf_checkpointing,
-    initialize_logging,
-    initialize_lr_scheduler,
-    initialize_model,
-    initialize_optimizer,
-    initialize_run_dir,
-    initialize_tokenizer,
-    initialize_wandb,
-)
-__all__ = [
-    "initialize_configuration",
-    "initialize_dataloader",
-    "initialize_dataset",
-    "initialize_fabric",
-    "initialize_hf_checkpointing",
-    "initialize_logging",
-    "initialize_lr_scheduler",
-    "initialize_model",
-    "initialize_optimizer",
-    "initialize_run_dir",
-    "initialize_tokenizer",
-    "initialize_wandb",
-]

src/training/utils/data.py DELETED Viewed

@@ -1,35 +0,0 @@
-"""
-Utilities for data loading and processing.
-"""
-from torch.utils.data import IterableDataset
-class ShardedIterableDataset(IterableDataset):
-    """
-    A super simple implementation of a sharded iterable dataset that enables DataParallelism
-    across multiple workers. Ensures that each worker gets a unique shard of the dataset.
-    NOTE: Also works fine if there is only one worker.
-    """
-    def __init__(self, dataset, rank, world_size):
-        self.dataset = dataset
-        self.rank = rank
-        self.world_size = world_size
-    def __iter__(self):
-        iterator = iter(self.dataset)
-        # NOTE: Start by skipping to this worker's shard
-        for _ in range(self.rank):
-            next(iterator)
-        # NOTE: Yield every world_size-th item
-        while True:
-            try:
-                yield next(iterator)
-                # Skip other workers' samples
-                for _ in range(self.world_size - 1):
-                    next(iterator)
-            except StopIteration:
-                break

src/training/utils/initialization.py DELETED Viewed

@@ -1,702 +0,0 @@
-"""
-Utilities for initializing components of the training process.
-Here, we initialize all of the components that are part of the learning process. From logging,
-and checkpointing to the optimizer to the dataset and the dataloader, this file contains the
-logic for setting up the classes and functions that are used in the training loop.
-As always, this code is meant to be basic. We hard-code the obvious defaults, and leave the
-more experimental stuff to you.
-"""
-import logging
-import math
-import os
-import warnings
-from dataclasses import fields, is_dataclass
-from datetime import datetime
-from typing import Dict, Optional, Union
-import lightning as L
-import torch
-import yaml
-from datasets import Dataset, DownloadConfig, load_dataset
-from datasets import config as datasets_config
-from huggingface_hub import add_collection_item, create_branch, create_repo
-from lightning.fabric.loggers import Logger as FabricLogger
-from lightning.fabric.utilities.rank_zero import rank_zero_only
-from torch.utils.data import DataLoader
-from transformers import AutoTokenizer
-import wandb
-from src.config import (
-    CheckpointingConfig,
-    DataConfig,
-    EvaluationConfig,
-    ModelConfig,
-    MonitoringConfig,
-    TrainingConfig,
-)
-from src.model import PicoDecoder
-from src.training.utils.io import use_backoff
-from wandb.integration.lightning.fabric import WandbLogger
-warnings.filterwarnings(
-    "ignore",
-    message=".*This integration is tested and supported for lightning Fabric.*",
-)
-warnings.filterwarnings(
-    "ignore",
-    message=".*Please report any issues to.*",
-)
-########################################################
-#
-# Basic Initialization
-#
-########################################################
-def _apply_config_overrides(config, overrides: dict):
-    """Recursively apply configuration overrides to a dataclass config object.
-    Args:
-        config: Base configuration object (must be a dataclass)
-        overrides: Dictionary of override values matching config structure
-    Returns:
-        Modified config object with overrides to the config.
-    """
-    for field in fields(config):
-        field_value = getattr(config, field.name)
-        if is_dataclass(field_value):
-            _apply_config_overrides(field_value, overrides.get(field.name, {}))
-        else:
-            if field.name in overrides:
-                setattr(config, field.name, overrides[field.name])
-    return config
-def initialize_configuration(
-    config_path: Optional[str] = None,
-) -> Dict[
-    str,
-    Union[
-        DataConfig,
-        ModelConfig,
-        TrainingConfig,
-        EvaluationConfig,
-        MonitoringConfig,
-        CheckpointingConfig,
-    ],
-]:
-    """Initialize configuration objects with optional overrides from a YAML file.
-    This function initializes all of the configuration objects, and then applies
-    any overrides from the config_path file. If no config_path is provided,
-    the function will use the default configuration objects.
-    Args:
-        config_path: Path to a YAML file containing configuration overrides.
-    Returns:
-        A dictionary containing the initialized configuration objects.
-    """
-    data_config = DataConfig()
-    model_config = ModelConfig()
-    training_config = TrainingConfig()
-    evaluation_config = EvaluationConfig()
-    monitoring_config = MonitoringConfig()
-    checkpointing_config = CheckpointingConfig()
-    if config_path:
-        overrides = yaml.safe_load(open(config_path, "r"))
-        data_config = _apply_config_overrides(data_config, overrides.get("data", {}))
-        model_config = _apply_config_overrides(model_config, overrides.get("model", {}))
-        training_config = _apply_config_overrides(
-            training_config, overrides.get("training", {})
-        )
-        evaluation_config = _apply_config_overrides(
-            evaluation_config, overrides.get("evaluation", {})
-        )
-        monitoring_config = _apply_config_overrides(
-            monitoring_config, overrides.get("monitoring", {})
-        )
-        checkpointing_config = _apply_config_overrides(
-            checkpointing_config, overrides.get("checkpointing", {})
-        )
-    configs = {
-        "data": data_config,
-        "model": model_config,
-        "training": training_config,
-        "evaluation": evaluation_config,
-        "monitoring": monitoring_config,
-        "checkpointing": checkpointing_config,
-    }
-    return configs
-def initialize_run_dir(checkpointing_config: CheckpointingConfig) -> str:
-    """Initialize a directory for the current training run.
-    Creates a unique directory for storing training, evaluation, and logging artifacts.
-    If no run name is specified in the config, generates a timestamp-based name.
-    Args:
-        checkpointing_config: Configuration object containing run settings.
-            NOTE: Must have a 'run_name' attribute that can be None, in which case
-            a timestamp-based name will be generated.
-    Returns:
-        str: The path to the run directory.
-    """
-    run_name = checkpointing_config.run_name
-    if run_name is None:
-        run_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-        checkpointing_config.run_name = run_name
-    run_dir = os.path.join(checkpointing_config.runs_dir, run_name)
-    os.makedirs(run_dir, exist_ok=True)
-    return run_dir
-def initialize_fabric(
-    training_config: TrainingConfig, wandb_logger: Optional[FabricLogger] = None
-):
-    """Initialize Lightning Fabric for distributed training.
-    Sets up a Lightning Fabric instance with the specified configuration for
-    handling distributed training, mixed precision, and logging.
-    Args:
-        training_config: Configuration object containing fabric settings
-            (accelerator, precision, devices, etc.).
-        wandb_logger: Optional weights and biases logger instance for experiment tracking
-    Returns:
-        L.Fabric: Initialized Lightning Fabric instance.
-    Example:
-        >>> fabric = initialize_fabric(training_config, wandb_logger)
-    """
-    total_devices = (
-        training_config.fabric.num_devices * training_config.fabric.num_nodes
-    )
-    if total_devices > 1:
-        strategy = "deepspeed_stage_2"
-    else:
-        strategy = "auto"  # Sets up SingleDevice Strategy by default
-    # NOTE: The strategy is set to use either DeepSpeed (Zero Stage 2) on multi-GPU,
-    # or SingleDevice Strategy on single-GPU set ups. If you'd like to use a different strategy,
-    # you can change the strategy flag in the fabric initialization, but be aware that this might
-    # cause issues with checkpointing, evaluation, etc.
-    fabric = L.Fabric(
-        accelerator=training_config.fabric.accelerator,
-        precision=training_config.fabric.precision,
-        devices=training_config.fabric.num_devices,
-        num_nodes=training_config.fabric.num_nodes,
-        loggers=[wandb_logger] if wandb_logger is not None else None,
-        strategy=strategy,
-    )
-    fabric.launch()
-    return fabric
-########################################################
-#
-# Dataset and Tokenization Initialization
-#
-########################################################
-@use_backoff(max_retries=20)
-def initialize_dataset(
-    data_config: DataConfig,
-    fabric: L.Fabric,
-    initial_batch_step: Optional[int] = 0,
-    return_fast_forward_steps: bool = False,
-):
-    """Initialize dataset based on the given config.
-    This function will return a dataset object, and optionally a fast_forward_steps value.
-    The fast_forward_steps value is the number of steps that we need to fast-forward an iterator by,
-    so that we can continue from a ertain batch of data we would have seen had training not previously
-    stopped. Depending on how the dataset is loaded, the amount of steps to fast-forward may be
-    different from the initial_batch_step value.
-    NOTE: This functionality is primarily useful for streaming datasets (which for large
-    datasets is most of the time).
-    Args:
-        data_config: Configuration object containing dataset settings.
-        fabric: A Lightning Fabric instance.
-        initial_batch_step: The initial batch step to fast-forward to.
-        return_fast_forward_steps: Whether to return the fast-forward steps value.
-    Returns:
-        Dataset: Initialized dataset object.
-        Optional[int]: Number of steps to fast-forward the iterator by, if return_fast_forward_steps is True.
-    """
-    datasets_config.STREAMING_READ_MAX_RETRIES = 40  # default is 20
-    datasets_config.STREAMING_READ_RETRY_INTERVAL = 10  # default is 5
-    download_config = DownloadConfig(
-        max_retries=20,  # default is 1 and can lead to pre-mature HTTPS errors
-    )
-    fast_forward_steps = 0
-    if data_config.dataset.name == "pico-lm/pretokenized-dolma":
-        # NOTE: We know that the dataset is sharded into 10,000 shards, so we can easily compute
-        # the data file that we need to load in that contains the batch of data at
-        # initial_batch_step.
-        if initial_batch_step is not None:
-            examples_per_shard = 20_480
-            total_shards = 10_000
-            batches_per_shard = examples_per_shard // data_config.dataloader.batch_size
-            shard_idx = initial_batch_step // batches_per_shard
-            data_files = [
-                f"data/train-{str(_shard_idx).zfill(5)}-of-{total_shards}.parquet"
-                for _shard_idx in range(shard_idx, total_shards)
-            ]
-            fast_forward_steps = initial_batch_step % batches_per_shard
-        else:
-            data_files = None
-        base_dataset = load_dataset(
-            data_config.dataset.name,
-            split="train",
-            streaming=True,
-            data_files=data_files,
-            download_config=download_config,
-        )
-    else:
-        # NOTE: For other datasets, you might want to add some custom loading logic, especially
-        # to help with loading or fast-forwarding to the correct batch.
-        base_dataset = load_dataset(
-            data_config.dataset.name,
-            split="train",
-            streaming=True,
-            download_config=download_config,
-        )
-    if data_config.dataset.name == "pico-lm/pretokenized-dolma":
-        from .data import ShardedIterableDataset
-        # NOTE: We wrap the dataset in a ShardedIterableDataset, which is a custom class that
-        # allows us to shard an iterable dataset across multiple processes. This is useful for
-        # distributed training, where we want data-parallelism.
-        dataset = ShardedIterableDataset(
-            base_dataset, fabric.global_rank, fabric.world_size
-        )
-    else:
-        dataset = base_dataset
-    if return_fast_forward_steps:
-        return dataset, fast_forward_steps
-    else:
-        return dataset
-def initialize_tokenizer(data_config: DataConfig):
-    """Initialize the tokenizer for text processing.
-    This function can be extended to include custom tokenization logic.
-    Args:
-        data_config: Configuration object containing tokenizer settings.
-    Returns:
-        AutoTokenizer: A HuggingFace tokenizer instance.
-    """
-    return AutoTokenizer.from_pretrained(data_config.tokenizer.name)
-def initialize_dataloader(
-    data_config: DataConfig,
-    training_config: TrainingConfig,
-    fabric: L.Fabric,
-    dataset: Dataset,
-):
-    """Initialize the DataLoader for efficient batch processing.
-    Creates a PyTorch DataLoader that handles batching and data loading for training.
-    Configured specifically for streaming tokenized text datasets.
-    You might also want to extend this function to add a sampler, or some sort of custom
-    collate function. For the default dataset, we don't need any of this, because the data are
-    pre-shuffled, and pre-tokenized.
-    Args:
-        data_config: Configuration object containing dataloader settings.
-        training_config: Configuration object containing training settings.
-        fabric: A Lightning Fabric instance.
-        dataset: A HuggingFace Dataset object containing tokenized text data.
-            Expected to have 'input_ids' field in its items.
-    Returns:
-        DataLoader: PyTorch DataLoader instance configured for the dataset.
-    """
-    def _collate_fn(batch):
-        return {"input_ids": [entry["input_ids"] for entry in batch]}
-    sub_batch_size = data_config.dataloader.batch_size // (
-        fabric.world_size * training_config.optimization.gradient_accumulation_steps
-    )
-    # NOTE: We use the sub-batch size for the dataloader, which is the full batch size
-    # divided by the gradient accumulation steps. This ensures that the effective batch size
-    # is correct.
-    return DataLoader(
-        dataset,
-        batch_size=sub_batch_size,
-        shuffle=False,  # Keep sequential for streaming datasets
-        pin_memory=True,  # Speeds up transfer to GPU
-        collate_fn=_collate_fn,
-    )
-########################################################
-#
-# Model Initialization
-#
-########################################################
-def initialize_model(model_config: ModelConfig):
-    """Initialize the model for training.
-    Loads in a given model implemented in the `src.model` package and returns it.
-    NOTE: out of the box we currently only support the PicoDecoder model (a causal transformer
-    language model). If you'd like to implement your own model, you can do so by adding a new
-    model class in the `src.model` package, and then adding a new entry here.
-    Args:
-        model_config: Configuration object containing model settings.
-    Returns:
-        PyTorch model instance.
-    """
-    if model_config.model_type == "pico_decoder":
-        return PicoDecoder(model_config)
-    else:
-        raise ValueError(f"Invalid model type: {model_config.model_type}")
-########################################################
-#
-# Optimizer and Scheduler
-#
-########################################################
-def initialize_optimizer(training_config: TrainingConfig, model: torch.nn.Module):
-    """Initialize the optimizer for model training.
-    Creates an optimizer instance based on the configuration settings.
-    Add whatever other optimizers you want here.
-    Args:
-        training_config: Configuration object containing optimizer settings.
-            Must have:
-            - optimization.optimizer (str): Name of the optimizer ("adamw")
-            - optimization.lr (float): Learning rate for the optimizer
-        model: PyTorch model whose parameters will be optimized.
-    Returns:
-        torch.optim.Optimizer: Configured optimizer instance.
-    """
-    if training_config.optimization.optimizer == "adamw":
-        optimizer = torch.optim.AdamW(
-            model.parameters(), lr=training_config.optimization.lr
-        )
-    else:
-        raise ValueError(f"Invalid optimizer: {training_config.optimization.optimizer}")
-    return optimizer
-def initialize_lr_scheduler(
-    training_config: TrainingConfig, optimizer: torch.optim.Optimizer
-):
-    """Initialize a learning rate scheduler with warmup and decay.
-    The default is a learning rate scheduler that implements a linear warmup followed by
-    linear decay. The learning rate increases linearly from 0 to the initial lr
-    during warmup, then decreases linearly to 0 during the remaining steps.
-    Add other types of learning rate schedulers here.
-    Args:
-        training_config: Configuration object containing optimizer and scheduler settings.
-        optimizer: PyTorch optimizer whose learning rate will be scheduled.
-    Returns:
-        torch.optim.lr_scheduler.LambdaLR: Learning rate scheduler instance.
-    """
-    if training_config.optimization.lr_scheduler == "linear_with_warmup":
-        # Credit where credit is due:
-        # https://github.com/huggingface/transformers/blob/e71a01a104dd663c730e494eb0b6467bb51df357/src/transformers/optimization.py#L102
-        def _lr_lambda(curr_step, num_warmup_steps, max_steps):
-            if curr_step < num_warmup_steps:
-                return float(curr_step) / float(max(1, num_warmup_steps))
-            else:
-                return max(
-                    0.0,
-                    float(max_steps - curr_step)
-                    / float(max(1, max_steps - num_warmup_steps)),
-                )
-        lr_lambda = lambda step: _lr_lambda(  # noqa: E731
-            step,
-            training_config.optimization.lr_warmup_steps,
-            training_config.max_steps,
-        )
-        lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
-            optimizer,
-            lr_lambda,
-        )
-    elif training_config.optimization.lr_scheduler == "cosine":
-        # Cosine decay with warmup: linear warmup followed by cosine decay
-        # This provides sustained learning over long training runs
-        def _cosine_lr_lambda(curr_step, num_warmup_steps, max_steps):
-            if curr_step < num_warmup_steps:
-                # Linear warmup
-                return float(curr_step) / float(max(1, num_warmup_steps))
-            else:
-                # Cosine decay to 0.1 * initial_lr (not to 0)
-                progress = float(curr_step - num_warmup_steps) / float(
-                    max(1, max_steps - num_warmup_steps)
-                )
-                return max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))
-        lr_lambda = lambda step: _cosine_lr_lambda(  # noqa: E731
-            step,
-            training_config.optimization.lr_warmup_steps,
-            training_config.max_steps,
-        )
-        lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
-            optimizer,
-            lr_lambda,
-        )
-    else:
-        raise ValueError(
-            f"Invalid learning rate scheduler: {training_config.optimization.lr_scheduler}"
-        )
-    return lr_scheduler
-########################################################
-#
-# Experiment Monitoring (Logging, Experiment Tracking, etc.)
-#
-########################################################
-def _initialize_log_file(checkpointing_config: CheckpointingConfig) -> str:
-    """Create and initialize a timestamped log file in the run's log directory.
-    Sets up a log file with a unique timestamp in the run's logging directory.
-    Creates the necessary directory structure if it doesn't exist.
-    Directory Structure:
-        {checkpointing_config.runs_dir}/
-        └── {checkpointing_config.run_name}/
-            └── {checkpointing_config.logs_dir}/
-                └── log_YYYYMMDD_HHMMSS.txt
-    Args:
-        checkpointing_config: Configuration object containing checkpointing settings.
-    Returns:
-        str: Absolute path to the created log file.
-    """
-    run_dir = os.path.join(checkpointing_config.runs_dir, checkpointing_config.run_name)
-    logs_dir = os.path.join(run_dir, checkpointing_config.logs_dir)
-    os.makedirs(logs_dir, exist_ok=True)
-    # datetime stamp
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    log_file_name = f"log_{timestamp}.log"
-    log_file_path = os.path.join(logs_dir, log_file_name)
-    open(log_file_path, "w").close()  # Create an empty log file
-    return log_file_path
-@use_backoff()
-def initialize_wandb(
-    monitoring_config: MonitoringConfig, checkpointing_config: CheckpointingConfig
-):
-    """Initialize Weights and Biases.
-    This function initializes Weights and Biases based on the configuration settings.
-    Args:
-        monitoring_config: Configuration object containing monitoring settings.
-        checkpointing_config: Configuration object containing checkpointing settings.
-    Returns:
-        Optional[WandbLogger]: An experiment tracker instance.
-    """
-    assert (
-        monitoring_config.wandb.project is not None
-        and monitoring_config.wandb.project != ""
-    ), "Wandb project must be provided if wandb is to be used."
-    assert (
-        monitoring_config.wandb.entity is not None
-        and monitoring_config.wandb.entity != ""
-    ), "Wandb entity must be provided if wandb is to be used."
-    _run_id = None
-    if checkpointing_config.training.auto_resume:
-        # If we are loading a checkpoint, we can try to find the run id of the previous run
-        previous_runs = wandb.Api().runs(
-            path=f"{monitoring_config.wandb.entity}/{monitoring_config.wandb.project}",
-            filters={"display_name": checkpointing_config.run_name},
-        )
-        try:
-            if len(previous_runs) == 1:
-                _run_id = previous_runs[0].id
-        except ValueError:
-            pass
-    wandb_logger = WandbLogger(
-        project=monitoring_config.wandb.project,
-        entity=monitoring_config.wandb.entity,
-        id=_run_id,
-        name=checkpointing_config.run_name,
-    )
-    return wandb_logger
-@rank_zero_only
-def initialize_logging(
-    monitoring_config: MonitoringConfig,
-    checkpointing_config: CheckpointingConfig,
-    fabric: L.Fabric,
-):
-    """Initialize logging system with default logging, to file and console.
-    The default logging system uses a file handler and a stream handler.
-    NOTE: this function is only called on rank 0.
-    Args:
-        monitoring_config: Configuration object containing monitoring settings.
-        checkpointing_config: Configuration object containing checkpointing settings.
-    Returns:
-        logger: Standard Python logger configured for file and console output
-    """
-    # ---- Standard Local Logger ---- #
-    logger = logging.getLogger("pico-train")
-    logger.setLevel(logging.INFO)
-    # Create file handler
-    log_file_path = _initialize_log_file(checkpointing_config)
-    file_handler = logging.FileHandler(log_file_path, encoding="utf-8")
-    file_handler.setLevel(monitoring_config.logging.log_level)
-    # Create formatter and add it to the handler
-    formatter = logging.Formatter(
-        "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
-        datefmt="%Y-%m-%d %H:%M:%S",
-    )
-    file_handler.setFormatter(formatter)
-    # Add the handler to the logger
-    logger.addHandler(file_handler)
-    # Add a stream handler for console output
-    stream_handler = logging.StreamHandler()
-    stream_handler.setLevel(monitoring_config.logging.log_level)
-    stream_handler.setFormatter(formatter)
-    logger.addHandler(stream_handler)
-    return logger
-########################################################
-#
-# HuggingFace/Remote Checkpointing
-#
-########################################################
-@rank_zero_only
-@use_backoff()
-def initialize_hf_checkpointing(
-    checkpointing_config: CheckpointingConfig, fabric: L.Fabric
-):
-    """Initialize HuggingFace Checkpointing.
-    Creates a HuggingFace repository if it doesn't exist, and creates a branch named after the run.
-    NOTE: this function is only called on rank 0.
-    Args:
-        checkpointing_config: Configuration object containing checkpointing settings; must have
-            a 'hf_checkpoint' attribute that specifies the HuggingFace repository id and
-            collection slug (if applicable) to save the checkpoint to.
-    Raises:
-        RuntimeError: If unable to create HuggingFace repository after multiple attempts.
-    """
-    huggingface_repo_id = checkpointing_config.hf_checkpoint.repo_id
-    assert (
-        huggingface_repo_id is not None and huggingface_repo_id != ""
-    ), "hf_checkpoint.repo_id must be provided."
-    repo = create_repo(huggingface_repo_id, exist_ok=True)
-    # can create a repo without a specified namespace (will default to username)
-    # however the rest of the HF calls need the fully qualified name
-    # this is returned by create repo, so we update the config for later calls
-    checkpointing_config.hf_checkpoint.repo_id = repo.repo_id
-    huggingface_repo_id = repo.repo_id
-    if checkpointing_config.hf_checkpoint.collection_slug:
-        add_collection_item(
-            checkpointing_config.hf_checkpoint.collection_slug,
-            huggingface_repo_id,
-            repo.repo_type,
-            exists_ok=True,
-        )
-    create_branch(
-        repo_id=huggingface_repo_id,
-        branch=checkpointing_config.run_name,
-        exist_ok=True,
-    )

src/training/utils/io.py DELETED Viewed

@@ -1,52 +0,0 @@
-"""Defines a retry wrapper for io operations."""
-import time
-from functools import wraps
-def use_backoff(max_retries=2, initial_delay=1, backoff_factor=2):
-    """
-    Universal retry wrapper with exponential backoff for any function, but primarily for loading
-    and storing HuggingFace datasets and objects.
-    Example usage:
-    >>> @use_backoff(max_retries=10, delay=1, backoff_factor=2)
-    >>> def important_io_operation(x):
-    >>>     return x + 1
-    Args:
-        fn: Function to execute
-        max_retries: Maximum number of retry attempts (default: 3)
-        delay: Initial delay between retries in seconds (default: 1)
-        backoff_factor: Multiplier for delay between retries (default: 2)
-    Returns:
-        A wrapper function that will retry the function fn up to max_retries times with exponential backoff
-    Raises:
-        Exception: If all retries fail
-    """
-    def _decorator(fn):
-        @wraps(fn)
-        def wrapper(*args, **kwargs):
-            current_delay = initial_delay
-            last_exception = None
-            for attempt in range(max_retries):
-                try:
-                    return fn(*args, **kwargs)
-                except Exception as e:
-                    last_exception = e
-                    if attempt < max_retries - 1:  # Don't sleep on the last attempt
-                        time.sleep(current_delay)
-                        current_delay *= backoff_factor
-            raise Exception(
-                f"IO Operation failed after {max_retries} attempts: {str(last_exception)}"
-            )
-        return wrapper
-    return _decorator

src/training/utils/logging.py DELETED Viewed

@@ -1,48 +0,0 @@
-"""
-Miscellaneous logging utilities.
-"""
-from io import StringIO
-import yaml
-from lightning.fabric.utilities.rank_zero import rank_zero_only
-from rich.console import Console
-from rich.panel import Panel
-@rank_zero_only
-def pretty_print_yaml_config(logger, config: dict) -> None:
-    """
-    Pretty print config with rich formatting. Assumes that the config is already saved as a
-    dictionary - this can be done by calling `asdict` on the dataclass or loading in the config
-    from a yaml file.
-    NOTE: this function is only called on rank 0.
-    Args:
-        logger: Logger object to log the formatted output to.
-        config: Dictionary containing the config to pretty print.
-    """
-    # Create string buffer
-    output = StringIO()
-    console = Console(file=output, force_terminal=False)
-    # Convert to YAML string first
-    yaml_str = yaml.dump(
-        config, default_flow_style=False, sort_keys=False, Dumper=yaml.SafeDumper
-    )
-    # Create formatted panel
-    panel = Panel(
-        yaml_str,
-        border_style="blue",
-        padding=(0, 1),  # Reduced padding
-        expand=False,  # Don't expand to terminal width
-    )
-    # Print to buffer
-    console.print(panel)
-    # Log the formatted output
-    for line in output.getvalue().splitlines():
-        logger.info(line)