General
import dataclassesimport datetimeimport os import datasetsimport tokenizersimport torchimport torch.distributed as distimport torch.nn as nnimport torch.nn.functional as Fimport torch.optim.lr_scheduler as lr_schedulerimport tqdmfrom torch import Tensorfrom torch.distributed.checkpoint import load, savefrom torch.distributed.checkpoint.default_planner import DefaultLoadPlannerfrom torch.distributed.fsdp import FSDPModule, fully_shardfrom torch.distributed.tensor import Replicate, Shardfrom torch.distributed.tensor.parallel import ( ColwiseParallel, PrepareModuleInput, RowwiseParallel, SequenceParallel, loss_parallel, parallelize_module,)from torch.utils.data.distributed import DistributedSampler # Set default to bfloat16torch.set_default_dtype(torch.bfloat16)print("NCCL version:", torch.cuda.nccl.version()) # Build the model@dataclasses.dataclassclass LlamaConfig: """Define Llama…
WordPress
Dec 31, 2025