NousResearch/Hermes-4.3-36B-centralized
Captured source
source ↗published Oct 28, 2025seen 5dcaptured 14hhttp 200method plainparams 36Bdownloads 16likes 2
This is the centralized trained version of Hermes-4.3-36B which is released as a research artifact. It was trained on Nous Research's fork of Torchtitan commit d91ee11d6d5717c95daefcd789e4616ad82b7477
# torchtitan Config.toml # NOTE: this toml config is a preset for 64 A100 GPUs. [job] dump_folder = "./outputs" description = "Seed 36B training" [profiling] enable_profiling = false save_traces_folder = "profile_trace" profile_freq = 100 [metrics] log_freq = 1 enable_tensorboard = false save_tb_folder = "tb" enable_wandb = true [model] name = "llama3" flavor = "36B_seed_flex_attn" tokenizer_path = "./assets/tokenizer/original/tokenizer.model" # converters = ["float8"] [optimizer] name = "AdamW" lr = 2.5e-5 eps = 1e-8 weight_decay = 0.01 beta1 = 0.9 beta2 = 0.999 [lr_scheduler] warmup_steps = 300 # lr scheduler warm up decay_type = "cosine" [training] local_batch_size = 2 global_batch_size = 384 seq_len = 131072 max_norm = 1.0 # grad norm clipping # steps = 1000 epochs = 4 dataset = "hermes-4" dataset_type = "preprocessed" dataset_path = "/home/emozilla/preprocessed-datasets/Hermes-4.3-ByteDance-Seed-OSS-24K" [compile] enable=false components = ["model", "loss"] #components = ["loss"] [parallelism] data_parallel_replicate_degree = 1 data_parallel_shard_degree = -1 tensor_parallel_degree = 1 enable_async_tensor_parallel = true pipeline_parallel_degree = 1 context_parallel_degree = 1 [checkpoint] enable = true folder = "/home/emozilla/dcp/hermes4.3-36b" interval = 1000 last_save_model_only = true last_save_in_hf = true export_dtype = "bfloat16" async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"] initial_load_path = "/home/emozilla/dcp/Seed-OSS-36B-Base" [activation_checkpoint] mode = "full" #mode = "selective" # ["none", "selective", "full"] #selective_ac_option = "op" # "int" = ac every positive int layer or 'op', ac based on ops policy [quantize.linear.float8] enable_fsdp_float8_all_gather = false precompute_float8_dynamic_scale_for_fsdp = false filter_fqns = ["output"]
Notability
notability 3.0/10Low traction, minor model release