Spaces:

braindeck
/

text2text

Paused

App Files Files Community

braindeck commited on Oct 14

Commit

bcdf9fa

0 Parent(s):

Initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +1 -0
README.md +20 -0
_infer.py +309 -0
app.py +43 -0
config/infer.yaml +56 -0
prompts/__pycache__/base_instruction.cpython-311.pyc +0 -0
prompts/__pycache__/infer_prompt.cpython-311.pyc +0 -0
prompts/__pycache__/sft_prompt.cpython-311.pyc +0 -0
prompts/base_instruction.py +24 -0
prompts/infer_prompt.py +101 -0
requirements.txt +4 -0
requirements.txt.txt +200 -0
scripts/sft_infer_pass1.sh +33 -0
verl/__init__.py +40 -0
verl/__pycache__/__init__.cpython-311.pyc +0 -0
verl/__pycache__/protocol.cpython-311.pyc +0 -0
verl/models/README.md +35 -0
verl/models/__init__.py +13 -0
verl/models/__pycache__/__init__.cpython-311.pyc +0 -0
verl/models/__pycache__/registry.cpython-311.pyc +0 -0
verl/models/llama/__init__.py +13 -0
verl/models/llama/megatron/__init__.py +34 -0
verl/models/llama/megatron/checkpoint_utils/__init__.py +13 -0
verl/models/llama/megatron/checkpoint_utils/llama_loader.py +295 -0
verl/models/llama/megatron/checkpoint_utils/llama_loader_depracated.py +425 -0
verl/models/llama/megatron/checkpoint_utils/llama_saver.py +430 -0
verl/models/llama/megatron/layers/__init__.py +25 -0
verl/models/llama/megatron/layers/parallel_attention.py +425 -0
verl/models/llama/megatron/layers/parallel_decoder.py +150 -0
verl/models/llama/megatron/layers/parallel_linear.py +106 -0
verl/models/llama/megatron/layers/parallel_mlp.py +74 -0
verl/models/llama/megatron/layers/parallel_rmsnorm.py +48 -0
verl/models/llama/megatron/modeling_llama_megatron.py +662 -0
verl/models/mcore/__init__.py +18 -0
verl/models/mcore/config_converter.py +197 -0
verl/models/mcore/loader.py +468 -0
verl/models/mcore/model_forward.py +50 -0
verl/models/mcore/model_initializer.py +160 -0
verl/models/mcore/readme.md +99 -0
verl/models/mcore/registry.py +179 -0
verl/models/mcore/saver.py +459 -0
verl/models/mcore/util.py +190 -0
verl/models/mcore/weight_converter.py +207 -0
verl/models/qwen2/__init__.py +13 -0
verl/models/qwen2/megatron/__init__.py +34 -0
verl/models/qwen2/megatron/checkpoint_utils/__init__.py +13 -0
verl/models/qwen2/megatron/checkpoint_utils/qwen2_loader.py +312 -0
verl/models/qwen2/megatron/checkpoint_utils/qwen2_loader_depracated.py +442 -0
verl/models/qwen2/megatron/checkpoint_utils/qwen2_saver.py +436 -0
verl/models/qwen2/megatron/layers/__init__.py +20 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ checkpoints/

README.md ADDED Viewed

	@@ -0,0 +1,20 @@

+---
+title: Text-to-Text Generation with DeepSeek-R1-Distill-Qwen-7B
+emoji: 🚀
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 4.19.2
+app_file: app.py
+pinned: false
+---
+# Text-to-Text Generation with DeepSeek-R1-Distill-Qwen-7B
+This is a simple Gradio interface for text-to-text generation using the `deepseek-ai/DeepSeek-R1-Distill-Qwen-7B` model.
+## How to use
+1. Enter a prompt in the text box.
+2. Click the "Generate" button.
+3. The model will generate a response in the "Response" text box.

_infer.py ADDED Viewed

	@@ -0,0 +1,309 @@

+#!/usr/bin/env python
+# Copyright 2024 Bytedance
+# Apache-2.0
+#
+# VERL + vLLM inference with runtime LoRA (no merge).
+# - Wraps a LoRA .pt into a PEFT adapter and attaches via rollout.lora_modules
+# - Mixed precision defaults for H100: dtype=bf16, kv_cache_dtype=fp8_e5m2
+# - Pins max_model_len, max_num_batched_tokens, sets swap_space
+# - Uses OmegaConf.open_dict to add keys safely (no "not in struct" errors)
+# - Prevents FSDP from trying to load LoRA .pt as a full model
+import os
+import ast
+import json
+import hydra
+import numpy as np
+import ray
+import torch
+from pathlib import Path
+from pprint import pprint
+# Quiet logs
+os.environ["NCCL_DEBUG"] = os.environ.get("NCCL_DEBUG", "WARN")
+os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get("TOKENIZERS_PARALLELISM", "true")
+# vLLM CuMem allocator is incompatible with expandable_segments
+_bad = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
+if "expandable_segments:True" in _bad:
+    print(f"[fix] Removing incompatible PYTORCH_CUDA_ALLOC_CONF={_bad}")
+os.environ.pop("PYTORCH_CUDA_ALLOC_CONF", None)
+import pandas as pd
+from omegaconf import OmegaConf, open_dict
+from verl import DataProto
+from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto
+from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
+from verl.utils import hf_tokenizer
+from verl.utils.fs import copy_to_local
+from verl.utils.hdfs_io import makedirs
+from verl.utils.model import compute_position_id_with_mask
+from verl.workers.fsdp_workers import ActorRolloutRefWorker
+# ---------------- LoRA helpers ----------------
+DEFAULT_TARGET_MODULES = [
+    "q_proj","k_proj","v_proj","o_proj",
+    "up_proj","gate_proj","down_proj",
+]
+def _infer_lengths_and_defaults(config):
+    """Ensure rollout/data keys exist and set reasonable H100 defaults."""
+    # Ensure nested structs exist
+    with open_dict(config):
+        if "rollout" not in config:
+            config["rollout"] = OmegaConf.create()
+        if "data" not in config:
+            config["data"] = OmegaConf.create()
+        if "trainer" not in config:
+            config["trainer"] = OmegaConf.create()
+        if "ray_init" not in config:
+            config["ray_init"] = OmegaConf.create()
+    # Defaults that work on a single H100
+    with open_dict(config.rollout):
+        # If user didn't set these, choose H100-friendly defaults
+        config.rollout.setdefault("dtype", "bfloat16")              # weights/activations
+        config.rollout.setdefault("kv_cache_dtype", "fp8_e5m2")     # KV cache precision
+        config.rollout.setdefault("tensor_model_parallel_size", 1)
+        config.rollout.setdefault("enable_chunked_prefill", True)
+        config.rollout.setdefault("swap_space", 8)                  # GB of host swap for KV
+        config.rollout.setdefault("gpu_memory_utilization", 0.62)   # adjust 0.60~0.75 if needed
+        # Pin lengths to avoid vLLM over-reserving KV cache
+        pl = int(config.rollout.get("prompt_length", 1024))
+        rl = int(config.rollout.get("response_length", 128))
+        need = int(pl + rl)
+        config.rollout.setdefault("max_model_len", need)
+        config.rollout.setdefault("max_num_batched_tokens", need)
+        # Users may pass +rollout.quantization={fp8|awq|gptq} to shrink weights further
+        # We don't force it here.
+    with open_dict(config.data):
+        config.data.setdefault("batch_size", 1)
+        config.data.setdefault("n_samples", 1)
+        config.data.setdefault("prompt_key", "prompt")
+    with open_dict(config.trainer):
+        config.trainer.setdefault("n_gpus_per_node", 1)
+        config.trainer.setdefault("nnodes", 1)
+    with open_dict(config.ray_init):
+        config.ray_init.setdefault("num_cpus", 4)
+def _infer_lora_rank_from_state(sd):
+    for k, v in sd.items():
+        if k.endswith("lora_A.weight") and hasattr(v, "dim") and v.dim() == 2:
+            return int(v.shape[0])
+    return None
+def _list_target_modules_from_state(sd):
+    found = set()
+    for k in sd.keys():
+        if "lora_A.weight" in k or "lora_B.weight" in k:
+            if ".q_proj." in k: found.add("q_proj")
+            if ".k_proj." in k: found.add("k_proj")
+            if ".v_proj." in k: found.add("v_proj")
+            if ".o_proj." in k: found.add("o_proj")
+            if ".up_proj." in k: found.add("up_proj")
+            if ".gate_proj." in k: found.add("gate_proj")
+            if ".down_proj." in k: found.add("down_proj")
+    return sorted(found)
+def _write_adapter_config(adapter_dir, r, alpha, target_modules, dropout=0.0):
+    cfg = {
+        "peft_type": "LORA",
+        "auto_mapping": None,
+        "base_model_name_or_path": "",
+        "bias": "none",
+        "inference_mode": True,
+        "lora_alpha": int(alpha),
+        "lora_dropout": float(dropout),
+        "r": int(r),
+        "target_modules": target_modules,
+        "task_type": "CAUSAL_LM",
+    }
+    with open(os.path.join(adapter_dir, "adapter_config.json"), "w", encoding="utf-8") as f:
+        json.dump(cfg, f, ensure_ascii=False, indent=2)
+def _wrap_lora_pt_as_peft(adapter_pt_path: str, out_dir: str,
+                          fallback_rank=32, fallback_alpha=16):
+    os.makedirs(out_dir, exist_ok=True)
+    print(f"[lora] Loading LoRA state from: {adapter_pt_path}")
+    sd = torch.load(adapter_pt_path, map_location="cpu")
+    if isinstance(sd, dict) and "state_dict" in sd:
+        sd = sd["state_dict"]
+    r = _infer_lora_rank_from_state(sd) or int(fallback_rank)
+    tmods = _list_target_modules_from_state(sd) or DEFAULT_TARGET_MODULES
+    print(f"[lora] inferred rank={r}, target_modules={tmods}")
+    _write_adapter_config(out_dir, r=r, alpha=fallback_alpha, target_modules=tmods)
+    torch.save(sd, os.path.join(out_dir, "adapter_model.bin"))
+    return r, tmods
+def _maybe_attach_lora_adapter(config):
+    """Attach LoRA adapter directory to vLLM rollout (runtime LoRA)."""
+    # Accept either +lora.pt_path or model.load_param_path as a hint
+    lora_pt = None
+    if "lora" in config and getattr(config.lora, "pt_path", ""):
+        lora_pt = config.lora.pt_path
+    elif getattr(config.model, "load_param_path", ""):
+        lora_pt = config.model.load_param_path
+    if not lora_pt or not Path(lora_pt).is_file():
+        print("[lora] No LoRA .pt provided; running base model only.")
+        return
+    adapter_dir = os.path.join("/tmp", "lora_adapter_vllm")
+    r, _ = _wrap_lora_pt_as_peft(lora_pt, adapter_dir, fallback_rank=32, fallback_alpha=16)
+    # Ensure rollout keys exist and add LoRA knobs required by vLLM
+    with open_dict(config):
+        if "rollout" not in config:
+            config["rollout"] = OmegaConf.create()
+    with open_dict(config.rollout):
+        config.rollout.setdefault("max_loras", 1)
+        config.rollout.setdefault("max_lora_rank", int(r))
+        config.rollout["lora_modules"] = [{"path": adapter_dir, "scale": 1.0}]
+        print(f"[lora] Attached PEFT adapter: {adapter_dir} (rank={r})")
+    # CRITICAL: don't let FSDP try to load the LoRA .pt as a full state dict
+    with open_dict(config.model):
+        if getattr(config.model, "load_param", False):
+            print("[lora] Disabling model.load_param to avoid FSDP load_state_dict mismatch.")
+        config.model["load_param"] = False
+# ---------------- Hydra entry ----------------
+@hydra.main(config_path="config", config_name="infer", version_base=None)
+def main(config):
+    _infer_lengths_and_defaults(config)
+    # Ray env for workers
+    if not ray.is_initialized():
+        ray.init(
+            runtime_env={"env_vars": {
+                "TOKENIZERS_PARALLELISM": "true",
+                "NCCL_DEBUG": "WARN",
+                "PYTORCH_CUDA_ALLOC_CONF": "",  # keep allocator happy for vLLM
+            }},
+            num_cpus=config.ray_init.num_cpus,
+        )
+    ray.get(main_task.remote(config))
+@ray.remote(num_cpus=1)
+def main_task(config):
+    print("[worker] PYTORCH_CUDA_ALLOC_CONF =", os.environ.get("PYTORCH_CUDA_ALLOC_CONF"))
+    pprint(OmegaConf.to_container(config, resolve=True))
+    OmegaConf.resolve(config)
+    # Build LoRA adapter if provided
+    _maybe_attach_lora_adapter(config)
+    # Optionally pre-gen dataset schema if your repo provides it
+    try:
+        from prompts.infer_prompt import infer_dataset
+        infer_dataset(
+            model_name=config.model.path,
+            data_path=os.path.dirname(os.path.dirname(config.data.path)),
+        )
+    except Exception as e:
+        print(f"[info] infer_dataset() skipped: {e}")
+    # ---- Tokenizer from base model
+    local_path = copy_to_local(config.model.path)
+    trust_remote_code = getattr(config.model, "trust_remote_code", False)
+    tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
+    tokenizer.padding_side = "left"
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # ---- Sampling checks
+    if float(config.rollout.temperature) == 0.0:
+        assert int(config.data.n_samples) == 1, "When temperature=0, n_samples must be 1."
+    assert int(config.data.n_samples) >= 1, "n_samples should always >= 1"
+    # ---- Load dataset
+    dataset = pd.read_parquet(config.data.path)
+    prompt_key = getattr(config.data, "prompt_key", "prompt")
+    if prompt_key not in dataset.columns:
+        raise KeyError(f"Dataset missing column '{prompt_key}'")
+    chat_lst = dataset[prompt_key].tolist()
+    chat_lst = [chat.tolist() if hasattr(chat, "tolist") else chat for chat in chat_lst]
+    # ---- Worker group (vLLM inside Rollout)
+    ray_cls_with_init = RayClassWithInitArgs(cls=ray.remote(ActorRolloutRefWorker), config=config, role="rollout")
+    resource_pool = RayResourcePool(process_on_nodes=[config.trainer.n_gpus_per_node] * config.trainer.nnodes)
+    print("[debug] rollout.lora_modules =", config.rollout.get("lora_modules", None))
+    wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init)
+    wg.init_model()  # vLLM spins up; adapter used if set in rollout.lora_modules
+    total = len(dataset)
+    bs = int(config.data.batch_size)
+    num_batch = -(-total // bs)
+    slots = [[] for _ in range(int(config.data.n_samples))]
+    for b in range(num_batch):
+        print(f"[{b+1}/{num_batch}] Start to process.")
+        batch_chat = chat_lst[b * bs : (b + 1) * bs]
+        inputs = tokenizer.apply_chat_template(
+            batch_chat,
+            add_generation_prompt=True,
+            padding=True,
+            truncation=True,
+            max_length=int(config.rollout.prompt_length),
+            return_tensors="pt",
+            return_dict=True,
+            tokenize=True,
+        )
+        input_ids = inputs["input_ids"]
+        attention_mask = inputs["attention_mask"]
+        position_ids = compute_position_id_with_mask(attention_mask)
+        batch_dict = {"input_ids": input_ids, "attention_mask": attention_mask, "position_ids": position_ids}
+        data = DataProto.from_dict(batch_dict)
+        data_padded, pad_size = pad_dataproto_to_divisor(data, wg.world_size)
+        print(f"[{b+1}/{num_batch}] Start to generate.")
+        for n in range(int(config.data.n_samples)):
+            output_padded = wg.generate_sequences(data_padded)
+            output = unpad_dataproto(output_padded, pad_size=pad_size)
+            texts = []
+            for i in range(len(output)):
+                item = output[i]
+                pl = item.batch["prompts"].shape[-1]
+                valid_len = item.batch["attention_mask"][pl:].sum()
+                resp_ids = item.batch["responses"][:valid_len]
+                s = tokenizer.decode(resp_ids, skip_special_tokens=True)
+                print(f"[raw] Response {i}: {s!r}")
+                ix = s.find("</think>")
+                if ix != -1:
+                    s = s[ix + len("</think>") :].lstrip()
+                print(f"Response {i}: {s!r}")
+                try:
+                    texts.append(ast.literal_eval(s))
+                except Exception:
+                    texts.append(s)
+            slots[n].extend(texts)
+    outputs = np.array(slots, dtype=object)
+    outputs = np.transpose(outputs, (1, 0)).tolist()
+    dataset["response"] = outputs
+    keep = ["file_id", "vt", "gt", "response"]
+    cols = [c for c in keep if c in dataset.columns]
+    if cols:
+        dataset = dataset[cols]
+    out_path = config.data.output_path
+    makedirs(os.path.dirname(out_path), exist_ok=True)
+    dataset.to_json(out_path, orient="records", lines=True, force_ascii=False)
+    print(f"[done] Wrote: {out_path}")
+if __name__ == "__main__":
+    main()

app.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+# Load the model and tokenizer
+tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="auto")
+def generate_response(prompt):
+    """
+    Generates a response from the model.
+    """
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    outputs = model.generate(**inputs, max_new_tokens=512)
+    # Decode the generated text
+    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return generated_text
+# Create the Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Text-to-Text Generation with DeepSeek-R1-Distill-Qwen-7B")
+    gr.Markdown("Enter a prompt and the model will generate a response.")
+    with gr.Row():
+        prompt_input = gr.Textbox(label="Prompt", lines=4, placeholder="Enter your prompt here...")
+    with gr.Row():
+        generate_button = gr.Button("Generate")
+    with gr.Row():
+        response_output = gr.Textbox(label="Response", lines=8, interactive=False)
+    generate_button.click(
+        fn=generate_response,
+        inputs=prompt_input,
+        outputs=response_output
+    )
+if __name__ == "__main__":
+    demo.launch()

config/infer.yaml ADDED Viewed

	@@ -0,0 +1,56 @@

+trainer:
+  nnodes: 1
+  n_gpus_per_node: 1
+data:
+  path: ./data/parquet/test.parquet
+  prompt_key: prompt
+  n_samples: 1
+  output_path: ./checkpoints/grammar_generation.parquet
+  batch_size: 1
+model:
+  path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+  external_lib: null
+  load_param: False
+  load_param_path: null
+rollout:
+  name: vllm
+  mode: sync # sync: LLM, async: AsyncLLM
+  temperature: 0.0
+  top_k: -1 # 0 for hf rollout, -1 for vllm rollout
+  top_p: 1.0
+  max_loras: 1
+  prompt_length: 1800
+  response_length: 512
+  # for vllm rollout
+  dtype: bfloat16 # should align with FSDP
+  gpu_memory_utilization: 0.9   # ↑ allow cache to allocate
+  ignore_eos: False
+  enforce_eager: True
+  free_cache_engine: True
+  load_format: dummy_dtensor
+  tensor_model_parallel_size: 1
+  max_num_batched_tokens: 8192
+  max_model_len: 1800           # ≥ 1200 + 512
+  max_num_seqs: 1024
+  log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+  log_prob_micro_batch_size_per_gpu: 1
+  # for fire vllm rollout
+  use_fire_sampling: False # enable FIRE https://arxiv.org/abs/2410.21236
+  # for hf rollout
+  do_sample: True
+  disable_log_stats: False
+  enable_chunked_prefill: True   # OK because 8192 ≥ 3072
+  n: 1
+  # if beam search activated, top_k, temperature and top_p will be ignored
+actor:
+  strategy: fsdp  # This is for backward-compatibility
+  ulysses_sequence_parallel_size: 1 # sp size
+  fsdp_config:
+    fsdp_size: -1
+ray_init:
+  num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then.

prompts/__pycache__/base_instruction.cpython-311.pyc ADDED Viewed

Binary file (1.44 kB). View file

prompts/__pycache__/infer_prompt.cpython-311.pyc ADDED Viewed

Binary file (6.33 kB). View file

prompts/__pycache__/sft_prompt.cpython-311.pyc ADDED Viewed

Binary file (7.64 kB). View file

prompts/base_instruction.py ADDED Viewed

	@@ -0,0 +1,24 @@

+def basic_instruction(content, modelname):
+    system_instruction = (
+        "당신은 한국어 문장 교정 전문가입니다. "
+        "입력 문장은 다양한 오류(자모 분리, 철자 오류, 단어 누락 등)를 포함할 수 있습니다. "
+        "당신의 임무는 이러한 잘못된 문장을 완전하고 올바른 한국어 문장으로 복원하는 것입니다.\n"
+        "규칙:\n"
+        "•출력은 반드시 교정된 한국어 문장만 작성합니다.\n"
+        "•불필요한 설명, 이유, 따옴표는 포함하지 않습니다.\n"
+    )
+    user_instruction = (
+        f"잘못된 문장(노이즈): {content}\n\n"
+        "위 문장을 올바른 한국어 문장으로 교정하세요.\n"
+        "출력은 반드시 교정된 문장 하나만 작성하세요."
+    )
+    return [
+        {"role": "system", "content": system_instruction},
+        {"role": "user", "content": user_instruction},
+    ]
+def get_instruction_func(modelname):
+    return lambda desc, _: basic_instruction(desc, modelname)

prompts/infer_prompt.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from prompts.base_instruction import get_instruction_func
+def infer_dataset(
+        model_name: str,
+        data_path: str,
+    ):
+    import os, json
+    from typing import Any, Dict, List
+    from datasets import Dataset
+    from transformers import AutoTokenizer
+    MAX_TOKENS = 1200  # same as SFT
+    jsonl_path = os.path.join(data_path, "jsonl")
+    parquet_path = os.path.join(data_path, "parquet")
+    os.makedirs(parquet_path, exist_ok=True)
+    test_jsonl = os.path.join(jsonl_path, "test.jsonl")
+    # --- robust load: tolerant JSONL/array/concatenated JSON
+    rows = []
+    with open(test_jsonl, "r", encoding="utf-8") as f:
+        raw = f.read().strip()
+    try:
+        obj = json.loads(raw)
+        if isinstance(obj, list):
+            rows = [x for x in obj if isinstance(x, dict)]
+    except Exception:
+        pass
+    if not rows:
+        for ln in raw.replace("}{", "}\n{").splitlines():
+            ln = ln.strip()
+            if not ln:
+                continue
+            try:
+                x = json.loads(ln)
+                if isinstance(x, dict):
+                    rows.append(x)
+            except Exception:
+                continue
+    test_dataset = Dataset.from_list(rows)
+    instruction_func = get_instruction_func(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    # ─── helpers ───
+    def _coerce(rec: Dict[str, Any]) -> Dict[str, Any]:
+        r = dict(rec)
+        r["vt"] = str(r.get("vt", "") or "")
+        return r
+    def _prompt_tokens(prompt_messages) -> int:
+        prompt_str = tokenizer.apply_chat_template(
+            prompt_messages, add_generation_prompt=True, tokenize=False
+        )
+        return len(tokenizer(prompt_str, add_special_tokens=False).input_ids)
+    def make_map_fn(split: str):
+        def process_fn(example: Dict[str, Any], idx: int) -> Dict[str, Any]:
+            ex = _coerce(example)
+            vt = ex.get("vt", "").strip()
+            if not vt:
+                return {}
+            chat_prompt = instruction_func(vt, model_name)
+            total_tokens = _prompt_tokens(chat_prompt)
+            extra = {
+                "split": split,
+                "index": idx,
+                "total_tokens": int(total_tokens),
+                "file_id": ex.get("file_id")
+            }
+            return {
+                "prompt": chat_prompt,
+                "extra_info": extra,
+                "total_tokens": int(total_tokens)
+            }
+        return process_fn
+    # build prompts + token counts
+    test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
+    # drop rows where prompt is empty
+    test_dataset = test_dataset.filter(lambda ex: bool(ex.get("prompt")))
+    # drop long prompts (> MAX_TOKENS)
+    n_before_len = len(test_dataset)
+    test_dataset = test_dataset.filter(lambda ex: ex["total_tokens"] <= MAX_TOKENS)
+    kept = len(test_dataset)
+    dropped_long = n_before_len - kept
+    out_path = os.path.join(parquet_path, "test.parquet")
+    test_dataset.to_parquet(out_path)
+    print(f"[test] kept {kept} rows, dropped_long(>{MAX_TOKENS}) {dropped_long}")
+    print(f"Wrote {kept} rows → {out_path}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio
+transformers
+torch
+accelerate

requirements.txt.txt ADDED Viewed

	@@ -0,0 +1,200 @@

+absl-py==2.3.1
+accelerate==1.6.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.11.16
+aiohttp-cors==0.8.1
+aiosignal==1.3.2
+airportsdata==20250224
+alabaster==1.0.0
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+anyio==4.9.0
+astor==0.8.1
+attrs==25.3.0
+babel==2.17.0
+blake3==1.0.4
+cachetools==5.5.2
+certifi==2025.1.31
+charset-normalizer==3.4.1
+click==8.1.8
+cloudpickle==3.1.1
+codetiming==1.4.0
+colorful==0.5.6
+compressed-tensors==0.9.2
+cupy-cuda12x==13.4.1
+datasets==3.5.0
+depyf==0.18.0
+dill==0.3.8
+diskcache==5.6.3
+distlib==0.3.9
+distro==1.9.0
+dnspython==2.7.0
+docker-pycreds==0.4.0
+docutils==0.21.2
+einops==0.8.1
+email_validator==2.2.0
+fastapi==0.115.12
+fastapi-cli==0.0.7
+fastrlock==0.8.3
+filelock==3.18.0
+flash_attn==2.7.4.post1
+flashinfer-python==0.2.5
+frozenlist==1.5.0
+fsspec==2024.6.1
+gguf==0.10.0
+gitdb==4.0.12
+GitPython==3.1.44
+google-api-core==2.24.2
+google-auth==2.38.0
+googleapis-common-protos==1.69.2
+grpcio==1.71.0
+h11==0.14.0
+httpcore==1.0.8
+httptools==0.6.4
+httpx==0.28.1
+huggingface-hub==0.30.2
+hydra-core==1.3.2
+idna==3.10
+imagesize==1.4.1
+importlib_metadata==8.6.1
+interegular==0.3.3
+Jinja2==3.1.6
+jiter==0.9.0
+jiwer==4.0.0
+joblib==1.5.2
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+lark==1.2.2
+Levenshtein==0.27.1
+liger_kernel==0.5.9
+llguidance==0.7.14
+llvmlite==0.43.0
+lm-format-enforcer==0.10.11
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+mdurl==0.1.2
+mistral_common==1.5.4
+mpmath==1.3.0
+msgpack==1.1.0
+msgspec==0.19.0
+multidict==6.4.3
+multiprocess==0.70.16
+nest-asyncio==1.6.0
+networkx==3.3
+ninja==1.11.1.4
+nltk==3.9.1
+numba==0.60.0
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-ml-py==12.570.86
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+omegaconf==2.3.0
+openai==1.73.0
+opencensus==0.11.4
+opencensus-context==0.1.3
+opencv-python-headless==4.11.0.86
+orjson==3.10.16
+outlines==0.1.11
+outlines_core==0.1.26
+packaging==24.2
+pandas==2.2.3
+partial-json-parser==0.2.1.1.post5
+peft==0.15.1
+pillow==11.2.1
+platformdirs==4.3.7
+prometheus-fastapi-instrumentator==7.1.0
+prometheus_client==0.21.1
+propcache==0.3.1
+proto-plus==1.26.1
+protobuf==5.29.4
+psutil==7.0.0
+py-cpuinfo==9.0.0
+py-spy==0.4.0
+pyarrow==19.0.1
+pyasn1==0.6.1
+pyasn1_modules==0.4.2
+pybind11==2.13.6
+pycountry==24.6.1
+pydantic==2.11.3
+pydantic_core==2.33.1
+Pygments==2.19.1
+pylatexenc==2.10
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.0
+python-json-logger==3.3.0
+python-Levenshtein==0.27.1
+python-multipart==0.0.20
+pytz==2025.2
+PyYAML==6.0.2
+pyzmq==26.4.0
+RapidFuzz==3.14.1
+ray==2.44.1
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+rich==14.0.0
+rich-toolkit==0.14.1
+roman-numerals-py==3.1.0
+rouge_score==0.1.2
+rpds-py==0.24.0
+rsa==4.9
+safetensors==0.5.3
+scipy==1.15.2
+sentencepiece==0.2.0
+sentry-sdk==2.25.1
+setproctitle==1.3.5
+shellingham==1.5.4
+six==1.17.0
+smart-open==7.1.0
+smmap==5.0.2
+sniffio==1.3.1
+snowballstemmer==2.2.0
+Sphinx==8.2.3
+sphinxcontrib-applehelp==2.0.0
+sphinxcontrib-devhelp==2.0.0
+sphinxcontrib-htmlhelp==2.1.0
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==2.0.0
+sphinxcontrib-serializinghtml==2.0.0
+starlette==0.46.1
+sympy==1.13.1
+tensordict==0.6.2
+tiktoken==0.9.0
+timeout-decorator==0.5.0
+tokenizers==0.21.1
+torch==2.6.0
+torchaudio==2.6.0
+torchdata==0.11.0
+torchvision==0.21.0
+tqdm==4.67.1
+transformers==4.51.2
+triton==3.2.0
+typer==0.15.2
+typing-inspection==0.4.0
+typing_extensions==4.12.2
+tzdata==2025.2
+urllib3==2.4.0
+uvicorn==0.34.0
+uvloop==0.21.0
+virtualenv==20.30.0
+vllm==0.8.2
+wandb==0.19.9
+watchfiles==1.0.5
+websockets==15.0.1
+wrapt==1.17.2
+xformers==0.0.29.post2
+xgrammar==0.1.16
+xxhash==3.5.0
+yarl==1.19.0
+zipp==3.21.0

scripts/sft_infer_pass1.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+#!/bin/bash
+#!/bin/bash
+set -x
+python ./_infer.py \
+  model.path=./checkpoints/model \
+  model.load_param=False \
+  data.path=./data/parquet/test.parquet \
+  data.output_path=./model_output/[email protected] \
+  data.batch_size=32 data.n_samples=1 \
+  rollout.tensor_model_parallel_size=1 \
+  rollout.temperature=0.7 rollout.top_p=0.9 rollout.n=1 rollout.do_sample=True \
+  rollout.prompt_length=1200 rollout.response_length=512 \
+  rollout.enable_chunked_prefill=True \
+  +rollout.kv_cache_dtype=fp8_e5m2 \
+  rollout.max_model_len=1800 \
+  rollout.max_num_batched_tokens=1800 \
+  rollout.max_num_seqs=1 \
+  +model.trust_remote_code=True \
+  +rollout.kv_cache_block_size=16 \
+  +rollout.swap_space=16 \
+  rollout.gpu_memory_utilization=0.7
+# python ./_infer.py \
+#     model.load_param=True \
+#     model.load_param_path="./checkpoints/merged_r1qwen14b/model.pt" \
+#     data.output_path="./model_output/[email protected]" \
+#     data.n_samples=10\
+#     data.path="./data/parquet/test.parquet" \
+#     rollout.temperature=0.9\
+#     rollout.top_p=0.9 \
+#     rollout.n=1\

verl/__init__.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+from .protocol import DataProto
+from .utils.logging_utils import set_basic_config
+version_folder = os.path.dirname(os.path.join(os.path.abspath(__file__)))
+with open(os.path.join(version_folder, "version/version")) as f:
+    __version__ = f.read().strip()
+set_basic_config(level=logging.WARNING)
+__all__ = ["DataProto", "__version__"]
+if os.getenv("VERL_USE_MODELSCOPE", "False").lower() == "true":
+    import importlib
+    if importlib.util.find_spec("modelscope") is None:
+        raise ImportError("You are using the modelscope hub, please install modelscope by `pip install modelscope -U`")
+    # Patch hub to download models from modelscope to speed up.
+    from modelscope.utils.hf_util import patch_hub
+    patch_hub()

verl/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.69 kB). View file

verl/__pycache__/protocol.cpython-311.pyc ADDED Viewed

Binary file (47 kB). View file

verl/models/README.md ADDED Viewed

	@@ -0,0 +1,35 @@

+# Models
+Common modelzoo such as huggingface/transformers stuggles when using Pytorch native model parallelism. Following the design principle of vLLM, we keep a simple, parallelizable, highly-optimized with packed inputs in verl.
+## Adding a New Huggingface Model
+### Step 1: Copy the model file from HF to verl
+- Add a new file under verl/models/hf
+- Copy ONLY the model file from huggingface/transformers/models to verl/models/hf
+### Step 2: Modify the model file to use packed inputs
+- Remove all the code related to inference (kv cache)
+- Modify the inputs to include only
+    - input_ids (total_nnz,)
+    - cu_seqlens (total_nnz + 1,)
+    - max_seqlen_in_batch: int
+- Note that this requires using flash attention with causal mask.
+### Step 2.5: Add tests
+- Add a test to compare this version and the huggingface version
+- Following the infrastructure and add tests to tests/models/hf
+### Step 3: Add a function to apply tensor parallelism
+- Please follow
+    - https://pytorch.org/docs/stable/distributed.tensor.parallel.html
+    - https://pytorch.org/tutorials/intermediate/TP_tutorial.html
+- General comments
+    - Tensor Parallelism in native Pytorch is NOT auto-parallelism. The way it works is to specify how model parameters and input/output reshards using configs. These configs are then registered as hooks to perform input/output resharding before/after model forward.
+### Step 4: Add a function to apply data parallelism
+- Please use FSDP2 APIs
+- See demo here https://github.com/pytorch/torchtitan/blob/main/torchtitan/parallelisms/parallelize_llama.py#L413
+### Step 5: Add a function to apply pipeline parallelism
+- Comes in Pytorch 2.4
+- Currently only in alpha in nightly version
+- Check torchtitan for more details

verl/models/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

verl/models/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (176 Bytes). View file

verl/models/__pycache__/registry.cpython-311.pyc ADDED Viewed

Binary file (2.13 kB). View file

verl/models/llama/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

verl/models/llama/megatron/__init__.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .modeling_llama_megatron import (
+    ParallelLlamaForCausalLM,
+    # rmpad with megatron
+    ParallelLlamaForCausalLMRmPad,
+    # rmpad with megatron and pipeline parallelism
+    ParallelLlamaForCausalLMRmPadPP,
+    ParallelLlamaForValueRmPad,
+    ParallelLlamaForValueRmPadPP,
+    # original model with megatron
+    ParallelLlamaModel,
+)
+__all__ = [
+    "ParallelLlamaForCausalLM",
+    "ParallelLlamaForCausalLMRmPad",
+    "ParallelLlamaForCausalLMRmPadPP",
+    "ParallelLlamaForValueRmPad",
+    "ParallelLlamaForValueRmPadPP",
+    "ParallelLlamaModel",
+]

verl/models/llama/megatron/checkpoint_utils/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

verl/models/llama/megatron/checkpoint_utils/llama_loader.py ADDED Viewed

	@@ -0,0 +1,295 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import torch
+import torch.distributed as dist
+def _megatron_calc_layer_map(config):
+    """Calculate the mapping of global layer_idx to local layer_idx
+    Returns:
+        layer_map (Dict: int -> tuple(int, int, int)):
+            mapping from the global layer index to
+            a tuple of (pp_rank, virtual_pp_rank, layer_idx inside model)
+    """
+    from megatron.core import mpu
+    print(f"get megatron data parallel size: {mpu.get_data_parallel_world_size()}")
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+    layer_map = dict()
+    num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+    assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
+    for pp_rank_idx in range(pp_size):
+        for virtual_pp_rank_idx in range(virtual_pp_size):
+            layer_offset = virtual_pp_rank_idx * (config.num_hidden_layers // virtual_pp_size) + pp_rank_idx * num_layers_per_model
+            for layer_idx in range(num_layers_per_model):
+                layer_map[layer_offset + layer_idx] = (
+                    pp_rank_idx,
+                    virtual_pp_rank_idx,
+                    layer_idx,
+                )
+    return layer_map
+def load_state_dict_to_megatron_llama(state_dict, wrapped_models, config, params_dtype, is_value_model=False, tie_word_embeddings=False):
+    """Load merged state_dict to sharded Megatron module in training."""
+    from megatron.core import DistributedDataParallel as LocalDDP
+    from megatron.core import mpu
+    from megatron.core.transformer.module import Float16Module
+    from torch.nn.parallel import DistributedDataParallel as torchDDP
+    from verl.utils.megatron_utils import print_rank_0, unwrap_model
+    start_time = time.time()
+    def _get_gpt_model(model):
+        return model
+    def fetch_params(module):
+        for param in module.parameters():
+            torch.distributed.fetch(param.data, src=mpu.get_data_parallel_src_rank(), group=mpu.get_data_parallel_group())
+    dp_rank = mpu.get_data_parallel_rank()
+    pp_rank = mpu.get_pipeline_model_parallel_rank()
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+    mp_group = mpu.get_model_parallel_group()
+    if torch.distributed.get_rank() == 0:
+        assert mp_group.rank() == 0, f"mp_rank:[{mp_group.rank}] != 0 on rank #0"
+        assert pp_rank == 0, f"pp_rank:[{pp_rank}] != 0 on rank #0"
+        assert dp_rank == 0, f"dp_rank:[{dp_rank}] != 0 on rank #0"
+    if not isinstance(wrapped_models, (list, tuple)):
+        wrapped_models = list(wrapped_models)
+    assert len(wrapped_models) == virtual_pp_size
+    num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+    assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers, f"num_layers_per_model: {num_layers_per_model} * pp_size: {pp_size} * virtual_pp_size {virtual_pp_size} != config.num_hidden_layers: {config.num_hidden_layers}"
+    models = [None] * len(wrapped_models)
+    for i, wrapped_model in enumerate(wrapped_models):
+        models[i] = unwrap_model(wrapped_model, (torchDDP, LocalDDP, Float16Module))
+        gpt_model_module = _get_gpt_model(models[i])
+        assert len(gpt_model_module.model.layers) == num_layers_per_model
+    def _fetch_tensor(tensor, name) -> torch.Tensor:
+        """fetch tensor"""
+        nonlocal state_dict
+        if tensor is not None:
+            tensor.data.copy_(state_dict[name])
+    def _fetch_tp_shard_tensor_vocab(tensor, name, chunk_dim=0, mutate_func=None) -> torch.Tensor:
+        """fetch tensor in tp shards"""
+        nonlocal state_dict
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        if name in state_dict:
+            full_weight = state_dict[name]
+            if mutate_func is not None:
+                full_weight = mutate_func(full_weight)
+            tensor_chunk = torch.chunk(full_weight, tp_size, dim=chunk_dim)
+            if tensor is not None:
+                tensor.data.copy_(tensor_chunk[tp_rank])
+        else:
+            print(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
+    def _fetch_tp_shard_tensor(tensor, name, chunk_dim=0, mutate_func=None) -> torch.Tensor:
+        """fetch tensor in tp shards"""
+        nonlocal state_dict
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        if name in state_dict:
+            full_weight = state_dict[name]
+            if mutate_func is not None:
+                full_weight = mutate_func(full_weight)
+            tensor_chunk = torch.chunk(full_weight, tp_size, dim=chunk_dim)
+            if tensor is not None:
+                tensor.data.copy_(tensor_chunk[tp_rank])
+        else:
+            print(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
+    def _fetch_tp_shard_tensor_gate_up(tensor, gate_name, up_name) -> torch.Tensor:
+        """fetch gate_up tensor in tp shards"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        if gate_name in state_dict and up_name in state_dict:
+            gate_weight = state_dict[gate_name]
+            up_weight = state_dict[up_name]
+            new_gate_up_weight = torch.empty(config.intermediate_size * 2, config.hidden_size, dtype=params_dtype, device=torch.cuda.current_device())
+            for i in range(tp_size):
+                intermediate_size_tp = config.intermediate_size // tp_size
+                gate_weight_tp = gate_weight[i * intermediate_size_tp : (i + 1) * intermediate_size_tp]
+                up_weight_tp = up_weight[i * intermediate_size_tp : (i + 1) * intermediate_size_tp]
+                new_gate_up_weight[intermediate_size_tp * 2 * i : intermediate_size_tp * 2 * (i + 1)].copy_(torch.cat([gate_weight_tp, up_weight_tp], dim=0))
+            tensor_chunk = torch.chunk(new_gate_up_weight, tp_size, dim=0)
+            if tensor is not None:
+                tensor.data.copy_(tensor_chunk[tp_rank])
+        else:
+            print(f"tp_shard tensor:[{gate_name}, {up_name}] not in state_dict, skip loading")
+    def _fetch_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name) -> torch.Tensor:
+        """fetch tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        assert q_name in state_dict and k_name in state_dict and v_name in state_dict
+        full_weight_q = state_dict[q_name]
+        full_weight_k = state_dict[k_name]
+        full_weight_v = state_dict[v_name]
+        hidden_size_per_head = config.hidden_size // config.num_attention_heads
+        if config.num_key_value_heads >= tp_size:
+            q_size_tp = config.hidden_size // tp_size
+            kv_size_tp = hidden_size_per_head * config.num_key_value_heads // tp_size
+            total_size = q_size_tp + 2 * kv_size_tp
+            new_weight_qkv = torch.empty(total_size * tp_size, config.hidden_size, dtype=params_dtype, device=torch.cuda.current_device())
+            for i in range(tp_size):
+                q_part = full_weight_q[i * q_size_tp : (i + 1) * q_size_tp]
+                k_part = full_weight_k[i * kv_size_tp : (i + 1) * kv_size_tp]
+                v_part = full_weight_v[i * kv_size_tp : (i + 1) * kv_size_tp]
+                new_weight_qkv[i * total_size : (i + 1) * total_size].copy_(torch.cat([q_part, k_part, v_part], dim=0))
+        else:
+            q_size_tp = config.hidden_size // tp_size
+            kv_size_tp = hidden_size_per_head
+            total_size = q_size_tp + 2 * kv_size_tp
+            new_weight_qkv = torch.empty(total_size * tp_size, config.hidden_size, dtype=params_dtype, device=torch.cuda.current_device())
+            for i in range(tp_size):
+                q_part = full_weight_q[i * q_size_tp : (i + 1) * q_size_tp]
+                start_idx = i * config.num_key_value_heads // tp_size * hidden_size_per_head
+                end_idx = (i * config.num_key_value_heads // tp_size + 1) * hidden_size_per_head
+                k_part = full_weight_k[start_idx:end_idx]
+                v_part = full_weight_v[start_idx:end_idx]
+                new_weight_qkv[i * total_size : (i + 1) * total_size].copy_(torch.cat([q_part, k_part, v_part], dim=0))
+        tensor_chunk = torch.chunk(new_weight_qkv, tp_size, dim=0)
+        if tensor is not None:
+            tensor.data.copy_(tensor_chunk[tp_rank])
+    # Embeddings
+    # -------------------
+    print_rank_0("loading embeddings...")
+    gpt_model_module = _get_gpt_model(models[0])
+    embed_tokens_weight = None
+    if pp_rank == 0:
+        embed_tokens_weight = gpt_model_module.model.embed_tokens.weight
+    _fetch_tp_shard_tensor_vocab(embed_tokens_weight, "model.embed_tokens.weight")
+    # Transformer layers
+    # -------------------
+    layer_map = _megatron_calc_layer_map(config)
+    pp_rank = mpu.get_pipeline_model_parallel_rank()
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    num_layer_per_pp = config.num_hidden_layers // pp_size
+    vpp_size = mpu.get_virtual_pipeline_model_parallel_world_size()
+    layer_list = []
+    if vpp_size is not None:
+        for vpp_rank in range(vpp_size):
+            num_layer_vpp_chunk = num_layer_per_pp // vpp_size
+            num_layer_this_model = num_layer_vpp_chunk
+            offset = vpp_rank * (config.num_hidden_layers // mpu.get_virtual_pipeline_model_parallel_world_size()) + (mpu.get_pipeline_model_parallel_rank() * num_layer_vpp_chunk)
+            layer_list.extend(list(range(offset, offset + num_layer_this_model)))
+    else:
+        num_layer_this_model = num_layer_per_pp
+        offset = pp_rank * num_layer_per_pp
+        layer_list.extend(list(range(offset, offset + num_layer_this_model)))
+    for layer in layer_list:
+        print_rank_0(f"loading layer #{layer}...")
+        layer_name = f"model.layers.{layer}"
+        dst_pp_rank, dst_virtual_pp_rank, dst_layer_idx = layer_map[layer]
+        gpt_model_module = _get_gpt_model(models[dst_virtual_pp_rank])
+        sync_layer = gpt_model_module.model.layers[dst_layer_idx]
+        _fetch_tensor(
+            sync_layer.input_layernorm.weight if dst_pp_rank == pp_rank else None,
+            f"{layer_name}.input_layernorm.weight",
+        )
+        _fetch_tp_shard_tensor_qkv(
+            sync_layer.self_attn.qkv_proj.weight if dst_pp_rank == pp_rank else None,
+            f"{layer_name}.self_attn.q_proj.weight",
+            f"{layer_name}.self_attn.k_proj.weight",
+            f"{layer_name}.self_attn.v_proj.weight",
+        )
+        _fetch_tp_shard_tensor(
+            sync_layer.self_attn.o_proj.weight if dst_pp_rank == pp_rank else None,
+            f"{layer_name}.self_attn.o_proj.weight",
+            chunk_dim=1,
+        )
+        _fetch_tensor(
+            sync_layer.post_attention_layernorm.weight if dst_pp_rank == pp_rank else None,
+            f"{layer_name}.post_attention_layernorm.weight",
+        )
+        _fetch_tp_shard_tensor_gate_up(
+            sync_layer.mlp.gate_up_proj.weight if dst_pp_rank == pp_rank else None,
+            f"{layer_name}.mlp.gate_proj.weight",
+            f"{layer_name}.mlp.up_proj.weight",
+        )
+        _fetch_tp_shard_tensor(
+            sync_layer.mlp.down_proj.weight if dst_pp_rank == pp_rank else None,
+            f"{layer_name}.mlp.down_proj.weight",
+            chunk_dim=1,
+        )
+    # Final Layernorm
+    # -------------------
+    print_rank_0("loading final layernorm...")
+    gpt_model_module = _get_gpt_model(models[-1])
+    _fetch_tensor(
+        getattr(gpt_model_module.model.norm, "weight", None),
+        "model.norm.weight",
+    )
+    print_rank_0("loading lm_head...")
+    if pp_rank + 1 == pp_size:
+        lm_head_weight = gpt_model_module.lm_head.weight
+        if is_value_model:
+            if "lm_head.weight" in state_dict and state_dict["lm_head.weight"].shape[0] == 1:
+                _fetch_tensor(lm_head_weight, "lm_head.weight")
+                print_rank_0("load lm_head weight")
+            elif "reward_head.weight" in state_dict and state_dict["reward_head.weight"].shape[0] == 1:
+                _fetch_tensor(lm_head_weight, "reward_head.weight")
+                print_rank_0("load lm_head from value_head weight")
+            else:
+                _fetch_tensor(None, "lm_head.weight")
+                print_rank_0("fail to match lm_head in value_model")
+        else:
+            _fetch_tp_shard_tensor(lm_head_weight, "lm_head.weight")
+    dist.barrier()
+    torch.cuda.empty_cache()
+    print_rank_0(f"loading megatron ckpt done, time elapsed {time.time() - start_time}s")

verl/models/llama/megatron/checkpoint_utils/llama_loader_depracated.py ADDED Viewed

	@@ -0,0 +1,425 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import torch
+import torch.distributed as dist
+def _megatron_calc_layer_map(config):
+    """Calculate the mapping of global layer_idx to local layer_idx
+    Returns:
+        layer_map (Dict: int -> tuple(int, int, int)):
+            mapping from the global layer index to
+            a tuple of (pp_rank, virtual_pp_rank, layer_idx inside model)
+    """
+    from megatron.core import mpu
+    print(f"get megatron data parallel size: {mpu.get_data_parallel_world_size()}")
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+    layer_map = dict()
+    num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+    assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
+    for pp_rank_idx in range(pp_size):
+        for virtual_pp_rank_idx in range(virtual_pp_size):
+            layer_offset = virtual_pp_rank_idx * (config.num_hidden_layers // virtual_pp_size) + pp_rank_idx * num_layers_per_model
+            for layer_idx in range(num_layers_per_model):
+                layer_map[layer_offset + layer_idx] = (
+                    pp_rank_idx,
+                    virtual_pp_rank_idx,
+                    layer_idx,
+                )
+    return layer_map
+def load_state_dict_to_megatron_llama(state_dict, wrapped_models, config, params_dtype, is_value_model=False, tie_word_embeddings=False):
+    """Load merged state_dict to sharded Megatron module in training."""
+    from megatron.core import DistributedDataParallel as LocalDDP
+    from megatron.core import mpu
+    from megatron.core.transformer.module import Float16Module
+    from torch.nn.parallel import DistributedDataParallel as torchDDP
+    from verl.utils.megatron_utils import print_rank_0, unwrap_model
+    start_time = time.time()
+    def _get_gpt_model(model):
+        return model
+    def broadcast_params(module):
+        for param in module.parameters():
+            torch.distributed.broadcast(param.data, src=mpu.get_data_parallel_src_rank(), group=mpu.get_data_parallel_group())
+    dp_rank = mpu.get_data_parallel_rank()
+    pp_rank = mpu.get_pipeline_model_parallel_rank()
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+    mp_group = mpu.get_model_parallel_group()
+    if torch.distributed.get_rank() == 0:
+        assert mp_group.rank() == 0, f"mp_rank:[{mp_group.rank}] != 0 on rank #0"
+        assert pp_rank == 0, f"pp_rank:[{pp_rank}] != 0 on rank #0"
+        assert dp_rank == 0, f"dp_rank:[{dp_rank}] != 0 on rank #0"
+    if not isinstance(wrapped_models, (list, tuple)):
+        wrapped_models = list(wrapped_models)
+    assert len(wrapped_models) == virtual_pp_size
+    num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+    assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers, f"num_layers_per_model: {num_layers_per_model} * pp_size: {pp_size} * virtual_pp_size {virtual_pp_size} != config.num_hidden_layers: {config.num_hidden_layers}"
+    models = [None] * len(wrapped_models)
+    for i, wrapped_model in enumerate(wrapped_models):
+        models[i] = unwrap_model(wrapped_model, (torchDDP, LocalDDP, Float16Module))
+        gpt_model_module = _get_gpt_model(models[i])
+        assert len(gpt_model_module.model.layers) == num_layers_per_model
+    def _broadcast_tensor(tensor, name) -> torch.Tensor:
+        """broadcast tensor from rank0 across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        if torch.distributed.get_rank() == 0:
+            if name in state_dict:
+                weight = state_dict[name]
+                tensor_shape = weight.shape
+            else:
+                tensor_shape = None
+        else:
+            weight = None
+            tensor_shape = None
+        obj_list = [tensor_shape]
+        dist.broadcast_object_list(obj_list, src=0, group=mp_group)
+        tensor_shape = obj_list[0]
+        if tensor_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tensor:[{name}] not in state_dict, skip load")
+            return
+        if tensor is None:
+            tensor = torch.empty(
+                tensor_shape,
+                dtype=params_dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        if torch.distributed.get_rank() == 0:
+            tensor.data.copy_(weight)
+        dist.broadcast(tensor, src=0, group=mp_group)
+    def _broadcast_tp_shard_tensor_vocab(tensor, name, chunk_dim=0, mutate_func=None) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        if torch.distributed.get_rank() == 0:
+            if name in state_dict:
+                full_weight = state_dict[name]
+                if mutate_func is not None:
+                    full_weight = mutate_func(full_weight)
+                tensor_chunk = torch.chunk(full_weight, tp_size, dim=chunk_dim)
+                chunk_shape = tensor_chunk[0].shape
+            else:
+                chunk_shape = None
+        else:
+            chunk_shape = None
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=0, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
+            return
+        if tensor is None:
+            sync_tensor = torch.empty(
+                chunk_shape,
+                dtype=params_dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        else:
+            assert tensor.shape == chunk_shape, f"rank #{torch.distributed.get_rank()} tensor {name} shape {tensor.shape} != {chunk_shape}"
+            sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
+        for i in range(tp_size):
+            if torch.distributed.get_rank() == 0:
+                sync_tensor.data.copy_(tensor_chunk[i])
+            dist.broadcast(sync_tensor, src=0, group=mp_group)
+            if (i == tp_rank) and (tensor is not None):
+                tensor.data.copy_(sync_tensor)
+    def _broadcast_tp_shard_tensor(tensor, name, chunk_dim=0, mutate_func=None) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        if torch.distributed.get_rank() == 0:
+            if name in state_dict:
+                full_weight = state_dict[name]
+                if mutate_func is not None:
+                    full_weight = mutate_func(full_weight)
+                tensor_chunk = torch.chunk(full_weight, tp_size, dim=chunk_dim)
+                chunk_shape = tensor_chunk[0].shape
+            else:
+                chunk_shape = None
+        else:
+            chunk_shape = None
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=0, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
+            return
+        if tensor is None:
+            sync_tensor = torch.empty(
+                chunk_shape,
+                dtype=params_dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        else:
+            assert tensor.shape == chunk_shape, f"rank #{torch.distributed.get_rank()} tensor {name} shape {tensor.shape} != {chunk_shape}"
+            sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
+        for i in range(tp_size):
+            if torch.distributed.get_rank() == 0:
+                sync_tensor.data.copy_(tensor_chunk[i])
+            dist.broadcast(sync_tensor, src=0, group=mp_group)
+            if (i == tp_rank) and (tensor is not None):
+                tensor.data.copy_(sync_tensor)
+    def _broadcast_tp_shard_tensor_gate_up(tensor, gate_name, up_name) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        if torch.distributed.get_rank() == 0:
+            gate_weight = state_dict[gate_name]
+            up_weight = state_dict[up_name]
+            new_gate_up_weight = torch.empty(config.intermediate_size * 2, config.hidden_size, dtype=params_dtype, device=torch.cuda.current_device())
+            for i in range(tp_size):
+                intermediate_size_tp = config.intermediate_size // tp_size
+                gate_weight_tp = gate_weight[i * intermediate_size_tp : (i + 1) * intermediate_size_tp]
+                up_weight_tp = up_weight[i * intermediate_size_tp : (i + 1) * intermediate_size_tp]
+                new_gate_up_weight[intermediate_size_tp * 2 * i : intermediate_size_tp * 2 * (i + 1)].copy_(torch.cat([gate_weight_tp, up_weight_tp], dim=0))
+            tensor_chunk = torch.chunk(new_gate_up_weight, tp_size, dim=0)
+            chunk_shape = tensor_chunk[0].shape
+        else:
+            chunk_shape = None
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=0, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{gate_name, up_name}] not in state_dict, skip loading")
+            return
+        if tensor is None:
+            sync_tensor = torch.empty(
+                chunk_shape,
+                dtype=params_dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        else:
+            assert tensor.shape == chunk_shape, f"rank #{torch.distributed.get_rank() == 0:} tensor {gate_name, up_name} shape {tensor.shape} != {chunk_shape}"
+            sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
+        for i in range(tp_size):
+            if torch.distributed.get_rank() == 0:
+                sync_tensor.data.copy_(tensor_chunk[i])
+            dist.broadcast(sync_tensor, src=0, group=mp_group)
+            if (i == tp_rank) and (tensor is not None):
+                tensor.data.copy_(sync_tensor)
+    def _broadcast_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        if torch.distributed.get_rank() == 0:
+            assert q_name in state_dict and k_name in state_dict and v_name in state_dict
+            full_weight_q = state_dict[q_name]
+            full_weight_k = state_dict[k_name]
+            full_weight_v = state_dict[v_name]
+            hidden_size_per_head = config.hidden_size // config.num_attention_heads
+            if config.num_key_value_heads >= tp_size:
+                q_size_tp = config.hidden_size // tp_size
+                kv_size_tp = hidden_size_per_head * config.num_key_value_heads // tp_size
+                total_size = q_size_tp + 2 * kv_size_tp
+                new_weight_qkv = torch.empty(total_size * tp_size, config.hidden_size, dtype=params_dtype, device=torch.cuda.current_device())
+                for i in range(tp_size):
+                    q_part = full_weight_q[i * q_size_tp : (i + 1) * q_size_tp]
+                    k_part = full_weight_k[i * kv_size_tp : (i + 1) * kv_size_tp]
+                    v_part = full_weight_v[i * kv_size_tp : (i + 1) * kv_size_tp]
+                    new_weight_qkv[i * total_size : (i + 1) * total_size].copy_(torch.cat([q_part, k_part, v_part], dim=0))
+            else:
+                q_size_tp = config.hidden_size // tp_size
+                kv_size_tp = hidden_size_per_head
+                total_size = q_size_tp + 2 * kv_size_tp
+                new_weight_qkv = torch.empty(total_size * tp_size, config.hidden_size, dtype=params_dtype, device=torch.cuda.current_device())
+                for i in range(tp_size):
+                    q_part = full_weight_q[i * q_size_tp : (i + 1) * q_size_tp]
+                    start_idx = i * config.num_key_value_heads // tp_size * hidden_size_per_head
+                    end_idx = (i * config.num_key_value_heads // tp_size + 1) * hidden_size_per_head
+                    k_part = full_weight_k[start_idx:end_idx]
+                    v_part = full_weight_v[start_idx:end_idx]
+                    new_weight_qkv[i * total_size : (i + 1) * total_size].copy_(torch.cat([q_part, k_part, v_part], dim=0))
+            tensor_chunk = torch.chunk(new_weight_qkv, tp_size, dim=0)
+            chunk_shape = tensor_chunk[0].shape
+        else:
+            chunk_shape = None
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=0, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{q_name, k_name, v_name}] not in state_dict, skip loading")
+            return
+        if tensor is None:
+            sync_tensor = torch.empty(
+                chunk_shape,
+                dtype=params_dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        else:
+            assert tensor.shape == chunk_shape, f"rank #{torch.distributed.get_rank()} tensor {q_name} shape {tensor.shape} != {chunk_shape}"
+            sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
+        for i in range(tp_size):
+            if torch.distributed.get_rank() == 0:
+                sync_tensor.data.copy_(tensor_chunk[i])
+            dist.broadcast(sync_tensor, src=0, group=mp_group)
+            if (i == tp_rank) and (tensor is not None):
+                tensor.data.copy_(sync_tensor)
+    if dp_rank == 0:
+        # Embeddings
+        # -------------------
+        print_rank_0("loading embeddings...")
+        gpt_model_module = _get_gpt_model(models[0])
+        embed_tokens_weight = None
+        if pp_rank == 0:
+            embed_tokens_weight = gpt_model_module.model.embed_tokens.weight
+        _broadcast_tp_shard_tensor_vocab(embed_tokens_weight, "model.embed_tokens.weight")
+        # Transformer layers
+        # -------------------
+        layer_map = _megatron_calc_layer_map(config)
+        for layer in range(config.num_hidden_layers):
+            print_rank_0(f"loading layer #{layer}...")
+            layer_name = f"model.layers.{layer}"
+            dst_pp_rank, dst_virtual_pp_rank, dst_layer_idx = layer_map[layer]
+            gpt_model_module = _get_gpt_model(models[dst_virtual_pp_rank])
+            sync_layer = gpt_model_module.model.layers[dst_layer_idx]
+            _broadcast_tensor(
+                sync_layer.input_layernorm.weight if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.input_layernorm.weight",
+            )
+            _broadcast_tp_shard_tensor_qkv(
+                sync_layer.self_attn.qkv_proj.weight if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.self_attn.q_proj.weight",
+                f"{layer_name}.self_attn.k_proj.weight",
+                f"{layer_name}.self_attn.v_proj.weight",
+            )
+            _broadcast_tp_shard_tensor(
+                sync_layer.self_attn.o_proj.weight if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.self_attn.o_proj.weight",
+                chunk_dim=1,
+            )
+            _broadcast_tensor(
+                sync_layer.post_attention_layernorm.weight if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.post_attention_layernorm.weight",
+            )
+            _broadcast_tp_shard_tensor_gate_up(
+                sync_layer.mlp.gate_up_proj.weight if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.mlp.gate_proj.weight",
+                f"{layer_name}.mlp.up_proj.weight",
+            )
+            _broadcast_tp_shard_tensor(
+                sync_layer.mlp.down_proj.weight if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.mlp.down_proj.weight",
+                chunk_dim=1,
+            )
+        # Final Layernorm
+        # -------------------
+        print_rank_0("loading final layernorm...")
+        gpt_model_module = _get_gpt_model(models[-1])
+        _broadcast_tensor(
+            getattr(gpt_model_module.model.norm, "weight", None),
+            "model.norm.weight",
+        )
+        print_rank_0("loading lm_head...")
+        lm_head_weight = None
+        if pp_rank + 1 == pp_size:
+            lm_head_weight = gpt_model_module.lm_head.weight
+        if is_value_model:
+            if "lm_head.weight" in state_dict and state_dict["lm_head.weight"].shape[0] == 1:
+                _broadcast_tensor(lm_head_weight, "lm_head.weight")
+                print_rank_0("load lm_head weight")
+            elif "reward_head.weight" in state_dict and state_dict["reward_head.weight"].shape[0] == 1:
+                _broadcast_tensor(lm_head_weight, "reward_head.weight")
+                print_rank_0("load lm_head from value_head weight")
+            else:
+                _broadcast_tensor(None, "lm_head.weight")
+                print_rank_0("fail to match lm_head in value_model")
+        else:
+            _broadcast_tp_shard_tensor(lm_head_weight, "lm_head.weight")
+    dist.barrier()
+    # Broadcast weights inside data parallel groups
+    for wrapped_model in wrapped_models:
+        broadcast_params(wrapped_model)
+    torch.cuda.empty_cache()
+    print_rank_0(f"loading megatron ckpt done, time elapsed {time.time() - start_time}s")

verl/models/llama/megatron/checkpoint_utils/llama_saver.py ADDED Viewed

	@@ -0,0 +1,430 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import torch
+import torch.distributed as dist
+from megatron.core import mpu
+from megatron.core.distributed import DistributedDataParallel as LocalDDP
+from megatron.core.transformer.module import Float16Module
+from torch.nn.parallel import DistributedDataParallel as torchDDP
+from verl.utils.megatron_utils import print_rank_0, unwrap_model
+def _megatron_calc_global_rank(tp_rank: int = 0, dp_rank: int = 0, pp_rank: int = 0):
+    """given TP,DP,PP rank to get the global rank."""
+    tp_size = mpu.get_tensor_model_parallel_world_size()
+    dp_size = mpu.get_data_parallel_world_size()
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    assert tp_size * dp_size * pp_size == torch.distributed.get_world_size(), f"{tp_size} x {dp_size} x {pp_size} != {torch.distributed.get_world_size()}"
+    # We only support TP-DP-PP grouping, for correctness when resharding
+    return (pp_rank * dp_size + dp_rank) * tp_size + tp_rank
+def _megatron_calc_layer_map(config):
+    """Calculate the mapping of global layer_idx to local layer_idx
+    Returns:
+        layer_map (Dict: int -> tuple(int, int, int)):
+            mapping from the global layer index to
+            a tuple of (pp_rank, virtual_pp_rank, layer_idx inside model)
+    """
+    from megatron.core import mpu
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+    layer_map = dict()
+    num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+    assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
+    for pp_rank_idx in range(pp_size):
+        for virtual_pp_rank_idx in range(virtual_pp_size):
+            layer_offset = virtual_pp_rank_idx * (config.num_hidden_layers // virtual_pp_size) + pp_rank_idx * num_layers_per_model
+            for layer_idx in range(num_layers_per_model):
+                layer_map[layer_offset + layer_idx] = (
+                    pp_rank_idx,
+                    virtual_pp_rank_idx,
+                    layer_idx,
+                )
+    return layer_map
+def merge_megatron_ckpt_llama(wrapped_models, config, dtype, is_value_model=False, tie_word_embeddings=False):
+    """Merge sharded parameters of a Megatron module into a merged checkpoint.
+    Args:
+        wrapped_models (list of megatron.core.distributed.DistributedDataParallel):
+            The local DDP wrapped megatron modules.
+        config (str or None):
+            HF config for model
+        dtype: model params type
+        is_value_model: if model is value model
+        tie_word_embeddings: tie_word_embeddings, not used in llama, only to keep same interface with qwen2
+    Returns:
+        state_dict (dict):
+            The merged state_dict in rank 0, and an empty dictionary in other ranks.
+    """
+    start_time = time.time()
+    def _get_gpt_model(model):
+        return model
+    dp_rank = mpu.get_data_parallel_rank()
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    pp_rank = mpu.get_pipeline_model_parallel_rank()
+    virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+    mp_group = mpu.get_model_parallel_group()
+    if dist.get_rank() == 0:
+        assert mp_group.rank() == 0, f"mp_rank:[{mp_group.rank}] != 0 on rank #0"
+        assert pp_rank == 0, f"pp_rank:[{pp_rank}] != 0 on rank #0"
+        assert dp_rank == 0, f"dp_rank:[{dp_rank}] != 0 on rank #0"
+    if not isinstance(wrapped_models, (list, tuple)):
+        wrapped_models = list(wrapped_models)
+    assert len(wrapped_models) == virtual_pp_size
+    num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+    assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
+    models = [None] * len(wrapped_models)
+    for i, wrapped_model in enumerate(wrapped_models):
+        models[i] = unwrap_model(wrapped_model, (torchDDP, LocalDDP, Float16Module))
+        assert len(models[i].model.layers) == num_layers_per_model, "len model layers {} not equal to num_layers_per_model {}".format(len(models[i].model.layers), num_layers_per_model)
+    state_dict = dict()
+    def _get_cpu_tensor(tensor: torch.Tensor):
+        if tensor is None:
+            return None
+        if tensor.device == torch.device("cpu"):
+            return tensor.detach().clone()
+        return tensor.detach().cpu()
+    def _broadcast_tensor(tensor, name, src_pp_rank) -> torch.Tensor:
+        """broadcast tensor across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
+        if torch.distributed.get_rank() == src_rank:
+            if tensor is None:
+                weight = None
+                tensor_shape = None
+            else:
+                weight = tensor
+                tensor_shape = weight.shape
+        else:
+            weight = None
+            tensor_shape = None
+        obj_list = [tensor_shape]
+        dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+        tensor_shape = obj_list[0]
+        if tensor_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tensor:[{name}] not exist, skip collect")
+            return
+        if weight is None:
+            weight = torch.empty(
+                tensor_shape,
+                dtype=dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        dist.broadcast(weight, src=src_rank, group=mp_group)
+        if torch.distributed.get_rank() == 0:
+            state_dict[name] = _get_cpu_tensor(weight)
+    def _broadcast_tp_shard_tensor(tensor, name, src_pp_rank, concat_dim=0, mutate_func=None) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
+        chunk_shape = tensor.shape if torch.distributed.get_rank() == src_rank else None
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{name}] not exist, skip collecting")
+            return
+        buffer_tensor = torch.empty(
+            chunk_shape,
+            dtype=dtype,
+            device=torch.cuda.current_device(),
+            requires_grad=False,
+        )
+        chunk_tensors = [None] * tp_size
+        for i in range(tp_size):
+            cur_src_rank = _megatron_calc_global_rank(tp_rank=i, dp_rank=0, pp_rank=src_pp_rank)
+            sync_tensor = tensor if torch.distributed.get_rank() == cur_src_rank else buffer_tensor
+            dist.broadcast(sync_tensor, src=cur_src_rank, group=mp_group)
+            if torch.distributed.get_rank() == 0:
+                chunk_tensors[i] = _get_cpu_tensor(sync_tensor)
+        if torch.distributed.get_rank() == 0:
+            full_tensor = torch.concat(chunk_tensors, dim=concat_dim)
+            if mutate_func is not None:
+                full_tensor = mutate_func(full_tensor)
+            state_dict[name] = full_tensor
+    def _broadcast_tp_shard_tensor_gate_up(tensor, gate_name, up_name, src_pp_rank) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
+        chunk_shape = tensor.shape if torch.distributed.get_rank() == src_rank else None
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{gate_name, up_name}] not exist, skip collecting")
+            return
+        buffer_tensor = torch.empty(
+            chunk_shape,
+            dtype=dtype,
+            device=torch.cuda.current_device(),
+            requires_grad=False,
+        )
+        chunk_tensors = [None] * tp_size
+        for i in range(tp_size):
+            cur_src_rank = _megatron_calc_global_rank(tp_rank=i, dp_rank=0, pp_rank=src_pp_rank)
+            sync_tensor = tensor if torch.distributed.get_rank() == cur_src_rank else buffer_tensor
+            dist.broadcast(sync_tensor, src=cur_src_rank, group=mp_group)
+            if torch.distributed.get_rank() == 0:
+                chunk_tensors[i] = _get_cpu_tensor(sync_tensor)
+        if torch.distributed.get_rank() == 0:
+            full_tensor = torch.concat(chunk_tensors, dim=0)
+            intermediate_size_tp = config.intermediate_size // tp_size
+            gate_weight_list = []
+            up_weight_list = []
+            for i in range(tp_size):
+                gate_up_weight_tp = full_tensor[intermediate_size_tp * 2 * i : intermediate_size_tp * 2 * (i + 1)]
+                gate_weight_tp = gate_up_weight_tp[:intermediate_size_tp]
+                up_weight_tp = gate_up_weight_tp[intermediate_size_tp:]
+                gate_weight_list.append(gate_weight_tp)
+                up_weight_list.append(up_weight_tp)
+            state_dict[gate_name] = torch.cat(gate_weight_list, dim=0)
+            state_dict[up_name] = torch.cat(up_weight_list, dim=0)
+    def _broadcast_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name, src_pp_rank):
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
+        chunk_shape = tensor.shape if torch.distributed.get_rank() == src_rank else None
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{q_name}] not exist, skip collecting")
+            return
+        buffer_tensor = torch.empty(
+            chunk_shape,
+            dtype=dtype,
+            device=torch.cuda.current_device(),
+            requires_grad=False,
+        )
+        chunk_tensors = [None] * tp_size
+        for i in range(tp_size):
+            cur_src_rank = _megatron_calc_global_rank(tp_rank=i, dp_rank=0, pp_rank=src_pp_rank)
+            sync_tensor = tensor if torch.distributed.get_rank() == cur_src_rank else buffer_tensor
+            dist.broadcast(sync_tensor, src=cur_src_rank, group=mp_group)
+            if torch.distributed.get_rank() == 0:
+                chunk_tensors[i] = _get_cpu_tensor(sync_tensor)
+        if torch.distributed.get_rank() == 0:
+            full_tensor = torch.concat(chunk_tensors, dim=0)
+            q_weight_list = []
+            k_weight_list = []
+            v_weight_list = []
+            hidden_size_per_head = config.hidden_size // config.num_attention_heads
+            if config.num_key_value_heads >= tp_size:
+                q_size_tp = config.hidden_size // tp_size
+                kv_size_tp = hidden_size_per_head * config.num_key_value_heads // tp_size
+                total_size = q_size_tp + 2 * kv_size_tp
+                for i in range(tp_size):
+                    qkv_part = full_tensor[i * total_size : (i + 1) * total_size]
+                    q_part = qkv_part[:q_size_tp]
+                    k_part = qkv_part[q_size_tp : q_size_tp + kv_size_tp]
+                    v_part = qkv_part[q_size_tp + kv_size_tp : total_size]
+                    q_weight_list.append(q_part)
+                    k_weight_list.append(k_part)
+                    v_weight_list.append(v_part)
+            else:
+                q_size_tp = config.hidden_size // tp_size
+                kv_size_tp = hidden_size_per_head
+                total_size = q_size_tp + 2 * kv_size_tp
+                for i in range(tp_size):
+                    qkv_part = full_tensor[i * total_size : (i + 1) * total_size]
+                    q_part = qkv_part[:q_size_tp]
+                    k_part = qkv_part[q_size_tp : q_size_tp + kv_size_tp]
+                    v_part = qkv_part[q_size_tp + kv_size_tp : total_size]
+                    q_weight_list.append(q_part)
+                    if i * config.num_key_value_heads % tp_size == 0:
+                        k_weight_list.append(k_part)
+                        v_weight_list.append(v_part)
+            state_dict[q_name] = torch.cat(q_weight_list, dim=0)
+            state_dict[k_name] = torch.cat(k_weight_list, dim=0)
+            state_dict[v_name] = torch.cat(v_weight_list, dim=0)
+    # empty cache before collecting weights
+    torch.cuda.empty_cache()
+    # Embeddings
+    # -------------------
+    if dp_rank == 0:
+        # Embeddings
+        # -------------------
+        print_rank_0("collecting embeddings...")
+        gpt_model_module = _get_gpt_model(models[0])
+        _broadcast_tp_shard_tensor(
+            gpt_model_module.model.embed_tokens.weight if pp_rank == 0 else None,
+            "model.embed_tokens.weight",
+            src_pp_rank=0,
+        )
+        # Transformer layers
+        # -------------------
+        layer_map = _megatron_calc_layer_map(config)
+        for layer in range(config.num_hidden_layers):
+            print_rank_0(f"collecting layer #{layer}...")
+            layer_name = f"model.layers.{layer}"
+            src_pp_rank, src_virtual_pp_rank, src_layer_idx = layer_map[layer]
+            gpt_model_module = _get_gpt_model(models[src_virtual_pp_rank])
+            sync_layer = gpt_model_module.model.layers[src_layer_idx]
+            _broadcast_tensor(
+                sync_layer.input_layernorm.weight,
+                f"{layer_name}.input_layernorm.weight",
+                src_pp_rank=src_pp_rank,
+            )
+            _broadcast_tp_shard_tensor_qkv(
+                sync_layer.self_attn.qkv_proj.weight,
+                f"{layer_name}.self_attn.q_proj.weight",
+                f"{layer_name}.self_attn.k_proj.weight",
+                f"{layer_name}.self_attn.v_proj.weight",
+                src_pp_rank=src_pp_rank,
+            )
+            _broadcast_tp_shard_tensor(
+                sync_layer.self_attn.o_proj.weight,
+                f"{layer_name}.self_attn.o_proj.weight",
+                concat_dim=1,
+                src_pp_rank=src_pp_rank,
+            )
+            _broadcast_tensor(
+                sync_layer.post_attention_layernorm.weight,
+                f"{layer_name}.post_attention_layernorm.weight",
+                src_pp_rank=src_pp_rank,
+            )
+            _broadcast_tp_shard_tensor_gate_up(
+                sync_layer.mlp.gate_up_proj.weight,
+                f"{layer_name}.mlp.gate_proj.weight",
+                f"{layer_name}.mlp.up_proj.weight",
+                src_pp_rank=src_pp_rank,
+            )
+            _broadcast_tp_shard_tensor(
+                sync_layer.mlp.down_proj.weight,
+                f"{layer_name}.mlp.down_proj.weight",
+                concat_dim=1,
+                src_pp_rank=src_pp_rank,
+            )
+        # Final Layernorm
+        # -------------------
+        print_rank_0("collecting final layernorm...")
+        gpt_model_module = _get_gpt_model(models[-1])
+        _broadcast_tensor(
+            getattr(gpt_model_module.model.norm, "weight", None),
+            "model.norm.weight",
+            src_pp_rank=pp_size - 1,
+        )
+        print_rank_0("collecting lm_head...")
+        if is_value_model:
+            if pp_rank == pp_size - 1:
+                print(f"gpt_model_module.lm_head.weight: {gpt_model_module.lm_head.weight.shape}")
+            _broadcast_tensor(
+                gpt_model_module.lm_head.weight if pp_rank == pp_size - 1 else None,
+                "lm_head.weight",
+                src_pp_rank=pp_size - 1,
+            )
+            _broadcast_tensor(
+                gpt_model_module.reward_head.weight if pp_rank == pp_size - 1 and getattr(gpt_model_module, "reward_weight", None) is not None else None,
+                "reward_head.weight",
+                src_pp_rank=pp_size - 1,
+            )
+        else:
+            _broadcast_tp_shard_tensor(
+                getattr(gpt_model_module.lm_head, "weight", None) if pp_rank == pp_size - 1 else None,
+                "lm_head.weight",
+                src_pp_rank=pp_size - 1,
+            )
+    dist.barrier()
+    torch.cuda.empty_cache()
+    if torch.distributed.get_rank() == 0:
+        if dtype not in [torch.float16, torch.bfloat16, torch.float32]:
+            print(f'Unknown/unsupported dtype to save: {dtype}"')
+            exit(1)
+        for k, v in state_dict.items():
+            if dtype != v.dtype:
+                state_dict[k] = v.to(dtype)
+    print_rank_0(f"merge megatron ckpt done, time elapsed {time.time() - start_time}s")
+    return state_dict

verl/models/llama/megatron/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .parallel_attention import ParallelLlamaAttention
+from .parallel_decoder import ParallelLlamaDecoderLayer, ParallelLlamaDecoderLayerRmPad
+from .parallel_linear import (
+    LinearForLastLayer,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+)
+from .parallel_mlp import ParallelLlamaMLP
+from .parallel_rmsnorm import ParallelLlamaRMSNorm
+__all__ = ["LinearForLastLayer", "MergedColumnParallelLinear", "QKVParallelLinear", "ParallelLlamaAttention", "ParallelLlamaDecoderLayer", "ParallelLlamaDecoderLayerRmPad", "ParallelLlamaMLP", "ParallelLlamaRMSNorm"]

verl/models/llama/megatron/layers/parallel_attention.py ADDED Viewed

	@@ -0,0 +1,425 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional, Tuple
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from flash_attn.layers.rotary import apply_rotary_emb
+from megatron.core import ModelParallelConfig, tensor_parallel
+from megatron.core import parallel_state as mpu
+from torch import nn
+from transformers import LlamaConfig
+from transformers.utils import is_flash_attn_2_available
+from verl.models.llama.megatron.layers.parallel_linear import QKVParallelLinear
+from verl.utils.megatron import tensor_parallel as tp_utils
+class LlamaRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype())
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        if seq_len > self.max_position_embeddings:
+            base = self.base * ((self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+class LlamaLlama3ScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    def __init__(self, dim, config, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__(dim, max_position_embeddings, base, device)
+        self.factor = config.rope_scaling["factor"]  # `8` in the original implementation
+        self.high_freq_factor = config.rope_scaling["high_freq_factor"]  # `1` in the original implementation
+        self.low_freq_factor = config.rope_scaling["low_freq_factor"]  # `4` in the original implementation
+        self.old_context_len = config.rope_scaling["original_max_position_embeddings"]  # `8192` in the original implementation
+        low_freq_wavelen = self.old_context_len / self.low_freq_factor
+        high_freq_wavelen = self.old_context_len / self.high_freq_factor
+        wavelen = 2 * math.pi / self.inv_freq
+        # wavelen < high_freq_wavelen: do nothing; wavelen > low_freq_wavelen: divide by factor
+        inv_freq_llama = torch.where(wavelen > low_freq_wavelen, self.inv_freq / self.factor, self.inv_freq)
+        # otherwise: interpolate between the two, using a smooth factor
+        smooth_factor = (self.old_context_len / wavelen - self.low_freq_factor) / (self.high_freq_factor - self.low_freq_factor)
+        smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / self.factor + smooth_factor * inv_freq_llama
+        is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
+        inv_freq = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype())
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class ParallelLlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
+        super().__init__()
+        self.config = config
+        self.megatron_config = megatron_config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        # assign values after tp
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        assert self.num_heads % tp_size == 0, f"num_head must be divisible by tp_size. Got num_head={self.num_heads}, tp_size={tp_size}"
+        assert self.num_key_value_heads % tp_size == 0, f"num_key_value_heads must be divisible by tp_size. Got num_key_value_heads={self.num_key_value_heads}, tp_size={tp_size}"
+        self.num_heads_per_tp = self.num_heads // tp_size
+        self.num_key_value_heads_per_tp = self.num_key_value_heads // tp_size
+        self.hidden_size_per_tp = self.hidden_size // tp_size
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`: {self.num_heads}).")
+        column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
+        row_kwargs = tp_utils.get_default_kwargs_for_row_parallel_linear()
+        if megatron_config is not None:
+            assert column_kwargs.get("config", False), "must have ModelParallelConfig"
+            assert row_kwargs.get("config", False), "must have ModelParallelConfig"
+            tp_utils.update_kwargs_with_config(column_kwargs, megatron_config)
+            tp_utils.update_kwargs_with_config(row_kwargs, megatron_config)
+        # [self.q_size, self.k_size, self.v_size]
+        self.qkv_proj = QKVParallelLinear(
+            input_size=self.hidden_size,
+            num_heads=self.num_heads,
+            num_key_value_heads=self.num_key_value_heads,
+            head_dim=self.head_dim,
+            bias=config.attention_bias,
+            gather_output=False,
+            skip_bias_add=False,
+            **column_kwargs,
+        )
+        self.q_size = self.num_heads_per_tp * self.head_dim
+        self.k_size = self.num_key_value_heads_per_tp * self.head_dim
+        self.v_size = self.num_key_value_heads_per_tp * self.head_dim
+        self.o_proj = tensor_parallel.RowParallelLinear(
+            input_size=self.num_heads * self.head_dim,
+            output_size=self.hidden_size,
+            bias=config.attention_bias,
+            input_is_parallel=True,
+            skip_bias_add=False,
+            **row_kwargs,
+        )
+        self._init_rope()
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = LlamaRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            rope_type_key = "type" if "type" in self.config.rope_scaling else "rope_type"
+            scaling_type = self.config.rope_scaling[rope_type_key]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "llama3":
+                self.rotary_emb = LlamaLlama3ScalingRotaryEmbedding(
+                    self.head_dim,
+                    self.config,
+                    max_position_embeddings=self.max_position_embeddings,
+                    base=self.rope_theta,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        qkv = self.qkv_proj(hidden_states)[0]
+        query_states, key_states, value_states = qkv.split([self.q_size, self.k_size, self.v_size], dim=-1)
+        query_states = query_states.view(bsz, q_len, self.num_heads_per_tp, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads_per_tp, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads_per_tp, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads_per_tp, q_len, kv_seq_len):
+            raise ValueError(f"Attention weights should be of size {(bsz, self.num_heads_per_tp, q_len, kv_seq_len)}, but is {attn_weights.size()}")
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}")
+            attn_weights = attn_weights + attention_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads_per_tp, q_len, self.head_dim):
+            raise ValueError(f"`attn_output` should be of size {(bsz, self.num_heads_per_tp, q_len, self.head_dim)}, but is {attn_output.size()}")
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size_per_tp)
+        attn_output = self.o_proj(attn_output)[0]
+        return attn_output
+"""
+Remove padding Attention
+- Using Flash-attn 2
+- Compatible with sequence parallel
+"""
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+def apply_rotary_pos_emb_rmpad(q, k, cos, sin, position_ids, indices, sequence_length):
+    batch_size = position_ids.shape[0]
+    q = pad_input(q, indices, batch_size, sequence_length)  # (batch_size, seqlen, num_head, head_dim)
+    k = pad_input(k, indices, batch_size, sequence_length)
+    cos = cos[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+    sin = sin[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    q_embed = index_first_axis(rearrange(q_embed, "b s ... -> (b s) ..."), indices)
+    k_embed = index_first_axis(rearrange(k_embed, "b s ... -> (b s) ..."), indices)
+    return q_embed, k_embed
+# use flash-attn rotary embeddings with rmpad
+# cos/sin shoudl be: (seq_length, rotary_dim / 2)
+def apply_rotary_pos_emb_rmpad_flash(q, k, cos, sin, cu_seqlens, max_seqlen):
+    q_embed = apply_rotary_emb(q, cos, sin, interleaved=False, inplace=False, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen)
+    k_embed = apply_rotary_emb(k, cos, sin, interleaved=False, inplace=False, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen)
+    return q_embed, k_embed
+class ParallelLlamaAttentionRmPad(ParallelLlamaAttention):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        sequence_length: int = None,
+        indices: torch.Tensor = None,
+        cu_seqlens: torch.Tensor = None,
+        max_seqlen_in_batch: int = None,
+    ):
+        total_nnz, _, _ = hidden_states.size()  # This is the total_nnz padded after sequence parallel
+        if self.megatron_config.sequence_parallel:
+            total_nnz = total_nnz * mpu.get_tensor_model_parallel_world_size()
+        qkv = self.qkv_proj(hidden_states)[0]
+        query_states, key_states, value_states = qkv.split([self.q_size, self.k_size, self.v_size], dim=-1)  # (total_nnz, 1, hidden_size)
+        if self.megatron_config.sequence_parallel:
+            sequence_parallel_pad = total_nnz - cu_seqlens[-1]
+            total_nnz = cu_seqlens[-1]  # total_nnz before sp padding
+            query_states = query_states[:total_nnz]
+            key_states = key_states[:total_nnz]
+            value_states = value_states[:total_nnz]
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dime x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(total_nnz, self.num_heads_per_tp, self.head_dim)
+        key_states = key_states.view(total_nnz, self.num_key_value_heads_per_tp, self.head_dim)
+        value_states = value_states.view(total_nnz, self.num_key_value_heads_per_tp, self.head_dim)
+        cos, sin = self.rotary_emb(value_states, seq_len=sequence_length)
+        cos, sin = cos[:, : cos.shape[1] // 2], sin[:, : sin.shape[1] // 2]  # flash attn only needs half
+        query_states, key_states = apply_rotary_pos_emb_rmpad_flash(query_states, key_states, cos, sin, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen_in_batch)
+        # query_states, key_states = apply_rotary_pos_emb_rmpad(query_states, key_states, cos, sin, position_ids, indices,
+        # TODO: llama does not have dropout in the config??
+        # It is recommended to use dropout with FA according to the docs
+        # when training.
+        dropout_rate = 0.0  # if not self.training else self.attn_dropout
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            query_states = query_states.to(torch.float16)
+            key_states = key_states.to(torch.float16)
+            value_states = value_states.to(torch.float16)
+        attn_output_unpad = flash_attn_varlen_func(
+            query_states,
+            key_states,
+            value_states,
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_k=cu_seqlens,
+            max_seqlen_q=max_seqlen_in_batch,
+            max_seqlen_k=max_seqlen_in_batch,
+            dropout_p=dropout_rate,
+            softmax_scale=None,
+            causal=True,
+        )
+        attn_output_unpad = attn_output_unpad.to(input_dtype)
+        attn_output_unpad = attn_output_unpad.reshape(total_nnz, 1, self.hidden_size_per_tp).contiguous()
+        # sequence parallel reduce_scatter is performed inside RowColumnParallel if enabled
+        # Here we need to repad
+        if self.megatron_config.sequence_parallel:
+            attn_output_unpad = F.pad(attn_output_unpad, pad=(0, 0, 0, 0, 0, sequence_parallel_pad))
+        attn_output_unpad = self.o_proj(attn_output_unpad)[0]
+        return attn_output_unpad

verl/models/llama/megatron/layers/parallel_decoder.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple
+import torch
+from megatron.core import ModelParallelConfig
+from torch import nn
+from transformers import LlamaConfig
+from verl.utils.megatron_utils import TransformerConfig, convert_config
+from .parallel_attention import ParallelLlamaAttention, ParallelLlamaAttentionRmPad
+from .parallel_mlp import ParallelLlamaMLP
+from .parallel_rmsnorm import ParallelLlamaRMSNorm
+class ParallelLlamaDecoderLayer(nn.Module):
+    def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig, layer_idx: int):
+        super().__init__()
+        self.config: TransformerConfig = convert_config(config, megatron_config)
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.self_attn = ParallelLlamaAttention(config=config, megatron_config=megatron_config)
+        self.mlp = ParallelLlamaMLP(config, megatron_config=megatron_config)
+        self.input_layernorm = ParallelLlamaRMSNorm(config, megatron_config)
+        self.post_attention_layernorm = ParallelLlamaRMSNorm(config, megatron_config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Note: sequence parallel is hidden inside ColumnParallelLinear
+        # reduce scatter is hidden inside RowParallelLinear
+        # Self Attention
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+        )
+        # TODO: add sequence parallel operator reduce_scatter here
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        # TODO: add sequence parallel operator all_gather here
+        hidden_states = self.mlp(hidden_states)
+        # TODO: add sequence parallel operator reduce_scatter here
+        hidden_states = residual + hidden_states
+        outputs = hidden_states
+        return outputs
+class ParallelLlamaDecoderLayerRmPad(nn.Module):
+    def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig, layer_idx: int):
+        super().__init__()
+        self.config: TransformerConfig = convert_config(config, megatron_config)
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.self_attn = ParallelLlamaAttentionRmPad(config=config, megatron_config=megatron_config)
+        self.mlp = ParallelLlamaMLP(config, megatron_config=megatron_config)
+        self.input_layernorm = ParallelLlamaRMSNorm(config, megatron_config)
+        self.post_attention_layernorm = ParallelLlamaRMSNorm(config, megatron_config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        sequence_length: int = None,
+        indices: torch.Tensor = None,
+        cu_seqlens: int = None,
+        max_seqlen_in_batch: int = None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states  # (total_nnz // sp, 1, hidden_size)
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        # (total_nnz // sp, 1, hidden_size) -> all-gather (total_nnz, 1, hidden_size)
+        # -> col + row -> reduce-scatter -> (total_nnz // sp, 1, hidden_size)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            position_ids=position_ids,
+            sequence_length=sequence_length,
+            indices=indices,
+            cu_seqlens=cu_seqlens,
+            max_seqlen_in_batch=max_seqlen_in_batch,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        # shape changes same as attn
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = hidden_states
+        return outputs

verl/models/llama/megatron/layers/parallel_linear.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/linear.py
+import torch
+from megatron.core import tensor_parallel
+class QKVParallelLinear(tensor_parallel.ColumnParallelLinear):
+    def __init__(
+        self,
+        input_size,
+        num_heads,
+        num_key_value_heads,
+        head_dim,
+        *,
+        bias=True,
+        gather_output=True,
+        skip_bias_add=False,
+        **kwargs,
+    ):
+        # Keep input parameters, and already restrict the head numbers
+        self.input_size = input_size
+        self.q_output_size = num_heads * head_dim
+        self.kv_output_size = num_key_value_heads * head_dim
+        self.head_dim = head_dim
+        self.gather_output = gather_output
+        self.skip_bias_add = skip_bias_add
+        input_size = self.input_size
+        output_size = (num_heads + 2 * num_key_value_heads) * self.head_dim
+        super().__init__(
+            input_size=input_size,
+            output_size=output_size,
+            bias=bias,
+            gather_output=gather_output,
+            skip_bias_add=skip_bias_add,
+            **kwargs,
+        )
+class MergedColumnParallelLinear(tensor_parallel.ColumnParallelLinear):
+    def __init__(
+        self,
+        input_size,
+        gate_ouput_size,
+        up_output_size,
+        *,
+        bias=True,
+        gather_output=True,
+        skip_bias_add=False,
+        **kwargs,
+    ):
+        # Keep input parameters, and already restrict the head numbers
+        self.input_size = input_size
+        self.output_size = gate_ouput_size + up_output_size
+        self.gather_output = gather_output
+        self.skip_bias_add = skip_bias_add
+        super().__init__(
+            input_size=self.input_size,
+            output_size=self.output_size,
+            bias=bias,
+            gather_output=gather_output,
+            skip_bias_add=skip_bias_add,
+            **kwargs,
+        )
+class LinearForLastLayer(torch.nn.Linear):
+    def __init__(
+        self,
+        input_size,
+        output_size,
+        *,
+        config,
+        bias=True,
+    ):
+        super().__init__(in_features=input_size, out_features=output_size, bias=bias)
+        self.sequence_parallel = config.sequence_parallel
+        if self.sequence_parallel:
+            self.weight.sequence_parallel = True
+    def forward(
+        self,
+        input_,
+        weight=None,
+        runtime_gather_output=None,
+    ):
+        logits = super().forward(input_)
+        logits = logits.float()
+        if self.sequence_parallel:
+            logits = tensor_parallel.gather_from_sequence_parallel_region(logits, tensor_parallel_output_grad=False)
+        return logits, None

verl/models/llama/megatron/layers/parallel_mlp.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from megatron.core import ModelParallelConfig, tensor_parallel
+from megatron.core import parallel_state as mpu
+from torch import nn
+from transformers.activations import ACT2FN
+from verl.models.llama.megatron.layers.parallel_linear import MergedColumnParallelLinear
+from verl.utils.megatron import tensor_parallel as tp_utils
+class ParallelLlamaMLP(nn.Module):
+    def __init__(self, config, megatron_config: ModelParallelConfig = None) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        # The weight is only [hidden_size, intermediate_size // model_parallel_world_size]
+        column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
+        row_kwargs = tp_utils.get_default_kwargs_for_row_parallel_linear()
+        if megatron_config is not None:
+            assert column_kwargs.get("config", False), "must have ModelParallelConfig"
+            assert row_kwargs.get("config", False), "must have ModelParallelConfig"
+            tp_utils.update_kwargs_with_config(row_kwargs, megatron_config)
+            tp_utils.update_kwargs_with_config(column_kwargs, megatron_config)
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=self.hidden_size,
+            gate_ouput_size=self.intermediate_size,
+            up_output_size=self.intermediate_size,
+            bias=False,
+            gather_output=False,
+            skip_bias_add=False,
+            **column_kwargs,
+        )
+        self.gate_size = self.intermediate_size // tp_size
+        self.down_proj = tensor_parallel.RowParallelLinear(
+            input_size=self.intermediate_size,
+            output_size=self.hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            skip_bias_add=False,
+            **row_kwargs,
+        )
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        gate_up = self.gate_up_proj(x)[0]
+        gate, up = gate_up.split(self.gate_size, dim=-1)
+        return self.down_proj(self.act_fn(gate) * up)[0]

verl/models/llama/megatron/layers/parallel_rmsnorm.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numbers
+import torch
+from apex.normalization.fused_layer_norm import fused_rms_norm_affine
+from megatron.core import ModelParallelConfig
+from torch import nn
+from transformers import LlamaConfig
+from verl.utils.megatron import sequence_parallel as sp_utils
+class ParallelLlamaRMSNorm(nn.Module):
+    def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        if isinstance(config.hidden_size, numbers.Integral):
+            normalized_shape = (config.hidden_size,)
+        self.normalized_shape = torch.Size(normalized_shape)
+        self.weight = nn.Parameter(torch.ones(self.normalized_shape))
+        self.variance_epsilon = config.rms_norm_eps
+        if megatron_config.sequence_parallel:
+            sp_utils.mark_parameter_as_sequence_parallel(self.weight)
+    def forward(self, hidden_states):
+        return fused_rms_norm_affine(
+            input=hidden_states,
+            weight=self.weight,
+            normalized_shape=self.normalized_shape,
+            eps=self.variance_epsilon,
+            memory_efficient=True,
+        )

verl/models/llama/megatron/modeling_llama_megatron.py ADDED Viewed

	@@ -0,0 +1,662 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch LLaMA model with Megatron-style acceleration."""
+from typing import Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from megatron.core import ModelParallelConfig, mpu, tensor_parallel
+from torch import nn
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.models.llama.modeling_llama import CausalLMOutputWithPast
+from verl.utils.megatron import sequence_parallel as sp_utils
+from verl.utils.megatron import tensor_parallel as tp_utils
+from verl.utils.megatron_utils import TransformerConfig, convert_config
+from .layers import ParallelLlamaDecoderLayer, ParallelLlamaDecoderLayerRmPad, ParallelLlamaRMSNorm
+"""
+TODO:
+1. Add weight initialization. Here we need to be careful on TP weight init.
+2. Add sequence parallel
+3. Load checkpoint from meta LLama pretrained checkpoint
+"""
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len)
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+class ParallelLlamaModel(nn.Module):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+    Args:
+        config: LlamaConfig
+    """
+    def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
+        super().__init__()
+        self.config: TransformerConfig = convert_config(config, megatron_config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        embedding_kwargs = tp_utils.get_default_kwargs_for_parallel_embedding()
+        if megatron_config is not None:
+            assert embedding_kwargs.get("config", False), "must have ModelParallelConfig"
+            tp_utils.update_kwargs_with_config(embedding_kwargs, self.megatron_config)
+        self.embed_tokens = tensor_parallel.VocabParallelEmbedding(num_embeddings=config.vocab_size, embedding_dim=config.hidden_size, **embedding_kwargs)
+        self.layers = nn.ModuleList([ParallelLlamaDecoderLayer(config, megatron_config) for _ in range(config.num_hidden_layers)])
+        self.norm = ParallelLlamaRMSNorm(config, megatron_config)
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(inputs_embeds.device)
+            combined_attention_mask = expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+        return combined_attention_mask
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        """
+        Args:
+            input_ids: input ids. shape (batch_size, seq_length)
+            attention_mask: attention_mask. shape (batch_size, seq_length)
+            position_ids: position ids. shape (batch_size, seq_length)
+        Returns:
+        """
+        batch_size, seq_length = input_ids.shape
+        inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        attention_mask = self._prepare_decoder_attention_mask(attention_mask, (batch_size, seq_length), inputs_embeds)
+        hidden_states = inputs_embeds
+        for idx, decoder_layer in enumerate(self.layers):
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+            )
+            hidden_states = layer_outputs
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+class ParallelLlamaForCausalLM(nn.Module):
+    def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
+        super().__init__()
+        self.config: TransformerConfig = convert_config(config, megatron_config)
+        self.model = ParallelLlamaModel(config, megatron_config=megatron_config)
+        self.vocab_size = config.vocab_size
+        column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
+        if megatron_config is not None:
+            assert column_kwargs.get("config", False), "must have ModelParallelConfig"
+            tp_utils.update_kwargs_with_config(column_kwargs, self.megatron_config)
+        self.lm_head = tensor_parallel.ColumnParallelLinear(
+            input_size=config.hidden_size,
+            output_size=config.vocab_size,
+            bias=False,
+            gather_output=False,
+            skip_bias_add=False,
+            **column_kwargs,
+        )
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        ```"""
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+        )
+        hidden_states = outputs
+        logits = self.lm_head(hidden_states)[0]
+        logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits)
+        logits = logits.float()
+        return CausalLMOutputWithPast(
+            loss=None,
+            logits=logits,
+            past_key_values=None,
+            hidden_states=None,
+            attentions=None,
+        )
+from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+class ParallelLlamaModelRmPad(nn.Module):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+    Args:
+        config: LlamaConfig
+    """
+    def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
+        super().__init__()
+        self.config: TransformerConfig = convert_config(config, megatron_config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        embedding_kwargs = tp_utils.get_default_kwargs_for_parallel_embedding()
+        self.megatron_config = megatron_config
+        if megatron_config is not None:
+            assert embedding_kwargs.get("config", False), "must have ModelParallelConfig"
+            tp_utils.update_kwargs_with_config(embedding_kwargs, self.megatron_config)
+        self.embed_tokens = tensor_parallel.VocabParallelEmbedding(num_embeddings=config.vocab_size, embedding_dim=config.hidden_size, **embedding_kwargs)
+        self.layers = nn.ModuleList([ParallelLlamaDecoderLayerRmPad(config, megatron_config) for _ in range(config.num_hidden_layers)])
+        self.norm = ParallelLlamaRMSNorm(config, megatron_config)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        sequence_length: int = None,
+        indices: torch.Tensor = None,
+        cu_seqlens: int = None,
+        max_seqlen_in_batch: int = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        """
+        Args:
+            input_ids: input ids. shape (1, totol_nnz)
+            position_ids: position ids. shape (batch_size, seq_length)
+        Returns:
+        """
+        inputs_embeds = self.embed_tokens(input_ids)  # (1, total_nnz) -> (1, total_nnz, hidden_size)
+        # (1, total_nnz, hidden_size) -> (total_nnz, 1, hidden_size) -> (total_nnz // sp, 1, hidden_size)
+        inputs_embeds = inputs_embeds.transpose(0, 1)
+        if self.megatron_config.sequence_parallel:
+            inputs_embeds = tensor_parallel.scatter_to_sequence_parallel_region(inputs_embeds)
+        hidden_states = inputs_embeds
+        for idx, decoder_layer in enumerate(self.layers):
+            layer_outputs = decoder_layer(
+                hidden_states,
+                position_ids=position_ids,
+                sequence_length=sequence_length,
+                indices=indices,
+                cu_seqlens=cu_seqlens,
+                max_seqlen_in_batch=max_seqlen_in_batch,
+            )
+            hidden_states = layer_outputs
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+class ParallelLlamaForCausalLMRmPad(nn.Module):
+    def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
+        super().__init__()
+        self.config: TransformerConfig = convert_config(config, megatron_config)
+        self.megatron_config = megatron_config
+        self.model = ParallelLlamaModelRmPad(config, megatron_config=megatron_config)
+        self.vocab_size = config.vocab_size
+        self._init_head(config)
+    def _init_head(self, config):
+        column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
+        if self.megatron_config is not None:
+            assert column_kwargs.get("config", False), "must have ModelParallelConfig"
+            tp_utils.update_kwargs_with_config(column_kwargs, self.megatron_config)
+        self.lm_head = tensor_parallel.ColumnParallelLinear(
+            input_size=config.hidden_size,
+            output_size=config.vocab_size,
+            bias=False,
+            gather_output=False,
+            skip_bias_add=False,
+            **column_kwargs,
+        )
+    def _forward_head(self, hidden_states):
+        # all_gather from sequence parallel region is performed inside lm_head
+        logits = self.lm_head(hidden_states)[0]
+        logits = logits.float()  # (total_nnz_padded, 1, vocab_size // tp)
+        logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits)  # (total_nnz_padded, 1, vocab_size)
+        return logits
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        ```"""
+        batch_size, sequence_length = input_ids.shape
+        # remove padding here
+        input_ids, indices, cu_seqlens, max_seqlen_in_batch, *_ = unpad_input(input_ids.unsqueeze(dim=-1), attention_mask)  # (total_nnz, 1)
+        # pad input_ids to multiple of tp for all tp ranks
+        # TODO: for better performance, the sp padding should be removed at each layer. Not sure the performance gap
+        if self.megatron_config.sequence_parallel:
+            input_ids = sp_utils.pad_to_sequence_parallel(input_ids)
+        input_ids = input_ids.transpose(0, 1)  # (1, total_nnz+pad)
+        outputs = self.model(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            sequence_length=sequence_length,
+            indices=indices,
+            cu_seqlens=cu_seqlens,
+            max_seqlen_in_batch=max_seqlen_in_batch,
+        )
+        hidden_states = outputs
+        logits = self._forward_head(hidden_states)
+        # remove padding from sequence parallel
+        if self.megatron_config.sequence_parallel:
+            totol_nnz = cu_seqlens[-1]
+            logits = logits[:totol_nnz]  # (total_nnz_padded)
+        logits = torch.squeeze(logits, dim=1)  # remove the artificial batch dimension
+        # add removed padding back
+        logits = pad_input(logits, indices, batch_size, seqlen=sequence_length)  # (batch_size, sequence_length, vocab_size)
+        return CausalLMOutputWithPast(
+            loss=None,
+            logits=logits,
+            past_key_values=None,
+            hidden_states=None,
+            attentions=None,
+        )
+class ParallelLlamaForValueRmPad(ParallelLlamaForCausalLMRmPad):
+    def _init_head(self, config):
+        column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
+        if self.megatron_config is not None:
+            assert column_kwargs.get("config", False), "must have ModelParallelConfig"
+            tp_utils.update_kwargs_with_config(column_kwargs, self.megatron_config)
+        self.lm_head = nn.Linear(in_features=config.hidden_size, out_features=1, bias=False)
+        # lm_head is effectively the same as sequence parallel
+        sp_utils.mark_parameter_as_sequence_parallel(self.lm_head.weight)
+    def _forward_head(self, hidden_states):
+        logits = self.lm_head(hidden_states)  # (total_nnz_padded // tp, 1, 1)
+        logits = logits.float()
+        if self.megatron_config.sequence_parallel:
+            logits = tensor_parallel.gather_from_sequence_parallel_region(logits, tensor_parallel_output_grad=False)
+        return logits
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output = super().forward(input_ids, attention_mask, position_ids)
+        output.logits = torch.squeeze(output.logits, dim=-1)
+        return output
+"""
+Support pipeline parallelism
+"""
+class ParallelLlamaModelRmPadPP(nn.Module):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+    This model definition supports pipeline parallelism. To support pp and vpp,
+    - This model only contains layer in this pp stage and vpp chunk
+    - When calling get_model in Megatron, this rank will instantiate all the vpp chunks in this pp.
+    Args:
+        config: LlamaConfig
+    """
+    def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig, pre_process, post_process):
+        super().__init__()
+        self.config: TransformerConfig = convert_config(config, megatron_config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.megatron_config = megatron_config
+        embedding_kwargs = tp_utils.get_default_kwargs_for_parallel_embedding()
+        if megatron_config is not None:
+            assert embedding_kwargs.get("config", False), "must have ModelParallelConfig"
+            tp_utils.update_kwargs_with_config(embedding_kwargs, self.megatron_config)
+        if pre_process:
+            self.embed_tokens = tensor_parallel.VocabParallelEmbedding(num_embeddings=config.vocab_size, embedding_dim=config.hidden_size, **embedding_kwargs)
+        else:
+            self.embed_tokens = None
+        pp_rank = mpu.get_pipeline_model_parallel_rank()
+        pp_size = megatron_config.pipeline_model_parallel_size
+        self.num_layer_per_pp = config.num_hidden_layers // pp_size
+        vpp_size = megatron_config.virtual_pipeline_model_parallel_size
+        vpp_rank = mpu.get_virtual_pipeline_model_parallel_rank()
+        if vpp_size is not None:
+            self.layers = nn.ModuleList()
+            self.num_layer_vpp_chunk = self.num_layer_per_pp // vpp_size
+            self.num_layer_this_model = self.num_layer_vpp_chunk
+            offset = vpp_rank * (config.num_hidden_layers // vpp_size) + (pp_rank * self.num_layer_vpp_chunk)
+        else:
+            self.num_layer_this_model = self.num_layer_per_pp
+            offset = pp_rank * self.num_layer_per_pp
+        self.layers = nn.ModuleList()
+        for i in range(self.num_layer_this_model):
+            layer = ParallelLlamaDecoderLayerRmPad(config, megatron_config, layer_idx=offset + i)
+            self.layers.add_module(f"{i}", layer)
+        if post_process:
+            self.norm = ParallelLlamaRMSNorm(config, megatron_config)
+        else:
+            self.norm = None
+    def set_input_tensor(self, input_tensor):
+        """Set input tensor to be used instead of forward()'s input.
+        When doing pipeline parallelism the input from the previous
+        stage comes from communication, not from the input, so the
+        model's forward_step_func won't have it. This function is thus
+        used by internal code to bypass the input provided by the
+        forward_step_func"""
+        self.input_tensor = input_tensor
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        sequence_length: int = None,
+        indices: torch.Tensor = None,
+        cu_seqlens: int = None,
+        max_seqlen_in_batch: int = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        """
+        Args:
+            input_ids: input ids. shape (1, totol_nnz)
+            position_ids: position ids. shape (batch_size, seq_length)
+        Returns:
+        """
+        if self.pre_process:
+            inputs_embeds = self.embed_tokens(input_ids)  # (1, total_nnz) -> (1, total_nnz, hidden_size)
+            # vocab parallel embedding will not do sequence parallel reduce-scatter in open source megatron
+            # so need to deal with it by handle here:
+            # (1, total_nnz, hidden_size) -> (total_nnz, 1, hidden_size) -> (total_nnz // sp, 1, hidden_size)
+            inputs_embeds = inputs_embeds.transpose(0, 1)
+            if self.megatron_config.sequence_parallel:
+                inputs_embeds = tensor_parallel.scatter_to_sequence_parallel_region(inputs_embeds)
+            hidden_states = inputs_embeds
+        else:
+            # self.hidden_states should be passed by Megatron
+            hidden_states = self.input_tensor
+        for idx, decoder_layer in enumerate(self.layers):
+            layer_outputs = decoder_layer(
+                hidden_states,
+                position_ids=position_ids,
+                sequence_length=sequence_length,
+                indices=indices,
+                cu_seqlens=cu_seqlens,
+                max_seqlen_in_batch=max_seqlen_in_batch,
+            )
+            hidden_states = layer_outputs
+        if self.post_process:
+            hidden_states = self.norm(hidden_states)
+        return hidden_states
+class ParallelLlamaForCausalLMRmPadPP(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        megatron_config: ModelParallelConfig,
+        pre_process,
+        post_process,
+        share_embeddings_and_output_weights=False,
+    ):
+        super().__init__()
+        self.config: TransformerConfig = convert_config(config, megatron_config)
+        self.megatron_config = megatron_config
+        self.model = ParallelLlamaModelRmPadPP(config, megatron_config=megatron_config, pre_process=pre_process, post_process=post_process)
+        assert share_embeddings_and_output_weights is False, "Llama Model not supports sharing embedding and output weights"
+        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
+        self.vocab_size = config.vocab_size
+        self.pre_process = pre_process
+        self.post_process = post_process
+        if post_process:
+            self._init_head(config)
+    def set_input_tensor(self, input_tensor):
+        """Set input tensor to be used instead of forward()'s input.
+        When doing pipeline parallelism the input from the previous
+        stage comes from communication, not from the input, so the
+        model's forward_step_func won't have it. This function is thus
+        used by internal code to bypass the input provided by the
+        forward_step_func"""
+        assert len(input_tensor) == 1
+        self.model.set_input_tensor(input_tensor[0])
+    def _init_head(self, config):
+        column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
+        if self.megatron_config is not None:
+            assert column_kwargs.get("config", False), "must have ModelParallelConfig"
+            tp_utils.update_kwargs_with_config(column_kwargs, self.megatron_config)
+        self.lm_head = tensor_parallel.ColumnParallelLinear(
+            input_size=config.hidden_size,
+            output_size=config.vocab_size,
+            bias=False,
+            gather_output=False,
+            skip_bias_add=False,
+            **column_kwargs,
+        )
+    def _forward_head(self, hidden_states):
+        # all_gather from sequence parallel region is performed inside lm_head
+        # logits shape before forward_head hidden_states.shape: [4, 32, 4096]
+        logits = self.lm_head(hidden_states)[0]
+        # logits shape after forward_head logits.shape: [8, 32, 8]
+        logits = logits.float()  # (total_nnz_padded, 1, vocab_size // tp)
+        return logits
+    def forward(
+        self,
+        # original input
+        *,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        ```"""
+        # Note that input_ids, attention_mask and position_ids should be passed to every pp layer.
+        # In the first pp, input_ids will be used, in other pp layers hidden_states will be used inside self.model
+        batch_size, sequence_length = input_ids.shape
+        # remove padding here
+        input_ids_rmpad, indices, cu_seqlens, max_seqlen_in_batch, *_ = unpad_input(input_ids.unsqueeze(dim=-1), attention_mask)  # (total_nnz, 1)
+        # pad input_ids to multiple of tp for all tp ranks
+        # TODO: for better performance, the sp padding should be removed at each layer. Not sure the performance gap
+        if self.megatron_config.sequence_parallel:
+            input_ids_rmpad = sp_utils.pad_to_sequence_parallel(input_ids_rmpad)
+        input_ids_rmpad = input_ids_rmpad.transpose(0, 1)  # (1, total_nnz+pad)
+        outputs = self.model(
+            input_ids=input_ids_rmpad,
+            position_ids=position_ids,
+            sequence_length=sequence_length,
+            indices=indices,
+            cu_seqlens=cu_seqlens,
+            max_seqlen_in_batch=max_seqlen_in_batch,
+        )
+        if self.post_process:
+            hidden_states = outputs
+            # print(f'hidden_states.shape = {hidden_states.shape}') # torch.Size([4, 32, 4096])
+            logits = self._forward_head(hidden_states)
+            logits = torch.squeeze(logits, dim=1)  # remove the artificial batch dimension # torch.Size([8, 32, 16])
+            # remove padding from sequence parallel
+            if self.megatron_config.sequence_parallel:
+                totol_nnz = cu_seqlens[-1]
+                logits = logits[:totol_nnz]  # (total_nnz_padded)
+            # add removed padding back. If input is already rmpad, we let the caller pad_input
+            logits = pad_input(logits, indices, batch_size, seqlen=sequence_length)  # (batch_size, sequence_length, vocab_size)
+            return CausalLMOutputWithPast(
+                loss=None,
+                logits=logits,
+                past_key_values=None,
+                hidden_states=None,
+                attentions=None,
+            )
+        else:
+            return outputs
+class ParallelLlamaForValueRmPadPP(ParallelLlamaForCausalLMRmPadPP):
+    def _init_head(self, config):
+        column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
+        if self.megatron_config is not None:
+            assert column_kwargs.get("config", False), "must have ModelParallelConfig"
+            tp_utils.update_kwargs_with_config(column_kwargs, self.megatron_config)
+        self.lm_head = nn.Linear(in_features=config.hidden_size, out_features=1, bias=False)
+        # lm_head is effectively the same as sequence parallel
+        sp_utils.mark_parameter_as_sequence_parallel(self.lm_head.weight)
+    def _forward_head(self, hidden_states):
+        logits = self.lm_head(hidden_states)  # (total_nnz_padded // tp, 1, 1)
+        logits = logits.float()
+        if self.megatron_config.sequence_parallel:
+            logits = tensor_parallel.gather_from_sequence_parallel_region(logits, tensor_parallel_output_grad=False)
+        return logits
+    def forward(
+        self,
+        *,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output = super().forward(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids)
+        if self.post_process:
+            output.logits = torch.squeeze(output.logits, dim=-1)
+            return output
+        else:
+            return output

verl/models/mcore/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .registry import get_mcore_forward_fn, get_mcore_weight_converter, hf_to_mcore_config, init_mcore_model
+__all__ = ["hf_to_mcore_config", "init_mcore_model", "get_mcore_forward_fn", "get_mcore_weight_converter"]

verl/models/mcore/config_converter.py ADDED Viewed

	@@ -0,0 +1,197 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# convert huggingface config to mcore transformer config
+import torch
+import torch.nn.functional as F
+from megatron.core.transformer import MLATransformerConfig, TransformerConfig
+from transformers import PretrainedConfig
+def _get_base_transformer_config(hf_config: PretrainedConfig, dtype: torch.dtype, **kwargs) -> TransformerConfig:
+    """
+    Create a base TransformerConfig with common parameters across different model architectures.
+    TODO: (ycl) use dataclass or converter config?
+    Args:
+        hf_config: HuggingFace model configuration
+        dtype: Data type for the model
+        **kwargs: Additional parameters to override defaults
+    Returns:
+        TransformerConfig with common parameters
+    """
+    from megatron.core import parallel_state as mpu
+    # Common parallel state parameters
+    overlap_p2p_comm = mpu.get_virtual_pipeline_model_parallel_world_size() is not None and mpu.get_virtual_pipeline_model_parallel_world_size() > 1
+    batch_p2p_comm = False
+    # Base configuration with common parameters
+    base_config = {
+        # Model architecture parameters
+        "num_layers": hf_config.num_hidden_layers,
+        "hidden_size": hf_config.hidden_size,
+        "num_attention_heads": hf_config.num_attention_heads,
+        "num_query_groups": hf_config.num_key_value_heads,
+        "ffn_hidden_size": hf_config.intermediate_size,
+        "attention_dropout": hf_config.attention_dropout,
+        "hidden_dropout": getattr(hf_config, "hidden_dropout", 0.0),
+        "kv_channels": getattr(hf_config, "head_dim", None),
+        "layernorm_epsilon": hf_config.rms_norm_eps,
+        # Activation and normalization
+        "activation_func": F.silu,
+        "normalization": "RMSNorm",
+        "gated_linear_unit": True,
+        # Data types
+        "pipeline_dtype": dtype,
+        "params_dtype": dtype,
+        "bf16": dtype is torch.bfloat16,
+        # Parallel configuration
+        "tensor_model_parallel_size": mpu.get_tensor_model_parallel_world_size(),
+        "pipeline_model_parallel_size": mpu.get_pipeline_model_parallel_world_size(),
+        "virtual_pipeline_model_parallel_size": mpu.get_virtual_pipeline_model_parallel_world_size(),
+        "context_parallel_size": mpu.get_context_parallel_world_size(),
+        "overlap_p2p_comm": overlap_p2p_comm,
+        "batch_p2p_comm": batch_p2p_comm,
+        "sequence_parallel": mpu.get_tensor_model_parallel_world_size() > 1,
+        # Common settings
+        "variable_seq_lengths": True,
+        "masked_softmax_fusion": True,
+        "moe_token_dispatcher_type": "alltoall",
+    }
+    # Update with any provided overrides
+    base_config.update(kwargs)
+    print(f"Overridden TF init config: {base_config}")
+    return TransformerConfig(**base_config)
+def hf_to_mcore_config_dense(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig:
+    # for LlamaForCausalLM or Qwen2ForCausalLM
+    qkv_bias = True if "Qwen2ForCausalLM" in hf_config.architectures else getattr(hf_config, "attention_bias", False)
+    qk_layernorm = True if "Qwen3ForCausalLM" in hf_config.architectures else False
+    return _get_base_transformer_config(
+        hf_config=hf_config,
+        dtype=dtype,
+        use_cpu_initialization=False,
+        add_bias_linear=False,
+        add_qkv_bias=qkv_bias,
+        qk_layernorm=qk_layernorm,
+    )
+def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig:
+    return _get_base_transformer_config(
+        hf_config=hf_config,
+        dtype=dtype,
+        use_cpu_initialization=False,
+        add_bias_linear=False,
+        layernorm_epsilon=hf_config.rms_norm_eps,
+        # MoE specific
+        moe_ffn_hidden_size=hf_config.moe_intermediate_size,
+        moe_router_bias_update_rate=0.001,
+        moe_router_topk=hf_config.num_experts_per_tok,
+        num_moe_experts=hf_config.num_experts,
+        moe_shared_expert_intermediate_size=hf_config.shared_expert_intermediate_size,
+        moe_aux_loss_coeff=hf_config.router_aux_loss_coef,
+        # moe_aux_loss_coeff=0.0,
+        moe_router_load_balancing_type="aux_loss",
+        moe_shared_expert_overlap=True,
+        moe_grouped_gemm=True,
+        moe_router_score_function="softmax",
+        # Other optimizations
+        persist_layer_norm=True,
+        bias_activation_fusion=True,
+        bias_dropout_fusion=True,
+        # Qwen specific
+        moe_router_pre_softmax=True,
+        add_qkv_bias=True,
+    )
+def hf_to_mcore_config_mixtral(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig:
+    return _get_base_transformer_config(
+        hf_config=hf_config,
+        dtype=dtype,
+        use_cpu_initialization=False,
+        add_bias_linear=False,
+        layernorm_epsilon=hf_config.rms_norm_eps,
+        # MoE specific
+        num_moe_experts=hf_config.num_local_experts,
+        moe_aux_loss_coeff=hf_config.router_aux_loss_coef,
+        moe_router_topk=hf_config.num_experts_per_tok,
+        moe_router_pre_softmax=True,
+        moe_router_load_balancing_type="aux_loss",
+        moe_router_score_function="softmax",
+        moe_shared_expert_intermediate_size=None,  # mixtral has no shared expert
+        moe_shared_expert_overlap=False,  # mixtral has no shared expert
+        moe_ffn_hidden_size=hf_config.intermediate_size,
+        moe_router_bias_update_rate=0.001,
+        # moe_permute_fusion=True, # need TE 2.1+
+        moe_grouped_gemm=True,
+        # Other optimizations
+        persist_layer_norm=True,
+        apply_rope_fusion=True,
+        bias_activation_fusion=True,
+        bias_dropout_fusion=True,
+    )
+def hf_to_mcore_config_qwen3moe(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig:
+    return _get_base_transformer_config(
+        hf_config=hf_config,
+        dtype=dtype,
+        use_cpu_initialization=False,
+        add_bias_linear=False,
+        layernorm_epsilon=hf_config.rms_norm_eps,
+        # MoE specific
+        moe_ffn_hidden_size=hf_config.moe_intermediate_size,
+        moe_router_bias_update_rate=0.001,
+        moe_router_topk=hf_config.num_experts_per_tok,
+        num_moe_experts=hf_config.num_experts,
+        moe_aux_loss_coeff=hf_config.router_aux_loss_coef,
+        # moe_aux_loss_coeff=0.0,
+        moe_router_load_balancing_type="aux_loss",
+        moe_grouped_gemm=True,
+        moe_router_score_function="softmax",
+        # Other optimizations
+        persist_layer_norm=True,
+        bias_activation_fusion=True,
+        bias_dropout_fusion=True,
+        # Qwen specific
+        moe_router_pre_softmax=True,
+        qk_layernorm=True,
+    )
+def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype) -> MLATransformerConfig:
+    # DeepseekV3ForCausalLM
+    raise NotImplementedError("DeepseekV3ForCausalLM is not supported yet")
+def hf_to_mcore_config_qwen2_5_vl(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig:
+    # Qwen2_5_VLForConditionalGeneration
+    raise NotImplementedError("Qwen2_5_VLForConditionalGeneration is not supported yet")
+def hf_to_mcore_config_llama4(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig:
+    # Llama4ForConditionalGeneration
+    raise NotImplementedError("Llama4ForConditionalGeneration is not supported yet")

verl/models/mcore/loader.py ADDED Viewed

	@@ -0,0 +1,468 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import torch
+import torch.distributed as dist
+from .saver import _megatron_calc_global_rank
+def _megatron_calc_layer_map(config):
+    """Calculate the mapping of global layer_idx to local layer_idx
+    Returns:
+        layer_map (Dict: int -> tuple(int, int, int)):
+            mapping from the global layer index to
+            a tuple of (pp_rank, virtual_pp_rank, layer_idx inside model)
+    """
+    from megatron.core import mpu
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+    layer_map = dict()
+    num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+    assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
+    for pp_rank_idx in range(pp_size):
+        for virtual_pp_rank_idx in range(virtual_pp_size):
+            layer_offset = virtual_pp_rank_idx * (config.num_hidden_layers // virtual_pp_size) + pp_rank_idx * num_layers_per_model
+            for layer_idx in range(num_layers_per_model):
+                layer_map[layer_offset + layer_idx] = (
+                    pp_rank_idx,
+                    virtual_pp_rank_idx,
+                    layer_idx,
+                )
+    return layer_map
+def load_state_dict_to_megatron_gptmodel(state_dict, wrapped_models, config, params_dtype, is_value_model=False):
+    """Load merged state_dict to sharded Megatron module in training."""
+    from megatron.core import DistributedDataParallel as LocalDDP
+    from megatron.core import mpu
+    from megatron.core.transformer.module import Float16Module
+    from torch.nn.parallel import DistributedDataParallel as torchDDP
+    from verl.utils.megatron_utils import print_rank_0, unwrap_model
+    start_time = time.time()
+    def _get_gpt_model(model):
+        return model
+    def broadcast_params(module):
+        for param in module.parameters():
+            torch.distributed.broadcast(param.data, src=mpu.get_data_parallel_src_rank(), group=mpu.get_data_parallel_group())
+    dp_rank = mpu.get_data_parallel_rank()
+    pp_rank = mpu.get_pipeline_model_parallel_rank()
+    cp_rank = mpu.get_context_parallel_rank()
+    src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=0, cp_rank=cp_rank)
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+    mp_group = mpu.get_model_parallel_group()
+    if torch.distributed.get_rank() == src_rank:
+        assert mp_group.rank() == 0, f"mp_rank:[{mp_group.rank}] != 0 on rank #0"
+        assert pp_rank == 0, f"pp_rank:[{pp_rank}] != 0 on rank #0"
+        assert dp_rank == 0, f"dp_rank:[{dp_rank}] != 0 on rank #0"
+    if not isinstance(wrapped_models, (list, tuple)):
+        wrapped_models = list(wrapped_models)
+    assert len(wrapped_models) == virtual_pp_size
+    num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+    assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
+    models = [None] * len(wrapped_models)
+    for i, wrapped_model in enumerate(wrapped_models):
+        models[i] = unwrap_model(wrapped_model, (torchDDP, LocalDDP, Float16Module))
+        gpt_model_module = _get_gpt_model(models[i])
+        assert len(gpt_model_module.decoder.layers) == num_layers_per_model
+    def _broadcast_tensor(tensor, name) -> torch.Tensor:
+        """broadcast tensor from rank0 across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        if torch.distributed.get_rank() == src_rank:
+            if name in state_dict:
+                weight = state_dict[name]
+                tensor_shape = weight.shape
+            else:
+                tensor_shape = None
+        else:
+            weight = None
+            tensor_shape = None
+        obj_list = [tensor_shape]
+        dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+        tensor_shape = obj_list[0]
+        if tensor_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tensor:[{name}] not in state_dict, skip load")
+            return
+        if tensor is None:
+            tensor = torch.empty(
+                tensor_shape,
+                dtype=params_dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        if torch.distributed.get_rank() == src_rank:
+            tensor.data.copy_(weight)
+        dist.broadcast(tensor, src=src_rank, group=mp_group)
+    def _broadcast_tp_shard_tensor_vocab(tensor, name, chunk_dim=0, mutate_func=None) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        if torch.distributed.get_rank() == src_rank:
+            if name in state_dict:
+                full_weight = state_dict[name]
+                if mutate_func is not None:
+                    full_weight = mutate_func(full_weight)
+                tensor_chunk = torch.chunk(full_weight, tp_size, dim=chunk_dim)
+                chunk_shape = tensor_chunk[0].shape
+            else:
+                chunk_shape = None
+        else:
+            chunk_shape = None
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
+            return
+        if tensor is None:
+            sync_tensor = torch.empty(
+                chunk_shape,
+                dtype=params_dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        else:
+            assert tensor.shape == chunk_shape, f"rank #{torch.distributed.get_rank()} tensor {name} shape {tensor.shape} != {chunk_shape}"
+            sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
+        for i in range(tp_size):
+            if torch.distributed.get_rank() == src_rank:
+                sync_tensor.data.copy_(tensor_chunk[i])
+            dist.broadcast(sync_tensor, src=src_rank, group=mp_group)
+            if (i == tp_rank) and (tensor is not None):
+                tensor.data.copy_(sync_tensor)
+    def _broadcast_tp_shard_tensor(tensor, name, chunk_dim=0, mutate_func=None) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        if torch.distributed.get_rank() == src_rank:
+            if name in state_dict:
+                full_weight = state_dict[name]
+                if mutate_func is not None:
+                    full_weight = mutate_func(full_weight)
+                tensor_chunk = torch.chunk(full_weight, tp_size, dim=chunk_dim)
+                chunk_shape = tensor_chunk[0].shape
+            else:
+                chunk_shape = None
+        else:
+            chunk_shape = None
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
+            return
+        if tensor is None:
+            sync_tensor = torch.empty(
+                chunk_shape,
+                dtype=params_dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        else:
+            assert tensor.shape == chunk_shape, f"rank #{torch.distributed.get_rank()} tensor {name} shape {tensor.shape} != {chunk_shape}"
+            sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
+        for i in range(tp_size):
+            if torch.distributed.get_rank() == src_rank:
+                sync_tensor.data.copy_(tensor_chunk[i])
+            dist.broadcast(sync_tensor, src=src_rank, group=mp_group)
+            if (i == tp_rank) and (tensor is not None):
+                tensor.data.copy_(sync_tensor)
+    def _broadcast_tp_shard_tensor_gate_up(tensor, gate_name, up_name) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        if torch.distributed.get_rank() == src_rank:
+            gate_weight = state_dict[gate_name]
+            up_weight = state_dict[up_name]
+            new_gate_up_weight = torch.empty(config.intermediate_size * 2, config.hidden_size, dtype=params_dtype, device=torch.cuda.current_device())
+            for i in range(tp_size):
+                intermediate_size_tp = config.intermediate_size // tp_size
+                gate_weight_tp = gate_weight[i * intermediate_size_tp : (i + 1) * intermediate_size_tp]
+                up_weight_tp = up_weight[i * intermediate_size_tp : (i + 1) * intermediate_size_tp]
+                new_gate_up_weight[intermediate_size_tp * 2 * i : intermediate_size_tp * 2 * (i + 1)].copy_(torch.cat([gate_weight_tp, up_weight_tp], dim=0))
+            tensor_chunk = torch.chunk(new_gate_up_weight, tp_size, dim=0)
+            chunk_shape = tensor_chunk[0].shape
+        else:
+            chunk_shape = None
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{gate_name, up_name}] not in state_dict, skip loading")
+            return
+        if tensor is None:
+            sync_tensor = torch.empty(
+                chunk_shape,
+                dtype=params_dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        else:
+            assert tensor.shape == chunk_shape, f"rank #{torch.distributed.get_rank() == src_rank:} tensor {gate_name, up_name} shape {tensor.shape} != {chunk_shape}"
+            sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
+        for i in range(tp_size):
+            if torch.distributed.get_rank() == src_rank:
+                sync_tensor.data.copy_(tensor_chunk[i])
+            dist.broadcast(sync_tensor, src=src_rank, group=mp_group)
+            if (i == tp_rank) and (tensor is not None):
+                tensor.data.copy_(sync_tensor)
+    def _broadcast_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name, bias=False) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        if torch.distributed.get_rank() == src_rank:
+            assert q_name in state_dict and k_name in state_dict and v_name in state_dict
+            full_weight_q = state_dict[q_name]
+            full_weight_k = state_dict[k_name]
+            full_weight_v = state_dict[v_name]
+            hidden_size_per_head = config.hidden_size // config.num_attention_heads
+            if config.num_key_value_heads >= tp_size:
+                q_size_tp = config.hidden_size // tp_size
+                kv_size_tp = hidden_size_per_head * config.num_key_value_heads // tp_size
+                total_size = q_size_tp + 2 * kv_size_tp
+                sizes = [total_size * tp_size]
+                if not bias:
+                    sizes.append(config.hidden_size)
+                new_weight_qkv = torch.empty(*sizes, dtype=params_dtype, device=torch.cuda.current_device())
+                for i in range(tp_size):
+                    q_part = full_weight_q[i * q_size_tp : (i + 1) * q_size_tp]
+                    k_part = full_weight_k[i * kv_size_tp : (i + 1) * kv_size_tp]
+                    v_part = full_weight_v[i * kv_size_tp : (i + 1) * kv_size_tp]
+                    num_query_groups_per_partition = models[0].config.num_query_groups // tp_size
+                    new_weight_qkv_this_tp = new_weight_qkv[i * total_size : (i + 1) * total_size]
+                    q_part_per_head = torch.chunk(q_part, num_query_groups_per_partition, dim=0)
+                    k_part_per_head = torch.chunk(k_part, num_query_groups_per_partition, dim=0)
+                    v_part_per_head = torch.chunk(v_part, num_query_groups_per_partition, dim=0)
+                    total_size_per_head = total_size // num_query_groups_per_partition
+                    for j in range(num_query_groups_per_partition):
+                        new_weight_qkv_this_tp[j * total_size_per_head : (j + 1) * total_size_per_head].copy_(torch.cat([q_part_per_head[j], k_part_per_head[j], v_part_per_head[j]], dim=0))
+            else:
+                q_size_tp = config.hidden_size // tp_size
+                kv_size_tp = hidden_size_per_head
+                total_size = q_size_tp + 2 * kv_size_tp
+                sizes = [total_size * tp_size]
+                if not bias:
+                    sizes.append(config.hidden_size)
+                new_weight_qkv = torch.empty(*sizes, dtype=params_dtype, device=torch.cuda.current_device())
+                for i in range(tp_size):
+                    q_part = full_weight_q[i * q_size_tp : (i + 1) * q_size_tp]
+                    start_idx = i * config.num_key_value_heads // tp_size * hidden_size_per_head
+                    end_idx = (i * config.num_key_value_heads // tp_size + 1) * hidden_size_per_head
+                    k_part = full_weight_k[start_idx:end_idx]
+                    v_part = full_weight_v[start_idx:end_idx]
+                    new_weight_qkv_this_tp = new_weight_qkv[i * total_size : (i + 1) * total_size]
+                    q_part_per_head = torch.chunk(q_part, config.num_attention_heads, dim=0)
+                    k_part_per_head = torch.chunk(k_part, config.num_attention_heads, dim=0)
+                    v_part_per_head = torch.chunk(v_part, config.num_attention_heads, dim=0)
+                    total_size_per_head = total_size // config.num_attention_heads
+                    for j in range(config.num_attention_heads):
+                        new_weight_qkv_this_tp[j * total_size_per_head : (j + 1) * total_size_per_head].copy_(torch.cat([q_part_per_head[j], k_part_per_head[j], v_part_per_head[j]], dim=0))
+            tensor_chunk = torch.chunk(new_weight_qkv, tp_size, dim=0)
+            chunk_shape = tensor_chunk[0].shape
+        else:
+            chunk_shape = None
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{q_name, k_name, v_name}] not in state_dict, skip loading")
+            return
+        if tensor is None:
+            sync_tensor = torch.empty(
+                chunk_shape,
+                dtype=params_dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        else:
+            assert tensor.shape == chunk_shape, f"rank #{torch.distributed.get_rank()} tensor {q_name} shape {tensor.shape} != {chunk_shape}"
+            sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
+        for i in range(tp_size):
+            if torch.distributed.get_rank() == src_rank:
+                sync_tensor.data.copy_(tensor_chunk[i])
+            dist.broadcast(sync_tensor, src=src_rank, group=mp_group)
+            if (i == tp_rank) and (tensor is not None):
+                tensor.data.copy_(sync_tensor)
+    if dp_rank == 0:
+        # Embeddings
+        # -------------------
+        print_rank_0("loading embeddings...")
+        gpt_model_module = _get_gpt_model(models[0])
+        embed_tokens_weight = None
+        if pp_rank == 0:
+            embed_tokens_weight = gpt_model_module.embedding.word_embeddings.weight
+        _broadcast_tp_shard_tensor_vocab(embed_tokens_weight, "model.embed_tokens.weight")
+        # Transformer layers
+        # -------------------
+        layer_map = _megatron_calc_layer_map(config)
+        for layer in range(config.num_hidden_layers):
+            print_rank_0(f"loading layer #{layer}...")
+            layer_name = f"model.layers.{layer}"
+            dst_pp_rank, dst_virtual_pp_rank, dst_layer_idx = layer_map[layer]
+            gpt_model_module = _get_gpt_model(models[dst_virtual_pp_rank])
+            sync_layer = gpt_model_module.decoder.layers[dst_layer_idx]
+            _broadcast_tensor(
+                sync_layer.self_attention.linear_qkv.layer_norm_weight if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.input_layernorm.weight",
+            )
+            if f"{layer_name}.self_attn.q_norm.weight" in state_dict:
+                _broadcast_tensor(
+                    sync_layer.self_attention.q_layernorm.weight if dst_pp_rank == pp_rank else None,
+                    f"{layer_name}.self_attn.q_norm.weight",
+                )
+                _broadcast_tensor(
+                    sync_layer.self_attention.k_layernorm.weight if dst_pp_rank == pp_rank else None,
+                    f"{layer_name}.self_attn.k_norm.weight",
+                )
+            _broadcast_tp_shard_tensor_qkv(
+                sync_layer.self_attention.linear_qkv.weight if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.self_attn.q_proj.weight",
+                f"{layer_name}.self_attn.k_proj.weight",
+                f"{layer_name}.self_attn.v_proj.weight",
+            )
+            if f"{layer_name}.self_attn.q_proj.bias" in state_dict:
+                _broadcast_tp_shard_tensor_qkv(
+                    sync_layer.self_attention.linear_qkv.bias if dst_pp_rank == pp_rank else None,
+                    f"{layer_name}.self_attn.q_proj.bias",
+                    f"{layer_name}.self_attn.k_proj.bias",
+                    f"{layer_name}.self_attn.v_proj.bias",
+                    bias=True,
+                )
+            _broadcast_tp_shard_tensor(
+                sync_layer.self_attention.linear_proj.weight if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.self_attn.o_proj.weight",
+                chunk_dim=1,
+            )
+            _broadcast_tensor(
+                sync_layer.mlp.linear_fc1.layer_norm_weight if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.post_attention_layernorm.weight",
+            )
+            _broadcast_tp_shard_tensor_gate_up(
+                sync_layer.mlp.linear_fc1.weight if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.mlp.gate_proj.weight",
+                f"{layer_name}.mlp.up_proj.weight",
+            )
+            _broadcast_tp_shard_tensor(
+                sync_layer.mlp.linear_fc2.weight if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.mlp.down_proj.weight",
+                chunk_dim=1,
+            )
+        # Final Layernorm
+        # -------------------
+        print_rank_0("loading final layernorm...")
+        gpt_model_module = _get_gpt_model(models[-1])
+        _broadcast_tensor(
+            getattr(gpt_model_module.decoder.final_layernorm, "weight", None),
+            "model.norm.weight",
+        )
+        print_rank_0("loading lm_head...")
+        lm_head_weight = None
+        if pp_rank + 1 == pp_size:
+            lm_head_weight = gpt_model_module.output_layer.weight
+        if is_value_model:
+            # if torch.distributed.get_rank() == src_rank:
+            if "lm_head.weight" in state_dict and state_dict["lm_head.weight"].shape[0] == 1:
+                _broadcast_tensor(lm_head_weight, "lm_head.weight")
+            elif "reward_head.weight" in state_dict and state_dict["reward_head.weight"].shape[0] == 1:
+                _broadcast_tensor(lm_head_weight, "reward_head.weight")
+                print_rank_0("load lm_head from value_head weight")
+            else:
+                _broadcast_tensor(None, "lm_head.weight")
+                print_rank_0("fail to match lm_head in value_model")
+            # else:
+            #     _broadcast_tensor(lm_head_weight, "lm_head.weight")
+        else:
+            _broadcast_tp_shard_tensor(lm_head_weight, "lm_head.weight")
+    dist.barrier()
+    # Broadcast weights inside data parallel groups
+    for wrapped_model in wrapped_models:
+        broadcast_params(wrapped_model)
+    pass
+    torch.cuda.empty_cache()
+    print_rank_0(f"loading megatron ckpt done, time elapsed {time.time() - start_time}s")

verl/models/mcore/model_forward.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from verl.utils.megatron_utils import unwrap_model
+from .util import postprocess_packed_seqs, preprocess_packed_seqs, recover_left_padding, remove_left_padding
+def gptmodel_forward(model, input_ids, attention_mask, position_ids, sequence_parallel, value_model=False, pack_seqs=True):
+    """Default forward pass for GPT models with optional sequence packing."""
+    pre_process = unwrap_model(model).pre_process
+    post_process = unwrap_model(model).post_process
+    if pack_seqs:
+        batch_size, seq_len = attention_mask.shape[:2]
+        input_ids_rmpad, packed_seq_params = preprocess_packed_seqs(input_ids, attention_mask, pre_process=pre_process)
+        input_ids_rmpad = input_ids_rmpad.contiguous()
+        output_orig = model(
+            input_ids=input_ids_rmpad,
+            attention_mask=None,
+            position_ids=position_ids,
+            packed_seq_params=packed_seq_params,
+        )
+        output = postprocess_packed_seqs(output_orig, packed_seq_params, attention_mask, batch_size, seq_len, post_process=post_process)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        new_input_ids, new_attention_mask, new_position_ids = remove_left_padding(input_ids, attention_mask, position_ids, sequence_parallel, pre_process=pre_process)
+        output = model(input_ids=new_input_ids, attention_mask=new_attention_mask, position_ids=new_position_ids)
+        output = recover_left_padding(output, new_attention_mask, attention_mask, sequence_length, post_process=post_process)
+    if value_model and post_process:
+        output = output[..., 0]
+    return output
+def gptmodel_forward_qwen2_5_vl(*args, **kwargs):
+    """Forward pass for Qwen2.5 VL model (not implemented)."""
+    raise NotImplementedError("VLM is not supported yet")

verl/models/mcore/model_initializer.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# use mcore transformer config to initialize the model
+from abc import ABC, abstractmethod
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec
+from megatron.core.models.gpt.gpt_model import GPTModel
+from .config_converter import PretrainedConfig, TransformerConfig
+class BaseModelInitializer(ABC):
+    """Base class for model initializers."""
+    def __init__(self, tfconfig: TransformerConfig, hf_config: PretrainedConfig):
+        self.tfconfig = tfconfig
+        self.hf_config = hf_config
+    @abstractmethod
+    def get_transformer_layer_spec(self):
+        """Get the transformer layer specification.
+        https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/models/gpt/gpt_layer_specs.py"""
+        pass
+    def get_rope_scaling_args(self) -> dict:
+        """Get rope scaling args."""
+        rope_scaling_args = {}
+        if "rope_scaling" in self.hf_config:
+            if self.hf_config.rope_scaling is not None:
+                assert self.hf_config.rope_scaling["type"] == "linear", "only linear scaling is supported for now"
+                rope_scaling_args["seq_len_interpolation_factor"] = self.hf_config.rope_scaling["factor"]
+        return rope_scaling_args
+    def initialize(
+        self,
+        pre_process: bool = True,
+        post_process: bool = True,
+        share_embeddings_and_output_weights: bool = False,
+        value: bool = False,
+        **extra_kwargs,
+    ) -> GPTModel:
+        """Initialize a GPT model with the given configuration.
+        https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/models/gpt/gpt_model.py
+        Args:
+            pre_process (bool): include embedding layer.
+            post_process (bool): including an output layer.
+            share_embeddings_and_output_weights (bool): input embeddings and output logit weights are shared.
+            value (bool): add an extra linear layer for classification or regression.
+        Returns:
+            GPTModel: An initialized GPT model instance
+        """
+        transformer_layer_spec = self.get_transformer_layer_spec()
+        rope_scaling_args = self.get_rope_scaling_args()
+        model = GPTModel(
+            config=self.tfconfig,
+            transformer_layer_spec=transformer_layer_spec,
+            vocab_size=self.hf_config.vocab_size,
+            max_sequence_length=self.hf_config.max_position_embeddings,
+            pre_process=pre_process,
+            post_process=post_process,
+            share_embeddings_and_output_weights=share_embeddings_and_output_weights,
+            position_embedding_type="rope",
+            rotary_base=self.hf_config.rope_theta,
+            **rope_scaling_args,
+        )
+        if post_process and value:
+            from verl.models.llama.megatron.layers.parallel_linear import LinearForLastLayer
+            model.output_layer = LinearForLastLayer(input_size=self.tfconfig.hidden_size, output_size=1, config=self.tfconfig)
+        return model
+class DenseModel(BaseModelInitializer):
+    """Initializer for dense models like Llama and Qwen2."""
+    def get_transformer_layer_spec(self):
+        assert self.tfconfig.normalization == "RMSNorm", "only RMSNorm is supported for now"
+        return get_gpt_decoder_block_spec(self.tfconfig, use_transformer_engine=True)
+class Qwen2MoEModel(BaseModelInitializer):
+    """Initializer for Qwen2 MoE models."""
+    def get_transformer_layer_spec(self):
+        assert self.tfconfig.normalization == "RMSNorm", "only RMSNorm is supported for now"
+        transformer_layer_spec = get_gpt_decoder_block_spec(self.tfconfig, use_transformer_engine=True)
+        # Patch layer spec for shared experts
+        for i in range(len(transformer_layer_spec.layer_specs)):
+            transformer_layer_spec.layer_specs[i].submodules.mlp.submodules.shared_experts.params["gate"] = True
+        return transformer_layer_spec
+    def initialize(self, freeze_moe_router: bool = True, **kwargs):
+        # Qwen default freeze_moe_router: true
+        model = super().initialize(**kwargs)
+        if freeze_moe_router:
+            for layer in model.decoder.layers:
+                layer.mlp.router.weight.requires_grad = False
+                layer.mlp.shared_experts.gate_weight.requires_grad = False
+        return model
+class MixtralModel(BaseModelInitializer):
+    """Initializer for Mixtral models."""
+    def get_transformer_layer_spec(self):
+        assert self.tfconfig.normalization == "RMSNorm", "only RMSNorm is supported for now"
+        transformer_layer_spec = get_gpt_decoder_block_spec(self.tfconfig, use_transformer_engine=True)
+        return transformer_layer_spec
+    def initialize(self, freeze_moe_router: bool = False, **kwargs):
+        model = super().initialize(**kwargs)
+        if freeze_moe_router:
+            for layer in model.decoder.layers:
+                layer.mlp.router.weight.requires_grad = False
+        return model
+class Qwen3MoEModel(BaseModelInitializer):
+    """Initializer for Qwen3 MoE models."""
+    def get_transformer_layer_spec(self):
+        assert self.tfconfig.normalization == "RMSNorm", "only RMSNorm is supported for now"
+        transformer_layer_spec = get_gpt_decoder_block_spec(self.tfconfig, use_transformer_engine=True)
+        return transformer_layer_spec
+    def initialize(self, freeze_moe_router: bool = True, **kwargs):
+        # Qwen default freeze_moe_router: true
+        model = super().initialize(**kwargs)
+        if freeze_moe_router:
+            for layer in model.decoder.layers:
+                layer.mlp.router.weight.requires_grad = False
+        return model
+class Qwen25VLModel(BaseModelInitializer):
+    """Initializer for Qwen2.5 VL models."""
+    def get_transformer_layer_spec(self):
+        raise NotImplementedError("VLM is not supported yet")

verl/models/mcore/readme.md ADDED Viewed

	@@ -0,0 +1,99 @@

+# verl Megatron-Core Models
+The earlier versions of verl use `Megatron-LM` 0.4 and workaround huggingface model classes. To better use the latest features and speedup of modern Megatron, we are migrating to `Megatron-Core`(mcore), and use the recommended `GPTModel` class for all language models. With mcore `GPTModel`, we can use the latest features like `context parallel`, `expert parallel`, `dist_checkpointing`, etc. and we can update mcore with little effort in the future for new features.
+The migration has been successful with the help of the mcore team and the community. What we have done is:
+1. update `Megatron` version to `0.11.0`
+2. migrate `LlamaForCausalLM` and `Qwen2ForCausalLM` to mcore `GPTModel`
+3. support sequence packing/thd format.
+4. support `tensor parallel`, `pipeline parallel`, `sequence parallel`, `virtual pipeline parallel`, `context parallel`.
+5. support the mcore `dist_checkpointing` feature and a basic offline weighs conversion scipt from huggingface to mcore `dist_checkpointing` format.
+We are working on the following features:
+- support `Qwen2MoeForCausalLM`
+- support `MixtralForCausalLM`
+- support `DeepseekV3ForCausalLM`
+- support `expert parallel`
+Features we invite the community to contribute:
+- better scipts for offline weights conversion from huggingface to mcore `dist_checkpointing` format.
+    - conversion of large models with multiple GPUs
+    - conversion of large models with single GPU
+- refactor the `megatron_checkpoint_manager.py` by `dist_checkpointing` format.
+- support llama4
+- support qwen2.5-vl
+To track the progress of verl mcore integration, please refer to the [mcore integration issue](https://github.com/volcengine/verl/issues/1033).
+## How things work now
+To engage the community in contributing, here are the key steps in our mcore integration process and features under development.
+The huggingface `transformers` is the de facto standard of model zoo while mcore is good at computation efficiency. The main challenge is conversion between the two.
+main steps:
+1. modelling the huggingface model with mcore `GPTModel`
+    - a. convert the huggingface config to mcore `TransformerConfig`
+    - b. init the mcore `GPTModel` with the converted config
+    - c. load the huggingface model weights to the `GPTModel`
+2. online weight conversion from mcore to huggingface (due the the rollout engine `vLLM` is using huggingface format)
+    - a. bridge the gap between mcore and huggingface weights format and name mapping
+    - b. online resharding the mcore weights to rollout engine
+        - this part is very complicated with multiple parallel strategies composition between mcore and rollout engine
+3. support the mcore features in verl
+    - a. support `tensor parallel`, `pipeline parallel`, `sequence parallel`, `virtual pipeline parallel`, `context parallel`
+    - b. support recompute and other mcore speed up features
+4. checkpointing
+    - a. support recovering the verl training.
+    - b. support exporting the mcore checkpoint to huggingface format, for downstream inference.
+### Modelling the huggingface model with mcore `GPTModel`
+The first step is to convert huggingface config to mcore `TransformerConfig` and init the mcore `GPTModel` with the converted config. See code in `verl/models/mcore/config_converter.py` and `verl/verl/models/mcore/models/model_initializer.py`. The corresponding model forward code is in `verl/verl/models/mcore/models/model_forward.py`.
+There are two ways of loading the huggingface model weights to the `GPTModel`
+1. Runtime loading
+    - every rank loads the entire huggingface model weights and then shard and convert to mcore weights.
+    - speed is slow and memory consumption is high.
+    - this way is deprecated and will not support new models.
+2. Offline loading
+    - use offline script to convert the huggingface model weights to mcore weights and save with mcore `dist_checkpointing` format.
+    - online loading and sharding is automatically done by mcore `dist_checkpointing` format. The speed is fast and memory consumption is low.
+    - the offline script is in `verl/scripts/converter_hf_to_mcore.py`.
+### online weight conversion from mcore to huggingface
+See function `convert_megatron_model_to_transformers_model` in `verl/utils/megatron_utils.py` for the details.
+It should be refatored for extensibility and better performance.
+### support the mcore features in verl
+Most of the features of `GPTModel` is out-of-the-box supported in verl through changing the `TransformerConfig`, except those about parallel strategies, such as `expert parallel`.
+Features about parallel strategies should be supported with changes about the online weights conversion(especially the resharding part) and verl work dispatching.
+### checkpointing
+The existing checkpointing code is in `verl/utils/checkpoint/megatron_checkpoint_manager.py`. And the script to convert checkpoint to huggingface format is in `verl/scripts/model_merger.py`.
+The existing checkpoint format is simplely save every rank's weights and optimizer states. It should be refactored by `dist_checkpointing` format.
+## How to support new models
+1. make sure the model is supported by vLLM
+2. modelling the huggingface model with mcore `GPTModel` (The [Pai-Megatron-Path](https://github.com/alibaba/Pai-Megatron-Patch/tree/main) is a good reference)
+    - a. convert the huggingface config to mcore `TransformerConfig`
+    - b. init the mcore `GPTModel` with the converted config
+    - c. load the huggingface model weights to the `GPTModel`
+    - d. for VLM the interface might be different, it is ok to add a new model class with GPTModel as its module.
+3. offline weights conversion from huggingface to mcore `dist_checkpointing` format
+4. support online weights conversion from mcore to huggingface
+    - it is recommended to initilize a vLLM model with the converted mcore weights, and then test if the generating sequence is correct.
+## How to scale up to larger models like deepseek-v3 or other 100B+ models
+The greatest challenge for scaling up to larger models is the memory consumption.
+The necessary features under development for scaling up are
+1. Training engine part
+    - expert parallel
+2. Rollout engine part
+    - pipeline parallel
+    - expert parallel
+    - more efficient and general weight resharding and loading
+3. Offline weights conversion
+    - support weights larger then single GPU memory

verl/models/mcore/registry.py ADDED Viewed

	@@ -0,0 +1,179 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Registry module for model architecture components.
+"""
+from enum import Enum
+from typing import Callable, Dict, Type
+import torch
+import torch.nn as nn
+from .config_converter import (
+    PretrainedConfig,
+    TransformerConfig,
+    hf_to_mcore_config_dense,
+    hf_to_mcore_config_dpskv3,
+    hf_to_mcore_config_llama4,
+    hf_to_mcore_config_mixtral,
+    hf_to_mcore_config_qwen2_5_vl,
+    hf_to_mcore_config_qwen2moe,
+    hf_to_mcore_config_qwen3moe,
+)
+from .model_forward import (
+    gptmodel_forward,
+)
+from .model_initializer import (
+    BaseModelInitializer,
+    DenseModel,
+    MixtralModel,
+    Qwen2MoEModel,
+    Qwen3MoEModel,
+    Qwen25VLModel,
+)
+from .weight_converter import (
+    McoreToHFWeightConverterDense,
+    McoreToHFWeightConverterMixtral,
+    McoreToHFWeightConverterQwen2Moe,
+    McoreToHFWeightConverterQwen3Moe,
+)
+class SupportedModel(Enum):
+    LLAMA = "LlamaForCausalLM"  # tested
+    QWEN2 = "Qwen2ForCausalLM"  # tested
+    QWEN2_MOE = "Qwen2MoeForCausalLM"  # pending
+    DEEPSEEK_V3 = "DeepseekV3ForCausalLM"  # not tested
+    MIXTRAL = "MixtralForCausalLM"  # tested
+    QWEN2_5_VL = "Qwen2_5_VLForConditionalGeneration"  # not supported
+    LLAMA4 = "Llama4ForConditionalGeneration"  # not tested
+    QWEN3 = "Qwen3ForCausalLM"  # tested
+    QWEN3_MOE = "Qwen3MoeForCausalLM"  # not tested
+# Registry for model configuration converters
+MODEL_CONFIG_CONVERTER_REGISTRY: Dict[SupportedModel, Callable[[PretrainedConfig, torch.dtype], TransformerConfig]] = {
+    SupportedModel.LLAMA: hf_to_mcore_config_dense,
+    SupportedModel.QWEN2: hf_to_mcore_config_dense,
+    SupportedModel.QWEN2_MOE: hf_to_mcore_config_qwen2moe,
+    SupportedModel.DEEPSEEK_V3: hf_to_mcore_config_dpskv3,
+    SupportedModel.MIXTRAL: hf_to_mcore_config_mixtral,
+    SupportedModel.QWEN2_5_VL: hf_to_mcore_config_qwen2_5_vl,
+    SupportedModel.LLAMA4: hf_to_mcore_config_llama4,
+    SupportedModel.QWEN3: hf_to_mcore_config_dense,
+    SupportedModel.QWEN3_MOE: hf_to_mcore_config_qwen3moe,
+}
+# Registry for model initializers
+MODEL_INITIALIZER_REGISTRY: Dict[SupportedModel, Type[BaseModelInitializer]] = {
+    SupportedModel.LLAMA: DenseModel,
+    SupportedModel.QWEN2: DenseModel,
+    SupportedModel.QWEN2_MOE: Qwen2MoEModel,
+    SupportedModel.MIXTRAL: MixtralModel,
+    SupportedModel.DEEPSEEK_V3: DenseModel,
+    SupportedModel.QWEN2_5_VL: Qwen25VLModel,
+    SupportedModel.LLAMA4: DenseModel,
+    SupportedModel.QWEN3: DenseModel,
+    SupportedModel.QWEN3_MOE: Qwen3MoEModel,
+}
+# Registry for model forward functions
+MODEL_FORWARD_REGISTRY: Dict[SupportedModel, Callable] = {
+    SupportedModel.LLAMA: gptmodel_forward,
+    SupportedModel.QWEN2: gptmodel_forward,
+    SupportedModel.QWEN2_MOE: gptmodel_forward,
+    SupportedModel.MIXTRAL: gptmodel_forward,
+    SupportedModel.DEEPSEEK_V3: gptmodel_forward,
+    SupportedModel.QWEN2_5_VL: gptmodel_forward,
+    SupportedModel.LLAMA4: gptmodel_forward,
+    SupportedModel.QWEN3: gptmodel_forward,
+    SupportedModel.QWEN3_MOE: gptmodel_forward,
+}
+# Registry for model weight converters
+MODEL_WEIGHT_CONVERTER_REGISTRY: Dict[SupportedModel, Type] = {
+    SupportedModel.LLAMA: McoreToHFWeightConverterDense,
+    SupportedModel.QWEN2: McoreToHFWeightConverterDense,
+    SupportedModel.QWEN2_MOE: McoreToHFWeightConverterQwen2Moe,
+    SupportedModel.MIXTRAL: McoreToHFWeightConverterMixtral,
+    SupportedModel.QWEN3: McoreToHFWeightConverterDense,
+    SupportedModel.QWEN3_MOE: McoreToHFWeightConverterQwen3Moe,
+}
+def get_supported_model(model_type: str) -> SupportedModel:
+    try:
+        return SupportedModel(model_type)
+    except ValueError as err:
+        supported_models = [e.value for e in SupportedModel]
+        raise NotImplementedError(f"Model Type: {model_type} not supported. Supported models: {supported_models}") from err
+def hf_to_mcore_config(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig:
+    assert len(hf_config.architectures) == 1, "Only one architecture is supported for now"
+    model = get_supported_model(hf_config.architectures[0])
+    return MODEL_CONFIG_CONVERTER_REGISTRY[model](hf_config, dtype)
+def init_mcore_model(
+    tfconfig: TransformerConfig,
+    hf_config: PretrainedConfig,
+    pre_process: bool = True,
+    post_process: bool = None,
+    *,
+    share_embeddings_and_output_weights: bool = False,
+    value: bool = False,
+    **extra_kwargs,  # may be used for vlm and moe
+) -> nn.Module:
+    """
+    Initialize a Mcore model.
+    Args:
+        tfconfig: The transformer config.
+        hf_config: The HuggingFace config.
+        pre_process: Optional pre-processing function.
+        post_process: Optional post-processing function.
+        share_embeddings_and_output_weights: Whether to share embeddings and output weights.
+        value: Whether to use value.
+        **extra_kwargs: Additional keyword arguments.
+    Returns:
+        The initialized model.
+    """
+    assert len(hf_config.architectures) == 1, "Only one architecture is supported for now"
+    model = get_supported_model(hf_config.architectures[0])
+    initializer_cls = MODEL_INITIALIZER_REGISTRY[model]
+    initializer = initializer_cls(tfconfig, hf_config)
+    return initializer.initialize(pre_process=pre_process, post_process=post_process, share_embeddings_and_output_weights=share_embeddings_and_output_weights, value=value, **extra_kwargs)
+def get_mcore_forward_fn(hf_config: PretrainedConfig) -> Callable:
+    """
+    Get the forward function for given model architecture.
+    """
+    assert len(hf_config.architectures) == 1, "Only one architecture is supported for now"
+    model = get_supported_model(hf_config.architectures[0])
+    return MODEL_FORWARD_REGISTRY[model]
+def get_mcore_weight_converter(hf_config: PretrainedConfig, dtype: torch.dtype) -> Callable:
+    """
+    Get the weight converter for given model architecture.
+    """
+    assert len(hf_config.architectures) == 1, "Only one architecture is supported for now"
+    model = get_supported_model(hf_config.architectures[0])
+    tfconfig = hf_to_mcore_config(hf_config, dtype)
+    return MODEL_WEIGHT_CONVERTER_REGISTRY[model](hf_config, tfconfig)

verl/models/mcore/saver.py ADDED Viewed

	@@ -0,0 +1,459 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import torch
+import torch.distributed as dist
+from megatron.core import mpu
+from megatron.core.distributed import DistributedDataParallel as LocalDDP
+from megatron.core.transformer.module import Float16Module
+from torch.nn.parallel import DistributedDataParallel as torchDDP
+from verl.utils.megatron_utils import print_rank_0, unwrap_model
+def _megatron_calc_global_rank(tp_rank: int = 0, dp_rank: int = 0, pp_rank: int = 0, cp_rank: int = 0, ep_rank: int = 0):
+    """Calculate global rank with support for CP/EP parallelism"""
+    # Get parallel sizes for each dimension
+    tp_size = mpu.get_tensor_model_parallel_world_size()
+    dp_size = mpu.get_data_parallel_world_size()
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    cp_size = mpu.get_context_parallel_world_size()
+    # ep_size = mpu.get_expert_model_parallel_world_size()
+    # Verify total GPU count matches (must be consistent with parallel_state.py)
+    total_size = tp_size * dp_size * pp_size * cp_size
+    assert total_size == torch.distributed.get_world_size(), f"{tp_size}x{dp_size}x{pp_size}x{cp_size} != {torch.distributed.get_world_size()}"
+    # Core calculation logic (corresponds to RankGenerator order parameter)
+    # Assumes default order is "tp-cp-ep-dp-pp"
+    return ((pp_rank * dp_size + dp_rank) * cp_size + cp_rank) * tp_size + tp_rank
+def _megatron_calc_layer_map(config):
+    """Calculate the mapping of global layer_idx to local layer_idx
+    Returns:
+        layer_map (Dict: int -> tuple(int, int, int)):
+            mapping from the global layer index to
+            a tuple of (pp_rank, virtual_pp_rank, layer_idx inside model)
+    """
+    from megatron.core import mpu
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+    layer_map = dict()
+    num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+    assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
+    for pp_rank_idx in range(pp_size):
+        for virtual_pp_rank_idx in range(virtual_pp_size):
+            layer_offset = virtual_pp_rank_idx * (config.num_hidden_layers // virtual_pp_size) + pp_rank_idx * num_layers_per_model
+            for layer_idx in range(num_layers_per_model):
+                layer_map[layer_offset + layer_idx] = (
+                    pp_rank_idx,
+                    virtual_pp_rank_idx,
+                    layer_idx,
+                )
+    return layer_map
+def merge_megatron_ckpt_gptmodel(wrapped_models, config, dtype, is_value_model=False, tie_word_embeddings=False):
+    """Merge sharded parameters of a Megatron module into a merged checkpoint.
+    Args:
+        wrapped_models (list of megatron.core.distributed.DistributedDataParallel):
+            The local DDP wrapped megatron modules.
+        config (str or None):
+            HF config for model
+        dtype: model params type
+        is_value_model: if model is value model
+        tie_word_embeddings: tie_word_embeddings
+    Returns:
+        state_dict (dict):
+            The merged state_dict in rank 0, and an empty dictionary in other ranks.
+    """
+    start_time = time.time()
+    def _get_gpt_model(model):
+        return model
+    dp_rank = mpu.get_data_parallel_rank()
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    pp_rank = mpu.get_pipeline_model_parallel_rank()
+    cp_rank = mpu.get_context_parallel_rank()
+    virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+    mp_group = mpu.get_model_parallel_group()
+    if dist.get_rank() == 0:
+        assert mp_group.rank() == 0, f"mp_rank:[{mp_group.rank}] != 0 on rank #0"
+        assert pp_rank == 0, f"pp_rank:[{pp_rank}] != 0 on rank #0"
+        assert dp_rank == 0, f"dp_rank:[{dp_rank}] != 0 on rank #0"
+    if not isinstance(wrapped_models, (list, tuple)):
+        wrapped_models = list(wrapped_models)
+    assert len(wrapped_models) == virtual_pp_size
+    num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+    assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
+    models = [None] * len(wrapped_models)
+    for i, wrapped_model in enumerate(wrapped_models):
+        models[i] = unwrap_model(wrapped_model, (torchDDP, LocalDDP, Float16Module))
+        assert len(models[i].decoder.layers) == num_layers_per_model, "len model layers {} not equal to num_layers_per_model {}".format(len(models[i].decoder.layers), num_layers_per_model)
+    state_dict = dict()
+    def _get_cpu_tensor(tensor: torch.Tensor):
+        if tensor is None:
+            return None
+        if tensor.device == torch.device("cpu"):
+            return tensor.detach().clone()
+        return tensor.detach().cpu()
+    def _broadcast_tensor(tensor, name, src_pp_rank) -> torch.Tensor:
+        """broadcast tensor across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank, cp_rank=cp_rank)
+        if torch.distributed.get_rank() == src_rank:
+            if tensor is None:
+                weight = None
+                tensor_shape = None
+            else:
+                weight = tensor
+                tensor_shape = weight.shape
+        else:
+            weight = None
+            tensor_shape = None
+        obj_list = [tensor_shape]
+        dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+        tensor_shape = obj_list[0]
+        if tensor_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tensor:[{name}] not exist, skip collect")
+            return
+        if weight is None:
+            weight = torch.empty(
+                tensor_shape,
+                dtype=dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        dist.broadcast(weight, src=src_rank, group=mp_group)
+        if torch.distributed.get_rank() == 0:
+            state_dict[name] = _get_cpu_tensor(weight)
+    def _broadcast_tp_shard_tensor(tensor, name, src_pp_rank, concat_dim=0, mutate_func=None) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        # tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank, cp_rank=cp_rank)
+        chunk_shape = tensor.shape if torch.distributed.get_rank() == src_rank else None
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{name}] not exist, skip collecting")
+            return
+        buffer_tensor = torch.empty(
+            chunk_shape,
+            dtype=dtype,
+            device=torch.cuda.current_device(),
+            requires_grad=False,
+        )
+        chunk_tensors = [None] * tp_size
+        for i in range(tp_size):
+            cur_src_rank = _megatron_calc_global_rank(tp_rank=i, dp_rank=0, pp_rank=src_pp_rank, cp_rank=cp_rank)
+            sync_tensor = tensor if torch.distributed.get_rank() == cur_src_rank else buffer_tensor
+            dist.broadcast(sync_tensor, src=cur_src_rank, group=mp_group)
+            if torch.distributed.get_rank() == 0:
+                chunk_tensors[i] = _get_cpu_tensor(sync_tensor)
+        if torch.distributed.get_rank() == 0:
+            full_tensor = torch.concat(chunk_tensors, dim=concat_dim)
+            if mutate_func is not None:
+                full_tensor = mutate_func(full_tensor)
+            state_dict[name] = full_tensor
+    def _broadcast_tp_shard_tensor_gate_up(tensor, gate_name, up_name, src_pp_rank) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        # tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank, cp_rank=cp_rank)
+        chunk_shape = tensor.shape if torch.distributed.get_rank() == src_rank else None
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{gate_name, up_name}] not exist, skip collecting")
+            return
+        buffer_tensor = torch.empty(
+            chunk_shape,
+            dtype=dtype,
+            device=torch.cuda.current_device(),
+            requires_grad=False,
+        )
+        chunk_tensors = [None] * tp_size
+        for i in range(tp_size):
+            cur_src_rank = _megatron_calc_global_rank(tp_rank=i, dp_rank=0, pp_rank=src_pp_rank, cp_rank=cp_rank)
+            sync_tensor = tensor if torch.distributed.get_rank() == cur_src_rank else buffer_tensor
+            dist.broadcast(sync_tensor, src=cur_src_rank, group=mp_group)
+            if torch.distributed.get_rank() == 0:
+                chunk_tensors[i] = _get_cpu_tensor(sync_tensor)
+        if torch.distributed.get_rank() == 0:
+            full_tensor = torch.concat(chunk_tensors, dim=0)
+            intermediate_size_tp = config.intermediate_size // tp_size
+            gate_weight_list = []
+            up_weight_list = []
+            for i in range(tp_size):
+                gate_up_weight_tp = full_tensor[intermediate_size_tp * 2 * i : intermediate_size_tp * 2 * (i + 1)]
+                gate_weight_tp = gate_up_weight_tp[:intermediate_size_tp]
+                up_weight_tp = gate_up_weight_tp[intermediate_size_tp:]
+                gate_weight_list.append(gate_weight_tp)
+                up_weight_list.append(up_weight_tp)
+            state_dict[gate_name] = torch.cat(gate_weight_list, dim=0)
+            state_dict[up_name] = torch.cat(up_weight_list, dim=0)
+    def _broadcast_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name, src_pp_rank):
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        # tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank, cp_rank=cp_rank)
+        chunk_shape = tensor.shape if torch.distributed.get_rank() == src_rank else None
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{q_name}] not exist, skip collecting")
+            return
+        buffer_tensor = torch.empty(
+            chunk_shape,
+            dtype=dtype,
+            device=torch.cuda.current_device(),
+            requires_grad=False,
+        )
+        chunk_tensors = [None] * tp_size
+        for i in range(tp_size):
+            cur_src_rank = _megatron_calc_global_rank(tp_rank=i, dp_rank=0, pp_rank=src_pp_rank, cp_rank=cp_rank)
+            sync_tensor = tensor if torch.distributed.get_rank() == cur_src_rank else buffer_tensor
+            dist.broadcast(sync_tensor, src=cur_src_rank, group=mp_group)
+            if torch.distributed.get_rank() == 0:
+                chunk_tensors[i] = _get_cpu_tensor(sync_tensor)
+        if torch.distributed.get_rank() == 0:
+            full_tensor = torch.concat(chunk_tensors, dim=0)
+            q_weight_list = []
+            k_weight_list = []
+            v_weight_list = []
+            hidden_size_per_head = config.hidden_size // config.num_attention_heads
+            if config.num_key_value_heads >= tp_size:
+                q_size_tp = config.hidden_size // tp_size
+                kv_size_tp = hidden_size_per_head * config.num_key_value_heads // tp_size
+                total_size = q_size_tp + 2 * kv_size_tp
+                for i in range(tp_size):
+                    num_query_groups_per_partition = wrapped_models[0].config.num_query_groups // tp_size
+                    qkv_part = full_tensor[i * total_size : (i + 1) * total_size]
+                    q_size_chunk = q_size_tp // num_query_groups_per_partition
+                    kv_size_chunk = kv_size_tp // num_query_groups_per_partition
+                    for qkv_part_chunk in qkv_part.chunk(num_query_groups_per_partition):
+                        q_part = qkv_part_chunk[:q_size_chunk]
+                        k_part = qkv_part_chunk[q_size_chunk : q_size_chunk + kv_size_chunk]
+                        v_part = qkv_part_chunk[q_size_chunk + kv_size_chunk :]
+                        q_weight_list.append(q_part)
+                        k_weight_list.append(k_part)
+                        v_weight_list.append(v_part)
+            else:
+                q_size_tp = config.hidden_size // tp_size
+                kv_size_tp = hidden_size_per_head
+                total_size = q_size_tp + 2 * kv_size_tp
+                for i in range(tp_size):
+                    num_query_groups_per_partition = wrapped_models[0].config.num_query_groups // tp_size
+                    qkv_part = full_tensor[i * total_size : (i + 1) * total_size]
+                    q_size_chunk = q_size_tp // num_query_groups_per_partition
+                    kv_size_chunk = kv_size_tp // num_query_groups_per_partition
+                    for qkv_part_chunk in qkv_part.chunk(num_query_groups_per_partition):
+                        q_part = qkv_part_chunk[:q_size_chunk]
+                        k_part = qkv_part_chunk[q_size_chunk : q_size_chunk + kv_size_chunk]
+                        v_part = qkv_part_chunk[q_size_chunk + kv_size_chunk :]
+                        q_weight_list.append(q_part)
+                        if i * config.num_key_value_heads % tp_size == 0:
+                            k_weight_list.append(k_part)
+                            v_weight_list.append(v_part)
+            state_dict[q_name] = torch.cat(q_weight_list, dim=0)
+            state_dict[k_name] = torch.cat(k_weight_list, dim=0)
+            state_dict[v_name] = torch.cat(v_weight_list, dim=0)
+    # empty cache before collecting weights
+    torch.cuda.empty_cache()
+    # Embeddings
+    # -------------------
+    if dp_rank == 0 and cp_rank == 0:  # models are identical across cp ranks
+        # Embeddings
+        # -------------------
+        print_rank_0("collecting embeddings...")
+        gpt_model_module = _get_gpt_model(models[0])
+        _broadcast_tp_shard_tensor(
+            gpt_model_module.embedding.word_embeddings.weight if pp_rank == 0 else None,
+            "model.embed_tokens.weight",
+            src_pp_rank=0,
+        )
+        # Transformer layers
+        # -------------------
+        layer_map = _megatron_calc_layer_map(config)
+        for layer in range(config.num_hidden_layers):
+            print_rank_0(f"collecting layer #{layer}...")
+            layer_name = f"model.layers.{layer}"
+            src_pp_rank, src_virtual_pp_rank, src_layer_idx = layer_map[layer]
+            gpt_model_module = _get_gpt_model(models[src_virtual_pp_rank])
+            sync_layer = gpt_model_module.decoder.layers[src_layer_idx]
+            _broadcast_tensor(
+                sync_layer.self_attention.linear_qkv.layer_norm_weight,
+                f"{layer_name}.input_layernorm.weight",
+                src_pp_rank=src_pp_rank,
+            )
+            _broadcast_tp_shard_tensor_qkv(
+                sync_layer.self_attention.linear_qkv.weight,
+                f"{layer_name}.self_attn.q_proj.weight",
+                f"{layer_name}.self_attn.k_proj.weight",
+                f"{layer_name}.self_attn.v_proj.weight",
+                src_pp_rank=src_pp_rank,
+            )
+            if getattr(sync_layer.self_attention.linear_qkv, "bias", None) is not None and sync_layer.self_attention.linear_qkv.bias.numel() > 0:
+                _broadcast_tp_shard_tensor_qkv(
+                    sync_layer.self_attention.linear_qkv.bias,
+                    f"{layer_name}.self_attn.q_proj.bias",
+                    f"{layer_name}.self_attn.k_proj.bias",
+                    f"{layer_name}.self_attn.v_proj.bias",
+                    src_pp_rank=src_pp_rank,
+                )
+            _broadcast_tp_shard_tensor(
+                sync_layer.self_attention.linear_proj.weight,
+                f"{layer_name}.self_attn.o_proj.weight",
+                concat_dim=1,
+                src_pp_rank=src_pp_rank,
+            )
+            _broadcast_tensor(
+                sync_layer.mlp.linear_fc1.layer_norm_weight,
+                f"{layer_name}.post_attention_layernorm.weight",
+                src_pp_rank=src_pp_rank,
+            )
+            _broadcast_tp_shard_tensor_gate_up(
+                sync_layer.mlp.linear_fc1.weight,
+                f"{layer_name}.mlp.gate_proj.weight",
+                f"{layer_name}.mlp.up_proj.weight",
+                src_pp_rank=src_pp_rank,
+            )
+            _broadcast_tp_shard_tensor(
+                sync_layer.mlp.linear_fc2.weight,
+                f"{layer_name}.mlp.down_proj.weight",
+                concat_dim=1,
+                src_pp_rank=src_pp_rank,
+            )
+        # Final Layernorm
+        # -------------------
+        print_rank_0("collecting final layernorm...")
+        gpt_model_module = _get_gpt_model(models[-1])
+        _broadcast_tensor(
+            getattr(gpt_model_module.decoder.final_layernorm, "weight", None),
+            "model.norm.weight",
+            src_pp_rank=pp_size - 1,
+        )
+        if tie_word_embeddings:
+            print_rank_0("tie word embedding skip load lm_head...")
+        else:
+            print_rank_0("collecting lm_head...")
+            if is_value_model:
+                lm_head_weight = None
+                if pp_rank == pp_size - 1:
+                    lm_head_weight = getattr(gpt_model_module.output_layer, "weight", None)
+                _broadcast_tensor(lm_head_weight, "lm_head.weight", src_pp_rank=pp_size - 1)
+            else:
+                _broadcast_tp_shard_tensor(
+                    getattr(gpt_model_module.output_layer, "weight", None) if pp_rank == pp_size - 1 else None,
+                    "lm_head.weight",
+                    src_pp_rank=pp_size - 1,
+                )
+    dist.barrier()
+    torch.cuda.empty_cache()
+    if torch.distributed.get_rank() == 0:
+        for k, v in state_dict.items():
+            if dtype != v.dtype:
+                state_dict[k] = v.to(dtype)
+    print_rank_0(f"merge megatron ckpt done, time elapsed {time.time() - start_time}s")
+    return state_dict
+def merge_megatron_ckpt_gptmodel_qwen_moe(wrapped_models, config, dtype, is_value_model=False, tie_word_embeddings=False):
+    raise NotImplementedError("merge_megatron_ckpt_gptmodel_qwen_moe is not implemented")
+def merge_megatron_ckpt_gptmodel_mixtral(wrapped_models, config, dtype, is_value_model=False, tie_word_embeddings=False):
+    raise NotImplementedError("merge_megatron_ckpt_gptmodel_mixtral is not implemented")

verl/models/mcore/util.py ADDED Viewed

	@@ -0,0 +1,190 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from megatron.core import parallel_state as mpu
+from megatron.core.packed_seq_params import PackedSeqParams
+def preprocess_packed_seqs(input_ids: torch.Tensor, attention_mask: torch.Tensor, pre_process: bool = True) -> tuple[torch.Tensor, PackedSeqParams]:
+    """
+    Preprocess packed sequences
+    CP splits sequence into CP*2 chunks, and each GPU gets 2 chunks (GPU0 gets first and last chunks, GPU1 gets second and second last chunks, and so on), this is for load balancing with causal masking.
+    See https://github.com/NVIDIA/TransformerEngine/issues/1368
+    """
+    batch_size = input_ids.shape[0]
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    tp_size = mpu.get_tensor_model_parallel_world_size()
+    cp_size = mpu.get_context_parallel_world_size()
+    cp_rank = mpu.get_context_parallel_rank()
+    align_size = tp_size * cp_size * 2 if cp_size > 1 else tp_size
+    pad_size = (align_size - seqlens_in_batch % align_size) % align_size
+    seqlens_in_batch_padded = seqlens_in_batch + pad_size
+    cu_seqlens = torch.zeros(batch_size + 1, dtype=torch.int32, device=input_ids.device)
+    cu_seqlens[1:] = torch.cumsum(seqlens_in_batch, dim=0)
+    cu_seqlens_padded = torch.zeros(batch_size + 1, dtype=torch.int32, device=input_ids.device)
+    cu_seqlens_padded[1:] = torch.cumsum(seqlens_in_batch_padded, dim=0)
+    max_seqlen_in_batch = seqlens_in_batch_padded.max().item()
+    shape = list(input_ids.shape[1:])
+    shape[0] = seqlens_in_batch_padded.sum().item() // cp_size
+    if pre_process:
+        input_ids_rmpad = torch.zeros(shape, dtype=input_ids.dtype, device=input_ids.device)
+        for i in range(batch_size):
+            if cp_size <= 1:
+                seqlen = seqlens_in_batch[i]
+                input_ids_rmpad[cu_seqlens_padded[i] : cu_seqlens_padded[i] + seqlen] = input_ids[i, attention_mask[i]]
+                continue
+            seqlen = seqlens_in_batch_padded[i] // cp_size
+            half_seqlen = seqlen // 2
+            start_idx = cu_seqlens_padded[i] // cp_size
+            # split to 2 chunks
+            d = input_ids[i, attention_mask[i]]
+            input_ids_rmpad[start_idx : start_idx + half_seqlen] = d[half_seqlen * cp_rank : half_seqlen * (cp_rank + 1)]
+            remain_start = seqlens_in_batch_padded[i] - half_seqlen * (cp_rank + 1)
+            remain_end = seqlens_in_batch_padded[i] - half_seqlen * cp_rank
+            remain_end = min(remain_end, d.shape[0])
+            remain_len = remain_end - remain_start
+            if remain_len > 0:
+                input_ids_rmpad[start_idx + half_seqlen : start_idx + half_seqlen + remain_len] = d[remain_start:remain_end]
+    packed_seq_params = PackedSeqParams(
+        qkv_format="thd",
+        cu_seqlens_q=cu_seqlens_padded,
+        max_seqlen_q=max_seqlen_in_batch,
+        cu_seqlens_kv=cu_seqlens_padded,
+        max_seqlen_kv=max_seqlen_in_batch,
+        cu_seqlens_q_padded=cu_seqlens_padded,
+        cu_seqlens_kv_padded=cu_seqlens_padded,
+    )
+    if pre_process:
+        return input_ids_rmpad.unsqueeze(0), packed_seq_params
+    else:
+        return input_ids, packed_seq_params
+def postprocess_packed_seqs(
+    output: torch.Tensor,
+    packed_seq_params: PackedSeqParams,
+    attention_mask: torch.Tensor,
+    batch_size: int,
+    seq_len: int,
+    post_process: bool = True,
+) -> torch.Tensor:
+    """
+    Postprocess packed sequences
+    """
+    if not post_process:
+        return output
+    shape = [batch_size, seq_len] + list(output.shape[2:])  # 1,packed, dim -> batch_size, seq_len, dim
+    output_new = torch.zeros(shape, dtype=output.dtype, device=output.device)
+    cp_size = mpu.get_context_parallel_world_size()
+    # all gather output across context parallel group
+    if cp_size > 1:
+        # output shape: [1, packed_len, hidden_dim]
+        # need to gather across cp group and concatenate in sequence dimension
+        output_list = [torch.empty_like(output) for _ in range(cp_size)]
+        torch.distributed.all_gather(output_list, output.detach(), group=mpu.get_context_parallel_group())
+        output_list[mpu.get_context_parallel_rank()] = output
+    else:
+        output_list = [output]
+    for i in range(batch_size):
+        if cp_size <= 1:
+            s = attention_mask[i].sum().item()
+            output_new[i, attention_mask[i]] = output[0][packed_seq_params.cu_seqlens_q_padded[i] : packed_seq_params.cu_seqlens_q_padded[i] + s]
+            continue
+        s_len_padded_chunk = (packed_seq_params.cu_seqlens_q_padded[i + 1] - packed_seq_params.cu_seqlens_q_padded[i]) // cp_size
+        half_seqlen = s_len_padded_chunk // 2
+        s_len = attention_mask[i].sum().item()
+        s_len_padded = s_len_padded_chunk * cp_size
+        tmp = torch.empty(s_len_padded, *output.shape[2:], device=output.device)
+        for j in range(cp_size):
+            o = output_list[j][0]
+            # split to 2 chunks
+            packed_start_idx = packed_seq_params.cu_seqlens_q_padded[i] // cp_size
+            o0, o1 = (
+                o[packed_start_idx : packed_start_idx + half_seqlen],
+                o[packed_start_idx + half_seqlen : packed_start_idx + s_len_padded_chunk],
+            )
+            tmp[j * half_seqlen : (j + 1) * half_seqlen] = o0
+            tmp[s_len_padded - (j + 1) * half_seqlen : s_len_padded - j * half_seqlen] = o1
+        output_new[i, attention_mask[i]] = tmp[:s_len]
+    return output_new
+def remove_left_padding(
+    input_ids: torch.Tensor,
+    attention_mask: torch.Tensor,
+    position_ids: torch.Tensor,
+    sequence_parallel: bool = False,
+    pre_process: bool = True,
+):
+    """
+    Remove left padding from input_ids, attention_mask and position_ids
+    return new_input_ids, new_attention_mask, new_position_ids
+    """
+    assert attention_mask.ndim == 2
+    assert position_ids.ndim == 2
+    cp_size = mpu.get_context_parallel_world_size()
+    assert cp_size == 1, "Context parallel size without seq_pack is not supported"
+    batch_size = input_ids.shape[0]
+    shape = list(input_ids.shape)  # batch_size, seq_len,...
+    seq_lens = attention_mask.sum(dim=1)
+    seq_len = seq_lens.max().item()
+    if sequence_parallel:
+        sp_world_size = mpu.get_tensor_model_parallel_world_size()
+        pad_size = (sp_world_size - seq_len % sp_world_size) % sp_world_size
+        seq_len = seq_len + pad_size
+    shape[1] = seq_len
+    if pre_process:
+        new_input_ids = torch.zeros(dtype=input_ids.dtype, device=input_ids.device, size=shape)
+    new_attention_mask = torch.zeros(dtype=attention_mask.dtype, device=attention_mask.device, size=(batch_size, seq_len))
+    new_position_ids = torch.zeros(dtype=position_ids.dtype, device=position_ids.device, size=(batch_size, seq_len))
+    for i in range(batch_size):
+        if pre_process:
+            new_input_ids[i, : seq_lens[i]] = input_ids[i, attention_mask[i]]
+        new_attention_mask[i, : seq_lens[i]] = attention_mask[i, attention_mask[i]]
+        new_position_ids[i, : seq_lens[i]] = position_ids[i, attention_mask[i]]
+    if pre_process:
+        return new_input_ids, new_attention_mask, new_position_ids
+    else:
+        return input_ids, new_attention_mask, new_position_ids
+def recover_left_padding(
+    result,
+    attention_mask: torch.Tensor,
+    original_attention_mask: torch.Tensor,
+    origin_seqlen: int,
+    post_process: bool = True,
+):
+    """
+    Recover left padding from result
+    return result
+    """
+    if not post_process:
+        return result
+    shape = list(result.shape)
+    batch_size = shape[0]
+    shape[1] = origin_seqlen
+    new_result = torch.zeros(dtype=result.dtype, device=result.device, size=shape)
+    for i in range(batch_size):
+        new_result[i, original_attention_mask[i]] = result[i, attention_mask[i]]
+    return new_result

verl/models/mcore/weight_converter.py ADDED Viewed

	@@ -0,0 +1,207 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# online convert mcore weight to pure huggingface weight, no any fusion
+# including format conversion and name mapping
+# not including resharding
+import torch
+from megatron.core.transformer import TransformerConfig
+from transformers import PretrainedConfig
+class McoreToHFWeightConverterBase:
+    def __init__(self, hf_config: PretrainedConfig, mcore_config: TransformerConfig):
+        self.hf_config = hf_config
+        self.mcore_config = mcore_config
+    def convert_param(self, name: str, params_one_group: list[torch.Tensor]) -> torch.Tensor:
+        raise NotImplementedError
+class McoreToHFWeightConverterDense(McoreToHFWeightConverterBase):
+    def _convert_attention_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]:
+        # 'decoder.layers.0.self_attention.linear_proj.weight'
+        # 'decoder.layers.0.self_attention.linear_qkv.layer_norm_weight'
+        # 'decoder.layers.0.self_attention.linear_qkv.weight'
+        # 'decoder.layers.0.self_attention.linear_qkv.bias'
+        layer_number = name.split(".")[2]
+        convert_names = []
+        if "self_attention.linear_qkv.bias" in name or "self_attention.linear_qkv.weight" in name:
+            param_type = name.split(".")[-1]
+            assert param_type == "bias" or param_type == "weight"
+            convert_names.append(f"model.layers.{layer_number}.self_attn.q_proj.{param_type}")
+            convert_names.append(f"model.layers.{layer_number}.self_attn.k_proj.{param_type}")
+            convert_names.append(f"model.layers.{layer_number}.self_attn.v_proj.{param_type}")
+            assert len(params) == 3
+        elif "self_attention.linear_proj.weight" in name:
+            convert_names.append(f"model.layers.{layer_number}.self_attn.o_proj.weight")
+            assert len(params) == 1
+        elif "self_attention.linear_qkv.layer_norm_weight" in name:
+            convert_names.append(f"model.layers.{layer_number}.input_layernorm.weight")
+            assert len(params) == 1
+        elif "self_attention.q_layernorm.weight" in name:
+            convert_names.append(f"model.layers.{layer_number}.self_attn.q_norm.weight")
+            assert len(params) == 1
+        elif "self_attention.k_layernorm.weight" in name:
+            convert_names.append(f"model.layers.{layer_number}.self_attn.k_norm.weight")
+            assert len(params) == 1
+        else:
+            raise NotImplementedError(f"Unsupported parameter name: {name}")
+        return convert_names, params
+    def _convert_mlp_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]:
+        # 'decoder.layers.0.mlp.linear_fc1.layer_norm_weight'
+        # 'decoder.layers.0.mlp.linear_fc1.weight'
+        # 'decoder.layers.0.mlp.linear_fc2.weight'
+        layer_number = name.split(".")[2]
+        convert_names = []
+        if "mlp.linear_fc1.weight" in name:
+            # split gate_proj and up_proj
+            convert_names.append(f"model.layers.{layer_number}.mlp.gate_proj.weight")
+            convert_names.append(f"model.layers.{layer_number}.mlp.up_proj.weight")
+            assert len(params) == 2
+        elif "mlp.linear_fc1.layer_norm_weight" in name:
+            convert_names.append(f"model.layers.{layer_number}.post_attention_layernorm.weight")
+            assert len(params) == 1
+        elif "mlp.linear_fc2.weight" in name:
+            convert_names.append(f"model.layers.{layer_number}.mlp.down_proj.weight")
+            assert len(params) == 1
+        else:
+            raise NotImplementedError(f"Unsupported parameter name: {name}")
+        return convert_names, params
+    def convert_param(self, name: str, params_one_group: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]:
+        direct_name_mapping = {
+            "embedding.word_embeddings.weight": "model.embed_tokens.weight",
+            "decoder.final_layernorm.weight": "model.norm.weight",
+            "output_layer.weight": "lm_head.weight",
+        }
+        if name in direct_name_mapping:
+            return [direct_name_mapping[name]], [params_one_group[0]]
+        if "self_attention" in name:
+            return self._convert_attention_param(name, params_one_group)
+        elif "mlp" in name:
+            return self._convert_mlp_param(name, params_one_group)
+        else:
+            raise NotImplementedError(f"Unsupported parameter name: {name}")
+class McoreToHFWeightConverterQwen2Moe(McoreToHFWeightConverterDense):
+    def _convert_mlp_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]:
+        # 'decoder.layers.0.pre_mlp_layernorm.weight',
+        # 'decoder.layers.0.mlp.router.weight',
+        # 'decoder.layers.0.mlp.shared_experts.gate_weight',
+        # 'decoder.layers.0.mlp.shared_experts.linear_fc1.weight',
+        # 'decoder.layers.0.mlp.shared_experts.linear_fc2.weight'
+        # moe1
+        # 'decoder.layers.0.mlp.experts.linear_fc1.weight0',
+        # 'decoder.layers.0.mlp.experts.linear_fc1.weight1',
+        # 'decoder.layers.0.mlp.experts.linear_fc1.weight2',
+        # 'decoder.layers.0.mlp.experts.linear_fc1.weight3',
+        # moe2
+        # 'decoder.layers.0.mlp.experts.linear_fc2.weight0',
+        # 'decoder.layers.0.mlp.experts.linear_fc2.weight1',
+        layer_number = name.split(".")[2]
+        convert_names = []
+        if "pre_mlp_layernorm" in name:
+            convert_names.append(f"model.layers.{layer_number}.post_attention_layernorm.weight")
+            assert len(params) == 1
+        elif "mlp.router.weight" in name:
+            convert_names.append(f"model.layers.{layer_number}.mlp.gate.weight")
+            assert len(params) == 1
+        elif "shared_experts.gate_weight" in name:
+            convert_names.append(f"model.layers.{layer_number}.mlp.shared_expert_gate.weight")
+            assert len(params) == 1
+        elif "shared_experts.linear_fc1.weight" in name:  # split gate_proj and up_proj
+            convert_names.append(f"model.layers.{layer_number}.mlp.shared_expert.gate_proj.weight")
+            convert_names.append(f"model.layers.{layer_number}.mlp.shared_expert.up_proj.weight")
+            assert len(params) == 2
+        elif "shared_experts.linear_fc2.weight" in name:
+            convert_names.append(f"model.layers.{layer_number}.mlp.shared_expert.down_proj.weight")
+            assert len(params) == 1
+        elif "mlp.experts.linear_fc1" in name:  # split gate_proj and up_proj
+            expert_id = name.split("weight")[-1]
+            convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.gate_proj.weight")
+            convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.up_proj.weight")
+            assert len(params) == 2
+        elif "mlp.experts.linear_fc2" in name:
+            expert_id = name.split("weight")[-1]
+            convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.down_proj.weight")
+            assert len(params) == 1
+        else:
+            raise NotImplementedError(f"Unsupported parameter name: {name}")
+        return convert_names, params
+class McoreToHFWeightConverterMixtral(McoreToHFWeightConverterDense):
+    def _convert_mlp_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]:
+        # decoder.layers.0.mlp.router.weight
+        # decoder.layers.0.mlp.experts.linear_fc1.weight0 - weight7
+        # decoder.layers.0.mlp.experts.linear_fc2.weight0 - weight7
+        layer_number = name.split(".")[2]
+        convert_names = []
+        if "pre_mlp_layernorm" in name:
+            convert_names.append(f"model.layers.{layer_number}.post_attention_layernorm.weight")
+        elif "mlp.router.weight" in name:
+            convert_names.append(f"model.layers.{layer_number}.block_sparse_moe.gate.weight")
+        elif "mlp.experts.linear_fc1.weight" in name:
+            expert_id = name.split("weight")[-1]
+            convert_names.append(f"model.layers.{layer_number}.block_sparse_moe.experts.{expert_id}.w1.weight")
+            convert_names.append(f"model.layers.{layer_number}.block_sparse_moe.experts.{expert_id}.w3.weight")
+        elif "mlp.experts.linear_fc2.weight" in name:
+            expert_id = name.split("weight")[-1]
+            convert_names.append(f"model.layers.{layer_number}.block_sparse_moe.experts.{expert_id}.w2.weight")
+        else:
+            raise NotImplementedError(f"Unsupported parameter name: {name}")
+        return convert_names, params
+class McoreToHFWeightConverterQwen3Moe(McoreToHFWeightConverterDense):
+    def _convert_mlp_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]:
+        # qwen3 moe no share expert
+        # 'decoder.layers.0.pre_mlp_layernorm.weight',
+        # 'decoder.layers.0.mlp.router.weight',
+        # moe1
+        # 'decoder.layers.0.mlp.experts.linear_fc1.weight0',
+        # 'decoder.layers.0.mlp.experts.linear_fc1.weight1',
+        # 'decoder.layers.0.mlp.experts.linear_fc1.weight2',
+        # 'decoder.layers.0.mlp.experts.linear_fc1.weight3',
+        # moe2
+        # 'decoder.layers.0.mlp.experts.linear_fc2.weight0',
+        # 'decoder.layers.0.mlp.experts.linear_fc2.weight1',
+        layer_number = name.split(".")[2]
+        convert_names = []
+        if "pre_mlp_layernorm" in name:
+            convert_names.append(f"model.layers.{layer_number}.post_attention_layernorm.weight")
+            assert len(params) == 1
+        elif "mlp.router.weight" in name:
+            convert_names.append(f"model.layers.{layer_number}.mlp.gate.weight")
+            assert len(params) == 1
+        elif "mlp.experts.linear_fc1" in name:  # split gate_proj and up_proj
+            expert_id = name.split("weight")[-1]
+            convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.gate_proj.weight")
+            convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.up_proj.weight")
+            assert len(params) == 2
+        elif "mlp.experts.linear_fc2" in name:
+            expert_id = name.split("weight")[-1]
+            convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.down_proj.weight")
+            assert len(params) == 1
+        else:
+            raise NotImplementedError(f"Unsupported parameter name: {name}")
+        return convert_names, params

verl/models/qwen2/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

verl/models/qwen2/megatron/__init__.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .modeling_qwen2_megatron import (
+    ParallelQwen2ForCausalLM,
+    # rmpad with megatron
+    ParallelQwen2ForCausalLMRmPad,
+    # rmpad with megatron and pipeline parallelism
+    ParallelQwen2ForCausalLMRmPadPP,
+    ParallelQwen2ForValueRmPad,
+    ParallelQwen2ForValueRmPadPP,
+    # original model with megatron
+    ParallelQwen2Model,
+)
+__all__ = [
+    "ParallelQwen2ForCausalLM",
+    "ParallelQwen2ForCausalLMRmPad",
+    "ParallelQwen2ForCausalLMRmPadPP",
+    "ParallelQwen2ForValueRmPad",
+    "ParallelQwen2ForValueRmPadPP",
+    "ParallelQwen2Model",
+]

verl/models/qwen2/megatron/checkpoint_utils/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

verl/models/qwen2/megatron/checkpoint_utils/qwen2_loader.py ADDED Viewed

	@@ -0,0 +1,312 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import torch
+import torch.distributed as dist
+def _megatron_calc_layer_map(config):
+    """Calculate the mapping of global layer_idx to local layer_idx
+    Returns:
+        layer_map (Dict: int -> tuple(int, int, int)):
+            mapping from the global layer index to
+            a tuple of (pp_rank, virtual_pp_rank, layer_idx inside model)
+    """
+    from megatron.core import mpu
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+    layer_map = dict()
+    num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+    assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
+    for pp_rank_idx in range(pp_size):
+        for virtual_pp_rank_idx in range(virtual_pp_size):
+            layer_offset = virtual_pp_rank_idx * (config.num_hidden_layers // virtual_pp_size) + pp_rank_idx * num_layers_per_model
+            for layer_idx in range(num_layers_per_model):
+                layer_map[layer_offset + layer_idx] = (
+                    pp_rank_idx,
+                    virtual_pp_rank_idx,
+                    layer_idx,
+                )
+    return layer_map
+def load_state_dict_to_megatron_qwen2(state_dict, wrapped_models, config, params_dtype, is_value_model=False, tie_word_embeddings=False):
+    """Load merged state_dict to sharded Megatron module in training."""
+    from megatron.core import DistributedDataParallel as LocalDDP
+    from megatron.core import mpu
+    from megatron.core.transformer.module import Float16Module
+    from torch.nn.parallel import DistributedDataParallel as torchDDP
+    from verl.utils.megatron_utils import print_rank_0, unwrap_model
+    start_time = time.time()
+    def _get_gpt_model(model):
+        return model
+    def fetch_params(module):
+        for param in module.parameters():
+            torch.distributed.fetch(param.data, src=mpu.get_data_parallel_src_rank(), group=mpu.get_data_parallel_group())
+    dp_rank = mpu.get_data_parallel_rank()
+    pp_rank = mpu.get_pipeline_model_parallel_rank()
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+    mp_group = mpu.get_model_parallel_group()
+    if torch.distributed.get_rank() == 0:
+        assert mp_group.rank() == 0, f"mp_rank:[{mp_group.rank}] != 0 on rank #0"
+        assert pp_rank == 0, f"pp_rank:[{pp_rank}] != 0 on rank #0"
+        assert dp_rank == 0, f"dp_rank:[{dp_rank}] != 0 on rank #0"
+    if not isinstance(wrapped_models, (list, tuple)):
+        wrapped_models = list(wrapped_models)
+    assert len(wrapped_models) == virtual_pp_size
+    num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+    assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers, f"num_layers_per_model: {num_layers_per_model} * pp_size: {pp_size} * virtual_pp_size: {virtual_pp_size} != config.num_hidden_layers: {config.num_hidden_layers}"
+    models = [None] * len(wrapped_models)
+    for i, wrapped_model in enumerate(wrapped_models):
+        models[i] = unwrap_model(wrapped_model, (torchDDP, LocalDDP, Float16Module))
+        gpt_model_module = _get_gpt_model(models[i])
+        assert len(gpt_model_module.model.layers) == num_layers_per_model
+    def _fetch_tensor(tensor, name) -> torch.Tensor:
+        """fetch tensor"""
+        nonlocal state_dict
+        if tensor is not None:
+            tensor = tensor.data.copy_(state_dict[name], non_blocking=True)
+    def _fetch_tp_shard_tensor_vocab(tensor, name, chunk_dim=0, mutate_func=None) -> torch.Tensor:
+        """fetch tensor in tp shards"""
+        nonlocal state_dict
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        if name in state_dict:
+            full_weight = state_dict[name]
+            if mutate_func is not None:
+                full_weight = mutate_func(full_weight)
+            tensor_chunk = torch.chunk(full_weight, tp_size, dim=chunk_dim)
+            if tensor is not None:
+                tensor = tensor.data.copy_(tensor_chunk[tp_rank], non_blocking=True)
+        else:
+            print(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
+    def _fetch_tp_shard_tensor(tensor, name, chunk_dim=0, mutate_func=None) -> torch.Tensor:
+        """fetch tensor in tp shards"""
+        nonlocal state_dict
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        if name in state_dict:
+            full_weight = state_dict[name]
+            if mutate_func is not None:
+                full_weight = mutate_func(full_weight)
+            tensor_chunk = torch.chunk(full_weight, tp_size, dim=chunk_dim)
+            if tensor is not None:
+                tensor = tensor.data.copy_(tensor_chunk[tp_rank], non_blocking=True)
+        else:
+            print(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
+    def _fetch_tp_shard_tensor_gate_up(tensor, gate_name, up_name) -> torch.Tensor:
+        """fetch gate_up tensor in tp shards"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        if gate_name in state_dict and up_name in state_dict:
+            gate_weight = state_dict[gate_name]
+            up_weight = state_dict[up_name]
+            new_gate_up_weight = torch.empty(config.intermediate_size * 2, config.hidden_size, dtype=params_dtype, device=torch.cuda.current_device())
+            for i in range(tp_size):
+                intermediate_size_tp = config.intermediate_size // tp_size
+                gate_weight_tp = gate_weight[i * intermediate_size_tp : (i + 1) * intermediate_size_tp]
+                up_weight_tp = up_weight[i * intermediate_size_tp : (i + 1) * intermediate_size_tp]
+                new_gate_up_weight[intermediate_size_tp * 2 * i : intermediate_size_tp * 2 * (i + 1)].copy_(torch.cat([gate_weight_tp, up_weight_tp], dim=0))
+            tensor_chunk = torch.chunk(new_gate_up_weight, tp_size, dim=0)
+            if tensor is not None:
+                tensor = tensor.data.copy_(tensor_chunk[tp_rank], non_blocking=True)
+        else:
+            print(f"tp_shard tensor:[{gate_name}, {up_name}] not in state_dict, skip loading")
+    def _fetch_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name, bias=False) -> torch.Tensor:
+        """fetch tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        assert q_name in state_dict and k_name in state_dict and v_name in state_dict
+        full_weight_q = state_dict[q_name]
+        full_weight_k = state_dict[k_name]
+        full_weight_v = state_dict[v_name]
+        hidden_size_per_head = config.hidden_size // config.num_attention_heads
+        if config.num_key_value_heads >= tp_size:
+            q_size_tp = config.hidden_size // tp_size
+            kv_size_tp = hidden_size_per_head * config.num_key_value_heads // tp_size
+            total_size = q_size_tp + 2 * kv_size_tp
+            if not bias:
+                new_weight_qkv = torch.empty(total_size * tp_size, config.hidden_size, dtype=params_dtype, device=torch.cuda.current_device())
+            else:
+                new_weight_qkv = torch.empty(total_size * tp_size, dtype=params_dtype, device=torch.cuda.current_device())
+            for i in range(tp_size):
+                q_part = full_weight_q[i * q_size_tp : (i + 1) * q_size_tp]
+                k_part = full_weight_k[i * kv_size_tp : (i + 1) * kv_size_tp]
+                v_part = full_weight_v[i * kv_size_tp : (i + 1) * kv_size_tp]
+                new_weight_qkv[i * total_size : (i + 1) * total_size].copy_(torch.cat([q_part, k_part, v_part], dim=0))
+        else:
+            q_size_tp = config.hidden_size // tp_size
+            kv_size_tp = hidden_size_per_head
+            total_size = q_size_tp + 2 * kv_size_tp
+            if not bias:
+                new_weight_qkv = torch.empty(total_size * tp_size, config.hidden_size, dtype=params_dtype, device=torch.cuda.current_device())
+            else:
+                new_weight_qkv = torch.empty(total_size * tp_size, dtype=params_dtype, device=torch.cuda.current_device())
+            for i in range(tp_size):
+                q_part = full_weight_q[i * q_size_tp : (i + 1) * q_size_tp]
+                start_idx = i * config.num_key_value_heads // tp_size * hidden_size_per_head
+                end_idx = (i * config.num_key_value_heads // tp_size + 1) * hidden_size_per_head
+                k_part = full_weight_k[start_idx:end_idx]
+                v_part = full_weight_v[start_idx:end_idx]
+                new_weight_qkv[i * total_size : (i + 1) * total_size].copy_(torch.cat([q_part, k_part, v_part], dim=0))
+        tensor_chunk = torch.chunk(new_weight_qkv, tp_size, dim=0)
+        if tensor is not None:
+            tensor = tensor.data.copy_(tensor_chunk[tp_rank], non_blocking=True)
+    # Embeddings
+    # -------------------
+    print_rank_0("loading embeddings...")
+    gpt_model_module = _get_gpt_model(models[0])
+    if pp_rank == 0:
+        embed_tokens_weight = gpt_model_module.model.embed_tokens.weight
+        _fetch_tp_shard_tensor_vocab(embed_tokens_weight, "model.embed_tokens.weight")
+    # Transformer layers
+    # -------------------
+    layer_map = _megatron_calc_layer_map(config)
+    pp_rank = mpu.get_pipeline_model_parallel_rank()
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    num_layer_per_pp = config.num_hidden_layers // pp_size
+    vpp_size = mpu.get_virtual_pipeline_model_parallel_world_size()
+    layer_list = []
+    if vpp_size is not None:
+        for vpp_rank in range(vpp_size):
+            num_layer_vpp_chunk = num_layer_per_pp // vpp_size
+            num_layer_this_model = num_layer_vpp_chunk
+            offset = vpp_rank * (config.num_hidden_layers // mpu.get_virtual_pipeline_model_parallel_world_size()) + (mpu.get_pipeline_model_parallel_rank() * num_layer_vpp_chunk)
+            layer_list.extend(list(range(offset, offset + num_layer_this_model)))
+    else:
+        num_layer_this_model = num_layer_per_pp
+        offset = pp_rank * num_layer_per_pp
+        layer_list.extend(list(range(offset, offset + num_layer_this_model)))
+    for layer in layer_list:
+        print(f"{torch.distributed.get_rank()} loading layer #{layer}...")
+        layer_name = f"model.layers.{layer}"
+        dst_pp_rank, dst_virtual_pp_rank, dst_layer_idx = layer_map[layer]
+        print(f"{torch.distributed.get_rank()} offset: {offset}, num_layer_this_model: {num_layer_this_model}, layer_name: {layer_name}, layer_map[layer]: {layer_map[layer]}")
+        gpt_model_module = _get_gpt_model(models[dst_virtual_pp_rank])
+        sync_layer = gpt_model_module.model.layers[dst_layer_idx]
+        _fetch_tensor(
+            sync_layer.input_layernorm.weight if dst_pp_rank == pp_rank else None,
+            f"{layer_name}.input_layernorm.weight",
+        )
+        _fetch_tp_shard_tensor_qkv(
+            sync_layer.self_attn.qkv_proj.weight if dst_pp_rank == pp_rank else None,
+            f"{layer_name}.self_attn.q_proj.weight",
+            f"{layer_name}.self_attn.k_proj.weight",
+            f"{layer_name}.self_attn.v_proj.weight",
+        )
+        _fetch_tp_shard_tensor_qkv(
+            sync_layer.self_attn.qkv_proj.bias if dst_pp_rank == pp_rank else None,
+            f"{layer_name}.self_attn.q_proj.bias",
+            f"{layer_name}.self_attn.k_proj.bias",
+            f"{layer_name}.self_attn.v_proj.bias",
+            bias=True,
+        )
+        _fetch_tp_shard_tensor(
+            sync_layer.self_attn.o_proj.weight if dst_pp_rank == pp_rank else None,
+            f"{layer_name}.self_attn.o_proj.weight",
+            chunk_dim=1,
+        )
+        _fetch_tensor(
+            sync_layer.post_attention_layernorm.weight if dst_pp_rank == pp_rank else None,
+            f"{layer_name}.post_attention_layernorm.weight",
+        )
+        _fetch_tp_shard_tensor_gate_up(
+            sync_layer.mlp.gate_up_proj.weight if dst_pp_rank == pp_rank else None,
+            f"{layer_name}.mlp.gate_proj.weight",
+            f"{layer_name}.mlp.up_proj.weight",
+        )
+        _fetch_tp_shard_tensor(
+            sync_layer.mlp.down_proj.weight if dst_pp_rank == pp_rank else None,
+            f"{layer_name}.mlp.down_proj.weight",
+            chunk_dim=1,
+        )
+    # Final Layernorm
+    # -------------------
+    print_rank_0("loading final layernorm...")
+    gpt_model_module = _get_gpt_model(models[-1])
+    _fetch_tensor(
+        getattr(gpt_model_module.model.norm, "weight", None),
+        "model.norm.weight",
+    )
+    if tie_word_embeddings:
+        print_rank_0("tie_word_embeddings skip load lm_head")
+    else:
+        print_rank_0("loading lm_head...")
+        if pp_rank + 1 == pp_size:
+            lm_head_weight = gpt_model_module.lm_head.weight
+            if is_value_model:
+                if "lm_head.weight" in state_dict and state_dict["lm_head.weight"].shape[0] == 1:
+                    _fetch_tensor(lm_head_weight, "lm_head.weight")
+                    print_rank_0("load lm_head from value_head weight")
+                elif "reward_head.weight" in state_dict and state_dict["reward_head.weight"].shape[0] == 1:
+                    _fetch_tensor(lm_head_weight, "reward_head.weight")
+                    print_rank_0("load lm_head from value_head weight")
+                else:
+                    _fetch_tensor(None, "lm_head.weight")
+                    print_rank_0("fail to match lm_head in value_model")
+            else:
+                _fetch_tp_shard_tensor(lm_head_weight, "lm_head.weight")
+    dist.barrier()
+    torch.cuda.empty_cache()
+    print_rank_0(f"loading megatron ckpt done, time elapsed {time.time() - start_time}s")

verl/models/qwen2/megatron/checkpoint_utils/qwen2_loader_depracated.py ADDED Viewed

	@@ -0,0 +1,442 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import torch
+import torch.distributed as dist
+def _megatron_calc_layer_map(config):
+    """Calculate the mapping of global layer_idx to local layer_idx
+    Returns:
+        layer_map (Dict: int -> tuple(int, int, int)):
+            mapping from the global layer index to
+            a tuple of (pp_rank, virtual_pp_rank, layer_idx inside model)
+    """
+    from megatron.core import mpu
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+    layer_map = dict()
+    num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+    assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
+    for pp_rank_idx in range(pp_size):
+        for virtual_pp_rank_idx in range(virtual_pp_size):
+            layer_offset = virtual_pp_rank_idx * (config.num_hidden_layers // virtual_pp_size) + pp_rank_idx * num_layers_per_model
+            for layer_idx in range(num_layers_per_model):
+                layer_map[layer_offset + layer_idx] = (
+                    pp_rank_idx,
+                    virtual_pp_rank_idx,
+                    layer_idx,
+                )
+    return layer_map
+def load_state_dict_to_megatron_qwen2(state_dict, wrapped_models, config, params_dtype, is_value_model=False, tie_word_embeddings=False):
+    """Load merged state_dict to sharded Megatron module in training."""
+    from megatron.core import DistributedDataParallel as LocalDDP
+    from megatron.core import mpu
+    from megatron.core.transformer.module import Float16Module
+    from torch.nn.parallel import DistributedDataParallel as torchDDP
+    from verl.utils.megatron_utils import print_rank_0, unwrap_model
+    start_time = time.time()
+    def _get_gpt_model(model):
+        return model
+    def broadcast_params(module):
+        for param in module.parameters():
+            torch.distributed.broadcast(param.data, src=mpu.get_data_parallel_src_rank(), group=mpu.get_data_parallel_group())
+    dp_rank = mpu.get_data_parallel_rank()
+    pp_rank = mpu.get_pipeline_model_parallel_rank()
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+    mp_group = mpu.get_model_parallel_group()
+    if torch.distributed.get_rank() == 0:
+        assert mp_group.rank() == 0, f"mp_rank:[{mp_group.rank}] != 0 on rank #0"
+        assert pp_rank == 0, f"pp_rank:[{pp_rank}] != 0 on rank #0"
+        assert dp_rank == 0, f"dp_rank:[{dp_rank}] != 0 on rank #0"
+    if not isinstance(wrapped_models, (list, tuple)):
+        wrapped_models = list(wrapped_models)
+    assert len(wrapped_models) == virtual_pp_size
+    num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+    assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers, f"num_layers_per_model: {num_layers_per_model} * pp_size: {pp_size} * virtual_pp_size: {virtual_pp_size} != config.num_hidden_layers: {config.num_hidden_layers}"
+    models = [None] * len(wrapped_models)
+    for i, wrapped_model in enumerate(wrapped_models):
+        models[i] = unwrap_model(wrapped_model, (torchDDP, LocalDDP, Float16Module))
+        gpt_model_module = _get_gpt_model(models[i])
+        assert len(gpt_model_module.model.layers) == num_layers_per_model
+    def _broadcast_tensor(tensor, name) -> torch.Tensor:
+        """broadcast tensor from rank0 across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        if torch.distributed.get_rank() == 0:
+            if name in state_dict:
+                weight = state_dict[name]
+                tensor_shape = weight.shape
+            else:
+                tensor_shape = None
+        else:
+            weight = None
+            tensor_shape = None
+        obj_list = [tensor_shape]
+        dist.broadcast_object_list(obj_list, src=0, group=mp_group)
+        tensor_shape = obj_list[0]
+        if tensor_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tensor:[{name}] not in state_dict, skip load")
+            return
+        if tensor is None:
+            tensor = torch.empty(
+                tensor_shape,
+                dtype=params_dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        if torch.distributed.get_rank() == 0:
+            tensor.data.copy_(weight)
+        dist.broadcast(tensor, src=0, group=mp_group)
+    def _broadcast_tp_shard_tensor_vocab(tensor, name, chunk_dim=0, mutate_func=None) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        if torch.distributed.get_rank() == 0:
+            if name in state_dict:
+                full_weight = state_dict[name]
+                if mutate_func is not None:
+                    full_weight = mutate_func(full_weight)
+                tensor_chunk = torch.chunk(full_weight, tp_size, dim=chunk_dim)
+                chunk_shape = tensor_chunk[0].shape
+            else:
+                chunk_shape = None
+        else:
+            chunk_shape = None
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=0, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
+            return
+        if tensor is None:
+            sync_tensor = torch.empty(
+                chunk_shape,
+                dtype=params_dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        else:
+            assert tensor.shape == chunk_shape, f"rank #{torch.distributed.get_rank()} tensor {name} shape {tensor.shape} != {chunk_shape}"
+            sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
+        for i in range(tp_size):
+            if torch.distributed.get_rank() == 0:
+                sync_tensor.data.copy_(tensor_chunk[i])
+            dist.broadcast(sync_tensor, src=0, group=mp_group)
+            if (i == tp_rank) and (tensor is not None):
+                tensor.data.copy_(sync_tensor)
+    def _broadcast_tp_shard_tensor(tensor, name, chunk_dim=0, mutate_func=None) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        if torch.distributed.get_rank() == 0:
+            if name in state_dict:
+                full_weight = state_dict[name]
+                if mutate_func is not None:
+                    full_weight = mutate_func(full_weight)
+                tensor_chunk = torch.chunk(full_weight, tp_size, dim=chunk_dim)
+                chunk_shape = tensor_chunk[0].shape
+            else:
+                chunk_shape = None
+        else:
+            chunk_shape = None
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=0, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
+            return
+        if tensor is None:
+            sync_tensor = torch.empty(
+                chunk_shape,
+                dtype=params_dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        else:
+            assert tensor.shape == chunk_shape, f"rank #{torch.distributed.get_rank()} tensor {name} shape {tensor.shape} != {chunk_shape}"
+            sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
+        for i in range(tp_size):
+            if torch.distributed.get_rank() == 0:
+                sync_tensor.data.copy_(tensor_chunk[i])
+            dist.broadcast(sync_tensor, src=0, group=mp_group)
+            if (i == tp_rank) and (tensor is not None):
+                tensor.data.copy_(sync_tensor)
+    def _broadcast_tp_shard_tensor_gate_up(tensor, gate_name, up_name) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        if torch.distributed.get_rank() == 0:
+            gate_weight = state_dict[gate_name]
+            up_weight = state_dict[up_name]
+            new_gate_up_weight = torch.empty(config.intermediate_size * 2, config.hidden_size, dtype=params_dtype, device=torch.cuda.current_device())
+            for i in range(tp_size):
+                intermediate_size_tp = config.intermediate_size // tp_size
+                gate_weight_tp = gate_weight[i * intermediate_size_tp : (i + 1) * intermediate_size_tp]
+                up_weight_tp = up_weight[i * intermediate_size_tp : (i + 1) * intermediate_size_tp]
+                new_gate_up_weight[intermediate_size_tp * 2 * i : intermediate_size_tp * 2 * (i + 1)].copy_(torch.cat([gate_weight_tp, up_weight_tp], dim=0))
+            tensor_chunk = torch.chunk(new_gate_up_weight, tp_size, dim=0)
+            chunk_shape = tensor_chunk[0].shape
+        else:
+            chunk_shape = None
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=0, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{gate_name, up_name}] not in state_dict, skip loading")
+            return
+        if tensor is None:
+            sync_tensor = torch.empty(
+                chunk_shape,
+                dtype=params_dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        else:
+            assert tensor.shape == chunk_shape, f"rank #{torch.distributed.get_rank() == 0:} tensor {gate_name, up_name} shape {tensor.shape} != {chunk_shape}"
+            sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
+        for i in range(tp_size):
+            if torch.distributed.get_rank() == 0:
+                sync_tensor.data.copy_(tensor_chunk[i])
+            dist.broadcast(sync_tensor, src=0, group=mp_group)
+            if (i == tp_rank) and (tensor is not None):
+                tensor.data.copy_(sync_tensor)
+    def _broadcast_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name, bias=False) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        if torch.distributed.get_rank() == 0:
+            assert q_name in state_dict and k_name in state_dict and v_name in state_dict
+            full_weight_q = state_dict[q_name]
+            full_weight_k = state_dict[k_name]
+            full_weight_v = state_dict[v_name]
+            hidden_size_per_head = config.hidden_size // config.num_attention_heads
+            if config.num_key_value_heads >= tp_size:
+                q_size_tp = config.hidden_size // tp_size
+                kv_size_tp = hidden_size_per_head * config.num_key_value_heads // tp_size
+                total_size = q_size_tp + 2 * kv_size_tp
+                if not bias:
+                    new_weight_qkv = torch.empty(total_size * tp_size, config.hidden_size, dtype=params_dtype, device=torch.cuda.current_device())
+                else:
+                    new_weight_qkv = torch.empty(total_size * tp_size, dtype=params_dtype, device=torch.cuda.current_device())
+                for i in range(tp_size):
+                    q_part = full_weight_q[i * q_size_tp : (i + 1) * q_size_tp]
+                    k_part = full_weight_k[i * kv_size_tp : (i + 1) * kv_size_tp]
+                    v_part = full_weight_v[i * kv_size_tp : (i + 1) * kv_size_tp]
+                    new_weight_qkv[i * total_size : (i + 1) * total_size].copy_(torch.cat([q_part, k_part, v_part], dim=0))
+            else:
+                q_size_tp = config.hidden_size // tp_size
+                kv_size_tp = hidden_size_per_head
+                total_size = q_size_tp + 2 * kv_size_tp
+                if not bias:
+                    new_weight_qkv = torch.empty(total_size * tp_size, config.hidden_size, dtype=params_dtype, device=torch.cuda.current_device())
+                else:
+                    new_weight_qkv = torch.empty(total_size * tp_size, dtype=params_dtype, device=torch.cuda.current_device())
+                for i in range(tp_size):
+                    q_part = full_weight_q[i * q_size_tp : (i + 1) * q_size_tp]
+                    start_idx = i * config.num_key_value_heads // tp_size * hidden_size_per_head
+                    end_idx = (i * config.num_key_value_heads // tp_size + 1) * hidden_size_per_head
+                    k_part = full_weight_k[start_idx:end_idx]
+                    v_part = full_weight_v[start_idx:end_idx]
+                    new_weight_qkv[i * total_size : (i + 1) * total_size].copy_(torch.cat([q_part, k_part, v_part], dim=0))
+            tensor_chunk = torch.chunk(new_weight_qkv, tp_size, dim=0)
+            chunk_shape = tensor_chunk[0].shape
+        else:
+            chunk_shape = None
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=0, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{q_name, k_name, v_name}] not in state_dict, skip loading")
+            return
+        if tensor is None:
+            sync_tensor = torch.empty(
+                chunk_shape,
+                dtype=params_dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        else:
+            assert tensor.shape == chunk_shape, f"rank #{torch.distributed.get_rank()} tensor {q_name} shape {tensor.shape} != {chunk_shape}"
+            sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
+        for i in range(tp_size):
+            if torch.distributed.get_rank() == 0:
+                sync_tensor.data.copy_(tensor_chunk[i])
+            dist.broadcast(sync_tensor, src=0, group=mp_group)
+            if (i == tp_rank) and (tensor is not None):
+                tensor.data.copy_(sync_tensor)
+    if dp_rank == 0:
+        # Embeddings
+        # -------------------
+        print_rank_0("loading embeddings...")
+        gpt_model_module = _get_gpt_model(models[0])
+        embed_tokens_weight = None
+        if pp_rank == 0:
+            embed_tokens_weight = gpt_model_module.model.embed_tokens.weight
+        _broadcast_tp_shard_tensor_vocab(embed_tokens_weight, "model.embed_tokens.weight")
+        # Transformer layers
+        # -------------------
+        layer_map = _megatron_calc_layer_map(config)
+        for layer in range(config.num_hidden_layers):
+            print_rank_0(f"loading layer #{layer}...")
+            layer_name = f"model.layers.{layer}"
+            dst_pp_rank, dst_virtual_pp_rank, dst_layer_idx = layer_map[layer]
+            gpt_model_module = _get_gpt_model(models[dst_virtual_pp_rank])
+            sync_layer = gpt_model_module.model.layers[dst_layer_idx]
+            _broadcast_tensor(
+                sync_layer.input_layernorm.weight if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.input_layernorm.weight",
+            )
+            _broadcast_tp_shard_tensor_qkv(
+                sync_layer.self_attn.qkv_proj.weight if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.self_attn.q_proj.weight",
+                f"{layer_name}.self_attn.k_proj.weight",
+                f"{layer_name}.self_attn.v_proj.weight",
+            )
+            _broadcast_tp_shard_tensor_qkv(
+                sync_layer.self_attn.qkv_proj.bias if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.self_attn.q_proj.bias",
+                f"{layer_name}.self_attn.k_proj.bias",
+                f"{layer_name}.self_attn.v_proj.bias",
+                bias=True,
+            )
+            _broadcast_tp_shard_tensor(
+                sync_layer.self_attn.o_proj.weight if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.self_attn.o_proj.weight",
+                chunk_dim=1,
+            )
+            _broadcast_tensor(
+                sync_layer.post_attention_layernorm.weight if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.post_attention_layernorm.weight",
+            )
+            _broadcast_tp_shard_tensor_gate_up(
+                sync_layer.mlp.gate_up_proj.weight if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.mlp.gate_proj.weight",
+                f"{layer_name}.mlp.up_proj.weight",
+            )
+            _broadcast_tp_shard_tensor(
+                sync_layer.mlp.down_proj.weight if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.mlp.down_proj.weight",
+                chunk_dim=1,
+            )
+        # Final Layernorm
+        # -------------------
+        print_rank_0("loading final layernorm...")
+        gpt_model_module = _get_gpt_model(models[-1])
+        _broadcast_tensor(
+            getattr(gpt_model_module.model.norm, "weight", None),
+            "model.norm.weight",
+        )
+        if tie_word_embeddings:
+            print_rank_0("tie_word_embeddings skip load lm_head")
+        else:
+            print_rank_0("loading lm_head...")
+            lm_head_weight = None
+            if pp_rank + 1 == pp_size:
+                lm_head_weight = gpt_model_module.lm_head.weight
+            if is_value_model:
+                if "lm_head.weight" in state_dict and state_dict["lm_head.weight"].shape[0] == 1:
+                    _broadcast_tensor(lm_head_weight, "lm_head.weight")
+                    print_rank_0("load lm_head from value_head weight")
+                elif "reward_head.weight" in state_dict and state_dict["reward_head.weight"].shape[0] == 1:
+                    _broadcast_tensor(lm_head_weight, "reward_head.weight")
+                    print_rank_0("load lm_head from value_head weight")
+                else:
+                    _broadcast_tensor(None, "lm_head.weight")
+                    print_rank_0("fail to match lm_head in value_model")
+            else:
+                _broadcast_tp_shard_tensor(lm_head_weight, "lm_head.weight")
+    dist.barrier()
+    # Broadcast weights inside data parallel groups
+    for wrapped_model in wrapped_models:
+        broadcast_params(wrapped_model)
+    torch.cuda.empty_cache()
+    print_rank_0(f"loading megatron ckpt done, time elapsed {time.time() - start_time}s")

verl/models/qwen2/megatron/checkpoint_utils/qwen2_saver.py ADDED Viewed

	@@ -0,0 +1,436 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import torch
+import torch.distributed as dist
+from megatron.core import mpu
+from megatron.core.distributed import DistributedDataParallel as LocalDDP
+from megatron.core.transformer.module import Float16Module
+from torch.nn.parallel import DistributedDataParallel as torchDDP
+from verl.utils.megatron_utils import print_rank_0, unwrap_model
+def _megatron_calc_global_rank(tp_rank: int = 0, dp_rank: int = 0, pp_rank: int = 0):
+    """given TP,DP,PP rank to get the global rank."""
+    tp_size = mpu.get_tensor_model_parallel_world_size()
+    dp_size = mpu.get_data_parallel_world_size()
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    assert tp_size * dp_size * pp_size == torch.distributed.get_world_size(), f"{tp_size} x {dp_size} x {pp_size} != {torch.distributed.get_world_size()}"
+    # We only support TP-DP-PP grouping, for correctness when resharding
+    return (pp_rank * dp_size + dp_rank) * tp_size + tp_rank
+def _megatron_calc_layer_map(config):
+    """Calculate the mapping of global layer_idx to local layer_idx
+    Returns:
+        layer_map (Dict: int -> tuple(int, int, int)):
+            mapping from the global layer index to
+            a tuple of (pp_rank, virtual_pp_rank, layer_idx inside model)
+    """
+    from megatron.core import mpu
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+    layer_map = dict()
+    num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+    assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
+    for pp_rank_idx in range(pp_size):
+        for virtual_pp_rank_idx in range(virtual_pp_size):
+            layer_offset = virtual_pp_rank_idx * (config.num_hidden_layers // virtual_pp_size) + pp_rank_idx * num_layers_per_model
+            for layer_idx in range(num_layers_per_model):
+                layer_map[layer_offset + layer_idx] = (
+                    pp_rank_idx,
+                    virtual_pp_rank_idx,
+                    layer_idx,
+                )
+    return layer_map
+def merge_megatron_ckpt_qwen2(wrapped_models, config, dtype, is_value_model=False, tie_word_embeddings=False):
+    """Merge sharded parameters of a Megatron module into a merged checkpoint.
+    Args:
+        wrapped_models (list of megatron.core.distributed.DistributedDataParallel):
+            The local DDP wrapped megatron modules.
+        config (str or None):
+            HF config for model
+        dtype: model params type
+        is_value_model: if model is value model
+        tie_word_embeddings: tie_word_embeddings
+    Returns:
+        state_dict (dict):
+            The merged state_dict in rank 0, and an empty dictionary in other ranks.
+    """
+    start_time = time.time()
+    def _get_gpt_model(model):
+        return model
+    dp_rank = mpu.get_data_parallel_rank()
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    pp_rank = mpu.get_pipeline_model_parallel_rank()
+    virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+    mp_group = mpu.get_model_parallel_group()
+    if dist.get_rank() == 0:
+        assert mp_group.rank() == 0, f"mp_rank:[{mp_group.rank}] != 0 on rank #0"
+        assert pp_rank == 0, f"pp_rank:[{pp_rank}] != 0 on rank #0"
+        assert dp_rank == 0, f"dp_rank:[{dp_rank}] != 0 on rank #0"
+    if not isinstance(wrapped_models, (list, tuple)):
+        wrapped_models = list(wrapped_models)
+    assert len(wrapped_models) == virtual_pp_size
+    num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+    assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
+    models = [None] * len(wrapped_models)
+    for i, wrapped_model in enumerate(wrapped_models):
+        models[i] = unwrap_model(wrapped_model, (torchDDP, LocalDDP, Float16Module))
+        assert len(models[i].model.layers) == num_layers_per_model, "len model layers {} not equal to num_layers_per_model {}".format(len(models[i].model.layers), num_layers_per_model)
+    state_dict = dict()
+    def _get_cpu_tensor(tensor: torch.Tensor):
+        if tensor is None:
+            return None
+        if tensor.device == torch.device("cpu"):
+            return tensor.detach().clone()
+        return tensor.detach().cpu()
+    def _broadcast_tensor(tensor, name, src_pp_rank) -> torch.Tensor:
+        """broadcast tensor across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
+        if torch.distributed.get_rank() == src_rank:
+            if tensor is None:
+                weight = None
+                tensor_shape = None
+            else:
+                weight = tensor
+                tensor_shape = weight.shape
+        else:
+            weight = None
+            tensor_shape = None
+        obj_list = [tensor_shape]
+        dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+        tensor_shape = obj_list[0]
+        if tensor_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tensor:[{name}] not exist, skip collect")
+            return
+        if weight is None:
+            weight = torch.empty(
+                tensor_shape,
+                dtype=dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        dist.broadcast(weight, src=src_rank, group=mp_group)
+        if torch.distributed.get_rank() == 0:
+            state_dict[name] = _get_cpu_tensor(weight)
+    def _broadcast_tp_shard_tensor(tensor, name, src_pp_rank, concat_dim=0, mutate_func=None) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
+        chunk_shape = tensor.shape if torch.distributed.get_rank() == src_rank else None
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{name}] not exist, skip collecting")
+            return
+        buffer_tensor = torch.empty(
+            chunk_shape,
+            dtype=dtype,
+            device=torch.cuda.current_device(),
+            requires_grad=False,
+        )
+        chunk_tensors = [None] * tp_size
+        for i in range(tp_size):
+            cur_src_rank = _megatron_calc_global_rank(tp_rank=i, dp_rank=0, pp_rank=src_pp_rank)
+            sync_tensor = tensor if torch.distributed.get_rank() == cur_src_rank else buffer_tensor
+            dist.broadcast(sync_tensor, src=cur_src_rank, group=mp_group)
+            if torch.distributed.get_rank() == 0:
+                chunk_tensors[i] = _get_cpu_tensor(sync_tensor)
+        if torch.distributed.get_rank() == 0:
+            full_tensor = torch.concat(chunk_tensors, dim=concat_dim)
+            if mutate_func is not None:
+                full_tensor = mutate_func(full_tensor)
+            state_dict[name] = full_tensor
+    def _broadcast_tp_shard_tensor_gate_up(tensor, gate_name, up_name, src_pp_rank) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
+        chunk_shape = tensor.shape if torch.distributed.get_rank() == src_rank else None
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{gate_name, up_name}] not exist, skip collecting")
+            return
+        buffer_tensor = torch.empty(
+            chunk_shape,
+            dtype=dtype,
+            device=torch.cuda.current_device(),
+            requires_grad=False,
+        )
+        chunk_tensors = [None] * tp_size
+        for i in range(tp_size):
+            cur_src_rank = _megatron_calc_global_rank(tp_rank=i, dp_rank=0, pp_rank=src_pp_rank)
+            sync_tensor = tensor if torch.distributed.get_rank() == cur_src_rank else buffer_tensor
+            dist.broadcast(sync_tensor, src=cur_src_rank, group=mp_group)
+            if torch.distributed.get_rank() == 0:
+                chunk_tensors[i] = _get_cpu_tensor(sync_tensor)
+        if torch.distributed.get_rank() == 0:
+            full_tensor = torch.concat(chunk_tensors, dim=0)
+            intermediate_size_tp = config.intermediate_size // tp_size
+            gate_weight_list = []
+            up_weight_list = []
+            for i in range(tp_size):
+                gate_up_weight_tp = full_tensor[intermediate_size_tp * 2 * i : intermediate_size_tp * 2 * (i + 1)]
+                gate_weight_tp = gate_up_weight_tp[:intermediate_size_tp]
+                up_weight_tp = gate_up_weight_tp[intermediate_size_tp:]
+                gate_weight_list.append(gate_weight_tp)
+                up_weight_list.append(up_weight_tp)
+            state_dict[gate_name] = torch.cat(gate_weight_list, dim=0)
+            state_dict[up_name] = torch.cat(up_weight_list, dim=0)
+    def _broadcast_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name, src_pp_rank):
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
+        chunk_shape = tensor.shape if torch.distributed.get_rank() == src_rank else None
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{q_name}] not exist, skip collecting")
+            return
+        buffer_tensor = torch.empty(
+            chunk_shape,
+            dtype=dtype,
+            device=torch.cuda.current_device(),
+            requires_grad=False,
+        )
+        chunk_tensors = [None] * tp_size
+        for i in range(tp_size):
+            cur_src_rank = _megatron_calc_global_rank(tp_rank=i, dp_rank=0, pp_rank=src_pp_rank)
+            sync_tensor = tensor if torch.distributed.get_rank() == cur_src_rank else buffer_tensor
+            dist.broadcast(sync_tensor, src=cur_src_rank, group=mp_group)
+            if torch.distributed.get_rank() == 0:
+                chunk_tensors[i] = _get_cpu_tensor(sync_tensor)
+        if torch.distributed.get_rank() == 0:
+            full_tensor = torch.concat(chunk_tensors, dim=0)
+            q_weight_list = []
+            k_weight_list = []
+            v_weight_list = []
+            hidden_size_per_head = config.hidden_size // config.num_attention_heads
+            if config.num_key_value_heads >= tp_size:
+                q_size_tp = config.hidden_size // tp_size
+                kv_size_tp = hidden_size_per_head * config.num_key_value_heads // tp_size
+                total_size = q_size_tp + 2 * kv_size_tp
+                for i in range(tp_size):
+                    qkv_part = full_tensor[i * total_size : (i + 1) * total_size]
+                    q_part = qkv_part[:q_size_tp]
+                    k_part = qkv_part[q_size_tp : q_size_tp + kv_size_tp]
+                    v_part = qkv_part[q_size_tp + kv_size_tp : total_size]
+                    q_weight_list.append(q_part)
+                    k_weight_list.append(k_part)
+                    v_weight_list.append(v_part)
+            else:
+                q_size_tp = config.hidden_size // tp_size
+                kv_size_tp = hidden_size_per_head
+                total_size = q_size_tp + 2 * kv_size_tp
+                for i in range(tp_size):
+                    qkv_part = full_tensor[i * total_size : (i + 1) * total_size]
+                    q_part = qkv_part[:q_size_tp]
+                    k_part = qkv_part[q_size_tp : q_size_tp + kv_size_tp]
+                    v_part = qkv_part[q_size_tp + kv_size_tp : total_size]
+                    q_weight_list.append(q_part)
+                    if i * config.num_key_value_heads % tp_size == 0:
+                        k_weight_list.append(k_part)
+                        v_weight_list.append(v_part)
+            state_dict[q_name] = torch.cat(q_weight_list, dim=0)
+            state_dict[k_name] = torch.cat(k_weight_list, dim=0)
+            state_dict[v_name] = torch.cat(v_weight_list, dim=0)
+    # empty cache before collecting weights
+    torch.cuda.empty_cache()
+    # Embeddings
+    # -------------------
+    if dp_rank == 0:
+        # Embeddings
+        # -------------------
+        print_rank_0("collecting embeddings...")
+        gpt_model_module = _get_gpt_model(models[0])
+        _broadcast_tp_shard_tensor(
+            gpt_model_module.model.embed_tokens.weight if pp_rank == 0 else None,
+            "model.embed_tokens.weight",
+            src_pp_rank=0,
+        )
+        # Transformer layers
+        # -------------------
+        layer_map = _megatron_calc_layer_map(config)
+        for layer in range(config.num_hidden_layers):
+            print_rank_0(f"collecting layer #{layer}...")
+            layer_name = f"model.layers.{layer}"
+            src_pp_rank, src_virtual_pp_rank, src_layer_idx = layer_map[layer]
+            gpt_model_module = _get_gpt_model(models[src_virtual_pp_rank])
+            sync_layer = gpt_model_module.model.layers[src_layer_idx]
+            _broadcast_tensor(
+                sync_layer.input_layernorm.weight,
+                f"{layer_name}.input_layernorm.weight",
+                src_pp_rank=src_pp_rank,
+            )
+            _broadcast_tp_shard_tensor_qkv(
+                sync_layer.self_attn.qkv_proj.weight,
+                f"{layer_name}.self_attn.q_proj.weight",
+                f"{layer_name}.self_attn.k_proj.weight",
+                f"{layer_name}.self_attn.v_proj.weight",
+                src_pp_rank=src_pp_rank,
+            )
+            _broadcast_tp_shard_tensor_qkv(
+                sync_layer.self_attn.qkv_proj.bias,
+                f"{layer_name}.self_attn.q_proj.bias",
+                f"{layer_name}.self_attn.k_proj.bias",
+                f"{layer_name}.self_attn.v_proj.bias",
+                src_pp_rank=src_pp_rank,
+            )
+            _broadcast_tp_shard_tensor(
+                sync_layer.self_attn.o_proj.weight,
+                f"{layer_name}.self_attn.o_proj.weight",
+                concat_dim=1,
+                src_pp_rank=src_pp_rank,
+            )
+            _broadcast_tensor(
+                sync_layer.post_attention_layernorm.weight,
+                f"{layer_name}.post_attention_layernorm.weight",
+                src_pp_rank=src_pp_rank,
+            )
+            _broadcast_tp_shard_tensor_gate_up(
+                sync_layer.mlp.gate_up_proj.weight,
+                f"{layer_name}.mlp.gate_proj.weight",
+                f"{layer_name}.mlp.up_proj.weight",
+                src_pp_rank=src_pp_rank,
+            )
+            _broadcast_tp_shard_tensor(
+                sync_layer.mlp.down_proj.weight,
+                f"{layer_name}.mlp.down_proj.weight",
+                concat_dim=1,
+                src_pp_rank=src_pp_rank,
+            )
+        # Final Layernorm
+        # -------------------
+        print_rank_0("collecting final layernorm...")
+        gpt_model_module = _get_gpt_model(models[-1])
+        _broadcast_tensor(
+            getattr(gpt_model_module.model.norm, "weight", None),
+            "model.norm.weight",
+            src_pp_rank=pp_size - 1,
+        )
+        if tie_word_embeddings:
+            print_rank_0("tie word embedding skip load lm_head...")
+        else:
+            print_rank_0("collecting lm_head...")
+            if is_value_model:
+                _broadcast_tensor(
+                    gpt_model_module.lm_head.weight if pp_rank == pp_size - 1 else None,
+                    "lm_head.weight",
+                    src_pp_rank=pp_size - 1,
+                )
+                _broadcast_tensor(
+                    gpt_model_module.reward_head.weight if pp_rank == pp_size - 1 and getattr(gpt_model_module, "reward_weight", None) is not None else None,
+                    "reward_head.weight",
+                    src_pp_rank=pp_size - 1,
+                )
+            else:
+                _broadcast_tp_shard_tensor(
+                    getattr(gpt_model_module.lm_head, "weight", None) if pp_rank == pp_size - 1 else None,
+                    "lm_head.weight",
+                    src_pp_rank=pp_size - 1,
+                )
+    dist.barrier()
+    torch.cuda.empty_cache()
+    if torch.distributed.get_rank() == 0:
+        for k, v in state_dict.items():
+            if dtype != v.dtype:
+                state_dict[k] = v.to(dtype)
+    print_rank_0(f"merge megatron ckpt done, time elapsed {time.time() - start_time}s")
+    return state_dict

verl/models/qwen2/megatron/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .parallel_attention import ParallelQwen2Attention
+from .parallel_decoder import ParallelQwen2DecoderLayer, ParallelQwen2DecoderLayerRmPad
+from .parallel_mlp import ParallelQwen2MLP
+from .parallel_rmsnorm import ParallelQwen2RMSNorm
+__all__ = ["ParallelQwen2Attention", "ParallelQwen2DecoderLayer", "ParallelQwen2DecoderLayerRmPad", "ParallelQwen2MLP", "ParallelQwen2RMSNorm"]