OpenLab-NLP
/

model-prototype

Model card Files Files and versions

xet

Community

Yuchan commited on 20 days ago

Commit

cc78280

verified ·

1 Parent(s): 6075db7

Update Model_torch.py

Browse files

Files changed (1) hide show

Model_torch.py +169 -189

Model_torch.py CHANGED Viewed

@@ -1,15 +1,23 @@
-import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.utils.data import Dataset, DataLoader
-import numpy as np
 import sentencepiece as spm
 import requests
-import os
 TOKENIZER_PATH = "ko_unigram.model"
-DATA_PATH = "corpus.txt"  # 36M 문장 텍스트 파일
-max_len = 128
 # ===============================
 # 1️⃣ 파일 다운로드
 # ===============================
@@ -19,215 +27,187 @@ def download_file(url, save_path):
     with open(save_path, "wb") as f:
         for chunk in r.iter_content(8192*2):
             f.write(chunk)
-    print(f"✅ {save_path} 저장됨")
 if not os.path.exists(TOKENIZER_PATH):
     download_file(
         "https://huggingface.co/Yuchan5386/inlam-100m/resolve/main/ko_unigram.model?download=true",
-        TOKENIZER_PATH
     )
 if not os.path.exists(DATA_PATH):
     download_file(
         "https://huggingface.co/datasets/Yuchan5386/1/resolve/main/shuffled_corpus.txt?download=true",
-        DATA_PATH
     )
 # ===============================
-# SentencePiece
 # ===============================
-sp = spm.SentencePieceProcessor("ko_unigram.model")
 pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
-start_id = sp.piece_to_id("<start>")
-end_id = sp.piece_to_id("<end>")
 vocab_size = sp.get_piece_size()
-max_len = 512
-batch_size = 32
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-def text_to_ids(text):
-    return sp.encode(text, out_type=int)
-def ids_to_text(ids):
-    return sp.decode(ids)
 # ===============================
-# Dataset
 # ===============================
-class TextDataset(Dataset):
-    def __init__(self, file_path, num_lines=None):
-        self.lines = []
-        with open(file_path, "r", encoding="utf-8") as f:
-            for i, line in enumerate(f):
-                if num_lines is not None and i >= num_lines:
-                    break
-                line = line.strip()
-                if line:
-                    self.lines.append(line)
-    def __len__(self):
-        return len(self.lines)
-    def __getitem__(self, idx):
-        text = self.lines[idx]
-        ids = text_to_ids(text)[:max_len-1]
-        full_input = ids + [end_id]
-        pad_len = max_len - len(full_input)
-        full_input += [pad_id]*pad_len
-        target = full_input[1:] + [pad_id]
-        return torch.tensor(full_input, dtype=torch.long), torch.tensor(target, dtype=torch.long)
-dataset = TextDataset("corpus.txt", num_lines=100000)
-dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
 # ===============================
-# 모델 정의
 # ===============================
-class SwiGLU(nn.Module):
-    def __init__(self, d_model):
         super().__init__()
-        self.W = nn.Linear(d_model, 3500)
-        self.W1 = nn.Linear(1750, d_model)
-    def forward(self, x):
-        x = self.W(x.float())
-        a,b = x.chunk(2, dim=-1)
-        return self.W1(F.silu(a)*b).to(x.dtype)
-class SparseCausalAttention(nn.Module):
-    def __init__(self, num_heads, head_dim, window_size=8):
         super().__init__()
-        self.num_heads = num_heads
-        self.head_dim = head_dim
-        self.window_size = window_size
-        self.q = nn.Linear(head_dim*num_heads, num_heads*head_dim)
-        self.k = nn.Linear(head_dim*num_heads, num_heads*head_dim)
-        self.v = nn.Linear(head_dim*num_heads, num_heads*head_dim)
-        self.out = nn.Linear(num_heads*head_dim, head_dim*num_heads)
     def forward(self, x):
-        B,L,D = x.shape
-        q = self.q(x).view(B,L,self.num_heads,self.head_dim).transpose(1,2)
-        k = self.k(x).view(B,L,self.num_heads,self.head_dim).transpose(1,2)
-        v = self.v(x).view(B,L,self.num_heads,self.head_dim).transpose(1,2)
-        q = q / (self.head_dim ** 0.5)
-        attn_scores = torch.matmul(q, k.transpose(-2,-1))
-        mask = torch.tril(torch.ones(L,L, device=x.device))
-        band_mask = torch.triu(mask, -self.window_size)
-        attn_scores = attn_scores.masked_fill(band_mask==0, float('-inf'))
-        attn_probs = F.softmax(attn_scores, dim=-1)
-        out = torch.matmul(attn_probs, v)
-        out = out.transpose(1,2).reshape(B,L,D)
-        return self.out(out)
-class Lo(nn.Module):
-    def __init__(self,d_model):
-        super().__init__()
-        self.d = nn.Linear(d_model,64)
-        self.w = nn.Linear(64,d_model)
-        self.norm = nn.LayerNorm(d_model)
-    def forward(self,x):
-        return self.norm(self.w(F.silu(self.d(x))) + x)
-class Block(nn.Module):
-    def __init__(self,d_model):
-        super().__init__()
-        self.attn = SparseCausalAttention(num_heads=2, head_dim=64)
-        self.glu = SwiGLU(d_model)
-        self.norm = nn.LayerNorm(d_model)
-        self.lo = Lo(d_model)
-    def forward(self,x):
-        x = self.attn(x)
-        x = self.norm(self.glu(x)+x)
-        x = self.lo(x)
-        return x
-class ReLM(nn.Module):
-    def __init__(self,vocab_size,max_seq_len,d_model,n_layers):
-        super().__init__()
-        self.token_embedding = nn.Embedding(vocab_size,d_model)
-        self.pos_embedding = nn.Embedding(max_seq_len,d_model)
-        self.blocks = nn.ModuleList([Block(d_model) for _ in range(n_layers)])
-        self.ln_f = nn.LayerNorm(d_model)
-        self.d_model = d_model
-    def forward(self,x):
-        B,L = x.shape
-        positions = torch.arange(L,device=x.device).unsqueeze(0)
-        x = self.token_embedding(x) + self.pos_embedding(positions)
         for block in self.blocks:
             x = block(x)
         x = self.ln_f(x)
-        logits = x @ self.token_embedding.weight.T
-        return logits
-# 모델, 옵티마이저, 스케줄러, 손실 함수
-model = ReLM(vocab_size, max_len, 128, 2).to(device)
-optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
-scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
-loss_fn = nn.CrossEntropyLoss(ignore_index=pad_id)
-# 정적 그래프 컴파일
-model = torch.compile(model, mode="default")
-scaler = torch.cuda.amp.GradScaler()
-epochs = 1
-for epoch in range(epochs):
-    model.train()
-    total_loss = 0
-    for step, (x, y) in enumerate(dataloader):
-        x, y = x.to(device), y.to(device)
-        optimizer.zero_grad()
-        with torch.cuda.amp.autocast():  # mixed precision
-            logits = model(x)
-            loss = loss_fn(logits.view(-1, vocab_size), y.view(-1))
-        scaler.scale(loss).backward()
-        scaler.unscale_(optimizer)
-        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
-        scaler.step(optimizer)
-        scaler.update()
-        total_loss += loss.item()
-        if step % 100 == 0:
-            print(f"Epoch {epoch+1}, Step {step}, Loss: {loss.item():.4f}")
-    scheduler.step()
-    print(f"Epoch {epoch+1} 완료, 평균 Loss: {total_loss/len(dataloader):.4f}")
-torch.save(model.state_dict(), "relm_model.pth")
-print("✅ 모델 저장 완료!")
-# ===============================
-# Top-p 샘플링 생성
-# ===============================
-def generate_text_topp(model, prompt, max_len=150, max_gen=150, p=0.9, temperature=0.6, min_len=20):
-    model.eval()
-    model_input = text_to_ids(f"<start> {prompt}")
-    model_input = model_input[:max_len]
-    generated = list(model_input)
-    with torch.no_grad():
-        for step in range(max_gen):
-            input_seq = generated[-max_len:] if len(generated)>max_len else generated
-            input_tensor = torch.tensor([input_seq + [pad_id]*(max_len-len(input_seq))], device=device)
-            logits = model(input_tensor)
-            next_logits = logits[0,len(input_seq)-1]
-            next_logits[end_id] -= 5.0
-            next_logits[pad_id] -= 10.0
-            probs = F.softmax(next_logits/temperature, dim=-1).cpu().numpy()
-            sorted_indices = np.argsort(probs)[::-1]
-            sorted_probs = probs[sorted_indices]
-            cumulative_probs = np.cumsum(sorted_probs)
-            cutoff = np.searchsorted(cumulative_probs,p)
-            top_indices = sorted_indices[:cutoff+1]
-            top_probs = sorted_probs[:cutoff+1]
-            top_probs /= top_probs.sum()
-            next_token = np.random.choice(top_indices, p=top_probs)
-            if next_token==end_id and len(generated)>=min_len:
-                break
-            generated.append(int(next_token))
-    return ids_to_text(generated)
-# 테스트
-print("\n===== 생성 결과 =====")
-print(generate_text_topp(model, "지난 2년 동안 출연연이 국가가 필요한 연구를", p=0.9))

+import os, json, random, numpy as np, torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.utils.data import IterableDataset, DataLoader
 import sentencepiece as spm
 import requests
+# ===============================
+# 0️⃣ 환경 설정
+# ===============================
 TOKENIZER_PATH = "ko_unigram.model"
+DATA_PATH = "corpus.txt"
+MAX_LEN = 128
+EMBED_DIM = 384
+LATENT_DIM = 384
+BATCH_SIZE = 384
+NEGATIVE_RATIO = 1
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # ===============================
 # 1️⃣ 파일 다운로드
 # ===============================
     with open(save_path, "wb") as f:
         for chunk in r.iter_content(8192*2):
             f.write(chunk)
+    print(f"Saved {save_path}")
 if not os.path.exists(TOKENIZER_PATH):
     download_file(
         "https://huggingface.co/Yuchan5386/inlam-100m/resolve/main/ko_unigram.model?download=true",
+        TOKENIZER_PATH,
     )
 if not os.path.exists(DATA_PATH):
     download_file(
         "https://huggingface.co/datasets/Yuchan5386/1/resolve/main/shuffled_corpus.txt?download=true",
+        DATA_PATH,
     )
 # ===============================
+# 2️⃣ 토크나이저 준비
 # ===============================
+sp = spm.SentencePieceProcessor(TOKENIZER_PATH)
 pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
 vocab_size = sp.get_piece_size()
+def encode_sentence(sentence, max_len=MAX_LEN):
+    return sp.encode(sentence, out_type=int)[:max_len]
+def pad_sentence(tokens):
+    return tokens + [pad_id] * (MAX_LEN - len(tokens))
 # ===============================
+# 3️⃣ Streaming Dataset
 # ===============================
+class PairStream(IterableDataset):
+    def __init__(self, txt_path, negative_ratio):
+        self.sentences = [line.strip() for line in open(txt_path, encoding="utf-8") if line.strip()]
+        self.neg_ratio = negative_ratio
+    def __iter__(self):
+        while True:
+            for s1 in self.sentences:
+                x1 = pad_sentence(encode_sentence(s1))
+                yield (torch.tensor(x1), torch.tensor(x1), torch.tensor(1.0))
+                for _ in range(self.neg_ratio):
+                    s2 = random.choice(self.sentences)
+                    x2 = pad_sentence(encode_sentence(s2))
+                    yield (torch.tensor(x1), torch.tensor(x2), torch.tensor(0.0))
+stream_ds = PairStream(DATA_PATH, NEGATIVE_RATIO)
+loader = DataLoader(stream_ds, batch_size=BATCH_SIZE)
 # ===============================
+# 4️⃣ Sentence Encoder 정의
 # ===============================
+class EncoderBlock(nn.Module):
+    def __init__(self, embed_dim, latent_dim):
         super().__init__()
+        self.mha = nn.MultiheadAttention(embed_dim, num_heads=8, batch_first=True)
+        self.WB = nn.Linear(embed_dim, embed_dim * 3)
+        self.W = nn.Linear(embed_dim * 3 // 2, embed_dim)
+        self.ln1 = nn.LayerNorm(embed_dim)
+        self.ln2 = nn.LayerNorm(embed_dim)
+        self.ln3 = nn.LayerNorm(embed_dim)
+    def forward(self, x):
+        x1 = self.ln1(x)
+        attn, _ = self.mha(x1, x1, x1)
+        x = attn + x
+        x2 = self.ln2(x)
+        w = self.WB(x2)
+        a, b = torch.chunk(w, 2, dim=-1)
+        g = F.silu(a) * b
+        out = self.W(g)
+        return self.ln3(out) + x
+class SentenceEncoder(nn.Module):
+    def __init__(self, vocab_size, embed_dim, latent_dim, max_len):
         super().__init__()
+        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_id)
+        self.pos = nn.Embedding(max_len, embed_dim)
+        self.blocks = nn.ModuleList([EncoderBlock(embed_dim, latent_dim) for _ in range(2)])
+        self.ln_f = nn.LayerNorm(embed_dim)
+        self.latent = nn.Linear(embed_dim, latent_dim)
     def forward(self, x):
+        b, l = x.shape
+        pos_ids = torch.arange(l, device=x.device).unsqueeze(0).expand(b, l)
+        x = self.embed(x) + self.pos(pos_ids)
         for block in self.blocks:
             x = block(x)
         x = self.ln_f(x)
+        x = x.mean(dim=1)
+        return torch.tanh(self.latent(x))
+encoder = SentenceEncoder(vocab_size, EMBED_DIM, LATENT_DIM, MAX_LEN).to(device)
+# ===============================
+# 5️⃣ Cosine + Contrastive Loss
+# ===============================
+def cosine_sim(v1, v2, eps=1e-8):
+    dot = (v1 * v2).sum(dim=-1)
+    norm = v1.norm(dim=-1) * v2.norm(dim=-1) + eps
+    return dot / norm
+def contrastive_loss(pred, label, margin=0.7):
+    dist = 1 - pred
+    pos_loss = label * dist.pow(2)
+    neg_loss = (1 - label) * (torch.clamp(margin - dist, min=0).pow(2))
+    return (pos_loss + neg_loss).mean()
+optimizer = torch.optim.Adam(encoder.parameters(), lr=1e-5)
+encoder = torch.compile(encoder)
+cosine_sim = torch.compile(cosine_sim)
+contrastive_loss = torch.compile(contrastive_loss)
+# ===============================
+# 6️⃣ 학습 루프
+# ===============================
+steps_per_epoch = 23119910 // BATCH_SIZE
+from tqdm import tqdm
+encoder.train()
+progress = tqdm(range(steps_per_epoch), desc="Training", ncols=120)
+for step, batch in zip(progress, loader):
+    x1, x2, y = [b.to(device) for b in batch]
+    # forward
+    v1 = encoder(x1)
+    v2 = encoder(x2)
+    pred = cosine_sim(v1, v2)
+    loss = contrastive_loss(pred, y)
+    # backward
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer.step()
+    # 📉 tqdm에 loss 표시
+    progress.set_postfix({"loss": f"{loss.item():.4f}"})
+# ===============================
+# 7️⃣ 검색용 벡터 생성
+# ===============================
+LIMIT = 4000
+prompts = []
+for i, line in enumerate(open(DATA_PATH, "r", encoding="utf-8")):
+    if i >= LIMIT: break
+    line = line.strip()
+    if line:
+        prompts.append(line)
+@torch.no_grad()
+def get_sentence_vector(sentence):
+    tokens = pad_sentence(encode_sentence(sentence))
+    x = torch.tensor([tokens]).to(device)
+    return encoder(x).cpu().numpy()[0]
+if os.path.exists("corpus_vectors.npy"):
+    corpus_vectors = np.load("corpus_vectors.npy")
+else:
+    corpus_vectors = np.stack([get_sentence_vector(p) for p in prompts]).astype(np.float16)
+    np.save("corpus_vectors.npy", corpus_vectors)
+corpus_norms = np.linalg.norm(corpus_vectors, axis=1)
+# ===============================
+# 8️⃣ 검색 함수
+# ===============================
+def search(query, top_k=3):
+    q_vec = get_sentence_vector(query).astype(np.float16)
+    sims = corpus_vectors @ q_vec
+    sims /= (corpus_norms * np.linalg.norm(q_vec) + 1e-8)
+    top_idx = np.argsort(sims)[::-1][:top_k]
+    return [(prompts[i], float(sims[i])) for i in top_idx]
+# ===============================
+# 🔟 테스트
+# ===============================
+query = "점심이나 저녁을 우리와 함께 먹을 건가요?"
+results = search(query)
+for p, s in results:
+    print(f"Prompt: {p}\n유사도: {s:.3f}\n---")