Added DAPT files
This commit is contained in:
parent
7df8ae203f
commit
7fa325eda9
|
|
@ -0,0 +1,347 @@
|
|||
|
||||
|
||||
# %%
|
||||
from huggingface_hub import InferenceClient
|
||||
|
||||
client = InferenceClient("meta-llama/Llama-3.1-8B", token="hf_xxx") # replace with your HF token
|
||||
resp = client.text_generation(
|
||||
"Write a haiku about GPUs",
|
||||
max_new_tokens=128,
|
||||
temperature=0.7,
|
||||
)
|
||||
print(resp)
|
||||
|
||||
# %%
|
||||
# Minimize on-disk writes (avoid "No space left on device")
|
||||
import os, tempfile, datasets, transformers
|
||||
|
||||
# Use a small temp dir for caches or disable dataset cache writes
|
||||
TMP_DIR = tempfile.mkdtemp(prefix="hf_tmp_")
|
||||
os.environ["HF_HOME"] = TMP_DIR
|
||||
os.environ["HF_DATASETS_CACHE"] = os.path.join(TMP_DIR, "datasets_cache")
|
||||
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
|
||||
|
||||
os.environ["HF_TOKEN"] = "hf_xxx" # replace with your HF token
|
||||
os.environ["HUGGINGFACE_HUB_TOKEN"] = os.environ["HF_TOKEN"]
|
||||
# Keep map results in memory to avoid materializing to disk
|
||||
datasets.disable_caching()
|
||||
print({
|
||||
"HF_HOME": os.environ.get("HF_HOME"),
|
||||
"HF_DATASETS_CACHE": os.environ.get("HF_DATASETS_CACHE"),
|
||||
"caching_disabled": True,
|
||||
"PYTORCH_CUDA_ALLOC_CONF": os.environ.get("PYTORCH_CUDA_ALLOC_CONF"),
|
||||
})
|
||||
|
||||
# %%
|
||||
# If needed, install dependencies. Uncomment the next cell to run once.
|
||||
# %pip -q install -U transformers datasets accelerate peft
|
||||
# For CUDA QLoRA only (Linux/NVIDIA):
|
||||
# %pip -q install bitsandbytes
|
||||
|
||||
import os
|
||||
import platform
|
||||
import torch
|
||||
|
||||
# Detect environment
|
||||
USE_CUDA = torch.cuda.is_available()
|
||||
USE_MPS = (not USE_CUDA) and torch.backends.mps.is_available()
|
||||
BF16_OK = USE_CUDA and torch.cuda.is_bf16_supported()
|
||||
USE_QLORA = USE_CUDA # QLoRA requires CUDA + bitsandbytes; set False on macOS/CPU
|
||||
|
||||
# Disable QLoRA automatically if bitsandbytes is not installed
|
||||
try:
|
||||
import importlib.metadata as _ilmd
|
||||
_ = _ilmd.version("bitsandbytes")
|
||||
except Exception:
|
||||
if USE_QLORA:
|
||||
print("bitsandbytes not found; disabling QLoRA (falling back to standard LoRA)")
|
||||
USE_QLORA = False
|
||||
|
||||
DEVICE = (
|
||||
torch.device("cuda") if USE_CUDA else (torch.device("mps") if USE_MPS else torch.device("cpu"))
|
||||
)
|
||||
|
||||
print({
|
||||
"cuda": USE_CUDA,
|
||||
"mps": USE_MPS,
|
||||
"bf16_ok": BF16_OK,
|
||||
"use_qlora": USE_QLORA,
|
||||
"device": str(DEVICE),
|
||||
"python": platform.python_version(),
|
||||
})
|
||||
|
||||
# %%
|
||||
from datasets import load_dataset
|
||||
from typing import Optional
|
||||
|
||||
# Paths and config
|
||||
PARQUET_PATH = "/u/v/d/vdhanuka/defeatbeta-api-main/stock_earning_call_transcripts.parquet"
|
||||
TEXT_COLUMN: Optional[str] = None # override to force a column, else auto
|
||||
|
||||
raw_ds = load_dataset("parquet", data_files={"train": PARQUET_PATH})["train"]
|
||||
SAMPLE_FRACTION = 0.2 # use 20% random subset for faster DAPT
|
||||
SAMPLE_SEED = int(os.environ.get("SAMPLE_SEED", "42"))
|
||||
if SAMPLE_FRACTION < 1.0:
|
||||
split = raw_ds.train_test_split(test_size=1.0 - SAMPLE_FRACTION, seed=SAMPLE_SEED, shuffle=True)
|
||||
raw_ds = split["train"]
|
||||
try:
|
||||
print(f"Randomly sampled {int(SAMPLE_FRACTION*100)}% subset; size: {len(raw_ds)}")
|
||||
except Exception:
|
||||
pass
|
||||
print("Columns:", raw_ds.column_names)
|
||||
print(raw_ds[0])
|
||||
|
||||
# If schema has nested `transcripts` (array of structs with speaker/content),
|
||||
# flatten into a single text field for DAPT.
|
||||
if "transcripts" in raw_ds.column_names:
|
||||
def flatten_segments(example):
|
||||
segments = example.get("transcripts") or []
|
||||
lines = []
|
||||
for seg in segments:
|
||||
if not seg:
|
||||
continue
|
||||
speaker = seg.get("speaker")
|
||||
content = seg.get("content")
|
||||
if content is None:
|
||||
continue
|
||||
if speaker and len(str(speaker)) > 0:
|
||||
lines.append(f"{speaker}: {content}")
|
||||
else:
|
||||
lines.append(str(content))
|
||||
example["__flattened_text"] = "\n".join(lines)
|
||||
return example
|
||||
|
||||
raw_ds = raw_ds.map(flatten_segments)
|
||||
# Prefer flattened text unless user overrides
|
||||
if TEXT_COLUMN is None:
|
||||
TEXT_COLUMN = "__flattened_text"
|
||||
|
||||
# Auto-detect a reasonable text column if still unknown
|
||||
if TEXT_COLUMN is None:
|
||||
preferred = ["__flattened_text","text","transcript","content","body","cleaned_text","utterance","raw_text"]
|
||||
for p in preferred:
|
||||
exact = [c for c in raw_ds.column_names if c.lower() == p]
|
||||
if len(exact) > 0:
|
||||
TEXT_COLUMN = exact[0]
|
||||
break
|
||||
|
||||
if TEXT_COLUMN is None:
|
||||
# fallback to first string-like column
|
||||
for name, feature in raw_ds.features.items():
|
||||
if getattr(feature, "dtype", "") in ("string", "large_string"):
|
||||
TEXT_COLUMN = name
|
||||
break
|
||||
|
||||
if TEXT_COLUMN is None:
|
||||
TEXT_COLUMN = raw_ds.column_names[0]
|
||||
|
||||
print("Using text column:", TEXT_COLUMN)
|
||||
|
||||
# Filter empty
|
||||
ds = raw_ds.filter(lambda x: x.get(TEXT_COLUMN) is not None and len(str(x[TEXT_COLUMN])) > 0)
|
||||
print(ds)
|
||||
print("Example text:", str(ds[0][TEXT_COLUMN])[:400])
|
||||
|
||||
# %%
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
MODEL_ID = "meta-llama/Llama-3.1-8B"
|
||||
BLOCK_SIZE = 512 # lowered to reduce activation memory on 10–12 GB GPUs
|
||||
|
||||
# Load tokenizer
|
||||
print("Loading tokenizer...")
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
# Avoid long-sequence warnings during tokenization; packing enforces BLOCK_SIZE later
|
||||
try:
|
||||
tokenizer.model_max_length = 1_000_000_000
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def tokenize_examples(batch):
|
||||
return tokenizer(batch[TEXT_COLUMN], add_special_tokens=False, truncation=False)
|
||||
|
||||
print("Tokenizing dataset (this may take a while)...")
|
||||
tok_ds = ds.map(tokenize_examples, batched=True, remove_columns=[c for c in ds.column_names if c != TEXT_COLUMN])
|
||||
|
||||
# Pack tokens into fixed blocks
|
||||
def group_texts(examples):
|
||||
concatenated = []
|
||||
for ids in examples["input_ids"]:
|
||||
concatenated.extend(ids + [tokenizer.eos_token_id])
|
||||
total_length = (len(concatenated) // BLOCK_SIZE) * BLOCK_SIZE
|
||||
if total_length == 0:
|
||||
return {"input_ids": [], "labels": []}
|
||||
input_ids = [concatenated[i:i+BLOCK_SIZE] for i in range(0, total_length, BLOCK_SIZE)]
|
||||
return {"input_ids": input_ids, "labels": [x.copy() for x in input_ids]}
|
||||
|
||||
lm_ds = tok_ds.map(group_texts, batched=True, remove_columns=tok_ds.column_names)
|
||||
print(lm_ds)
|
||||
|
||||
# %%
|
||||
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
|
||||
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
||||
|
||||
OUTPUT_DIR = "/u/v/d/vdhanuka/llama3_8b_dapt_transcripts_lora"
|
||||
LEARNING_RATE = 2e-4
|
||||
EPOCHS = 1
|
||||
PER_DEVICE_BATCH = 1
|
||||
GRAD_ACCUM = 32
|
||||
|
||||
bnb_config = None
|
||||
if USE_QLORA:
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype=torch.bfloat16 if BF16_OK else torch.float16,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
)
|
||||
|
||||
# Prefer FlashAttention-2 on CUDA if available; else fall back to SDPA
|
||||
attn_impl = "sdpa"
|
||||
if USE_CUDA:
|
||||
try:
|
||||
import flash_attn # noqa: F401
|
||||
attn_impl = "flash_attention_2"
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Constrain loading to GPU memory to avoid CPU/disk offload with 4-bit
|
||||
load_kwargs = {}
|
||||
if USE_CUDA:
|
||||
try:
|
||||
total_bytes = torch.cuda.get_device_properties(0).total_memory
|
||||
total_gib = max(1, int(total_bytes / (1024 ** 3)))
|
||||
reserve_gib = 1
|
||||
max_gib = max(1, total_gib - reserve_gib)
|
||||
load_kwargs["device_map"] = "auto"
|
||||
load_kwargs["max_memory"] = {0: f"{max_gib}GiB"}
|
||||
except Exception:
|
||||
load_kwargs["device_map"] = "auto"
|
||||
|
||||
print("Loading base model...")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_ID,
|
||||
dtype=torch.bfloat16 if BF16_OK else (torch.float16 if USE_CUDA else torch.float32),
|
||||
quantization_config=bnb_config if USE_QLORA else None,
|
||||
attn_implementation=attn_impl,
|
||||
low_cpu_mem_usage=True,
|
||||
**load_kwargs,
|
||||
)
|
||||
|
||||
if USE_QLORA:
|
||||
model = prepare_model_for_kbit_training(model)
|
||||
|
||||
lora_cfg = LoraConfig(
|
||||
task_type="CAUSAL_LM",
|
||||
r=16,
|
||||
lora_alpha=32,
|
||||
lora_dropout=0.05,
|
||||
target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
|
||||
)
|
||||
model = get_peft_model(model, lora_cfg)
|
||||
|
||||
# Reduce training memory footprint
|
||||
model.config.use_cache = False
|
||||
try:
|
||||
model.enable_input_require_grads()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
|
||||
except Exception:
|
||||
model.gradient_checkpointing_enable()
|
||||
|
||||
print(model)
|
||||
try:
|
||||
print("Device map:", getattr(model, "hf_device_map", None))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# %%
|
||||
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
|
||||
|
||||
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8)
|
||||
|
||||
args = TrainingArguments(
|
||||
output_dir=OUTPUT_DIR,
|
||||
num_train_epochs=EPOCHS,
|
||||
per_device_train_batch_size=PER_DEVICE_BATCH,
|
||||
gradient_accumulation_steps=GRAD_ACCUM,
|
||||
learning_rate=LEARNING_RATE,
|
||||
logging_steps=10,
|
||||
save_steps=200,
|
||||
save_total_limit=2,
|
||||
save_strategy="steps",
|
||||
bf16=BF16_OK,
|
||||
fp16=(USE_CUDA and not BF16_OK),
|
||||
tf32=True,
|
||||
gradient_checkpointing=True,
|
||||
remove_unused_columns=False,
|
||||
dataloader_num_workers=2,
|
||||
optim="paged_adamw_8bit" if USE_QLORA else "adamw_torch",
|
||||
lr_scheduler_type="cosine",
|
||||
warmup_ratio=0.03,
|
||||
weight_decay=0.0,
|
||||
save_safetensors=True,
|
||||
report_to="none",
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=args,
|
||||
train_dataset=lm_ds,
|
||||
data_collator=collator,
|
||||
)
|
||||
|
||||
# Free any stale allocations before training
|
||||
import gc, torch; gc.collect()
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
def _latest_checkpoint_dir(base_dir: str):
|
||||
try:
|
||||
dirs = [p for p in Path(base_dir).glob("checkpoint-*") if p.is_dir()]
|
||||
if not dirs:
|
||||
return None
|
||||
def step_num(p: Path):
|
||||
try:
|
||||
return int(p.name.split("-")[-1])
|
||||
except Exception:
|
||||
return -1
|
||||
latest = max(dirs, key=step_num)
|
||||
return str(latest)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
latest_ckpt = _latest_checkpoint_dir(OUTPUT_DIR)
|
||||
print("Resume checkpoint:", latest_ckpt or "<none; starting fresh>")
|
||||
trainer.train(resume_from_checkpoint=latest_ckpt if latest_ckpt else None)
|
||||
|
||||
# %%
|
||||
# Save adapter + tokenizer, then run a quick inference via HF Inference API
|
||||
from peft import PeftModel
|
||||
|
||||
# Save
|
||||
trainer.model.save_pretrained(OUTPUT_DIR)
|
||||
tokenizer.save_pretrained(OUTPUT_DIR)
|
||||
print(f"Saved PEFT adapter and tokenizer to {OUTPUT_DIR}")
|
||||
|
||||
# Hosted inference via Hugging Face Inference API (no GPU weights needed here)
|
||||
print("Running inference via Hugging Face Inference API...")
|
||||
from huggingface_hub import InferenceClient
|
||||
|
||||
hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
|
||||
client = InferenceClient("meta-llama/Llama-3.1-8B", token=hf_token)
|
||||
|
||||
resp = client.text_generation(
|
||||
"Write a haiku about GPUs",
|
||||
max_new_tokens=128,
|
||||
temperature=0.7,
|
||||
)
|
||||
print(resp)
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1,304 @@
|
|||
"""
|
||||
DAPT (Domain-Adaptive Pretraining) for Llama 3.1 on Earnings Call Transcripts
|
||||
|
||||
This script mirrors the notebook logic from `DAPT_Llama31_Transcripts.ipynb`.
|
||||
It performs continued pretraining (causal LM objective) of Llama 3.1 using a
|
||||
local Parquet file containing earnings call transcripts.
|
||||
|
||||
What you'll get:
|
||||
- Environment-adaptive setup (CUDA, MPS, CPU) with automatic LoRA/QLoRA selection
|
||||
- Robust dataset loading from Parquet and text-column auto-detection
|
||||
- Efficient token packing into fixed-length sequences
|
||||
- PEFT LoRA (and QLoRA on CUDA) training pipeline with Transformers Trainer
|
||||
- Save adapters and quick inference sanity check
|
||||
|
||||
Notes:
|
||||
- Accept the Llama 3.1 license on Hugging Face and authenticate before training.
|
||||
- On macOS (MPS), QLoRA is disabled (no bitsandbytes). We use standard LoRA with float16/float32.
|
||||
- For best performance, use a CUDA GPU and enable QLoRA.
|
||||
"""
|
||||
|
||||
# Install required libraries (run manually if needed):
|
||||
# pip install -U transformers datasets accelerate peft sentencepiece protobuf
|
||||
# For CUDA QLoRA only (Linux/NVIDIA):
|
||||
# pip install bitsandbytes
|
||||
|
||||
# Minimize on-disk writes (avoid "No space left on device")
|
||||
import os
|
||||
import tempfile
|
||||
import datasets
|
||||
import transformers
|
||||
|
||||
# Use a small temp dir for caches or disable dataset cache writes
|
||||
TMP_DIR = tempfile.mkdtemp(prefix="hf_tmp_")
|
||||
os.environ["HF_HOME"] = TMP_DIR
|
||||
os.environ["HF_DATASETS_CACHE"] = os.path.join(TMP_DIR, "datasets_cache")
|
||||
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
|
||||
|
||||
os.environ["HF_TOKEN"] = "hf_token"
|
||||
os.environ["HUGGINGFACE_HUB_TOKEN"] = os.environ["HF_TOKEN"]
|
||||
|
||||
# Keep map results in memory to avoid materializing to disk
|
||||
datasets.disable_caching()
|
||||
print({
|
||||
"HF_HOME": os.environ.get("HF_HOME"),
|
||||
"HF_DATASETS_CACHE": os.environ.get("HF_DATASETS_CACHE"),
|
||||
"caching_disabled": True,
|
||||
})
|
||||
|
||||
# If needed, install dependencies (do manually if missing):
|
||||
# pip install -U transformers datasets accelerate peft
|
||||
# For CUDA QLoRA only (Linux/NVIDIA):
|
||||
# pip install bitsandbytes
|
||||
|
||||
import platform
|
||||
import torch
|
||||
|
||||
# Detect environment
|
||||
USE_CUDA = torch.cuda.is_available()
|
||||
USE_MPS = (not USE_CUDA) and torch.backends.mps.is_available()
|
||||
BF16_OK = USE_CUDA and torch.cuda.is_bf16_supported()
|
||||
USE_QLORA = USE_CUDA # QLoRA requires CUDA + bitsandbytes; set False on macOS/CPU
|
||||
# Disable QLoRA automatically if bitsandbytes is not installed
|
||||
try:
|
||||
import importlib.metadata as _ilmd
|
||||
_ = _ilmd.version("bitsandbytes")
|
||||
except Exception:
|
||||
if USE_QLORA:
|
||||
print("bitsandbytes not found; disabling QLoRA (falling back to standard LoRA)")
|
||||
USE_QLORA = False
|
||||
|
||||
DEVICE = (
|
||||
torch.device("cuda") if USE_CUDA else (torch.device("mps") if USE_MPS else torch.device("cpu"))
|
||||
)
|
||||
|
||||
print({
|
||||
"cuda": USE_CUDA,
|
||||
"mps": USE_MPS,
|
||||
"bf16_ok": BF16_OK,
|
||||
"use_qlora": USE_QLORA,
|
||||
"device": str(DEVICE),
|
||||
"python": platform.python_version(),
|
||||
})
|
||||
|
||||
from datasets import load_datasets
|
||||
from typing import Optional
|
||||
import pandas as pds
|
||||
|
||||
# Paths and config
|
||||
# Update this to the actual Parquet path on your system
|
||||
PARQUET_PATH = "stock_earning_call_transcripts.parquet"
|
||||
TEXT_COLUMN: Optional[str] = None # override to force a column, else auto
|
||||
|
||||
raw_ds = load_dataset("parquet", data_files={"train": PARQUET_PATH})["train"]
|
||||
print("Columns:", raw_ds.column_names)
|
||||
print(raw_ds[0])
|
||||
|
||||
# If schema has nested `transcripts` (array of structs with speaker/content),
|
||||
# flatten into a single text field for DAPT.
|
||||
if "transcripts" in raw_ds.column_names:
|
||||
def flatten_segments(example):
|
||||
segments = example.get("transcripts") or []
|
||||
lines = []
|
||||
for seg in segments:
|
||||
if not seg:
|
||||
continue
|
||||
|
||||
speaker = seg.get("speaker")
|
||||
content = seg.get("content")
|
||||
if content is None:
|
||||
continue
|
||||
if speaker and len(str(speaker)) > 0:
|
||||
lines.append(f"{speaker}: {content}")
|
||||
else:
|
||||
lines.append(str(content))
|
||||
example["__flattened_text"] = "\n".join(lines)
|
||||
return example
|
||||
|
||||
raw_ds = raw_ds.map(flatten_segments)
|
||||
# Prefer flattened text unless user overrides
|
||||
if TEXT_COLUMN is None:
|
||||
TEXT_COLUMN = "__flattened_text"
|
||||
|
||||
# Auto-detect a reasonable text column if still unknown
|
||||
if TEXT_COLUMN is None:
|
||||
preferred = [
|
||||
"__flattened_text",
|
||||
"text",
|
||||
"transcript",
|
||||
"content",
|
||||
"body",
|
||||
"cleaned_text",
|
||||
"utterance",
|
||||
"raw_text",
|
||||
]
|
||||
for p in preferred:
|
||||
exact = [c for c in raw_ds.column_names if c.lower() == p]
|
||||
if len(exact) > 0:
|
||||
TEXT_COLUMN = exact[0]
|
||||
break
|
||||
|
||||
if TEXT_COLUMN is None:
|
||||
# fallback to first string-like column
|
||||
for name, feature in raw_ds.features.items():
|
||||
if getattr(feature, "dtype", "") in ("string", "large_string"):
|
||||
TEXT_COLUMN = name
|
||||
break
|
||||
|
||||
if TEXT_COLUMN is None:
|
||||
TEXT_COLUMN = raw_ds.column_names[0]
|
||||
|
||||
print("Using text column:", TEXT_COLUMN)
|
||||
|
||||
# Filter empty
|
||||
ds = raw_ds.filter(lambda x: x.get(TEXT_COLUMN) is not None and len(str(x[TEXT_COLUMN])) > 0)
|
||||
print(ds)
|
||||
print("Example text:", str(ds[0][TEXT_COLUMN])[:400])
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
MODEL_ID = "meta-llama/Llama-3.1-8B"
|
||||
BLOCK_SIZE = 1024 # use 512–1024 for QLoRA on 10–12 GB GPUs
|
||||
|
||||
# Load tokenizer
|
||||
print("Loading tokenizer...")
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
# Avoid long-sequence warnings during tokenization; packing enforces BLOCK_SIZE later
|
||||
try:
|
||||
tokenizer.model_max_length = 1_000_000_000
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def tokenize_examples(batch):
|
||||
return tokenizer(batch[TEXT_COLUMN], add_special_tokens=False, truncation=False)
|
||||
|
||||
print("Tokenizing dataset (this may take a while)...")
|
||||
tok_ds = ds.map(
|
||||
tokenize_examples,
|
||||
batched=True,
|
||||
remove_columns=[c for c in ds.column_names if c != TEXT_COLUMN],
|
||||
)
|
||||
|
||||
# Pack tokens into fixed blocks
|
||||
def group_texts(examples):
|
||||
concatenated = []
|
||||
for ids in examples["input_ids"]:
|
||||
concatenated.extend(ids + [tokenizer.eos_token_id])
|
||||
total_length = (len(concatenated) // BLOCK_SIZE) * BLOCK_SIZE
|
||||
if total_length == 0:
|
||||
return {"input_ids": [], "labels": []}
|
||||
input_ids = [
|
||||
concatenated[i : i + BLOCK_SIZE] for i in range(0, total_length, BLOCK_SIZE)
|
||||
]
|
||||
return {"input_ids": input_ids, "labels": [x.copy() for x in input_ids]}
|
||||
|
||||
lm_ds = tok_ds.map(group_texts, batched=True, remove_columns=tok_ds.column_names)
|
||||
print(lm_ds)
|
||||
|
||||
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
|
||||
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
||||
|
||||
OUTPUT_DIR = "llama31_dapt_transcripts_lora"
|
||||
LEARNING_RATE = 2e-4
|
||||
EPOCHS = 1
|
||||
PER_DEVICE_BATCH = 1
|
||||
GRAD_ACCUM = 32
|
||||
|
||||
bnb_config = None
|
||||
if USE_QLORA:
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype=torch.bfloat16 if BF16_OK else torch.float16,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
)
|
||||
|
||||
print("Loading base model...")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_ID,
|
||||
device_map="auto",
|
||||
torch_dtype=torch.bfloat16
|
||||
if BF16_OK
|
||||
else (torch.float16 if USE_CUDA else torch.float32),
|
||||
quantization_config=bnb_config if USE_QLORA else None,
|
||||
)
|
||||
|
||||
if USE_QLORA:
|
||||
model = prepare_model_for_kbit_training(model)
|
||||
|
||||
lora_cfg = LoraConfig(
|
||||
task_type="CAUSAL_LM",
|
||||
r=16,
|
||||
lora_alpha=32,
|
||||
lora_dropout=0.05,
|
||||
target_modules=[
|
||||
"q_proj",
|
||||
"k_proj",
|
||||
"v_proj",
|
||||
"o_proj",
|
||||
"gate_proj",
|
||||
"up_proj",
|
||||
"down_proj",
|
||||
],
|
||||
)
|
||||
model = get_peft_model(model, lora_cfg)
|
||||
|
||||
print(model)
|
||||
|
||||
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
|
||||
|
||||
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
||||
|
||||
args = TrainingArguments(
|
||||
output_dir=OUTPUT_DIR,
|
||||
num_train_epochs=EPOCHS,
|
||||
per_device_train_batch_size=PER_DEVICE_BATCH,
|
||||
gradient_accumulation_steps=GRAD_ACCUM,
|
||||
learning_rate=LEARNING_RATE,
|
||||
logging_steps=10,
|
||||
save_steps=200,
|
||||
save_total_limit=2,
|
||||
save_strategy="steps",
|
||||
bf16=BF16_OK,
|
||||
fp16=(USE_CUDA and not BF16_OK),
|
||||
optim="paged_adamw_8bit" if USE_QLORA else "adamw_torch",
|
||||
lr_scheduler_type="cosine",
|
||||
warmup_ratio=0.03,
|
||||
weight_decay=0.0,
|
||||
report_to="none",
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=args,
|
||||
train_dataset=lm_ds,
|
||||
data_collator=collator,
|
||||
)
|
||||
|
||||
trainer.train(resume_from_checkpoint=True)
|
||||
|
||||
# Save adapter + tokenizer, and run a tiny inference sanity check
|
||||
from peft import PeftModel
|
||||
|
||||
# Save
|
||||
trainer.model.save_pretrained(OUTPUT_DIR)
|
||||
tokenizer.save_pretrained(OUTPUT_DIR)
|
||||
print(f"Saved PEFT adapter and tokenizer to {OUTPUT_DIR}")
|
||||
|
||||
# Hosted inference via Hugging Face Inference API
|
||||
print("Running inference via Hugging Face Inference API...")
|
||||
from huggingface_hub import InferenceClient
|
||||
|
||||
hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
|
||||
client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
|
||||
|
||||
resp = client.text_generation(
|
||||
"Write a haiku about GPUs",
|
||||
max_new_tokens=128,
|
||||
temperature=0.7,
|
||||
)
|
||||
print(resp)
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,497 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
DAPT Model Evaluation Script
|
||||
|
||||
Evaluates a Domain-Adaptive Pretrained (DAPT) Llama 3.1 model against the baseline
|
||||
Llama 3.1 model on stock earnings call transcripts dataset.
|
||||
|
||||
Computes perplexity scores to measure model performance on domain-specific data.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import argparse
|
||||
from typing import List, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
from peft import PeftModel
|
||||
from tqdm import tqdm
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
AutoModelForCausalLM,
|
||||
BitsAndBytesConfig,
|
||||
)
|
||||
|
||||
|
||||
class DAPTEvaluator:
|
||||
"""Evaluator for DAPT model vs baseline perplexity comparison"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_id: str = "meta-llama/Llama-3.1-8B",
|
||||
dapt_model_path: str = "/u/v/d/vdhanuka/llama3_8b_dapt_transcripts_lora",
|
||||
dataset_path: str = "/u/v/d/vdhanuka/defeatbeta-api-main/stock_earning_call_transcripts.parquet",
|
||||
sample_size: Optional[int] = None,
|
||||
sample_percentage: Optional[float] = None,
|
||||
max_length: int = 1024,
|
||||
use_qlora: bool = True,
|
||||
device: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Initialize the evaluator.
|
||||
|
||||
Args:
|
||||
model_id: HuggingFace model ID for base model
|
||||
dapt_model_path: Path to trained DAPT LoRA adapters
|
||||
dataset_path: Path to evaluation dataset
|
||||
sample_size: Number of samples to evaluate (mutually exclusive with sample_percentage)
|
||||
sample_percentage: Percentage of dataset to evaluate (0.0-1.0, mutually exclusive with sample_size)
|
||||
max_length: Maximum sequence length for evaluation
|
||||
use_qlora: Whether to use QLoRA quantization
|
||||
device: Device to use (auto-detected if None)
|
||||
"""
|
||||
self.model_id = model_id
|
||||
self.dapt_model_path = dapt_model_path
|
||||
self.dataset_path = dataset_path
|
||||
self.sample_size = sample_size
|
||||
self.sample_percentage = sample_percentage
|
||||
self.max_length = max_length
|
||||
self.use_qlora = use_qlora
|
||||
|
||||
# Auto-detect device
|
||||
if device is None:
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
else:
|
||||
self.device = torch.device(device)
|
||||
|
||||
# Hugging Face token from environment
|
||||
self.hf_token = (
|
||||
os.getenv("HUGGING_FACE_HUB_TOKEN")
|
||||
or os.getenv("HF_TOKEN")
|
||||
or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
||||
)
|
||||
|
||||
# Initialize models and tokenizer
|
||||
self.tokenizer = None
|
||||
self.baseline_model = None
|
||||
self.dapt_model = None
|
||||
self.eval_texts: Optional[List[str]] = None
|
||||
|
||||
print("🚀 Initializing DAPT Evaluator")
|
||||
print(f" Model: {model_id}")
|
||||
print(f" DAPT Path: {dapt_model_path}")
|
||||
print(f" Dataset: {dataset_path}")
|
||||
print(f" Device: {self.device}")
|
||||
if self.hf_token:
|
||||
print(f" HF token: detected in environment")
|
||||
else:
|
||||
print(f" HF token: not found (anonymous access)")
|
||||
if sample_percentage is not None:
|
||||
print(f" Sample Percentage: {sample_percentage*100:.1f}%")
|
||||
else:
|
||||
print(f" Sample Size: {sample_size}")
|
||||
print(f" Use QLoRA: {use_qlora}")
|
||||
|
||||
def setup_tokenizer(self):
|
||||
"""Load and configure tokenizer"""
|
||||
print("\n🔧 Loading tokenizer...")
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||
self.model_id,
|
||||
use_fast=True,
|
||||
token=self.hf_token,
|
||||
)
|
||||
if self.tokenizer.pad_token is None:
|
||||
self.tokenizer.pad_token = self.tokenizer.eos_token
|
||||
print(f" Vocab size: {self.tokenizer.vocab_size}")
|
||||
return self.tokenizer
|
||||
|
||||
def load_dataset(self):
|
||||
"""Load and preprocess evaluation dataset"""
|
||||
print("\n📊 Loading evaluation dataset...")
|
||||
try:
|
||||
ds = load_dataset("parquet", data_files={"eval": self.dataset_path})["eval"]
|
||||
print(f" Dataset loaded: {len(ds)} examples")
|
||||
print(f" Columns: {ds.column_names}")
|
||||
|
||||
# Flatten transcripts if needed (same logic as training)
|
||||
if "transcripts" in ds.column_names:
|
||||
print(" Flattening transcript data...")
|
||||
|
||||
def flatten_segments(example):
|
||||
segments = example.get("transcripts") or []
|
||||
lines = []
|
||||
for seg in segments:
|
||||
if not seg:
|
||||
continue
|
||||
speaker = seg.get("speaker")
|
||||
content = seg.get("content")
|
||||
if content is None:
|
||||
continue
|
||||
if speaker and len(str(speaker)) > 0:
|
||||
lines.append(f"{speaker}: {content}")
|
||||
else:
|
||||
lines.append(str(content))
|
||||
example["text"] = "\n".join(lines)
|
||||
return example
|
||||
|
||||
ds = ds.map(flatten_segments, desc="Flattening transcripts")
|
||||
text_column = "text"
|
||||
else:
|
||||
# Auto-detect text column
|
||||
preferred = ["text", "transcript", "content", "body", "cleaned_text"]
|
||||
text_column = None
|
||||
for p in preferred:
|
||||
if p in ds.column_names:
|
||||
text_column = p
|
||||
break
|
||||
if text_column is None:
|
||||
text_column = ds.column_names[0]
|
||||
|
||||
print(f" Using text column: {text_column}")
|
||||
|
||||
# Determine sample size
|
||||
total_samples = len(ds)
|
||||
if self.sample_percentage is not None:
|
||||
# Use percentage of dataset
|
||||
sample_size = int(total_samples * self.sample_percentage)
|
||||
sample_size = max(1, sample_size)
|
||||
print(f" Using {self.sample_percentage*100:.1f}% of dataset = {sample_size} samples")
|
||||
else:
|
||||
# Use fixed sample size
|
||||
sample_size = min(self.sample_size, total_samples)
|
||||
if sample_size is None:
|
||||
sample_size = min(1000, total_samples)
|
||||
if sample_size < 1:
|
||||
sample_size = 1
|
||||
|
||||
# Get random sample for more representative evaluation
|
||||
indices = np.random.choice(total_samples, sample_size, replace=False)
|
||||
sample_ds = ds.select(indices)
|
||||
|
||||
# Filter out empty or very short texts
|
||||
def is_valid_text(example):
|
||||
text = example.get(text_column, "")
|
||||
return text is not None and len(str(text).strip()) > 50
|
||||
|
||||
sample_ds = sample_ds.filter(is_valid_text)
|
||||
self.eval_texts = [ex[text_column] for ex in sample_ds]
|
||||
|
||||
print(f" Sampled {len(self.eval_texts)} valid texts for evaluation")
|
||||
avg_chars = float(np.mean([len(t) for t in self.eval_texts])) if len(self.eval_texts) > 0 else 0.0
|
||||
print(f" Average text length: {avg_chars:.0f} characters")
|
||||
|
||||
return self.eval_texts
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error loading dataset: {e}")
|
||||
raise
|
||||
|
||||
def setup_quantization(self):
|
||||
"""Setup quantization configuration"""
|
||||
if not self.use_qlora or not torch.cuda.is_available():
|
||||
return None
|
||||
|
||||
try:
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
)
|
||||
print(" Using 4-bit quantization (QLoRA)")
|
||||
return bnb_config
|
||||
except Exception:
|
||||
print(" BitsAndBytes not available, using standard precision")
|
||||
return None
|
||||
|
||||
def load_baseline_model(self):
|
||||
"""Load the baseline Llama 3.1 model"""
|
||||
print("\n🏗️ Loading baseline model...")
|
||||
bnb_config = self.setup_quantization()
|
||||
|
||||
torch_dtype = (
|
||||
torch.bfloat16
|
||||
if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
|
||||
else torch.float16 if torch.cuda.is_available() else torch.float32
|
||||
)
|
||||
|
||||
self.baseline_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_id,
|
||||
device_map="auto" if torch.cuda.is_available() else None,
|
||||
torch_dtype=torch_dtype,
|
||||
quantization_config=bnb_config,
|
||||
low_cpu_mem_usage=True,
|
||||
token=self.hf_token,
|
||||
)
|
||||
self.baseline_model.eval()
|
||||
print(" Baseline model loaded successfully")
|
||||
return self.baseline_model
|
||||
|
||||
def load_dapt_model(self):
|
||||
"""Load the DAPT model with LoRA adapters"""
|
||||
print("\n🎯 Loading DAPT model...")
|
||||
if not os.path.exists(self.dapt_model_path):
|
||||
print(f"❌ DAPT model path not found: {self.dapt_model_path}")
|
||||
return None
|
||||
|
||||
try:
|
||||
bnb_config = self.setup_quantization()
|
||||
torch_dtype = (
|
||||
torch.bfloat16
|
||||
if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
|
||||
else torch.float16 if torch.cuda.is_available() else torch.float32
|
||||
)
|
||||
|
||||
# Load base model
|
||||
dapt_base_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_id,
|
||||
device_map="auto" if torch.cuda.is_available() else None,
|
||||
torch_dtype=torch_dtype,
|
||||
quantization_config=bnb_config,
|
||||
low_cpu_mem_usage=True,
|
||||
token=self.hf_token,
|
||||
)
|
||||
|
||||
# Load LoRA adapters
|
||||
self.dapt_model = PeftModel.from_pretrained(dapt_base_model, self.dapt_model_path)
|
||||
self.dapt_model.eval()
|
||||
print(" DAPT model loaded successfully")
|
||||
return self.dapt_model
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error loading DAPT model: {e}")
|
||||
return None
|
||||
|
||||
def compute_perplexity(self, model, texts: List[str]) -> float:
|
||||
"""
|
||||
Compute perplexity for a model on given texts.
|
||||
|
||||
Args:
|
||||
model: The language model to evaluate
|
||||
texts: List of text strings
|
||||
|
||||
Returns:
|
||||
Perplexity score
|
||||
"""
|
||||
model.eval()
|
||||
total_loss = 0.0
|
||||
total_tokens = 0
|
||||
|
||||
with torch.no_grad():
|
||||
for text in tqdm(texts, desc="Computing perplexity", unit="text"):
|
||||
# Tokenize
|
||||
encodings = self.tokenizer(
|
||||
text,
|
||||
return_tensors="pt",
|
||||
truncation=True,
|
||||
max_length=self.max_length,
|
||||
padding=False,
|
||||
)
|
||||
|
||||
input_ids = encodings.input_ids.to(self.device)
|
||||
|
||||
if len(input_ids[0]) <= 1:
|
||||
continue
|
||||
|
||||
# Create labels (same as input_ids for causal LM)
|
||||
labels = input_ids.clone()
|
||||
|
||||
# Forward pass
|
||||
outputs = model(input_ids=input_ids, labels=labels)
|
||||
loss = outputs.loss
|
||||
|
||||
# Accumulate loss weighted by sequence length
|
||||
seq_len = len(input_ids[0])
|
||||
total_loss += loss.item() * seq_len
|
||||
total_tokens += seq_len
|
||||
|
||||
if total_tokens == 0:
|
||||
return float("inf")
|
||||
|
||||
# Compute average loss and perplexity
|
||||
avg_loss = total_loss / total_tokens
|
||||
perplexity = float(np.exp(avg_loss))
|
||||
|
||||
return perplexity
|
||||
|
||||
def evaluate_models(self):
|
||||
"""Evaluate both baseline and DAPT models"""
|
||||
if self.eval_texts is None:
|
||||
raise ValueError("Evaluation texts not loaded. Call load_dataset() first.")
|
||||
|
||||
results = {}
|
||||
|
||||
# Evaluate baseline model
|
||||
if self.baseline_model is None:
|
||||
self.load_baseline_model()
|
||||
|
||||
print("\n📈 Evaluating BASELINE model...")
|
||||
start_time = time.time()
|
||||
baseline_ppl = self.compute_perplexity(self.baseline_model, self.eval_texts)
|
||||
baseline_time = time.time() - start_time
|
||||
results["baseline"] = {
|
||||
"perplexity": baseline_ppl,
|
||||
"eval_time": baseline_time,
|
||||
}
|
||||
print(f" Perplexity: {baseline_ppl:.4f}")
|
||||
print(f" Evaluation time: {baseline_time:.2f} seconds")
|
||||
|
||||
# Evaluate DAPT model
|
||||
if self.dapt_model is None:
|
||||
self.dapt_model = self.load_dapt_model()
|
||||
|
||||
if self.dapt_model is not None:
|
||||
print("\n📈 Evaluating DAPT model...")
|
||||
start_time = time.time()
|
||||
dapt_ppl = self.compute_perplexity(self.dapt_model, self.eval_texts)
|
||||
dapt_time = time.time() - start_time
|
||||
results["dapt"] = {
|
||||
"perplexity": dapt_ppl,
|
||||
"eval_time": dapt_time,
|
||||
}
|
||||
print(f" Perplexity: {dapt_ppl:.4f}")
|
||||
print(f" Evaluation time: {dapt_time:.2f} seconds")
|
||||
else:
|
||||
print("\n⚠️ DAPT model not available for evaluation")
|
||||
results["dapt"] = None
|
||||
|
||||
return results
|
||||
|
||||
def print_results(self, results):
|
||||
"""Print formatted evaluation results"""
|
||||
print("\n" + "=" * 70)
|
||||
print("🎯 EVALUATION RESULTS")
|
||||
print("=" * 70)
|
||||
|
||||
print(f"Dataset: {self.dataset_path}")
|
||||
print(f"Samples evaluated: {len(self.eval_texts)}")
|
||||
print(f"Max sequence length: {self.max_length}")
|
||||
print()
|
||||
|
||||
if "baseline" in results and results["baseline"]:
|
||||
baseline_ppl = results["baseline"]["perplexity"]
|
||||
baseline_time = results["baseline"]["eval_time"]
|
||||
print("BASELINE LLAMA 3.1:")
|
||||
print(f" Perplexity: {baseline_ppl:.4f}")
|
||||
print(f" Evaluation time: {baseline_time:.2f} seconds")
|
||||
if "dapt" in results and results["dapt"]:
|
||||
dapt_ppl = results["dapt"]["perplexity"]
|
||||
dapt_time = results["dapt"]["eval_time"]
|
||||
print("\nDAPT MODEL:")
|
||||
print(f" Perplexity: {dapt_ppl:.4f}")
|
||||
print(f" Evaluation time: {dapt_time:.2f} seconds")
|
||||
# Comparison
|
||||
if "baseline" in results and results["baseline"]:
|
||||
baseline_ppl = results["baseline"]["perplexity"]
|
||||
improvement = ((baseline_ppl - dapt_ppl) / baseline_ppl) * 100.0
|
||||
print("\nCOMPARISON:")
|
||||
print(f" Baseline PPL: {baseline_ppl:.4f}")
|
||||
print(f" DAPT PPL: {dapt_ppl:.4f}")
|
||||
print(f" Improvement: {improvement:.2f}%")
|
||||
if dapt_ppl < baseline_ppl:
|
||||
print("✅ SUCCESS: DAPT model outperforms baseline!")
|
||||
print(" The domain-adaptive pretraining improved performance on earnings call data.")
|
||||
else:
|
||||
print("⚠️ NOTE: DAPT model does not outperform baseline")
|
||||
print(" Consider adjusting training parameters or dataset.")
|
||||
else:
|
||||
print("\n❌ DAPT model evaluation failed or not available")
|
||||
|
||||
print("\n" + "-" * 70)
|
||||
print("📝 INTERPRETATION")
|
||||
print("-" * 70)
|
||||
print("Perplexity measures how well the model predicts the next token in sequences.")
|
||||
print("Lower perplexity = better predictive performance on the domain.")
|
||||
print("Typical perplexity ranges: 10-100+ (lower is better)")
|
||||
print()
|
||||
print("Earnings call transcripts contain specialized financial language,")
|
||||
print("so domain adaptation should ideally reduce perplexity compared to baseline.")
|
||||
|
||||
def run_evaluation(self):
|
||||
"""Run the complete evaluation pipeline"""
|
||||
try:
|
||||
# Setup
|
||||
self.setup_tokenizer()
|
||||
self.load_dataset()
|
||||
self.load_baseline_model()
|
||||
self.load_dapt_model()
|
||||
|
||||
# Evaluate
|
||||
results = self.evaluate_models()
|
||||
|
||||
# Display results
|
||||
self.print_results(results)
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Evaluation failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function with command line argument parsing"""
|
||||
parser = argparse.ArgumentParser(description="Evaluate DAPT model vs baseline perplexity")
|
||||
parser.add_argument("--model-id", default="meta-llama/Llama-3.1-8B", help="Base model ID")
|
||||
parser.add_argument(
|
||||
"--dapt-path",
|
||||
default="/u/v/d/vdhanuka/llama3_8b_dapt_transcripts_lora",
|
||||
help="Path to DAPT LoRA adapters",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
default="/u/v/d/vdhanuka/defeatbeta-api-main/stock_earning_call_transcripts.parquet",
|
||||
help="Path to evaluation dataset",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sample-size",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Number of samples to evaluate (mutually exclusive with --sample-percentage)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sample-percentage",
|
||||
type=float,
|
||||
default=None,
|
||||
help="Percentage of dataset to evaluate (0.0-1.0, mutually exclusive with --sample-size)",
|
||||
)
|
||||
parser.add_argument("--max-length", type=int, default=1024, help="Maximum sequence length")
|
||||
parser.add_argument("--no-qlora", action="store_true", help="Disable QLoRA quantization")
|
||||
parser.add_argument("--device", default=None, help="Device to use (cuda/cpu)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate mutually exclusive arguments
|
||||
if args.sample_size is not None and args.sample_percentage is not None:
|
||||
parser.error("--sample-size and --sample-percentage are mutually exclusive")
|
||||
if args.sample_size is None and args.sample_percentage is None:
|
||||
args.sample_size = 1000 # Default to 500 samples
|
||||
|
||||
# Create evaluator
|
||||
evaluator = DAPTEvaluator(
|
||||
model_id=args.model_id,
|
||||
dapt_model_path=args.dapt_path,
|
||||
dataset_path=args.dataset,
|
||||
sample_size=args.sample_size,
|
||||
sample_percentage=args.sample_percentage,
|
||||
max_length=args.max_length,
|
||||
use_qlora=not args.no_qlora,
|
||||
device=args.device,
|
||||
)
|
||||
|
||||
# Run evaluation
|
||||
results = evaluator.run_evaluation()
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Set random seed for reproducible sampling
|
||||
np.random.seed(42)
|
||||
torch.manual_seed(42)
|
||||
|
||||
main()
|
||||
Loading…
Reference in New Issue