352 lines
11 KiB
Python
352 lines
11 KiB
Python
|
||
|
||
# %%
|
||
from huggingface_hub import InferenceClient
|
||
|
||
client = InferenceClient("meta-llama/Llama-3.1-8B", token="hf_xxx") # replace with your HF token
|
||
resp = client.text_generation(
|
||
"Write a haiku about GPUs",
|
||
max_new_tokens=128,
|
||
temperature=0.7,
|
||
)
|
||
print(resp)
|
||
|
||
# %%
|
||
# Minimize on-disk writes (avoid "No space left on device")
|
||
import os, tempfile, datasets, transformers
|
||
|
||
# Use a small temp dir for caches or disable dataset cache writes
|
||
TMP_DIR = tempfile.mkdtemp(prefix="hf_tmp_")
|
||
os.environ["HF_HOME"] = TMP_DIR
|
||
os.environ["HF_DATASETS_CACHE"] = os.path.join(TMP_DIR, "datasets_cache")
|
||
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
|
||
|
||
os.environ["HF_TOKEN"] = "hf_xxx" # replace with your HF token
|
||
os.environ["HUGGINGFACE_HUB_TOKEN"] = os.environ["HF_TOKEN"]
|
||
# Keep map results in memory to avoid materializing to disk
|
||
datasets.disable_caching()
|
||
print({
|
||
"HF_HOME": os.environ.get("HF_HOME"),
|
||
"HF_DATASETS_CACHE": os.environ.get("HF_DATASETS_CACHE"),
|
||
"caching_disabled": True,
|
||
"PYTORCH_CUDA_ALLOC_CONF": os.environ.get("PYTORCH_CUDA_ALLOC_CONF"),
|
||
})
|
||
|
||
# %%
|
||
# If needed, install dependencies. Uncomment the next cell to run once.
|
||
# %pip -q install -U transformers datasets accelerate peft
|
||
# For CUDA QLoRA only (Linux/NVIDIA):
|
||
# %pip -q install bitsandbytes
|
||
|
||
import os
|
||
import platform
|
||
import torch
|
||
|
||
# Detect environment
|
||
USE_CUDA = torch.cuda.is_available()
|
||
USE_MPS = (not USE_CUDA) and torch.backends.mps.is_available()
|
||
BF16_OK = USE_CUDA and torch.cuda.is_bf16_supported()
|
||
USE_QLORA = USE_CUDA # QLoRA requires CUDA + bitsandbytes; set False on macOS/CPU
|
||
|
||
# Disable QLoRA automatically if bitsandbytes is not installed
|
||
try:
|
||
import importlib.metadata as _ilmd
|
||
_ = _ilmd.version("bitsandbytes")
|
||
except Exception:
|
||
if USE_QLORA:
|
||
print("bitsandbytes not found; disabling QLoRA (falling back to standard LoRA)")
|
||
USE_QLORA = False
|
||
|
||
DEVICE = (
|
||
torch.device("cuda") if USE_CUDA else (torch.device("mps") if USE_MPS else torch.device("cpu"))
|
||
)
|
||
|
||
print({
|
||
"cuda": USE_CUDA,
|
||
"mps": USE_MPS,
|
||
"bf16_ok": BF16_OK,
|
||
"use_qlora": USE_QLORA,
|
||
"device": str(DEVICE),
|
||
"python": platform.python_version(),
|
||
})
|
||
|
||
# %%
|
||
from datasets import load_dataset
|
||
from typing import Optional
|
||
import getpass
|
||
|
||
# Paths and config
|
||
# Get current username dynamically
|
||
current_user = getpass.getuser()
|
||
PARQUET_PATH = f"/u/v/d/{current_user}/defeatbeta-api-main/stock_earning_call_transcripts.parquet"
|
||
TEXT_COLUMN: Optional[str] = None # override to force a column, else auto
|
||
|
||
raw_ds = load_dataset("parquet", data_files={"train": PARQUET_PATH})["train"]
|
||
SAMPLE_FRACTION = 0.2 # use 20% random subset for faster DAPT
|
||
SAMPLE_SEED = int(os.environ.get("SAMPLE_SEED", "42"))
|
||
if SAMPLE_FRACTION < 1.0:
|
||
split = raw_ds.train_test_split(test_size=1.0 - SAMPLE_FRACTION, seed=SAMPLE_SEED, shuffle=True)
|
||
raw_ds = split["train"]
|
||
try:
|
||
print(f"Randomly sampled {int(SAMPLE_FRACTION*100)}% subset; size: {len(raw_ds)}")
|
||
except Exception:
|
||
pass
|
||
print("Columns:", raw_ds.column_names)
|
||
print(raw_ds[0])
|
||
|
||
# If schema has nested `transcripts` (array of structs with speaker/content),
|
||
# flatten into a single text field for DAPT.
|
||
if "transcripts" in raw_ds.column_names:
|
||
def flatten_segments(example):
|
||
segments = example.get("transcripts") or []
|
||
lines = []
|
||
for seg in segments:
|
||
if not seg:
|
||
continue
|
||
speaker = seg.get("speaker")
|
||
content = seg.get("content")
|
||
if content is None:
|
||
continue
|
||
if speaker and len(str(speaker)) > 0:
|
||
lines.append(f"{speaker}: {content}")
|
||
else:
|
||
lines.append(str(content))
|
||
example["__flattened_text"] = "\n".join(lines)
|
||
return example
|
||
|
||
raw_ds = raw_ds.map(flatten_segments)
|
||
# Prefer flattened text unless user overrides
|
||
if TEXT_COLUMN is None:
|
||
TEXT_COLUMN = "__flattened_text"
|
||
|
||
# Auto-detect a reasonable text column if still unknown
|
||
if TEXT_COLUMN is None:
|
||
preferred = ["__flattened_text","text","transcript","content","body","cleaned_text","utterance","raw_text"]
|
||
for p in preferred:
|
||
exact = [c for c in raw_ds.column_names if c.lower() == p]
|
||
if len(exact) > 0:
|
||
TEXT_COLUMN = exact[0]
|
||
break
|
||
|
||
if TEXT_COLUMN is None:
|
||
# fallback to first string-like column
|
||
for name, feature in raw_ds.features.items():
|
||
if getattr(feature, "dtype", "") in ("string", "large_string"):
|
||
TEXT_COLUMN = name
|
||
break
|
||
|
||
if TEXT_COLUMN is None:
|
||
TEXT_COLUMN = raw_ds.column_names[0]
|
||
|
||
print("Using text column:", TEXT_COLUMN)
|
||
|
||
# Filter empty
|
||
ds = raw_ds.filter(lambda x: x.get(TEXT_COLUMN) is not None and len(str(x[TEXT_COLUMN])) > 0)
|
||
print(ds)
|
||
print("Example text:", str(ds[0][TEXT_COLUMN])[:400])
|
||
|
||
# %%
|
||
from transformers import AutoTokenizer
|
||
|
||
MODEL_ID = "meta-llama/Llama-3.1-8B"
|
||
BLOCK_SIZE = 512 # lowered to reduce activation memory on 10–12 GB GPUs
|
||
|
||
# Load tokenizer
|
||
print("Loading tokenizer...")
|
||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
|
||
if tokenizer.pad_token is None:
|
||
tokenizer.pad_token = tokenizer.eos_token
|
||
# Avoid long-sequence warnings during tokenization; packing enforces BLOCK_SIZE later
|
||
try:
|
||
tokenizer.model_max_length = 1_000_000_000
|
||
except Exception:
|
||
pass
|
||
|
||
def tokenize_examples(batch):
|
||
return tokenizer(batch[TEXT_COLUMN], add_special_tokens=False, truncation=False)
|
||
|
||
print("Tokenizing dataset (this may take a while)...")
|
||
tok_ds = ds.map(tokenize_examples, batched=True, remove_columns=[c for c in ds.column_names if c != TEXT_COLUMN])
|
||
|
||
# Pack tokens into fixed blocks
|
||
def group_texts(examples):
|
||
concatenated = []
|
||
for ids in examples["input_ids"]:
|
||
concatenated.extend(ids + [tokenizer.eos_token_id])
|
||
total_length = (len(concatenated) // BLOCK_SIZE) * BLOCK_SIZE
|
||
if total_length == 0:
|
||
return {"input_ids": [], "labels": []}
|
||
input_ids = [concatenated[i:i+BLOCK_SIZE] for i in range(0, total_length, BLOCK_SIZE)]
|
||
return {"input_ids": input_ids, "labels": [x.copy() for x in input_ids]}
|
||
|
||
lm_ds = tok_ds.map(group_texts, batched=True, remove_columns=tok_ds.column_names)
|
||
print(lm_ds)
|
||
|
||
# %%
|
||
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
|
||
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
||
|
||
# Use the current username (already defined above)
|
||
OUTPUT_DIR = f"/u/v/d/{current_user}/llama3_8b_dapt_transcripts_lora"
|
||
LEARNING_RATE = 2e-4
|
||
EPOCHS = 1
|
||
PER_DEVICE_BATCH = 1
|
||
GRAD_ACCUM = 32
|
||
|
||
bnb_config = None
|
||
if USE_QLORA:
|
||
bnb_config = BitsAndBytesConfig(
|
||
load_in_4bit=True,
|
||
bnb_4bit_quant_type="nf4",
|
||
bnb_4bit_compute_dtype=torch.bfloat16 if BF16_OK else torch.float16,
|
||
bnb_4bit_use_double_quant=True,
|
||
)
|
||
|
||
# Prefer FlashAttention-2 on CUDA if available; else fall back to SDPA
|
||
attn_impl = "sdpa"
|
||
if USE_CUDA:
|
||
try:
|
||
import flash_attn # noqa: F401
|
||
attn_impl = "flash_attention_2"
|
||
except Exception:
|
||
pass
|
||
|
||
# Constrain loading to GPU memory to avoid CPU/disk offload with 4-bit
|
||
load_kwargs = {}
|
||
if USE_CUDA:
|
||
try:
|
||
total_bytes = torch.cuda.get_device_properties(0).total_memory
|
||
total_gib = max(1, int(total_bytes / (1024 ** 3)))
|
||
reserve_gib = 1
|
||
max_gib = max(1, total_gib - reserve_gib)
|
||
load_kwargs["device_map"] = "auto"
|
||
load_kwargs["max_memory"] = {0: f"{max_gib}GiB"}
|
||
except Exception:
|
||
load_kwargs["device_map"] = "auto"
|
||
|
||
print("Loading base model...")
|
||
model = AutoModelForCausalLM.from_pretrained(
|
||
MODEL_ID,
|
||
dtype=torch.bfloat16 if BF16_OK else (torch.float16 if USE_CUDA else torch.float32),
|
||
quantization_config=bnb_config if USE_QLORA else None,
|
||
attn_implementation=attn_impl,
|
||
low_cpu_mem_usage=True,
|
||
**load_kwargs,
|
||
)
|
||
|
||
if USE_QLORA:
|
||
model = prepare_model_for_kbit_training(model)
|
||
|
||
lora_cfg = LoraConfig(
|
||
task_type="CAUSAL_LM",
|
||
r=16,
|
||
lora_alpha=32,
|
||
lora_dropout=0.05,
|
||
target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
|
||
)
|
||
model = get_peft_model(model, lora_cfg)
|
||
|
||
# Reduce training memory footprint
|
||
model.config.use_cache = False
|
||
try:
|
||
model.enable_input_require_grads()
|
||
except Exception:
|
||
pass
|
||
try:
|
||
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
|
||
except Exception:
|
||
model.gradient_checkpointing_enable()
|
||
|
||
print(model)
|
||
try:
|
||
print("Device map:", getattr(model, "hf_device_map", None))
|
||
except Exception:
|
||
pass
|
||
|
||
# %%
|
||
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
|
||
|
||
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8)
|
||
|
||
args = TrainingArguments(
|
||
output_dir=OUTPUT_DIR,
|
||
num_train_epochs=EPOCHS,
|
||
per_device_train_batch_size=PER_DEVICE_BATCH,
|
||
gradient_accumulation_steps=GRAD_ACCUM,
|
||
learning_rate=LEARNING_RATE,
|
||
logging_steps=10,
|
||
save_steps=200,
|
||
save_total_limit=2,
|
||
save_strategy="steps",
|
||
bf16=BF16_OK,
|
||
fp16=(USE_CUDA and not BF16_OK),
|
||
tf32=True,
|
||
gradient_checkpointing=True,
|
||
remove_unused_columns=False,
|
||
dataloader_num_workers=2,
|
||
optim="paged_adamw_8bit" if USE_QLORA else "adamw_torch",
|
||
lr_scheduler_type="cosine",
|
||
warmup_ratio=0.03,
|
||
weight_decay=0.0,
|
||
save_safetensors=True,
|
||
report_to="none",
|
||
)
|
||
|
||
trainer = Trainer(
|
||
model=model,
|
||
args=args,
|
||
train_dataset=lm_ds,
|
||
data_collator=collator,
|
||
)
|
||
|
||
# Free any stale allocations before training
|
||
import gc, torch; gc.collect()
|
||
if torch.cuda.is_available():
|
||
torch.cuda.empty_cache()
|
||
|
||
from pathlib import Path
|
||
|
||
def _latest_checkpoint_dir(base_dir: str):
|
||
try:
|
||
dirs = [p for p in Path(base_dir).glob("checkpoint-*") if p.is_dir()]
|
||
if not dirs:
|
||
return None
|
||
def step_num(p: Path):
|
||
try:
|
||
return int(p.name.split("-")[-1])
|
||
except Exception:
|
||
return -1
|
||
latest = max(dirs, key=step_num)
|
||
return str(latest)
|
||
except Exception:
|
||
return None
|
||
|
||
latest_ckpt = _latest_checkpoint_dir(OUTPUT_DIR)
|
||
print("Resume checkpoint:", latest_ckpt or "<none; starting fresh>")
|
||
trainer.train(resume_from_checkpoint=latest_ckpt if latest_ckpt else None)
|
||
|
||
# %%
|
||
# Save adapter + tokenizer, then run a quick inference via HF Inference API
|
||
from peft import PeftModel
|
||
|
||
# Save
|
||
trainer.model.save_pretrained(OUTPUT_DIR)
|
||
tokenizer.save_pretrained(OUTPUT_DIR)
|
||
print(f"Saved PEFT adapter and tokenizer to {OUTPUT_DIR}")
|
||
|
||
# Hosted inference via Hugging Face Inference API (no GPU weights needed here)
|
||
print("Running inference via Hugging Face Inference API...")
|
||
from huggingface_hub import InferenceClient
|
||
|
||
hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
|
||
client = InferenceClient("meta-llama/Llama-3.1-8B", token=hf_token)
|
||
|
||
resp = client.text_generation(
|
||
"Write a haiku about GPUs",
|
||
max_new_tokens=128,
|
||
temperature=0.7,
|
||
)
|
||
print(resp)
|
||
|
||
|