TradingAgents/Finllama/DAPT_Llama31_Transcripts.py

"""
DAPT (Domain-Adaptive Pretraining) for Llama 3.1 on Earnings Call Transcripts

This script mirrors the notebook logic from `DAPT_Llama31_Transcripts.ipynb`.
It performs continued pretraining (causal LM objective) of Llama 3.1 using a
local Parquet file containing earnings call transcripts.

What you'll get:
- Environment-adaptive setup (CUDA, MPS, CPU) with automatic LoRA/QLoRA selection
- Robust dataset loading from Parquet and text-column auto-detection
- Efficient token packing into fixed-length sequences
- PEFT LoRA (and QLoRA on CUDA) training pipeline with Transformers Trainer
- Save adapters and quick inference sanity check

Notes:
- Accept the Llama 3.1 license on Hugging Face and authenticate before training.
- On macOS (MPS), QLoRA is disabled (no bitsandbytes). We use standard LoRA with float16/float32.
- For best performance, use a CUDA GPU and enable QLoRA.
"""

# Install required libraries (run manually if needed):
#   pip install -U transformers datasets accelerate peft sentencepiece protobuf
# For CUDA QLoRA only (Linux/NVIDIA):
#   pip install bitsandbytes

# Minimize on-disk writes (avoid "No space left on device")
import os
import tempfile
import datasets
import transformers

# Use a small temp dir for caches or disable dataset cache writes
TMP_DIR = tempfile.mkdtemp(prefix="hf_tmp_")
os.environ["HF_HOME"] = TMP_DIR
os.environ["HF_DATASETS_CACHE"] = os.path.join(TMP_DIR, "datasets_cache")
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

os.environ["HF_TOKEN"] = "hf_token"
os.environ["HUGGINGFACE_HUB_TOKEN"] = os.environ["HF_TOKEN"]

# Keep map results in memory to avoid materializing to disk
datasets.disable_caching()
print({
    "HF_HOME": os.environ.get("HF_HOME"),
    "HF_DATASETS_CACHE": os.environ.get("HF_DATASETS_CACHE"),
    "caching_disabled": True,
})

# If needed, install dependencies (do manually if missing):
#   pip install -U transformers datasets accelerate peft
# For CUDA QLoRA only (Linux/NVIDIA):
#   pip install bitsandbytes

import platform
import torch

# Detect environment
USE_CUDA = torch.cuda.is_available()
USE_MPS = (not USE_CUDA) and torch.backends.mps.is_available()
BF16_OK = USE_CUDA and torch.cuda.is_bf16_supported()
USE_QLORA = USE_CUDA  # QLoRA requires CUDA + bitsandbytes; set False on macOS/CPU
# Disable QLoRA automatically if bitsandbytes is not installed
try:
    import importlib.metadata as _ilmd
    _ = _ilmd.version("bitsandbytes")
except Exception:
    if USE_QLORA:
        print("bitsandbytes not found; disabling QLoRA (falling back to standard LoRA)")
    USE_QLORA = False

DEVICE = (
    torch.device("cuda") if USE_CUDA else (torch.device("mps") if USE_MPS else torch.device("cpu"))
)

print({
    "cuda": USE_CUDA,
    "mps": USE_MPS,
    "bf16_ok": BF16_OK,
    "use_qlora": USE_QLORA,
    "device": str(DEVICE),
    "python": platform.python_version(),
})

from datasets import load_datasets
from typing import Optional
import pandas as pds

# Paths and config
# Update this to the actual Parquet path on your system
PARQUET_PATH = "stock_earning_call_transcripts.parquet"
TEXT_COLUMN: Optional[str] = None  # override to force a column, else auto

raw_ds = load_dataset("parquet", data_files={"train": PARQUET_PATH})["train"]
print("Columns:", raw_ds.column_names)
print(raw_ds[0])

# If schema has nested `transcripts` (array of structs with speaker/content),
# flatten into a single text field for DAPT.
if "transcripts" in raw_ds.column_names:
    def flatten_segments(example):
        segments = example.get("transcripts") or []
        lines = []
        for seg in segments:
            if not seg:
                continue

            speaker = seg.get("speaker")
            content = seg.get("content")
            if content is None:
                continue
            if speaker and len(str(speaker)) > 0:
                lines.append(f"{speaker}: {content}")
            else:
                lines.append(str(content))
        example["__flattened_text"] = "\n".join(lines)
        return example

    raw_ds = raw_ds.map(flatten_segments)
    # Prefer flattened text unless user overrides
    if TEXT_COLUMN is None:
        TEXT_COLUMN = "__flattened_text"

# Auto-detect a reasonable text column if still unknown
if TEXT_COLUMN is None:
    preferred = [
        "__flattened_text",
        "text",
        "transcript",
        "content",
        "body",
        "cleaned_text",
        "utterance",
        "raw_text",
    ]
    for p in preferred:
        exact = [c for c in raw_ds.column_names if c.lower() == p]
        if len(exact) > 0:
            TEXT_COLUMN = exact[0]
            break

if TEXT_COLUMN is None:
    # fallback to first string-like column
    for name, feature in raw_ds.features.items():
        if getattr(feature, "dtype", "") in ("string", "large_string"):
            TEXT_COLUMN = name
            break

if TEXT_COLUMN is None:
    TEXT_COLUMN = raw_ds.column_names[0]

print("Using text column:", TEXT_COLUMN)

# Filter empty
ds = raw_ds.filter(lambda x: x.get(TEXT_COLUMN) is not None and len(str(x[TEXT_COLUMN])) > 0)
print(ds)
print("Example text:", str(ds[0][TEXT_COLUMN])[:400])

from transformers import AutoTokenizer

MODEL_ID = "meta-llama/Llama-3.1-8B"
BLOCK_SIZE = 1024  # use 512–1024 for QLoRA on 10–12 GB GPUs

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
# Avoid long-sequence warnings during tokenization; packing enforces BLOCK_SIZE later
try:
    tokenizer.model_max_length = 1_000_000_000
except Exception:
    pass

def tokenize_examples(batch):
    return tokenizer(batch[TEXT_COLUMN], add_special_tokens=False, truncation=False)

print("Tokenizing dataset (this may take a while)...")
tok_ds = ds.map(
    tokenize_examples,
    batched=True,
    remove_columns=[c for c in ds.column_names if c != TEXT_COLUMN],
)

# Pack tokens into fixed blocks
def group_texts(examples):
    concatenated = []
    for ids in examples["input_ids"]:
        concatenated.extend(ids + [tokenizer.eos_token_id])
    total_length = (len(concatenated) // BLOCK_SIZE) * BLOCK_SIZE
    if total_length == 0:
        return {"input_ids": [], "labels": []}
    input_ids = [
        concatenated[i : i + BLOCK_SIZE] for i in range(0, total_length, BLOCK_SIZE)
    ]
    return {"input_ids": input_ids, "labels": [x.copy() for x in input_ids]}

lm_ds = tok_ds.map(group_texts, batched=True, remove_columns=tok_ds.column_names)
print(lm_ds)

from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

OUTPUT_DIR = "llama31_dapt_transcripts_lora"
LEARNING_RATE = 2e-4
EPOCHS = 1
PER_DEVICE_BATCH = 1
GRAD_ACCUM = 32

bnb_config = None
if USE_QLORA:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16 if BF16_OK else torch.float16,
        bnb_4bit_use_double_quant=True,
    )

print("Loading base model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.bfloat16
    if BF16_OK
    else (torch.float16 if USE_CUDA else torch.float32),
    quantization_config=bnb_config if USE_QLORA else None,
)

if USE_QLORA:
    model = prepare_model_for_kbit_training(model)

lora_cfg = LoraConfig(
    task_type="CAUSAL_LM",
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
)
model = get_peft_model(model, lora_cfg)

print(model)

from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=PER_DEVICE_BATCH,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LEARNING_RATE,
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    save_strategy="steps",
    bf16=BF16_OK,
    fp16=(USE_CUDA and not BF16_OK),
    optim="paged_adamw_8bit" if USE_QLORA else "adamw_torch",
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    weight_decay=0.0,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=lm_ds,
    data_collator=collator,
)

trainer.train(resume_from_checkpoint=True)

# Save adapter + tokenizer, and run a tiny inference sanity check
from peft import PeftModel

# Save
trainer.model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Saved PEFT adapter and tokenizer to {OUTPUT_DIR}")

# Hosted inference via Hugging Face Inference API
print("Running inference via Hugging Face Inference API...")
from huggingface_hub import InferenceClient

hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)

resp = client.text_generation(
    "Write a haiku about GPUs",
    max_new_tokens=128,
    temperature=0.7,
)
print(resp)