TradingAgents/Finllama/DAPT_Llama31_Transcripts.py

305 lines
9.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
DAPT (Domain-Adaptive Pretraining) for Llama 3.1 on Earnings Call Transcripts
This script mirrors the notebook logic from `DAPT_Llama31_Transcripts.ipynb`.
It performs continued pretraining (causal LM objective) of Llama 3.1 using a
local Parquet file containing earnings call transcripts.
What you'll get:
- Environment-adaptive setup (CUDA, MPS, CPU) with automatic LoRA/QLoRA selection
- Robust dataset loading from Parquet and text-column auto-detection
- Efficient token packing into fixed-length sequences
- PEFT LoRA (and QLoRA on CUDA) training pipeline with Transformers Trainer
- Save adapters and quick inference sanity check
Notes:
- Accept the Llama 3.1 license on Hugging Face and authenticate before training.
- On macOS (MPS), QLoRA is disabled (no bitsandbytes). We use standard LoRA with float16/float32.
- For best performance, use a CUDA GPU and enable QLoRA.
"""
# Install required libraries (run manually if needed):
# pip install -U transformers datasets accelerate peft sentencepiece protobuf
# For CUDA QLoRA only (Linux/NVIDIA):
# pip install bitsandbytes
# Minimize on-disk writes (avoid "No space left on device")
import os
import tempfile
import datasets
import transformers
# Use a small temp dir for caches or disable dataset cache writes
TMP_DIR = tempfile.mkdtemp(prefix="hf_tmp_")
os.environ["HF_HOME"] = TMP_DIR
os.environ["HF_DATASETS_CACHE"] = os.path.join(TMP_DIR, "datasets_cache")
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
os.environ["HF_TOKEN"] = "hf_token"
os.environ["HUGGINGFACE_HUB_TOKEN"] = os.environ["HF_TOKEN"]
# Keep map results in memory to avoid materializing to disk
datasets.disable_caching()
print({
"HF_HOME": os.environ.get("HF_HOME"),
"HF_DATASETS_CACHE": os.environ.get("HF_DATASETS_CACHE"),
"caching_disabled": True,
})
# If needed, install dependencies (do manually if missing):
# pip install -U transformers datasets accelerate peft
# For CUDA QLoRA only (Linux/NVIDIA):
# pip install bitsandbytes
import platform
import torch
# Detect environment
USE_CUDA = torch.cuda.is_available()
USE_MPS = (not USE_CUDA) and torch.backends.mps.is_available()
BF16_OK = USE_CUDA and torch.cuda.is_bf16_supported()
USE_QLORA = USE_CUDA # QLoRA requires CUDA + bitsandbytes; set False on macOS/CPU
# Disable QLoRA automatically if bitsandbytes is not installed
try:
import importlib.metadata as _ilmd
_ = _ilmd.version("bitsandbytes")
except Exception:
if USE_QLORA:
print("bitsandbytes not found; disabling QLoRA (falling back to standard LoRA)")
USE_QLORA = False
DEVICE = (
torch.device("cuda") if USE_CUDA else (torch.device("mps") if USE_MPS else torch.device("cpu"))
)
print({
"cuda": USE_CUDA,
"mps": USE_MPS,
"bf16_ok": BF16_OK,
"use_qlora": USE_QLORA,
"device": str(DEVICE),
"python": platform.python_version(),
})
from datasets import load_datasets
from typing import Optional
import pandas as pds
# Paths and config
# Update this to the actual Parquet path on your system
PARQUET_PATH = "stock_earning_call_transcripts.parquet"
TEXT_COLUMN: Optional[str] = None # override to force a column, else auto
raw_ds = load_dataset("parquet", data_files={"train": PARQUET_PATH})["train"]
print("Columns:", raw_ds.column_names)
print(raw_ds[0])
# If schema has nested `transcripts` (array of structs with speaker/content),
# flatten into a single text field for DAPT.
if "transcripts" in raw_ds.column_names:
def flatten_segments(example):
segments = example.get("transcripts") or []
lines = []
for seg in segments:
if not seg:
continue
speaker = seg.get("speaker")
content = seg.get("content")
if content is None:
continue
if speaker and len(str(speaker)) > 0:
lines.append(f"{speaker}: {content}")
else:
lines.append(str(content))
example["__flattened_text"] = "\n".join(lines)
return example
raw_ds = raw_ds.map(flatten_segments)
# Prefer flattened text unless user overrides
if TEXT_COLUMN is None:
TEXT_COLUMN = "__flattened_text"
# Auto-detect a reasonable text column if still unknown
if TEXT_COLUMN is None:
preferred = [
"__flattened_text",
"text",
"transcript",
"content",
"body",
"cleaned_text",
"utterance",
"raw_text",
]
for p in preferred:
exact = [c for c in raw_ds.column_names if c.lower() == p]
if len(exact) > 0:
TEXT_COLUMN = exact[0]
break
if TEXT_COLUMN is None:
# fallback to first string-like column
for name, feature in raw_ds.features.items():
if getattr(feature, "dtype", "") in ("string", "large_string"):
TEXT_COLUMN = name
break
if TEXT_COLUMN is None:
TEXT_COLUMN = raw_ds.column_names[0]
print("Using text column:", TEXT_COLUMN)
# Filter empty
ds = raw_ds.filter(lambda x: x.get(TEXT_COLUMN) is not None and len(str(x[TEXT_COLUMN])) > 0)
print(ds)
print("Example text:", str(ds[0][TEXT_COLUMN])[:400])
from transformers import AutoTokenizer
MODEL_ID = "meta-llama/Llama-3.1-8B"
BLOCK_SIZE = 1024 # use 5121024 for QLoRA on 1012 GB GPUs
# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Avoid long-sequence warnings during tokenization; packing enforces BLOCK_SIZE later
try:
tokenizer.model_max_length = 1_000_000_000
except Exception:
pass
def tokenize_examples(batch):
return tokenizer(batch[TEXT_COLUMN], add_special_tokens=False, truncation=False)
print("Tokenizing dataset (this may take a while)...")
tok_ds = ds.map(
tokenize_examples,
batched=True,
remove_columns=[c for c in ds.column_names if c != TEXT_COLUMN],
)
# Pack tokens into fixed blocks
def group_texts(examples):
concatenated = []
for ids in examples["input_ids"]:
concatenated.extend(ids + [tokenizer.eos_token_id])
total_length = (len(concatenated) // BLOCK_SIZE) * BLOCK_SIZE
if total_length == 0:
return {"input_ids": [], "labels": []}
input_ids = [
concatenated[i : i + BLOCK_SIZE] for i in range(0, total_length, BLOCK_SIZE)
]
return {"input_ids": input_ids, "labels": [x.copy() for x in input_ids]}
lm_ds = tok_ds.map(group_texts, batched=True, remove_columns=tok_ds.column_names)
print(lm_ds)
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
OUTPUT_DIR = "llama31_dapt_transcripts_lora"
LEARNING_RATE = 2e-4
EPOCHS = 1
PER_DEVICE_BATCH = 1
GRAD_ACCUM = 32
bnb_config = None
if USE_QLORA:
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16 if BF16_OK else torch.float16,
bnb_4bit_use_double_quant=True,
)
print("Loading base model...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
torch_dtype=torch.bfloat16
if BF16_OK
else (torch.float16 if USE_CUDA else torch.float32),
quantization_config=bnb_config if USE_QLORA else None,
)
if USE_QLORA:
model = prepare_model_for_kbit_training(model)
lora_cfg = LoraConfig(
task_type="CAUSAL_LM",
r=16,
lora_alpha=32,
lora_dropout=0.05,
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
],
)
model = get_peft_model(model, lora_cfg)
print(model)
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
args = TrainingArguments(
output_dir=OUTPUT_DIR,
num_train_epochs=EPOCHS,
per_device_train_batch_size=PER_DEVICE_BATCH,
gradient_accumulation_steps=GRAD_ACCUM,
learning_rate=LEARNING_RATE,
logging_steps=10,
save_steps=200,
save_total_limit=2,
save_strategy="steps",
bf16=BF16_OK,
fp16=(USE_CUDA and not BF16_OK),
optim="paged_adamw_8bit" if USE_QLORA else "adamw_torch",
lr_scheduler_type="cosine",
warmup_ratio=0.03,
weight_decay=0.0,
report_to="none",
)
trainer = Trainer(
model=model,
args=args,
train_dataset=lm_ds,
data_collator=collator,
)
trainer.train(resume_from_checkpoint=True)
# Save adapter + tokenizer, and run a tiny inference sanity check
from peft import PeftModel
# Save
trainer.model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Saved PEFT adapter and tokenizer to {OUTPUT_DIR}")
# Hosted inference via Hugging Face Inference API
print("Running inference via Hugging Face Inference API...")
from huggingface_hub import InferenceClient
hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
resp = client.text_generation(
"Write a haiku about GPUs",
max_new_tokens=128,
temperature=0.7,
)
print(resp)