# %%
from huggingface_hub import InferenceClient

client = InferenceClient("meta-llama/Llama-3.1-8B", token="hf_xxx")  # replace with your HF token
resp = client.text_generation(
    "Write a haiku about GPUs",
    max_new_tokens=128,
    temperature=0.7,
)
print(resp)

# %%
# Minimize on-disk writes (avoid "No space left on device")
import os, tempfile, datasets, transformers

# Use a small temp dir for caches or disable dataset cache writes
TMP_DIR = tempfile.mkdtemp(prefix="hf_tmp_")
os.environ["HF_HOME"] = TMP_DIR
os.environ["HF_DATASETS_CACHE"] = os.path.join(TMP_DIR, "datasets_cache")
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

os.environ["HF_TOKEN"] = "hf_xxx"  # replace with your HF token
os.environ["HUGGINGFACE_HUB_TOKEN"] = os.environ["HF_TOKEN"]
# Keep map results in memory to avoid materializing to disk
datasets.disable_caching()
print({
    "HF_HOME": os.environ.get("HF_HOME"),
    "HF_DATASETS_CACHE": os.environ.get("HF_DATASETS_CACHE"),
    "caching_disabled": True,
    "PYTORCH_CUDA_ALLOC_CONF": os.environ.get("PYTORCH_CUDA_ALLOC_CONF"),
})

# %%
# If needed, install dependencies. Uncomment the next cell to run once.
# %pip -q install -U transformers datasets accelerate peft
# For CUDA QLoRA only (Linux/NVIDIA):
# %pip -q install bitsandbytes

import os
import platform
import torch

# Detect environment
USE_CUDA = torch.cuda.is_available()
USE_MPS = (not USE_CUDA) and torch.backends.mps.is_available()
BF16_OK = USE_CUDA and torch.cuda.is_bf16_supported()
USE_QLORA = USE_CUDA  # QLoRA requires CUDA + bitsandbytes; set False on macOS/CPU

# Disable QLoRA automatically if bitsandbytes is not installed
try:
    import importlib.metadata as _ilmd
    _ = _ilmd.version("bitsandbytes")
except Exception:
    if USE_QLORA:
        print("bitsandbytes not found; disabling QLoRA (falling back to standard LoRA)")
    USE_QLORA = False

DEVICE = (
    torch.device("cuda") if USE_CUDA else (torch.device("mps") if USE_MPS else torch.device("cpu"))
)

print({
    "cuda": USE_CUDA,
    "mps": USE_MPS,
    "bf16_ok": BF16_OK,
    "use_qlora": USE_QLORA,
    "device": str(DEVICE),
    "python": platform.python_version(),
})

# %%
from datasets import load_dataset
from typing import Optional
import getpass

# Paths and config
# Get current username dynamically
current_user = getpass.getuser()
PARQUET_PATH = f"/u/v/d/{current_user}/defeatbeta-api-main/stock_earning_call_transcripts.parquet"
TEXT_COLUMN: Optional[str] = None  # override to force a column, else auto

raw_ds = load_dataset("parquet", data_files={"train": PARQUET_PATH})["train"]
SAMPLE_FRACTION = 0.2  # use 20% random subset for faster DAPT
SAMPLE_SEED = int(os.environ.get("SAMPLE_SEED", "42"))
if SAMPLE_FRACTION < 1.0:
    split = raw_ds.train_test_split(test_size=1.0 - SAMPLE_FRACTION, seed=SAMPLE_SEED, shuffle=True)
    raw_ds = split["train"]
    try:
        print(f"Randomly sampled {int(SAMPLE_FRACTION*100)}% subset; size: {len(raw_ds)}")
    except Exception:
        pass
print("Columns:", raw_ds.column_names)
print(raw_ds[0])

# If schema has nested `transcripts` (array of structs with speaker/content),
# flatten into a single text field for DAPT.
if "transcripts" in raw_ds.column_names:
    def flatten_segments(example):
        segments = example.get("transcripts") or []
        lines = []
        for seg in segments:
            if not seg:
                continue
            speaker = seg.get("speaker")
            content = seg.get("content")
            if content is None:
                continue
            if speaker and len(str(speaker)) > 0:
                lines.append(f"{speaker}: {content}")
            else:
                lines.append(str(content))
        example["__flattened_text"] = "\n".join(lines)
        return example

    raw_ds = raw_ds.map(flatten_segments)
    # Prefer flattened text unless user overrides
    if TEXT_COLUMN is None:
        TEXT_COLUMN = "__flattened_text"

# Auto-detect a reasonable text column if still unknown
if TEXT_COLUMN is None:
    preferred = ["__flattened_text","text","transcript","content","body","cleaned_text","utterance","raw_text"]
    for p in preferred:
        exact = [c for c in raw_ds.column_names if c.lower() == p]
        if len(exact) > 0:
            TEXT_COLUMN = exact[0]
            break

if TEXT_COLUMN is None:
    # fallback to first string-like column
    for name, feature in raw_ds.features.items():
        if getattr(feature, "dtype", "") in ("string", "large_string"):
            TEXT_COLUMN = name
            break

if TEXT_COLUMN is None:
    TEXT_COLUMN = raw_ds.column_names[0]

print("Using text column:", TEXT_COLUMN)

# Filter empty
ds = raw_ds.filter(lambda x: x.get(TEXT_COLUMN) is not None and len(str(x[TEXT_COLUMN])) > 0)
print(ds)
print("Example text:", str(ds[0][TEXT_COLUMN])[:400])

# %%
from transformers import AutoTokenizer

MODEL_ID = "meta-llama/Llama-3.1-8B"
BLOCK_SIZE = 512  # lowered to reduce activation memory on 10–12 GB GPUs

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
# Avoid long-sequence warnings during tokenization; packing enforces BLOCK_SIZE later
try:
    tokenizer.model_max_length = 1_000_000_000
except Exception:
    pass

def tokenize_examples(batch):
    return tokenizer(batch[TEXT_COLUMN], add_special_tokens=False, truncation=False)

print("Tokenizing dataset (this may take a while)...")
tok_ds = ds.map(tokenize_examples, batched=True, remove_columns=[c for c in ds.column_names if c != TEXT_COLUMN])

# Pack tokens into fixed blocks
def group_texts(examples):
    concatenated = []
    for ids in examples["input_ids"]:
        concatenated.extend(ids + [tokenizer.eos_token_id])
    total_length = (len(concatenated) // BLOCK_SIZE) * BLOCK_SIZE
    if total_length == 0:
        return {"input_ids": [], "labels": []}
    input_ids = [concatenated[i:i+BLOCK_SIZE] for i in range(0, total_length, BLOCK_SIZE)]
    return {"input_ids": input_ids, "labels": [x.copy() for x in input_ids]}

lm_ds = tok_ds.map(group_texts, batched=True, remove_columns=tok_ds.column_names)
print(lm_ds)

# %%
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Use the current username (already defined above)
OUTPUT_DIR = f"/u/v/d/{current_user}/llama3_8b_dapt_transcripts_lora"
LEARNING_RATE = 2e-4
EPOCHS = 1
PER_DEVICE_BATCH = 1
GRAD_ACCUM = 32

bnb_config = None
if USE_QLORA:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16 if BF16_OK else torch.float16,
        bnb_4bit_use_double_quant=True,
    )

# Prefer FlashAttention-2 on CUDA if available; else fall back to SDPA
attn_impl = "sdpa"
if USE_CUDA:
    try:
        import flash_attn  # noqa: F401
        attn_impl = "flash_attention_2"
    except Exception:
        pass

# Constrain loading to GPU memory to avoid CPU/disk offload with 4-bit
load_kwargs = {}
if USE_CUDA:
    try:
        total_bytes = torch.cuda.get_device_properties(0).total_memory
        total_gib = max(1, int(total_bytes / (1024 ** 3)))
        reserve_gib = 1
        max_gib = max(1, total_gib - reserve_gib)
        load_kwargs["device_map"] = "auto"
        load_kwargs["max_memory"] = {0: f"{max_gib}GiB"}
    except Exception:
        load_kwargs["device_map"] = "auto"

print("Loading base model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    dtype=torch.bfloat16 if BF16_OK else (torch.float16 if USE_CUDA else torch.float32),
    quantization_config=bnb_config if USE_QLORA else None,
    attn_implementation=attn_impl,
    low_cpu_mem_usage=True,
    **load_kwargs,
)

if USE_QLORA:
    model = prepare_model_for_kbit_training(model)

lora_cfg = LoraConfig(
    task_type="CAUSAL_LM",
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
)
model = get_peft_model(model, lora_cfg)

# Reduce training memory footprint
model.config.use_cache = False
try:
    model.enable_input_require_grads()
except Exception:
    pass
try:
    model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
except Exception:
    model.gradient_checkpointing_enable()

print(model)
try:
    print("Device map:", getattr(model, "hf_device_map", None))
except Exception:
    pass

# %%
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8)

args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=PER_DEVICE_BATCH,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LEARNING_RATE,
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    save_strategy="steps",
    bf16=BF16_OK,
    fp16=(USE_CUDA and not BF16_OK),
    tf32=True,
    gradient_checkpointing=True,
    remove_unused_columns=False,
    dataloader_num_workers=2,
    optim="paged_adamw_8bit" if USE_QLORA else "adamw_torch",
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    weight_decay=0.0,
    save_safetensors=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=lm_ds,
    data_collator=collator,
)

# Free any stale allocations before training
import gc, torch; gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

from pathlib import Path

def _latest_checkpoint_dir(base_dir: str):
    try:
        dirs = [p for p in Path(base_dir).glob("checkpoint-*") if p.is_dir()]
        if not dirs:
            return None
        def step_num(p: Path):
            try:
                return int(p.name.split("-")[-1])
            except Exception:
                return -1
        latest = max(dirs, key=step_num)
        return str(latest)
    except Exception:
        return None

latest_ckpt = _latest_checkpoint_dir(OUTPUT_DIR)
print("Resume checkpoint:", latest_ckpt or "<none; starting fresh>")
trainer.train(resume_from_checkpoint=latest_ckpt if latest_ckpt else None)

# %%
# Save adapter + tokenizer, then run a quick inference via HF Inference API
from peft import PeftModel

# Save
trainer.model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Saved PEFT adapter and tokenizer to {OUTPUT_DIR}")

# Hosted inference via Hugging Face Inference API (no GPU weights needed here)
print("Running inference via Hugging Face Inference API...")
from huggingface_hub import InferenceClient

hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
client = InferenceClient("meta-llama/Llama-3.1-8B", token=hf_token)

resp = client.text_generation(
    "Write a haiku about GPUs",
    max_new_tokens=128,
    temperature=0.7,
)
print(resp)