prospector/tooling/eval/format_lora.py

#!/usr/bin/env python3
"""Format the gated, prospect-first labels into a classifier SFT dataset for LoRA.

Input:  <DATA_DIR>/sweep_labels.json — {context, is_prospect, move, trace} rows from
        the identity-gated cold-prospect re-sweep (the clean corpus).
Output: <DATA_DIR>/lora_train.jsonl + lora_eval.jsonl — chat-format SFT examples
        (system + user context -> assistant JSON {is_prospect, move, trace}), split
        90/10 stratified by move so rare classes appear in both.

The classifier learns the prospect-first CoT: commit is_prospect, then the move,
then a one-sentence trace — exactly the runtime contract. Deterministic split (no
RNG): every Nth row per move goes to eval.
"""
import json, os
from collections import defaultdict

DATA = os.environ.get("DATA_DIR", os.path.join(os.path.dirname(__file__), ".data"))
rows = json.load(open(os.path.join(DATA, "sweep_labels.json")))

SYSTEM = ("Classify the last message to Quinn (a touring companion) for her prospecting engine. "
          "FIRST decide is_prospect — someone saying \"hi\" is usually a prospect, but a friend, "
          "vendor, existing client, or bot is NOT (is_prospect is false iff the move is "
          "existing_client/personal/vendor/spam). Then the move, then a one-sentence trace. "
          "Output ONLY JSON: {\"is_prospect\": <bool>, \"move\": \"<class>\", \"trace\": \"<one sentence>\"}.")

def example(r):
    target = {"is_prospect": bool(r["is_prospect"]), "move": r["move"], "trace": r.get("trace", "")}
    return {"messages": [
        {"role": "system", "content": SYSTEM},
        {"role": "user", "content": r["context"]},
        {"role": "assistant", "content": json.dumps(target, ensure_ascii=False)},
    ]}

by_move = defaultdict(list)
for r in rows:
    if r.get("context") and r.get("move"):
        by_move[r["move"]].append(r)

train, evalset = [], []
for move, items in by_move.items():
    for i, r in enumerate(items):
        (evalset if i % 10 == 0 else train).append(example(r))

with open(os.path.join(DATA, "lora_train.jsonl"), "w") as f:
    for e in train:
        f.write(json.dumps(e, ensure_ascii=False) + "\n")
with open(os.path.join(DATA, "lora_eval.jsonl"), "w") as f:
    for e in evalset:
        f.write(json.dumps(e, ensure_ascii=False) + "\n")

print(f"train={len(train)} eval={len(evalset)} across {len(by_move)} moves -> {DATA}/lora_train.jsonl")
print("per-move:", {m: len(v) for m, v in sorted(by_move.items(), key=lambda x: -len(x[1]))})