mts1b-llm — public API surface
LLM router + 35 personas + semantic cache + cost ledger + governor.
Top-level
from mts1b_llm import LLM
llm = LLM()
Plain completion
response = await llm.complete(
persona="equities_analyst",
prompt="Summarize today's NVDA earnings.",
context={"earnings_release": "...", "consensus": {...}},
max_tokens=500,
temperature=0.2,
)
# Response(text=str, model=str, cached=bool, cost_usd=float, latency_ms=int,
# input_tokens=int, output_tokens=int)
print(response.text)
print(f"cost ${response.cost_usd:.4f}, latency {response.latency_ms}ms, cached={response.cached}")
Structured output
from pydantic import BaseModel
class TradeIdea(BaseModel):
symbol: str
side: str
rationale: str
confidence: float
ideas: list[TradeIdea] = await llm.complete(
persona="quant_screener",
prompt="Suggest 3 long ideas from the Russell 1000 today.",
output_schema=TradeIdea,
n=3,
)
# Auto-validated; auto-repair on malformed JSON
Persona-scoped
from mts1b_llm.personas import persona
# CRO veto
cro = persona("CRO")
veto = await cro.veto_order(order=..., context=..., timeout=5.0)
# VetoDecision(veto=bool, confidence=float, reasoning=str)
# Equities analyst summary
analyst = persona("equities_analyst")
summary = await analyst.summarize_earnings(release=..., consensus=...)
Persona registry
Personas defined as YAML in personas/*.yaml:
name: my_persona
description: What this persona does
default_model: claude-sonnet-4-5
fallback_models: [gpt-4-turbo, claude-opus-4-7]
temperature: 0.2
max_tokens: 1000
system_prompt: |
You are ...
tools:
- get_recent_drawdown
- get_positions
budget_usd_per_day: 5.0
scorers:
- latency_under_2s
- format_compliance
from mts1b_llm.personas import PERSONA_REGISTRY, reload_personas
# Discover
print(sorted(PERSONA_REGISTRY))
# Reload after editing YAML (no service restart needed)
await reload_personas()
Provider failover
# Configured in mts1b.config
providers:
- name: anthropic
api_key: ${ANTHROPIC_API_KEY}
priority: 1
- name: openai
api_key: ${OPENAI_API_KEY}
priority: 2
- name: google
api_key: ${GOOGLE_API_KEY}
priority: 3
- name: local
base_url: http://gpu1.local:8000
models: [llama-3-70b-instruct]
If primary fails (rate limit, 5xx), router falls back in priority order.
Cost ledger
from mts1b_llm.ledger import CostLedger
ledger = CostLedger()
report = await ledger.summary(window="7d", group_by="persona")
print(report)
# Per-call query
async for entry in ledger.entries(since=date.today()):
print(entry.persona, entry.model, entry.cost_usd, entry.cached)
CLI:
mts mts1b-llm cost --window 7d --group-by persona
mts mts1b-llm cache stats --window 7d
mts mts1b-llm providers status
Cache
from mts1b_llm.cache import semantic_cache
# Manual check
hit = await semantic_cache.lookup(persona="X", prompt="Y", threshold=0.95)
# Typical hit rate: 60-80% on stable workloads
Governor
from mts1b_llm.governor import BudgetGovernor
gov = BudgetGovernor()
status = await gov.check(persona="CRO")
# BudgetStatus(persona="CRO", daily_usd_spent=3.95, daily_usd_cap=5.0,
# pct_spent=0.79, action="continue")
# When pct_spent > 0.8: downshift to cheaper model
# When pct_spent >= 1.0: refuse new calls; return canned fallback
CLI:
mts mts1b-llm budget set --persona CRO --daily-usd 10
mts mts1b-llm budget show
Evals
mts mts1b-llm evals run --persona CRO --suite veto-cases
# 50 fixture cases; regression alert if PASS < 90%