feat: wire real LLM runner via third-one or OpenAI-compatible API

2026-04-28 15:48:37 +09:00
parent 9b0bc172ef
commit dced20a9af
8 changed files with 486 additions and 5 deletions
--- a/internal/workflows/llm_runner.go
+++ b/internal/workflows/llm_runner.go
@@ -0,0 +1,133 @@
+package workflows
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"strings"
+
+	"tutor/internal/llm"
+)
+
+type LLMRunner struct {
+	client *llm.Client
+}
+
+func NewLLMRunner(client *llm.Client) *LLMRunner {
+	return &LLMRunner{client: client}
+}
+
+func (r *LLMRunner) DiagnoseJobSeeker(ctx context.Context, input DiagnosticInput) (DiagnosticResult, error) {
+	raw, err := r.client.ChatJSON(ctx, diagnoseSystemPrompt(), diagnoseUserPrompt(input), true)
+	if err != nil {
+		return DiagnosticResult{}, fmt.Errorf("diagnose_job_seeker: %w", err)
+	}
+
+	var result DiagnosticResult
+	if err := extractJSON(raw, &result); err != nil {
+		return DiagnosticResult{}, fmt.Errorf("diagnose_job_seeker parse: %w", err)
+	}
+	return result, nil
+}
+
+func (r *LLMRunner) GradeInterviewAnswer(ctx context.Context, input GradeAnswerInput) (GradedAnswer, error) {
+	raw, err := r.client.ChatJSON(ctx, gradeAnswerSystemPrompt(), gradeAnswerUserPrompt(input), true)
+	if err != nil {
+		return GradedAnswer{}, fmt.Errorf("grade_interview_answer: %w", err)
+	}
+
+	var result GradedAnswer
+	if err := extractJSON(raw, &result); err != nil {
+		return GradedAnswer{}, fmt.Errorf("grade_interview_answer parse: %w", err)
+	}
+
+	result.UserID = input.UserID
+	result.AnswerID = input.AnswerID
+	result.QuestionID = input.QuestionID
+	return result, nil
+}
+
+func (r *LLMRunner) ExtractLearningMemory(ctx context.Context, grade GradedAnswer) (MemoryUpdateCandidate, error) {
+	raw, err := r.client.ChatJSON(ctx, extractMemorySystemPrompt(), extractMemoryUserPrompt(grade), true)
+	if err != nil {
+		return MemoryUpdateCandidate{}, fmt.Errorf("extract_learning_memory: %w", err)
+	}
+
+	candidate := MemoryUpdateCandidate{
+		UserID:         grade.UserID,
+		SourceAnswerID: grade.AnswerID,
+	}
+	if err := extractJSON(raw, &candidate); err != nil {
+		return MemoryUpdateCandidate{}, fmt.Errorf("extract_learning_memory parse: %w", err)
+	}
+	return candidate, nil
+}
+
+func (r *LLMRunner) SelectNextChallenge(ctx context.Context, input NextChallengeInput) (NextChallenge, error) {
+	raw, err := r.client.ChatJSON(ctx, nextChallengeSystemPrompt(), nextChallengeUserPrompt("", ""), true)
+	if err != nil {
+		return NextChallenge{}, fmt.Errorf("select_next_challenge: %w", err)
+	}
+
+	var next NextChallenge
+	if err := extractJSON(raw, &next); err != nil {
+		return NextChallenge{}, fmt.Errorf("select_next_challenge parse: %w", err)
+	}
+	next.UserID = input.UserID
+	next.Track = input.Track
+	return next, nil
+}
+
+func (r *LLMRunner) UpdateReadinessMap(ctx context.Context, input ReadinessUpdateInput) (ReadinessUpdate, error) {
+	raw, err := r.client.ChatJSON(ctx, readinessUpdateSystemPrompt(), readinessUpdateUserPrompt(input), true)
+	if err != nil {
+		return ReadinessUpdate{}, fmt.Errorf("update_readiness_map: %w", err)
+	}
+
+	var update ReadinessUpdate
+	if err := extractJSON(raw, &update); err != nil {
+		return ReadinessUpdate{}, fmt.Errorf("update_readiness_map parse: %w", err)
+	}
+	update.UserID = input.UserID
+	update.Track = input.Track
+	return update, nil
+}
+
+func extractJSON(raw string, target any) error {
+	clean := strings.TrimSpace(raw)
+	if strings.HasPrefix(clean, "```") {
+		clean = stripCodeFences(clean)
+	}
+	if err := json.Unmarshal([]byte(clean), target); err != nil {
+		return fmt.Errorf("%w: %s", err, firstBytes(clean, 200))
+	}
+	return nil
+}
+
+var errCodeFence = errors.New("code fence")
+
+func stripCodeFences(input string) string {
+	lines := strings.Split(input, "\n")
+	start := 0
+	end := len(lines)
+	for i, line := range lines {
+		trimmed := strings.TrimSpace(line)
+		if strings.HasPrefix(trimmed, "```") {
+			if start == 0 {
+				start = i + 1
+				continue
+			}
+			end = i
+			break
+		}
+	}
+	return strings.Join(lines[start:end], "\n")
+}
+
+func firstBytes(input string, limit int) string {
+	if len(input) > limit {
+		return input[:limit] + "..."
+	}
+	return input
+}
--- a/internal/workflows/prompts.go
+++ b/internal/workflows/prompts.go
@@ -0,0 +1,180 @@
+package workflows
+
+import (
+	"encoding/json"
+	"fmt"
+)
+
+func gradeAnswerSystemPrompt() string {
+	return fmt.Sprintf(`You are an expert technical interviewer grading a candidate's answer. Output valid JSON matching this schema:
+
+{
+  "user_id": "string",
+  "answer_id": "string",
+  "question_id": "string",
+  "concepts": [{"id": "string", "label": "string", "track": "string"}],
+  "scores": {
+    "correctness": 0,
+    "depth": 0,
+    "communication": 0,
+    "production_judgment": 0
+  },
+  "overall": "miss|partial|solid|strong",
+  "strengths": ["string"],
+  "gaps": ["string"],
+  "evidence": [{"kind": "answer|grading|source|session|asset", "id": "string", "quote": "string", "confidence": 0.0}],
+  "misconception_candidates": [{"label": "string", "description": "string", "evidence": [], "confidence": 0.0}],
+  "follow_up": {"needed": true, "question": "string", "purpose": "clarify|repair|stretch|pressure_test"}
+}
+
+Scoring rules:
+- scores: 1-4 integer scale (1=inadequate, 2=surface, 3=solid, 4=strong).
+- correctness: factual accuracy
+- depth: covers tradeoffs, edge cases, production context
+- communication: clarity, structure, conciseness
+- production_judgment: practical experience signals in the answer
+- overall: "miss" if mostly wrong, "partial" if some correct parts, "solid" if mostly correct with depth, "strong" if comprehensive and production-ready.
+- evidence: always include at least one EvidenceRef with kind "grading", quote from the answer, and confidence 0.5-1.0.
+- follow_up.needed: true unless the answer is "strong" and complete. Set purpose to "clarify" for unclear answers, "repair" for misconceptions, "stretch" to test depth, "pressure_test" for strong answers.
+- misconception_candidates: list any detected wrong mental models.
+
+Respond with ONLY the JSON object, no markdown fences.`)
+}
+
+func gradeAnswerUserPrompt(input GradeAnswerInput) string {
+	payload, _ := json.Marshal(input)
+	return fmt.Sprintf("Grade this interview answer: %s", string(payload))
+}
+
+func extractMemorySystemPrompt() string {
+	return fmt.Sprintf(`You are a learner memory extraction agent. From a graded interview answer, produce memory updates. Output valid JSON matching this schema:
+
+{
+  "updates": [
+    {
+      "kind": "concept_mastery|misconception|intervention|review_schedule",
+      "concept": {"id": "string", "label": "string", "track": "string"},
+      "proposed_state": "unknown|fragile|improving|interview_ready|strong_signal",
+      "summary": "string",
+      "evidence": [{"kind": "grading", "id": "string", "quote": "string", "confidence": 0.0}],
+      "confidence": 0.0,
+      "durability": "tentative|confirmed"
+    }
+  ]
+}
+
+Rules:
+- For every concept in the grading, create a concept_mastery update with proposed_state derived from overall grade: "miss"→fragile, "partial"→improving, "solid"→interview_ready, "strong"→strong_signal.
+- If follow_up.needed is true and overall is "miss" or "partial", add a misconception update (kind="misconception") for each concept with proposed_state "fragile".
+- If follow_up.needed is true, add an intervention update (kind="intervention") for each concept with the follow_up question as summary.
+- If the answer shows gaps, add a review_schedule update (kind="review_schedule") for each concept with a review reason.
+- Confidence: 0.5-0.7 for tentative, 0.8-1.0 for confirmed. Durability: "confirmed" only for "strong" overall.
+
+Respond with ONLY the JSON object, no markdown fences.`)
+}
+
+func extractMemoryUserPrompt(grade GradedAnswer) string {
+	payload, _ := json.Marshal(grade)
+	return fmt.Sprintf("Extract memory updates from this graded answer: %s", string(payload))
+}
+
+func nextChallengeSystemPrompt() string {
+	return fmt.Sprintf(`You are a challenge selection agent. Given learner memory state, select the next challenge. Output valid JSON matching this schema:
+
+{
+  "concept": {"id": "string", "label": "string", "track": "string"},
+  "ladder_level": "define|tradeoffs|debug|design_constraints|interview_pressure",
+  "question": "string",
+  "rationale": "string",
+  "difficulty_action": "lower|hold|raise|recover",
+  "evidence": [{"kind": "grading", "id": "string", "quote": "string", "confidence": 0.0}]
+}
+
+Rules:
+- Pick the concept with the weakest readiness state.
+- ladder_level: fragile→define, improving→tradeoffs, interview_ready→design_constraints, strong_signal→interview_pressure.
+- difficulty_action: fragile→recover, improving→hold, interview_ready+→raise.
+- Generate one concrete interview question for the selected concept at the appropriate ladder level.
+- rationale: explain why this concept and level was chosen.
+- evidence: reference the concept's existing evidence.
+
+Respond with ONLY the JSON object, no markdown fences.`)
+}
+
+func nextChallengeUserPrompt(masteryJSON, profileJSON string) string {
+	return fmt.Sprintf(`Learner mastery: %s
+
+Learner profile: %s
+
+Select the next challenge for this learner.`, masteryJSON, profileJSON)
+}
+
+func diagnoseSystemPrompt() string {
+	return fmt.Sprintf(`You are a diagnostic interview agent. Given a job seeker's profile, produce an initial readiness assessment. Output valid JSON matching this schema:
+
+{
+  "user_id": "string",
+  "track": "string",
+  "target_role": "string",
+  "stack": ["string"],
+  "initial_readiness": "unknown|fragile|improving|interview_ready|strong_signal",
+  "concept_findings": [
+    {
+      "concept": {"id": "string", "label": "string", "track": "string"},
+      "readiness": "unknown|fragile|improving|interview_ready|strong_signal",
+      "reason": "string",
+      "evidence": []
+    }
+  ],
+  "recommended_next_concepts": [{"id": "string", "label": "string", "track": "string"}]
+}
+
+Rules:
+- initial_readiness: default to "unknown" unless you have strong signals from the profile.
+- For each concept, estimate readiness based on the stack and target role. Default to "unknown" if no strong signal.
+- recommended_next_concepts: pick up to 3 concepts to start with.
+- evidence: always empty for initial diagnostic (no answers yet).
+
+Respond with ONLY the JSON object, no markdown fences.`)
+}
+
+func diagnoseUserPrompt(input DiagnosticInput) string {
+	payload, _ := json.Marshal(input)
+	return fmt.Sprintf("Assess initial readiness for this job seeker: %s", string(payload))
+}
+
+func readinessUpdateSystemPrompt() string {
+	return fmt.Sprintf(`You are a readiness update agent. Given learner memory state, produce readiness deltas and unlocks. Output valid JSON matching this schema:
+
+{
+  "concept_updates": [
+    {
+      "concept": {"id": "string", "label": "string", "track": "string"},
+      "previous": "unknown|fragile|improving|interview_ready|strong_signal",
+      "next": "unknown|fragile|improving|interview_ready|strong_signal",
+      "reason": "string",
+      "evidence": [{"kind": "grading", "id": "string", "quote": "string", "confidence": 0.0}]
+    }
+  ],
+  "unlocks": [
+    {
+      "kind": "boss_question|review_card|portfolio_entry",
+      "label": "string",
+      "reason": "string"
+    }
+  ]
+}
+
+Rules:
+- For each concept, determine if the readiness state should change based on evidence quality and quantity.
+- Unlock boss_question when 3+ concepts are at interview_ready or strong_signal.
+- Unlock review_card when concepts have misconceptions that need revisiting.
+- Unlock portfolio_entry when a concept reaches strong_signal.
+
+Respond with ONLY the JSON object, no markdown fences.`)
+}
+
+func readinessUpdateUserPrompt(input ReadinessUpdateInput) string {
+	payload, _ := json.Marshal(input)
+	return fmt.Sprintf("Analyze readiness updates for: %s", string(payload))
+}