feat: add file upload for materials (PDF/DOCX) with ingestion pipeline

This commit is contained in:
root
2026-04-29 15:52:35 +09:00
parent 518370b93e
commit 7f503326f9
51 changed files with 4712 additions and 27 deletions

View File

@@ -50,6 +50,7 @@ func (h Handler) Routes() http.Handler {
mux.HandleFunc("GET /api/v1/learners/{userID}/readiness-map", h.getReadinessMap)
mux.HandleFunc("GET /api/v1/learners/{userID}/next-challenge", h.getNextChallenge)
mux.HandleFunc("POST /api/v1/materials", h.ingestMaterial)
mux.HandleFunc("POST /api/v1/materials/upload", h.uploadMaterial)
mux.HandleFunc("GET /api/v1/ontology", h.getOntology)
mux.HandleFunc("POST /api/v1/teaching-assets/prompts", h.generateTeachingAssetPrompt)
mux.HandleFunc("GET /api/v1/teaching-assets", h.getTeachingAssets)

View File

@@ -0,0 +1,67 @@
package httpapi
import (
"io"
"net/http"
"strings"
"tutor/internal/ingestion"
"tutor/internal/ontology"
)
func (h Handler) uploadMaterial(w http.ResponseWriter, r *http.Request) {
if h.ontology == nil {
writeError(w, http.StatusNotFound, "ontology not configured")
return
}
if err := r.ParseMultipartForm(32 << 20); err != nil {
writeError(w, http.StatusBadRequest, "invalid multipart form")
return
}
file, header, err := r.FormFile("file")
if err != nil {
writeError(w, http.StatusBadRequest, "file field required")
return
}
defer file.Close()
if !ingestion.IsSupported(header.Filename) {
writeError(w, http.StatusBadRequest, "unsupported file format; supported: .md, .markdown, .pdf, .docx")
return
}
data, err := io.ReadAll(file)
if err != nil {
writeError(w, http.StatusInternalServerError, "failed to read file")
return
}
result, err := ingestion.ParseFromBytes(header.Filename, data)
if err != nil {
if strings.Contains(err.Error(), "unsupported") {
writeError(w, http.StatusBadRequest, "parse error: "+err.Error())
return
}
writeError(w, http.StatusInternalServerError, "parse error: "+err.Error())
return
}
title := r.FormValue("title")
if title == "" {
title = result.Title
}
ingestResult, err := h.ontology.Ingest(ontology.IngestInput{
Title: title,
SourceType: result.Format,
Body: result.Body,
})
if err != nil {
writeError(w, http.StatusBadRequest, err.Error())
return
}
writeJSON(w, http.StatusCreated, ingestResult)
}

View File

@@ -0,0 +1,221 @@
package httpapi
import (
"bytes"
"encoding/json"
"fmt"
"io"
"mime/multipart"
"net/http"
"net/http/httptest"
"strings"
"testing"
"tutor/internal/config"
"tutor/internal/interview"
"tutor/internal/learnermemory"
"tutor/internal/ontology"
"tutor/internal/progression"
"tutor/internal/teachingassets"
"tutor/internal/workflows"
)
func TestUploadMaterialMarkdown(t *testing.T) {
memory := learnermemory.NewService(learnermemory.NewMemoryStore())
service := interview.NewService(interview.NewMemoryStore(), workflows.NewStubRunner(), memory)
progress := progression.NewService(memory)
onto := ontology.NewService(ontology.NewMemoryStore())
assets := teachingassets.NewService(teachingassets.NewMemoryStore(), onto, "gpt-image-v2")
handler := NewHandler(config.Config{Environment: "test"}, service, memory, progress, onto, assets)
routes := handler.Routes()
var buf bytes.Buffer
w := multipart.NewWriter(&buf)
part, _ := w.CreateFormFile("file", "notes.md")
io.Copy(part, strings.NewReader("# Backend notes\nIdempotent API retries need transactions."))
w.Close()
req := httptest.NewRequest(http.MethodPost, "/api/v1/materials/upload", &buf)
req.Header.Set("Content-Type", w.FormDataContentType())
rec := httptest.NewRecorder()
routes.ServeHTTP(rec, req)
if rec.Code != http.StatusCreated {
t.Fatalf("status = %d, body = %s", rec.Code, rec.Body.String())
}
var result ontology.IngestResult
decodeJSON(t, rec.Body, &result)
if len(result.Snapshot.Concepts) == 0 {
t.Fatal("expected concept candidates after md upload")
}
}
func TestUploadMaterialPDF(t *testing.T) {
memory := learnermemory.NewService(learnermemory.NewMemoryStore())
service := interview.NewService(interview.NewMemoryStore(), workflows.NewStubRunner(), memory)
progress := progression.NewService(memory)
onto := ontology.NewService(ontology.NewMemoryStore())
assets := teachingassets.NewService(teachingassets.NewMemoryStore(), onto, "gpt-image-v2")
handler := NewHandler(config.Config{Environment: "test"}, service, memory, progress, onto, assets)
routes := handler.Routes()
var buf bytes.Buffer
w := multipart.NewWriter(&buf)
part, _ := w.CreateFormFile("file", "notes.pdf")
io.Copy(part, strings.NewReader("not a real pdf"))
w.Close()
req := httptest.NewRequest(http.MethodPost, "/api/v1/materials/upload", &buf)
req.Header.Set("Content-Type", w.FormDataContentType())
rec := httptest.NewRecorder()
routes.ServeHTTP(rec, req)
if rec.Code != http.StatusInternalServerError {
t.Fatalf("expected 500 for invalid PDF, got %d: %s", rec.Code, rec.Body.String())
}
}
func TestUploadMaterialUnsupportedFormat(t *testing.T) {
memory := learnermemory.NewService(learnermemory.NewMemoryStore())
service := interview.NewService(interview.NewMemoryStore(), workflows.NewStubRunner(), memory)
progress := progression.NewService(memory)
onto := ontology.NewService(ontology.NewMemoryStore())
assets := teachingassets.NewService(teachingassets.NewMemoryStore(), onto, "gpt-image-v2")
handler := NewHandler(config.Config{Environment: "test"}, service, memory, progress, onto, assets)
routes := handler.Routes()
var buf bytes.Buffer
w := multipart.NewWriter(&buf)
part, _ := w.CreateFormFile("file", "notes.txt")
io.Copy(part, strings.NewReader("plain text"))
w.Close()
req := httptest.NewRequest(http.MethodPost, "/api/v1/materials/upload", &buf)
req.Header.Set("Content-Type", w.FormDataContentType())
rec := httptest.NewRecorder()
routes.ServeHTTP(rec, req)
if rec.Code != http.StatusBadRequest {
t.Fatalf("expected 400 for unsupported format, got %d: %s", rec.Code, rec.Body.String())
}
}
func TestUploadMaterialMissingFile(t *testing.T) {
memory := learnermemory.NewService(learnermemory.NewMemoryStore())
service := interview.NewService(interview.NewMemoryStore(), workflows.NewStubRunner(), memory)
progress := progression.NewService(memory)
onto := ontology.NewService(ontology.NewMemoryStore())
assets := teachingassets.NewService(teachingassets.NewMemoryStore(), onto, "gpt-image-v2")
handler := NewHandler(config.Config{Environment: "test"}, service, memory, progress, onto, assets)
routes := handler.Routes()
var buf bytes.Buffer
w := multipart.NewWriter(&buf)
w.Close()
req := httptest.NewRequest(http.MethodPost, "/api/v1/materials/upload", &buf)
req.Header.Set("Content-Type", w.FormDataContentType())
rec := httptest.NewRecorder()
routes.ServeHTTP(rec, req)
if rec.Code != http.StatusBadRequest {
t.Fatalf("expected 400 for missing file, got %d: %s", rec.Code, rec.Body.String())
}
}
func TestUploadMaterialWithCustomTitle(t *testing.T) {
memory := learnermemory.NewService(learnermemory.NewMemoryStore())
service := interview.NewService(interview.NewMemoryStore(), workflows.NewStubRunner(), memory)
progress := progression.NewService(memory)
onto := ontology.NewService(ontology.NewMemoryStore())
assets := teachingassets.NewService(teachingassets.NewMemoryStore(), onto, "gpt-image-v2")
handler := NewHandler(config.Config{Environment: "test"}, service, memory, progress, onto, assets)
routes := handler.Routes()
var buf bytes.Buffer
w := multipart.NewWriter(&buf)
w.WriteField("title", "Custom Title")
part, _ := w.CreateFormFile("file", "notes.md")
io.Copy(part, strings.NewReader("Cache invalidation with TTL."))
w.Close()
req := httptest.NewRequest(http.MethodPost, "/api/v1/materials/upload", &buf)
req.Header.Set("Content-Type", w.FormDataContentType())
rec := httptest.NewRecorder()
routes.ServeHTTP(rec, req)
if rec.Code != http.StatusCreated {
t.Fatalf("status = %d, body = %s", rec.Code, rec.Body.String())
}
var result ontology.IngestResult
decodeJSON(t, rec.Body, &result)
if result.Material.Title != "Custom Title" {
t.Fatalf("title = %q, want %q", result.Material.Title, "Custom Title")
}
}
func TestUploadMaterialOntologyNotConfigured(t *testing.T) {
handler := NewHandler(config.Config{Environment: "test"}, nil, nil, nil, nil, nil)
routes := handler.Routes()
var buf bytes.Buffer
w := multipart.NewWriter(&buf)
part, _ := w.CreateFormFile("file", "notes.md")
io.Copy(part, strings.NewReader("# test"))
w.Close()
req := httptest.NewRequest(http.MethodPost, "/api/v1/materials/upload", &buf)
req.Header.Set("Content-Type", w.FormDataContentType())
rec := httptest.NewRecorder()
routes.ServeHTTP(rec, req)
if rec.Code != http.StatusNotFound {
t.Fatalf("expected 404, got %d: %s", rec.Code, rec.Body.String())
}
}
func decodeJSON(t *testing.T, r io.Reader, v interface{}) {
t.Helper()
if err := json.NewDecoder(r).Decode(v); err != nil {
t.Fatalf("decode error: %v", err)
}
}
func TestUploadMaterialMarkdownFrontmatter(t *testing.T) {
memory := learnermemory.NewService(learnermemory.NewMemoryStore())
service := interview.NewService(interview.NewMemoryStore(), workflows.NewStubRunner(), memory)
progress := progression.NewService(memory)
onto := ontology.NewService(ontology.NewMemoryStore())
assets := teachingassets.NewService(teachingassets.NewMemoryStore(), onto, "gpt-image-v2")
handler := NewHandler(config.Config{Environment: "test"}, service, memory, progress, onto, assets)
routes := handler.Routes()
var buf bytes.Buffer
w := multipart.NewWriter(&buf)
part, _ := w.CreateFormFile("file", "study-notes.md")
io.Copy(part, strings.NewReader(fmt.Sprintf("---\ntitle: Study Notes\ntags:\n - backend\n - go\n---\n\n# HTTP Idempotency\n\nIdempotent API retries need transactions for correctness.")))
w.Close()
req := httptest.NewRequest(http.MethodPost, "/api/v1/materials/upload", &buf)
req.Header.Set("Content-Type", w.FormDataContentType())
rec := httptest.NewRecorder()
routes.ServeHTTP(rec, req)
if rec.Code != http.StatusCreated {
t.Fatalf("status = %d, body = %s", rec.Code, rec.Body.String())
}
var result ontology.IngestResult
decodeJSON(t, rec.Body, &result)
if len(result.Snapshot.Concepts) == 0 {
t.Fatal("expected concepts from markdown with frontmatter")
}
for _, c := range result.Snapshot.Concepts {
if c.Concept.ID == "http-idempotency" {
return
}
}
t.Fatalf("expected http-idempotency concept, got concepts: %v", result.Snapshot.Concepts)
}