feat: add file upload for materials (PDF/DOCX) with ingestion pipeline
This commit is contained in:
@@ -50,6 +50,7 @@ func (h Handler) Routes() http.Handler {
|
||||
mux.HandleFunc("GET /api/v1/learners/{userID}/readiness-map", h.getReadinessMap)
|
||||
mux.HandleFunc("GET /api/v1/learners/{userID}/next-challenge", h.getNextChallenge)
|
||||
mux.HandleFunc("POST /api/v1/materials", h.ingestMaterial)
|
||||
mux.HandleFunc("POST /api/v1/materials/upload", h.uploadMaterial)
|
||||
mux.HandleFunc("GET /api/v1/ontology", h.getOntology)
|
||||
mux.HandleFunc("POST /api/v1/teaching-assets/prompts", h.generateTeachingAssetPrompt)
|
||||
mux.HandleFunc("GET /api/v1/teaching-assets", h.getTeachingAssets)
|
||||
|
||||
67
internal/httpapi/material_upload.go
Normal file
67
internal/httpapi/material_upload.go
Normal file
@@ -0,0 +1,67 @@
|
||||
package httpapi
|
||||
|
||||
import (
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
|
||||
"tutor/internal/ingestion"
|
||||
"tutor/internal/ontology"
|
||||
)
|
||||
|
||||
func (h Handler) uploadMaterial(w http.ResponseWriter, r *http.Request) {
|
||||
if h.ontology == nil {
|
||||
writeError(w, http.StatusNotFound, "ontology not configured")
|
||||
return
|
||||
}
|
||||
|
||||
if err := r.ParseMultipartForm(32 << 20); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid multipart form")
|
||||
return
|
||||
}
|
||||
|
||||
file, header, err := r.FormFile("file")
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, "file field required")
|
||||
return
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
if !ingestion.IsSupported(header.Filename) {
|
||||
writeError(w, http.StatusBadRequest, "unsupported file format; supported: .md, .markdown, .pdf, .docx")
|
||||
return
|
||||
}
|
||||
|
||||
data, err := io.ReadAll(file)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "failed to read file")
|
||||
return
|
||||
}
|
||||
|
||||
result, err := ingestion.ParseFromBytes(header.Filename, data)
|
||||
if err != nil {
|
||||
if strings.Contains(err.Error(), "unsupported") {
|
||||
writeError(w, http.StatusBadRequest, "parse error: "+err.Error())
|
||||
return
|
||||
}
|
||||
writeError(w, http.StatusInternalServerError, "parse error: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
title := r.FormValue("title")
|
||||
if title == "" {
|
||||
title = result.Title
|
||||
}
|
||||
|
||||
ingestResult, err := h.ontology.Ingest(ontology.IngestInput{
|
||||
Title: title,
|
||||
SourceType: result.Format,
|
||||
Body: result.Body,
|
||||
})
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
writeJSON(w, http.StatusCreated, ingestResult)
|
||||
}
|
||||
221
internal/httpapi/material_upload_test.go
Normal file
221
internal/httpapi/material_upload_test.go
Normal file
@@ -0,0 +1,221 @@
|
||||
package httpapi
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"tutor/internal/config"
|
||||
"tutor/internal/interview"
|
||||
"tutor/internal/learnermemory"
|
||||
"tutor/internal/ontology"
|
||||
"tutor/internal/progression"
|
||||
"tutor/internal/teachingassets"
|
||||
"tutor/internal/workflows"
|
||||
)
|
||||
|
||||
func TestUploadMaterialMarkdown(t *testing.T) {
|
||||
memory := learnermemory.NewService(learnermemory.NewMemoryStore())
|
||||
service := interview.NewService(interview.NewMemoryStore(), workflows.NewStubRunner(), memory)
|
||||
progress := progression.NewService(memory)
|
||||
onto := ontology.NewService(ontology.NewMemoryStore())
|
||||
assets := teachingassets.NewService(teachingassets.NewMemoryStore(), onto, "gpt-image-v2")
|
||||
handler := NewHandler(config.Config{Environment: "test"}, service, memory, progress, onto, assets)
|
||||
routes := handler.Routes()
|
||||
|
||||
var buf bytes.Buffer
|
||||
w := multipart.NewWriter(&buf)
|
||||
part, _ := w.CreateFormFile("file", "notes.md")
|
||||
io.Copy(part, strings.NewReader("# Backend notes\nIdempotent API retries need transactions."))
|
||||
w.Close()
|
||||
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/v1/materials/upload", &buf)
|
||||
req.Header.Set("Content-Type", w.FormDataContentType())
|
||||
rec := httptest.NewRecorder()
|
||||
routes.ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != http.StatusCreated {
|
||||
t.Fatalf("status = %d, body = %s", rec.Code, rec.Body.String())
|
||||
}
|
||||
|
||||
var result ontology.IngestResult
|
||||
decodeJSON(t, rec.Body, &result)
|
||||
if len(result.Snapshot.Concepts) == 0 {
|
||||
t.Fatal("expected concept candidates after md upload")
|
||||
}
|
||||
}
|
||||
|
||||
func TestUploadMaterialPDF(t *testing.T) {
|
||||
memory := learnermemory.NewService(learnermemory.NewMemoryStore())
|
||||
service := interview.NewService(interview.NewMemoryStore(), workflows.NewStubRunner(), memory)
|
||||
progress := progression.NewService(memory)
|
||||
onto := ontology.NewService(ontology.NewMemoryStore())
|
||||
assets := teachingassets.NewService(teachingassets.NewMemoryStore(), onto, "gpt-image-v2")
|
||||
handler := NewHandler(config.Config{Environment: "test"}, service, memory, progress, onto, assets)
|
||||
routes := handler.Routes()
|
||||
|
||||
var buf bytes.Buffer
|
||||
w := multipart.NewWriter(&buf)
|
||||
part, _ := w.CreateFormFile("file", "notes.pdf")
|
||||
io.Copy(part, strings.NewReader("not a real pdf"))
|
||||
w.Close()
|
||||
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/v1/materials/upload", &buf)
|
||||
req.Header.Set("Content-Type", w.FormDataContentType())
|
||||
rec := httptest.NewRecorder()
|
||||
routes.ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != http.StatusInternalServerError {
|
||||
t.Fatalf("expected 500 for invalid PDF, got %d: %s", rec.Code, rec.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestUploadMaterialUnsupportedFormat(t *testing.T) {
|
||||
memory := learnermemory.NewService(learnermemory.NewMemoryStore())
|
||||
service := interview.NewService(interview.NewMemoryStore(), workflows.NewStubRunner(), memory)
|
||||
progress := progression.NewService(memory)
|
||||
onto := ontology.NewService(ontology.NewMemoryStore())
|
||||
assets := teachingassets.NewService(teachingassets.NewMemoryStore(), onto, "gpt-image-v2")
|
||||
handler := NewHandler(config.Config{Environment: "test"}, service, memory, progress, onto, assets)
|
||||
routes := handler.Routes()
|
||||
|
||||
var buf bytes.Buffer
|
||||
w := multipart.NewWriter(&buf)
|
||||
part, _ := w.CreateFormFile("file", "notes.txt")
|
||||
io.Copy(part, strings.NewReader("plain text"))
|
||||
w.Close()
|
||||
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/v1/materials/upload", &buf)
|
||||
req.Header.Set("Content-Type", w.FormDataContentType())
|
||||
rec := httptest.NewRecorder()
|
||||
routes.ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != http.StatusBadRequest {
|
||||
t.Fatalf("expected 400 for unsupported format, got %d: %s", rec.Code, rec.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestUploadMaterialMissingFile(t *testing.T) {
|
||||
memory := learnermemory.NewService(learnermemory.NewMemoryStore())
|
||||
service := interview.NewService(interview.NewMemoryStore(), workflows.NewStubRunner(), memory)
|
||||
progress := progression.NewService(memory)
|
||||
onto := ontology.NewService(ontology.NewMemoryStore())
|
||||
assets := teachingassets.NewService(teachingassets.NewMemoryStore(), onto, "gpt-image-v2")
|
||||
handler := NewHandler(config.Config{Environment: "test"}, service, memory, progress, onto, assets)
|
||||
routes := handler.Routes()
|
||||
|
||||
var buf bytes.Buffer
|
||||
w := multipart.NewWriter(&buf)
|
||||
w.Close()
|
||||
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/v1/materials/upload", &buf)
|
||||
req.Header.Set("Content-Type", w.FormDataContentType())
|
||||
rec := httptest.NewRecorder()
|
||||
routes.ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != http.StatusBadRequest {
|
||||
t.Fatalf("expected 400 for missing file, got %d: %s", rec.Code, rec.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestUploadMaterialWithCustomTitle(t *testing.T) {
|
||||
memory := learnermemory.NewService(learnermemory.NewMemoryStore())
|
||||
service := interview.NewService(interview.NewMemoryStore(), workflows.NewStubRunner(), memory)
|
||||
progress := progression.NewService(memory)
|
||||
onto := ontology.NewService(ontology.NewMemoryStore())
|
||||
assets := teachingassets.NewService(teachingassets.NewMemoryStore(), onto, "gpt-image-v2")
|
||||
handler := NewHandler(config.Config{Environment: "test"}, service, memory, progress, onto, assets)
|
||||
routes := handler.Routes()
|
||||
|
||||
var buf bytes.Buffer
|
||||
w := multipart.NewWriter(&buf)
|
||||
w.WriteField("title", "Custom Title")
|
||||
part, _ := w.CreateFormFile("file", "notes.md")
|
||||
io.Copy(part, strings.NewReader("Cache invalidation with TTL."))
|
||||
w.Close()
|
||||
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/v1/materials/upload", &buf)
|
||||
req.Header.Set("Content-Type", w.FormDataContentType())
|
||||
rec := httptest.NewRecorder()
|
||||
routes.ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != http.StatusCreated {
|
||||
t.Fatalf("status = %d, body = %s", rec.Code, rec.Body.String())
|
||||
}
|
||||
|
||||
var result ontology.IngestResult
|
||||
decodeJSON(t, rec.Body, &result)
|
||||
if result.Material.Title != "Custom Title" {
|
||||
t.Fatalf("title = %q, want %q", result.Material.Title, "Custom Title")
|
||||
}
|
||||
}
|
||||
|
||||
func TestUploadMaterialOntologyNotConfigured(t *testing.T) {
|
||||
handler := NewHandler(config.Config{Environment: "test"}, nil, nil, nil, nil, nil)
|
||||
routes := handler.Routes()
|
||||
|
||||
var buf bytes.Buffer
|
||||
w := multipart.NewWriter(&buf)
|
||||
part, _ := w.CreateFormFile("file", "notes.md")
|
||||
io.Copy(part, strings.NewReader("# test"))
|
||||
w.Close()
|
||||
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/v1/materials/upload", &buf)
|
||||
req.Header.Set("Content-Type", w.FormDataContentType())
|
||||
rec := httptest.NewRecorder()
|
||||
routes.ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != http.StatusNotFound {
|
||||
t.Fatalf("expected 404, got %d: %s", rec.Code, rec.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func decodeJSON(t *testing.T, r io.Reader, v interface{}) {
|
||||
t.Helper()
|
||||
if err := json.NewDecoder(r).Decode(v); err != nil {
|
||||
t.Fatalf("decode error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUploadMaterialMarkdownFrontmatter(t *testing.T) {
|
||||
memory := learnermemory.NewService(learnermemory.NewMemoryStore())
|
||||
service := interview.NewService(interview.NewMemoryStore(), workflows.NewStubRunner(), memory)
|
||||
progress := progression.NewService(memory)
|
||||
onto := ontology.NewService(ontology.NewMemoryStore())
|
||||
assets := teachingassets.NewService(teachingassets.NewMemoryStore(), onto, "gpt-image-v2")
|
||||
handler := NewHandler(config.Config{Environment: "test"}, service, memory, progress, onto, assets)
|
||||
routes := handler.Routes()
|
||||
|
||||
var buf bytes.Buffer
|
||||
w := multipart.NewWriter(&buf)
|
||||
part, _ := w.CreateFormFile("file", "study-notes.md")
|
||||
io.Copy(part, strings.NewReader(fmt.Sprintf("---\ntitle: Study Notes\ntags:\n - backend\n - go\n---\n\n# HTTP Idempotency\n\nIdempotent API retries need transactions for correctness.")))
|
||||
w.Close()
|
||||
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/v1/materials/upload", &buf)
|
||||
req.Header.Set("Content-Type", w.FormDataContentType())
|
||||
rec := httptest.NewRecorder()
|
||||
routes.ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != http.StatusCreated {
|
||||
t.Fatalf("status = %d, body = %s", rec.Code, rec.Body.String())
|
||||
}
|
||||
|
||||
var result ontology.IngestResult
|
||||
decodeJSON(t, rec.Body, &result)
|
||||
if len(result.Snapshot.Concepts) == 0 {
|
||||
t.Fatal("expected concepts from markdown with frontmatter")
|
||||
}
|
||||
for _, c := range result.Snapshot.Concepts {
|
||||
if c.Concept.ID == "http-idempotency" {
|
||||
return
|
||||
}
|
||||
}
|
||||
t.Fatalf("expected http-idempotency concept, got concepts: %v", result.Snapshot.Concepts)
|
||||
}
|
||||
75
internal/ingestion/ingestion.go
Normal file
75
internal/ingestion/ingestion.go
Normal file
@@ -0,0 +1,75 @@
|
||||
package ingestion
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type Result struct {
|
||||
Title string
|
||||
Body string
|
||||
Format string
|
||||
}
|
||||
|
||||
var parsers = map[string]func(string) (string, error){
|
||||
".md": ParseMarkdown,
|
||||
".markdown": ParseMarkdown,
|
||||
".pdf": ParsePDF,
|
||||
".docx": ParseDOCX,
|
||||
}
|
||||
|
||||
func ParseFile(path string) (Result, error) {
|
||||
ext := strings.ToLower(filepath.Ext(path))
|
||||
parse, ok := parsers[ext]
|
||||
if !ok {
|
||||
return Result{}, fmt.Errorf("unsupported file format: %s", ext)
|
||||
}
|
||||
|
||||
body, err := parse(path)
|
||||
if err != nil {
|
||||
return Result{}, fmt.Errorf("parse %s: %w", ext, err)
|
||||
}
|
||||
|
||||
title := strings.TrimSuffix(filepath.Base(path), ext)
|
||||
return Result{
|
||||
Title: title,
|
||||
Body: strings.TrimSpace(body),
|
||||
Format: ext[1:],
|
||||
}, nil
|
||||
}
|
||||
|
||||
func IsSupported(path string) bool {
|
||||
ext := strings.ToLower(filepath.Ext(path))
|
||||
_, ok := parsers[ext]
|
||||
return ok
|
||||
}
|
||||
|
||||
func SupportedExtensions() []string {
|
||||
exts := make([]string, 0, len(parsers))
|
||||
for ext := range parsers {
|
||||
exts = append(exts, ext)
|
||||
}
|
||||
return exts
|
||||
}
|
||||
|
||||
func ParseFromBytes(filename string, data []byte) (Result, error) {
|
||||
safe := filepath.Base(filename)
|
||||
if safe == "." || safe == string(filepath.Separator) {
|
||||
return Result{}, fmt.Errorf("invalid filename: %q", filename)
|
||||
}
|
||||
|
||||
tmpDir, err := os.MkdirTemp("", "ingestion-*")
|
||||
if err != nil {
|
||||
return Result{}, fmt.Errorf("create temp dir: %w", err)
|
||||
}
|
||||
defer os.RemoveAll(tmpDir)
|
||||
|
||||
tmpPath := filepath.Join(tmpDir, safe)
|
||||
if err := os.WriteFile(tmpPath, data, 0644); err != nil {
|
||||
return Result{}, fmt.Errorf("write temp file: %w", err)
|
||||
}
|
||||
|
||||
return ParseFile(tmpPath)
|
||||
}
|
||||
265
internal/ingestion/ingestion_test.go
Normal file
265
internal/ingestion/ingestion_test.go
Normal file
@@ -0,0 +1,265 @@
|
||||
package ingestion
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseMarkdown(t *testing.T) {
|
||||
content := `---
|
||||
title: test
|
||||
tags: [go]
|
||||
---
|
||||
# Hello
|
||||
|
||||
This is a test document with idempotent APIs.`
|
||||
path := writeTempFile(t, "test.md", content)
|
||||
defer os.Remove(path)
|
||||
|
||||
result, err := ParseFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("ParseFile error: %v", err)
|
||||
}
|
||||
if result.Title != "test" {
|
||||
t.Fatalf("title = %q, want %q", result.Title, "test")
|
||||
}
|
||||
if !strings.Contains(result.Body, "idempotent") {
|
||||
t.Fatal("body should contain idempotent")
|
||||
}
|
||||
if strings.Contains(result.Body, "---") {
|
||||
t.Fatal("body should not contain frontmatter")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseMarkdownNoFrontmatter(t *testing.T) {
|
||||
content := `# Notes
|
||||
|
||||
Database indexes speed up queries.`
|
||||
path := writeTempFile(t, "notes.md", content)
|
||||
defer os.Remove(path)
|
||||
|
||||
result, err := ParseFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("ParseFile error: %v", err)
|
||||
}
|
||||
if !strings.Contains(result.Body, "Database indexes") {
|
||||
t.Fatalf("body = %q, want to contain %q", result.Body, "Database indexes")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParsePDF(t *testing.T) {
|
||||
content := `%PDF-1.4 fake content`
|
||||
path := writeTempFile(t, "test.pdf", content)
|
||||
defer os.Remove(path)
|
||||
|
||||
result, err := ParseFile(path)
|
||||
if err == nil {
|
||||
t.Fatal("expected error for invalid PDF")
|
||||
}
|
||||
if result.Body != "" {
|
||||
t.Fatalf("expected empty body for invalid PDF, got %q", result.Body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseDOCX(t *testing.T) {
|
||||
path := writeTempFile(t, "test.docx", "not a real docx")
|
||||
defer os.Remove(path)
|
||||
|
||||
result, err := ParseFile(path)
|
||||
if err == nil {
|
||||
t.Fatal("expected error for invalid docx")
|
||||
}
|
||||
if result.Body != "" {
|
||||
t.Fatalf("expected empty body for invalid docx, got %q", result.Body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseUnsupportedFormat(t *testing.T) {
|
||||
path := writeTempFile(t, "test.txt", "plain text")
|
||||
defer os.Remove(path)
|
||||
|
||||
_, err := ParseFile(path)
|
||||
if err == nil {
|
||||
t.Fatal("expected error for unsupported format")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsSupported(t *testing.T) {
|
||||
cases := []struct {
|
||||
path string
|
||||
want bool
|
||||
}{
|
||||
{"doc.md", true},
|
||||
{"doc.markdown", true},
|
||||
{"doc.pdf", true},
|
||||
{"doc.docx", true},
|
||||
{"doc.txt", false},
|
||||
{"doc.html", false},
|
||||
}
|
||||
for _, c := range cases {
|
||||
got := IsSupported(c.path)
|
||||
if got != c.want {
|
||||
t.Errorf("IsSupported(%q) = %v, want %v", c.path, got, c.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseFromBytes(t *testing.T) {
|
||||
content := "# Hello\nThis is markdown with cache invalidation."
|
||||
result, err := ParseFromBytes("test.md", []byte(content))
|
||||
if err != nil {
|
||||
t.Fatalf("ParseFromBytes error: %v", err)
|
||||
}
|
||||
if result.Title != "test" {
|
||||
t.Fatalf("title = %q, want %q", result.Title, "test")
|
||||
}
|
||||
if !strings.Contains(result.Body, "cache invalidation") {
|
||||
t.Fatal("body should contain cache invalidation")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSupportedExtensions(t *testing.T) {
|
||||
exts := SupportedExtensions()
|
||||
if len(exts) == 0 {
|
||||
t.Fatal("expected at least one supported extension")
|
||||
}
|
||||
hasMD := false
|
||||
hasPDF := false
|
||||
hasDOCX := false
|
||||
for _, ext := range exts {
|
||||
switch ext {
|
||||
case ".md":
|
||||
hasMD = true
|
||||
case ".markdown":
|
||||
hasMD = true
|
||||
case ".pdf":
|
||||
hasPDF = true
|
||||
case ".docx":
|
||||
hasDOCX = true
|
||||
}
|
||||
}
|
||||
if !hasMD {
|
||||
t.Error("expected .md or .markdown in supported extensions")
|
||||
}
|
||||
if !hasPDF {
|
||||
t.Error("expected .pdf in supported extensions")
|
||||
}
|
||||
if !hasDOCX {
|
||||
t.Error("expected .docx in supported extensions")
|
||||
}
|
||||
}
|
||||
|
||||
func TestStripFrontmatter(t *testing.T) {
|
||||
cases := []struct {
|
||||
input string
|
||||
want string
|
||||
}{
|
||||
{
|
||||
input: "---\ntitle: test\n---\nbody text",
|
||||
want: "body text",
|
||||
},
|
||||
{
|
||||
input: "no frontmatter",
|
||||
want: "no frontmatter",
|
||||
},
|
||||
{
|
||||
input: "---\nno end marker",
|
||||
want: "---\nno end marker",
|
||||
},
|
||||
{
|
||||
input: "---\ntags:\n - go\n - testing\n---\nreal body",
|
||||
want: "real body",
|
||||
},
|
||||
{
|
||||
input: "---\ntitle: YAML with --- inside\n---\nbody after",
|
||||
want: "body after",
|
||||
},
|
||||
}
|
||||
for _, c := range cases {
|
||||
got := stripFrontmatter(c.input)
|
||||
if strings.TrimSpace(got) != strings.TrimSpace(c.want) {
|
||||
t.Errorf("stripFrontmatter(%q) = %q, want %q", c.input, got, c.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestStripFrontmatterMultilineYAML(t *testing.T) {
|
||||
input := "---\ntitle: Study Notes\ndescription: |\n This block contains --- as part of YAML\n and more content.\n---\nbody content"
|
||||
got := stripFrontmatter(input)
|
||||
want := "body content"
|
||||
if strings.TrimSpace(got) != strings.TrimSpace(want) {
|
||||
t.Errorf("stripFrontmatter with multiline YAML = %q, want %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestStripFrontmatterCRLF(t *testing.T) {
|
||||
input := "---\r\ntitle: test\r\n---\r\nbody with CRLF"
|
||||
got := stripFrontmatter(input)
|
||||
want := "body with CRLF"
|
||||
if strings.TrimSpace(got) != strings.TrimSpace(want) {
|
||||
t.Errorf("stripFrontmatter CRLF = %q, want %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseFromBytesPathTraversal(t *testing.T) {
|
||||
filename := "../../etc/passwd"
|
||||
_, err := ParseFromBytes(filename, []byte("malicious"))
|
||||
if err == nil {
|
||||
t.Fatal("expected error for path traversal filename")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseFromBytesInvalidFilename(t *testing.T) {
|
||||
for _, name := range []string{"", ".", string(filepath.Separator)} {
|
||||
_, err := ParseFromBytes(name, []byte("content"))
|
||||
if err == nil {
|
||||
t.Errorf("expected error for filename %q", name)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsSupportedCaseInsensitive(t *testing.T) {
|
||||
cases := []string{"DOC.MD", "File.PDF", "Notes.DOCX", "FILE.MARKDOWN"}
|
||||
for _, name := range cases {
|
||||
if !IsSupported(name) {
|
||||
t.Errorf("IsSupported(%q) should be true (case-insensitive)", name)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseMarkdownCRLF(t *testing.T) {
|
||||
content := "# Hello\r\nCache invalidation with TTL tradeoffs.\r\n"
|
||||
path := writeTempFile(t, "notes.md", content)
|
||||
|
||||
result, err := ParseFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("ParseFile error: %v", err)
|
||||
}
|
||||
if !strings.Contains(result.Body, "Cache invalidation") {
|
||||
t.Fatalf("body = %q, want to contain %q", result.Body, "Cache invalidation")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseMarkdownEmptyFile(t *testing.T) {
|
||||
path := writeTempFile(t, "empty.md", "")
|
||||
|
||||
result, err := ParseFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("ParseFile error: %v", err)
|
||||
}
|
||||
if result.Body != "" {
|
||||
t.Fatalf("expected empty body, got %q", result.Body)
|
||||
}
|
||||
}
|
||||
|
||||
func writeTempFile(t *testing.T, name, content string) string {
|
||||
t.Helper()
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, name)
|
||||
if err := os.WriteFile(path, []byte(content), 0644); err != nil {
|
||||
t.Fatalf("write temp file: %v", err)
|
||||
}
|
||||
return path
|
||||
}
|
||||
20
internal/ingestion/parse_docx.go
Normal file
20
internal/ingestion/parse_docx.go
Normal file
@@ -0,0 +1,20 @@
|
||||
package ingestion
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/nguyenthenguyen/docx"
|
||||
)
|
||||
|
||||
func ParseDOCX(path string) (string, error) {
|
||||
reader, err := docx.ReadDocxFile(path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer reader.Close()
|
||||
|
||||
doc := reader.Editable()
|
||||
text := doc.GetContent()
|
||||
|
||||
return strings.TrimSpace(text), nil
|
||||
}
|
||||
57
internal/ingestion/parse_markdown.go
Normal file
57
internal/ingestion/parse_markdown.go
Normal file
@@ -0,0 +1,57 @@
|
||||
package ingestion
|
||||
|
||||
import (
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func ParseMarkdown(path string) (string, error) {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
content := string(data)
|
||||
content = stripFrontmatter(content)
|
||||
content = strings.TrimSpace(content)
|
||||
return content, nil
|
||||
}
|
||||
|
||||
func stripFrontmatter(content string) string {
|
||||
content = strings.TrimLeft(content, "\n\r\t ")
|
||||
if !strings.HasPrefix(content, "---") {
|
||||
return content
|
||||
}
|
||||
|
||||
rest := content[3:]
|
||||
closing := findFMClosing(rest)
|
||||
if closing < 0 {
|
||||
return content
|
||||
}
|
||||
|
||||
return strings.TrimLeft(rest[closing:], "\n\r")
|
||||
}
|
||||
|
||||
func findFMClosing(s string) int {
|
||||
i := 0
|
||||
for i < len(s) {
|
||||
nl := strings.IndexByte(s[i:], '\n')
|
||||
if nl < 0 {
|
||||
break
|
||||
}
|
||||
lineStart := i + nl + 1
|
||||
if lineStart >= len(s) {
|
||||
break
|
||||
}
|
||||
end := strings.IndexByte(s[lineStart:], '\n')
|
||||
line := s[lineStart:]
|
||||
if end >= 0 {
|
||||
line = s[lineStart : lineStart+end]
|
||||
}
|
||||
if strings.TrimRight(line, "\r") == "---" {
|
||||
return lineStart + len(line)
|
||||
}
|
||||
i = lineStart + len(line)
|
||||
}
|
||||
return -1
|
||||
}
|
||||
74
internal/ingestion/parse_pdf.go
Normal file
74
internal/ingestion/parse_pdf.go
Normal file
@@ -0,0 +1,74 @@
|
||||
package ingestion
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/pdfcpu/pdfcpu/pkg/api"
|
||||
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model"
|
||||
)
|
||||
|
||||
func ParsePDF(path string) (string, error) {
|
||||
ctx, err := api.ReadContextFile(path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
tmpDir, err := os.MkdirTemp("", "pdf-extract-*")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer os.RemoveAll(tmpDir)
|
||||
|
||||
conf := model.NewDefaultConfiguration()
|
||||
if err := api.ExtractContentFile(path, tmpDir, nil, conf); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
var buf strings.Builder
|
||||
entries, err := os.ReadDir(tmpDir)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
for _, entry := range entries {
|
||||
if entry.IsDir() {
|
||||
continue
|
||||
}
|
||||
content, err := os.ReadFile(filepath.Join(tmpDir, entry.Name()))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("read extracted content %s: %w", entry.Name(), err)
|
||||
}
|
||||
buf.WriteString(string(content))
|
||||
buf.WriteString("\n")
|
||||
}
|
||||
|
||||
if buf.Len() == 0 {
|
||||
content, err := extractPageText(ctx)
|
||||
if err == nil {
|
||||
buf.WriteString(content)
|
||||
}
|
||||
}
|
||||
|
||||
return buf.String(), nil
|
||||
}
|
||||
|
||||
func extractPageText(ctx *model.Context) (string, error) {
|
||||
var buf strings.Builder
|
||||
for i := 1; i <= ctx.PageCount; i++ {
|
||||
r, err := api.ExtractPage(ctx, i)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
data, err := io.ReadAll(r)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
buf.Write(data)
|
||||
buf.WriteString("\n")
|
||||
}
|
||||
return buf.String(), nil
|
||||
}
|
||||
@@ -25,6 +25,9 @@ var els = {
|
||||
progressDivider: document.querySelector("#progress-divider"),
|
||||
refreshProgress: document.querySelector("#refresh-progress"),
|
||||
materialForm: document.querySelector("#material-form"),
|
||||
materialFile: document.querySelector("#material-file"),
|
||||
fileNameDisplay: document.querySelector("#file-name"),
|
||||
uploadFileButton: document.querySelector("#upload-file-button"),
|
||||
assetForm: document.querySelector("#asset-form"),
|
||||
ontology: document.querySelector("#ontology"),
|
||||
assetOutput: document.querySelector("#asset-output"),
|
||||
@@ -232,6 +235,55 @@ function renderBlock(el, title, items) {
|
||||
"</ul>";
|
||||
}
|
||||
|
||||
/* ---- File upload ---- */
|
||||
els.materialFile.addEventListener("change", function() {
|
||||
var file = els.materialFile.files[0];
|
||||
if (file) {
|
||||
els.fileNameDisplay.textContent = file.name;
|
||||
els.uploadFileButton.disabled = false;
|
||||
} else {
|
||||
els.fileNameDisplay.textContent = "";
|
||||
els.uploadFileButton.disabled = true;
|
||||
}
|
||||
});
|
||||
|
||||
els.uploadFileButton.addEventListener("click", function() {
|
||||
var file = els.materialFile.files[0];
|
||||
if (!file) return;
|
||||
|
||||
clearError();
|
||||
setStatus(t("ingestingMaterial"), true);
|
||||
els.uploadFileButton.disabled = true;
|
||||
|
||||
var formData = new FormData();
|
||||
formData.append("file", file);
|
||||
var title = document.querySelector("#material-title").value;
|
||||
if (title) formData.append("title", title);
|
||||
|
||||
var token = localStorage.getItem("tutor_token");
|
||||
var lang = localStorage.getItem("tutor_lang") || document.documentElement.lang || "ko";
|
||||
var headers = {};
|
||||
if (token) headers["Authorization"] = "Bearer " + token;
|
||||
|
||||
fetch("/api/v1/materials/upload", { method:"POST", headers:headers, body:formData })
|
||||
.then(function(response) {
|
||||
return response.json().then(function(body) {
|
||||
if (!response.ok) throw new Error(body.error || "Upload failed: " + response.status);
|
||||
state.ontology = body.snapshot;
|
||||
renderOntology();
|
||||
setStatus(t("materialIngested", body.material.id));
|
||||
els.materialFile.value = "";
|
||||
els.fileNameDisplay.textContent = "";
|
||||
});
|
||||
})
|
||||
["catch"](function(error) {
|
||||
showError(error.message); setStatus(t("contentReady"));
|
||||
})
|
||||
["finally"](function() {
|
||||
els.uploadFileButton.disabled = false;
|
||||
});
|
||||
});
|
||||
|
||||
/* ---- Progress ---- */
|
||||
els.refreshProgress.addEventListener("click", function() { clearError(); refreshProgress(); });
|
||||
|
||||
|
||||
@@ -74,6 +74,9 @@ var i18n = {
|
||||
questionId: "질문 ID",
|
||||
starting: "시작 중…",
|
||||
grading: "채점 중…",
|
||||
uploadFile: "파일 업로드",
|
||||
uploadAndIngest: "업로드 및 수집",
|
||||
pasteTextToggle: "또는 텍스트 붙여넣기",
|
||||
ingesting: "수집 중…",
|
||||
generating: "생성 중…",
|
||||
questionsSuffix: "개 질문",
|
||||
@@ -159,6 +162,9 @@ var i18n = {
|
||||
questionId: "question id",
|
||||
starting: "Starting…",
|
||||
grading: "Grading…",
|
||||
uploadFile: "Upload file",
|
||||
uploadAndIngest: "Upload & ingest",
|
||||
pasteTextToggle: "Or paste text",
|
||||
ingesting: "Ingesting…",
|
||||
generating: "Generating…",
|
||||
questionsSuffix: "questions",
|
||||
|
||||
@@ -151,15 +151,27 @@
|
||||
<input id="material-source" value="markdown" />
|
||||
</label>
|
||||
</div>
|
||||
<label>
|
||||
<span data-i18n="sourceMaterial">Source material</span>
|
||||
<textarea id="material-body" rows="4">
|
||||
Idempotent API retries need transactions. Cache invalidation uses TTL tradeoffs and database indexes support query plans.</textarea>
|
||||
</label>
|
||||
<button id="material-button" type="submit">
|
||||
<span class="btn-text" data-i18n="ingestMaterial">Ingest material</span>
|
||||
<span class="btn-spinner" aria-hidden="true"></span>
|
||||
</button>
|
||||
<div class="file-upload-row">
|
||||
<label class="file-label">
|
||||
<span data-i18n="uploadFile">Upload file</span>
|
||||
<input id="material-file" type="file" accept=".md,.markdown,.pdf,.docx" />
|
||||
</label>
|
||||
<span id="file-name" class="file-name"></span>
|
||||
<button id="upload-file-button" type="button" class="small-button" data-i18n="uploadAndIngest" disabled>Upload & ingest</button>
|
||||
</div>
|
||||
<details class="paste-toggle">
|
||||
<summary data-i18n="pasteTextToggle">Or paste text</summary>
|
||||
<label class="wide-field">
|
||||
<span data-i18n="sourceMaterial">Source material</span>
|
||||
<textarea id="material-body" rows="5">
|
||||
Idempotent API retries need transactions. Cache invalidation uses TTL tradeoffs and database indexes support query plans.</textarea
|
||||
>
|
||||
</label>
|
||||
<button id="material-button" type="submit">
|
||||
<span class="btn-text" data-i18n="ingestMaterial">Ingest material</span>
|
||||
<span class="btn-spinner" aria-hidden="true"></span>
|
||||
</button>
|
||||
</details>
|
||||
</form>
|
||||
|
||||
<div id="ontology" class="ontology-view empty-state">
|
||||
|
||||
@@ -362,6 +362,88 @@ button.is-loading .btn-spinner { display:inline-block; }
|
||||
margin:0; padding:12px; white-space:pre-wrap; font-size:12px; line-height:1.5; color:var(--text);
|
||||
}
|
||||
|
||||
/* ===== FILE UPLOAD ===== */
|
||||
.file-upload-row {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 10px;
|
||||
grid-column: 1 / -1;
|
||||
}
|
||||
|
||||
.file-label {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 4px;
|
||||
font-size: 13px;
|
||||
font-weight: 650;
|
||||
color: var(--muted);
|
||||
}
|
||||
|
||||
.file-label input[type="file"] {
|
||||
padding: 8px;
|
||||
font-size: 12px;
|
||||
border: 1px solid var(--line);
|
||||
border-radius: 6px;
|
||||
background: #fbfcfa;
|
||||
color: var(--text);
|
||||
cursor: pointer;
|
||||
min-width: 200px;
|
||||
}
|
||||
|
||||
.file-label input[type="file"]::file-selector-button {
|
||||
border: 1px solid var(--line);
|
||||
border-radius: 4px;
|
||||
padding: 6px 10px;
|
||||
background: var(--surface);
|
||||
color: var(--text);
|
||||
cursor: pointer;
|
||||
font-size: 12px;
|
||||
font-weight: 650;
|
||||
margin-right: 10px;
|
||||
}
|
||||
|
||||
.file-label input[type="file"]::file-selector-button:hover {
|
||||
background: var(--surface-muted);
|
||||
}
|
||||
|
||||
.file-name {
|
||||
font-size: 12px;
|
||||
color: var(--muted);
|
||||
flex: 1;
|
||||
white-space: nowrap;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
}
|
||||
|
||||
.paste-toggle {
|
||||
grid-column: 1 / -1;
|
||||
border: 1px solid var(--line);
|
||||
border-radius: 6px;
|
||||
padding: 10px 14px;
|
||||
background: #fbfcfa;
|
||||
}
|
||||
|
||||
.paste-toggle summary {
|
||||
cursor: pointer;
|
||||
color: var(--muted);
|
||||
font-size: 12px;
|
||||
font-weight: 650;
|
||||
}
|
||||
|
||||
.paste-toggle[open] summary {
|
||||
margin-bottom: 12px;
|
||||
}
|
||||
|
||||
.paste-toggle .wide-field {
|
||||
grid-column: 1 / -1;
|
||||
}
|
||||
|
||||
.small-button {
|
||||
min-height: 32px;
|
||||
padding: 0 14px;
|
||||
font-size: 12px;
|
||||
}
|
||||
|
||||
/* ===== RESPONSIVE ===== */
|
||||
@media (max-width:900px) {
|
||||
.main-grid { grid-template-columns:1fr; }
|
||||
|
||||
Reference in New Issue
Block a user