feat: add file upload for materials (PDF/DOCX) with ingestion pipeline

This commit is contained in:
root
2026-04-29 15:52:35 +09:00
parent 518370b93e
commit 7f503326f9
51 changed files with 4712 additions and 27 deletions

View File

@@ -0,0 +1,75 @@
package ingestion
import (
"fmt"
"os"
"path/filepath"
"strings"
)
type Result struct {
Title string
Body string
Format string
}
var parsers = map[string]func(string) (string, error){
".md": ParseMarkdown,
".markdown": ParseMarkdown,
".pdf": ParsePDF,
".docx": ParseDOCX,
}
func ParseFile(path string) (Result, error) {
ext := strings.ToLower(filepath.Ext(path))
parse, ok := parsers[ext]
if !ok {
return Result{}, fmt.Errorf("unsupported file format: %s", ext)
}
body, err := parse(path)
if err != nil {
return Result{}, fmt.Errorf("parse %s: %w", ext, err)
}
title := strings.TrimSuffix(filepath.Base(path), ext)
return Result{
Title: title,
Body: strings.TrimSpace(body),
Format: ext[1:],
}, nil
}
func IsSupported(path string) bool {
ext := strings.ToLower(filepath.Ext(path))
_, ok := parsers[ext]
return ok
}
func SupportedExtensions() []string {
exts := make([]string, 0, len(parsers))
for ext := range parsers {
exts = append(exts, ext)
}
return exts
}
func ParseFromBytes(filename string, data []byte) (Result, error) {
safe := filepath.Base(filename)
if safe == "." || safe == string(filepath.Separator) {
return Result{}, fmt.Errorf("invalid filename: %q", filename)
}
tmpDir, err := os.MkdirTemp("", "ingestion-*")
if err != nil {
return Result{}, fmt.Errorf("create temp dir: %w", err)
}
defer os.RemoveAll(tmpDir)
tmpPath := filepath.Join(tmpDir, safe)
if err := os.WriteFile(tmpPath, data, 0644); err != nil {
return Result{}, fmt.Errorf("write temp file: %w", err)
}
return ParseFile(tmpPath)
}

View File

@@ -0,0 +1,265 @@
package ingestion
import (
"os"
"path/filepath"
"strings"
"testing"
)
func TestParseMarkdown(t *testing.T) {
content := `---
title: test
tags: [go]
---
# Hello
This is a test document with idempotent APIs.`
path := writeTempFile(t, "test.md", content)
defer os.Remove(path)
result, err := ParseFile(path)
if err != nil {
t.Fatalf("ParseFile error: %v", err)
}
if result.Title != "test" {
t.Fatalf("title = %q, want %q", result.Title, "test")
}
if !strings.Contains(result.Body, "idempotent") {
t.Fatal("body should contain idempotent")
}
if strings.Contains(result.Body, "---") {
t.Fatal("body should not contain frontmatter")
}
}
func TestParseMarkdownNoFrontmatter(t *testing.T) {
content := `# Notes
Database indexes speed up queries.`
path := writeTempFile(t, "notes.md", content)
defer os.Remove(path)
result, err := ParseFile(path)
if err != nil {
t.Fatalf("ParseFile error: %v", err)
}
if !strings.Contains(result.Body, "Database indexes") {
t.Fatalf("body = %q, want to contain %q", result.Body, "Database indexes")
}
}
func TestParsePDF(t *testing.T) {
content := `%PDF-1.4 fake content`
path := writeTempFile(t, "test.pdf", content)
defer os.Remove(path)
result, err := ParseFile(path)
if err == nil {
t.Fatal("expected error for invalid PDF")
}
if result.Body != "" {
t.Fatalf("expected empty body for invalid PDF, got %q", result.Body)
}
}
func TestParseDOCX(t *testing.T) {
path := writeTempFile(t, "test.docx", "not a real docx")
defer os.Remove(path)
result, err := ParseFile(path)
if err == nil {
t.Fatal("expected error for invalid docx")
}
if result.Body != "" {
t.Fatalf("expected empty body for invalid docx, got %q", result.Body)
}
}
func TestParseUnsupportedFormat(t *testing.T) {
path := writeTempFile(t, "test.txt", "plain text")
defer os.Remove(path)
_, err := ParseFile(path)
if err == nil {
t.Fatal("expected error for unsupported format")
}
}
func TestIsSupported(t *testing.T) {
cases := []struct {
path string
want bool
}{
{"doc.md", true},
{"doc.markdown", true},
{"doc.pdf", true},
{"doc.docx", true},
{"doc.txt", false},
{"doc.html", false},
}
for _, c := range cases {
got := IsSupported(c.path)
if got != c.want {
t.Errorf("IsSupported(%q) = %v, want %v", c.path, got, c.want)
}
}
}
func TestParseFromBytes(t *testing.T) {
content := "# Hello\nThis is markdown with cache invalidation."
result, err := ParseFromBytes("test.md", []byte(content))
if err != nil {
t.Fatalf("ParseFromBytes error: %v", err)
}
if result.Title != "test" {
t.Fatalf("title = %q, want %q", result.Title, "test")
}
if !strings.Contains(result.Body, "cache invalidation") {
t.Fatal("body should contain cache invalidation")
}
}
func TestSupportedExtensions(t *testing.T) {
exts := SupportedExtensions()
if len(exts) == 0 {
t.Fatal("expected at least one supported extension")
}
hasMD := false
hasPDF := false
hasDOCX := false
for _, ext := range exts {
switch ext {
case ".md":
hasMD = true
case ".markdown":
hasMD = true
case ".pdf":
hasPDF = true
case ".docx":
hasDOCX = true
}
}
if !hasMD {
t.Error("expected .md or .markdown in supported extensions")
}
if !hasPDF {
t.Error("expected .pdf in supported extensions")
}
if !hasDOCX {
t.Error("expected .docx in supported extensions")
}
}
func TestStripFrontmatter(t *testing.T) {
cases := []struct {
input string
want string
}{
{
input: "---\ntitle: test\n---\nbody text",
want: "body text",
},
{
input: "no frontmatter",
want: "no frontmatter",
},
{
input: "---\nno end marker",
want: "---\nno end marker",
},
{
input: "---\ntags:\n - go\n - testing\n---\nreal body",
want: "real body",
},
{
input: "---\ntitle: YAML with --- inside\n---\nbody after",
want: "body after",
},
}
for _, c := range cases {
got := stripFrontmatter(c.input)
if strings.TrimSpace(got) != strings.TrimSpace(c.want) {
t.Errorf("stripFrontmatter(%q) = %q, want %q", c.input, got, c.want)
}
}
}
func TestStripFrontmatterMultilineYAML(t *testing.T) {
input := "---\ntitle: Study Notes\ndescription: |\n This block contains --- as part of YAML\n and more content.\n---\nbody content"
got := stripFrontmatter(input)
want := "body content"
if strings.TrimSpace(got) != strings.TrimSpace(want) {
t.Errorf("stripFrontmatter with multiline YAML = %q, want %q", got, want)
}
}
func TestStripFrontmatterCRLF(t *testing.T) {
input := "---\r\ntitle: test\r\n---\r\nbody with CRLF"
got := stripFrontmatter(input)
want := "body with CRLF"
if strings.TrimSpace(got) != strings.TrimSpace(want) {
t.Errorf("stripFrontmatter CRLF = %q, want %q", got, want)
}
}
func TestParseFromBytesPathTraversal(t *testing.T) {
filename := "../../etc/passwd"
_, err := ParseFromBytes(filename, []byte("malicious"))
if err == nil {
t.Fatal("expected error for path traversal filename")
}
}
func TestParseFromBytesInvalidFilename(t *testing.T) {
for _, name := range []string{"", ".", string(filepath.Separator)} {
_, err := ParseFromBytes(name, []byte("content"))
if err == nil {
t.Errorf("expected error for filename %q", name)
}
}
}
func TestIsSupportedCaseInsensitive(t *testing.T) {
cases := []string{"DOC.MD", "File.PDF", "Notes.DOCX", "FILE.MARKDOWN"}
for _, name := range cases {
if !IsSupported(name) {
t.Errorf("IsSupported(%q) should be true (case-insensitive)", name)
}
}
}
func TestParseMarkdownCRLF(t *testing.T) {
content := "# Hello\r\nCache invalidation with TTL tradeoffs.\r\n"
path := writeTempFile(t, "notes.md", content)
result, err := ParseFile(path)
if err != nil {
t.Fatalf("ParseFile error: %v", err)
}
if !strings.Contains(result.Body, "Cache invalidation") {
t.Fatalf("body = %q, want to contain %q", result.Body, "Cache invalidation")
}
}
func TestParseMarkdownEmptyFile(t *testing.T) {
path := writeTempFile(t, "empty.md", "")
result, err := ParseFile(path)
if err != nil {
t.Fatalf("ParseFile error: %v", err)
}
if result.Body != "" {
t.Fatalf("expected empty body, got %q", result.Body)
}
}
func writeTempFile(t *testing.T, name, content string) string {
t.Helper()
dir := t.TempDir()
path := filepath.Join(dir, name)
if err := os.WriteFile(path, []byte(content), 0644); err != nil {
t.Fatalf("write temp file: %v", err)
}
return path
}

View File

@@ -0,0 +1,20 @@
package ingestion
import (
"strings"
"github.com/nguyenthenguyen/docx"
)
func ParseDOCX(path string) (string, error) {
reader, err := docx.ReadDocxFile(path)
if err != nil {
return "", err
}
defer reader.Close()
doc := reader.Editable()
text := doc.GetContent()
return strings.TrimSpace(text), nil
}

View File

@@ -0,0 +1,57 @@
package ingestion
import (
"os"
"strings"
)
func ParseMarkdown(path string) (string, error) {
data, err := os.ReadFile(path)
if err != nil {
return "", err
}
content := string(data)
content = stripFrontmatter(content)
content = strings.TrimSpace(content)
return content, nil
}
func stripFrontmatter(content string) string {
content = strings.TrimLeft(content, "\n\r\t ")
if !strings.HasPrefix(content, "---") {
return content
}
rest := content[3:]
closing := findFMClosing(rest)
if closing < 0 {
return content
}
return strings.TrimLeft(rest[closing:], "\n\r")
}
func findFMClosing(s string) int {
i := 0
for i < len(s) {
nl := strings.IndexByte(s[i:], '\n')
if nl < 0 {
break
}
lineStart := i + nl + 1
if lineStart >= len(s) {
break
}
end := strings.IndexByte(s[lineStart:], '\n')
line := s[lineStart:]
if end >= 0 {
line = s[lineStart : lineStart+end]
}
if strings.TrimRight(line, "\r") == "---" {
return lineStart + len(line)
}
i = lineStart + len(line)
}
return -1
}

View File

@@ -0,0 +1,74 @@
package ingestion
import (
"fmt"
"io"
"os"
"path/filepath"
"strings"
"github.com/pdfcpu/pdfcpu/pkg/api"
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model"
)
func ParsePDF(path string) (string, error) {
ctx, err := api.ReadContextFile(path)
if err != nil {
return "", err
}
tmpDir, err := os.MkdirTemp("", "pdf-extract-*")
if err != nil {
return "", err
}
defer os.RemoveAll(tmpDir)
conf := model.NewDefaultConfiguration()
if err := api.ExtractContentFile(path, tmpDir, nil, conf); err != nil {
return "", err
}
var buf strings.Builder
entries, err := os.ReadDir(tmpDir)
if err != nil {
return "", err
}
for _, entry := range entries {
if entry.IsDir() {
continue
}
content, err := os.ReadFile(filepath.Join(tmpDir, entry.Name()))
if err != nil {
return "", fmt.Errorf("read extracted content %s: %w", entry.Name(), err)
}
buf.WriteString(string(content))
buf.WriteString("\n")
}
if buf.Len() == 0 {
content, err := extractPageText(ctx)
if err == nil {
buf.WriteString(content)
}
}
return buf.String(), nil
}
func extractPageText(ctx *model.Context) (string, error) {
var buf strings.Builder
for i := 1; i <= ctx.PageCount; i++ {
r, err := api.ExtractPage(ctx, i)
if err != nil {
continue
}
data, err := io.ReadAll(r)
if err != nil {
continue
}
buf.Write(data)
buf.WriteString("\n")
}
return buf.String(), nil
}