266 lines
6.2 KiB
Go
266 lines
6.2 KiB
Go
package ingestion
|
|
|
|
import (
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"testing"
|
|
)
|
|
|
|
func TestParseMarkdown(t *testing.T) {
|
|
content := `---
|
|
title: test
|
|
tags: [go]
|
|
---
|
|
# Hello
|
|
|
|
This is a test document with idempotent APIs.`
|
|
path := writeTempFile(t, "test.md", content)
|
|
defer os.Remove(path)
|
|
|
|
result, err := ParseFile(path)
|
|
if err != nil {
|
|
t.Fatalf("ParseFile error: %v", err)
|
|
}
|
|
if result.Title != "test" {
|
|
t.Fatalf("title = %q, want %q", result.Title, "test")
|
|
}
|
|
if !strings.Contains(result.Body, "idempotent") {
|
|
t.Fatal("body should contain idempotent")
|
|
}
|
|
if strings.Contains(result.Body, "---") {
|
|
t.Fatal("body should not contain frontmatter")
|
|
}
|
|
}
|
|
|
|
func TestParseMarkdownNoFrontmatter(t *testing.T) {
|
|
content := `# Notes
|
|
|
|
Database indexes speed up queries.`
|
|
path := writeTempFile(t, "notes.md", content)
|
|
defer os.Remove(path)
|
|
|
|
result, err := ParseFile(path)
|
|
if err != nil {
|
|
t.Fatalf("ParseFile error: %v", err)
|
|
}
|
|
if !strings.Contains(result.Body, "Database indexes") {
|
|
t.Fatalf("body = %q, want to contain %q", result.Body, "Database indexes")
|
|
}
|
|
}
|
|
|
|
func TestParsePDF(t *testing.T) {
|
|
content := `%PDF-1.4 fake content`
|
|
path := writeTempFile(t, "test.pdf", content)
|
|
defer os.Remove(path)
|
|
|
|
result, err := ParseFile(path)
|
|
if err == nil {
|
|
t.Fatal("expected error for invalid PDF")
|
|
}
|
|
if result.Body != "" {
|
|
t.Fatalf("expected empty body for invalid PDF, got %q", result.Body)
|
|
}
|
|
}
|
|
|
|
func TestParseDOCX(t *testing.T) {
|
|
path := writeTempFile(t, "test.docx", "not a real docx")
|
|
defer os.Remove(path)
|
|
|
|
result, err := ParseFile(path)
|
|
if err == nil {
|
|
t.Fatal("expected error for invalid docx")
|
|
}
|
|
if result.Body != "" {
|
|
t.Fatalf("expected empty body for invalid docx, got %q", result.Body)
|
|
}
|
|
}
|
|
|
|
func TestParseUnsupportedFormat(t *testing.T) {
|
|
path := writeTempFile(t, "test.txt", "plain text")
|
|
defer os.Remove(path)
|
|
|
|
_, err := ParseFile(path)
|
|
if err == nil {
|
|
t.Fatal("expected error for unsupported format")
|
|
}
|
|
}
|
|
|
|
func TestIsSupported(t *testing.T) {
|
|
cases := []struct {
|
|
path string
|
|
want bool
|
|
}{
|
|
{"doc.md", true},
|
|
{"doc.markdown", true},
|
|
{"doc.pdf", true},
|
|
{"doc.docx", true},
|
|
{"doc.txt", false},
|
|
{"doc.html", false},
|
|
}
|
|
for _, c := range cases {
|
|
got := IsSupported(c.path)
|
|
if got != c.want {
|
|
t.Errorf("IsSupported(%q) = %v, want %v", c.path, got, c.want)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestParseFromBytes(t *testing.T) {
|
|
content := "# Hello\nThis is markdown with cache invalidation."
|
|
result, err := ParseFromBytes("test.md", []byte(content))
|
|
if err != nil {
|
|
t.Fatalf("ParseFromBytes error: %v", err)
|
|
}
|
|
if result.Title != "test" {
|
|
t.Fatalf("title = %q, want %q", result.Title, "test")
|
|
}
|
|
if !strings.Contains(result.Body, "cache invalidation") {
|
|
t.Fatal("body should contain cache invalidation")
|
|
}
|
|
}
|
|
|
|
func TestSupportedExtensions(t *testing.T) {
|
|
exts := SupportedExtensions()
|
|
if len(exts) == 0 {
|
|
t.Fatal("expected at least one supported extension")
|
|
}
|
|
hasMD := false
|
|
hasPDF := false
|
|
hasDOCX := false
|
|
for _, ext := range exts {
|
|
switch ext {
|
|
case ".md":
|
|
hasMD = true
|
|
case ".markdown":
|
|
hasMD = true
|
|
case ".pdf":
|
|
hasPDF = true
|
|
case ".docx":
|
|
hasDOCX = true
|
|
}
|
|
}
|
|
if !hasMD {
|
|
t.Error("expected .md or .markdown in supported extensions")
|
|
}
|
|
if !hasPDF {
|
|
t.Error("expected .pdf in supported extensions")
|
|
}
|
|
if !hasDOCX {
|
|
t.Error("expected .docx in supported extensions")
|
|
}
|
|
}
|
|
|
|
func TestStripFrontmatter(t *testing.T) {
|
|
cases := []struct {
|
|
input string
|
|
want string
|
|
}{
|
|
{
|
|
input: "---\ntitle: test\n---\nbody text",
|
|
want: "body text",
|
|
},
|
|
{
|
|
input: "no frontmatter",
|
|
want: "no frontmatter",
|
|
},
|
|
{
|
|
input: "---\nno end marker",
|
|
want: "---\nno end marker",
|
|
},
|
|
{
|
|
input: "---\ntags:\n - go\n - testing\n---\nreal body",
|
|
want: "real body",
|
|
},
|
|
{
|
|
input: "---\ntitle: YAML with --- inside\n---\nbody after",
|
|
want: "body after",
|
|
},
|
|
}
|
|
for _, c := range cases {
|
|
got := stripFrontmatter(c.input)
|
|
if strings.TrimSpace(got) != strings.TrimSpace(c.want) {
|
|
t.Errorf("stripFrontmatter(%q) = %q, want %q", c.input, got, c.want)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestStripFrontmatterMultilineYAML(t *testing.T) {
|
|
input := "---\ntitle: Study Notes\ndescription: |\n This block contains --- as part of YAML\n and more content.\n---\nbody content"
|
|
got := stripFrontmatter(input)
|
|
want := "body content"
|
|
if strings.TrimSpace(got) != strings.TrimSpace(want) {
|
|
t.Errorf("stripFrontmatter with multiline YAML = %q, want %q", got, want)
|
|
}
|
|
}
|
|
|
|
func TestStripFrontmatterCRLF(t *testing.T) {
|
|
input := "---\r\ntitle: test\r\n---\r\nbody with CRLF"
|
|
got := stripFrontmatter(input)
|
|
want := "body with CRLF"
|
|
if strings.TrimSpace(got) != strings.TrimSpace(want) {
|
|
t.Errorf("stripFrontmatter CRLF = %q, want %q", got, want)
|
|
}
|
|
}
|
|
|
|
func TestParseFromBytesPathTraversal(t *testing.T) {
|
|
filename := "../../etc/passwd"
|
|
_, err := ParseFromBytes(filename, []byte("malicious"))
|
|
if err == nil {
|
|
t.Fatal("expected error for path traversal filename")
|
|
}
|
|
}
|
|
|
|
func TestParseFromBytesInvalidFilename(t *testing.T) {
|
|
for _, name := range []string{"", ".", string(filepath.Separator)} {
|
|
_, err := ParseFromBytes(name, []byte("content"))
|
|
if err == nil {
|
|
t.Errorf("expected error for filename %q", name)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestIsSupportedCaseInsensitive(t *testing.T) {
|
|
cases := []string{"DOC.MD", "File.PDF", "Notes.DOCX", "FILE.MARKDOWN"}
|
|
for _, name := range cases {
|
|
if !IsSupported(name) {
|
|
t.Errorf("IsSupported(%q) should be true (case-insensitive)", name)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestParseMarkdownCRLF(t *testing.T) {
|
|
content := "# Hello\r\nCache invalidation with TTL tradeoffs.\r\n"
|
|
path := writeTempFile(t, "notes.md", content)
|
|
|
|
result, err := ParseFile(path)
|
|
if err != nil {
|
|
t.Fatalf("ParseFile error: %v", err)
|
|
}
|
|
if !strings.Contains(result.Body, "Cache invalidation") {
|
|
t.Fatalf("body = %q, want to contain %q", result.Body, "Cache invalidation")
|
|
}
|
|
}
|
|
|
|
func TestParseMarkdownEmptyFile(t *testing.T) {
|
|
path := writeTempFile(t, "empty.md", "")
|
|
|
|
result, err := ParseFile(path)
|
|
if err != nil {
|
|
t.Fatalf("ParseFile error: %v", err)
|
|
}
|
|
if result.Body != "" {
|
|
t.Fatalf("expected empty body, got %q", result.Body)
|
|
}
|
|
}
|
|
|
|
func writeTempFile(t *testing.T, name, content string) string {
|
|
t.Helper()
|
|
dir := t.TempDir()
|
|
path := filepath.Join(dir, name)
|
|
if err := os.WriteFile(path, []byte(content), 0644); err != nil {
|
|
t.Fatalf("write temp file: %v", err)
|
|
}
|
|
return path
|
|
}
|