package ingestion import ( "os" "path/filepath" "strings" "testing" ) func TestParseMarkdown(t *testing.T) { content := `--- title: test tags: [go] --- # Hello This is a test document with idempotent APIs.` path := writeTempFile(t, "test.md", content) defer os.Remove(path) result, err := ParseFile(path) if err != nil { t.Fatalf("ParseFile error: %v", err) } if result.Title != "test" { t.Fatalf("title = %q, want %q", result.Title, "test") } if !strings.Contains(result.Body, "idempotent") { t.Fatal("body should contain idempotent") } if strings.Contains(result.Body, "---") { t.Fatal("body should not contain frontmatter") } } func TestParseMarkdownNoFrontmatter(t *testing.T) { content := `# Notes Database indexes speed up queries.` path := writeTempFile(t, "notes.md", content) defer os.Remove(path) result, err := ParseFile(path) if err != nil { t.Fatalf("ParseFile error: %v", err) } if !strings.Contains(result.Body, "Database indexes") { t.Fatalf("body = %q, want to contain %q", result.Body, "Database indexes") } } func TestParsePDF(t *testing.T) { content := `%PDF-1.4 fake content` path := writeTempFile(t, "test.pdf", content) defer os.Remove(path) result, err := ParseFile(path) if err == nil { t.Fatal("expected error for invalid PDF") } if result.Body != "" { t.Fatalf("expected empty body for invalid PDF, got %q", result.Body) } } func TestParseDOCX(t *testing.T) { path := writeTempFile(t, "test.docx", "not a real docx") defer os.Remove(path) result, err := ParseFile(path) if err == nil { t.Fatal("expected error for invalid docx") } if result.Body != "" { t.Fatalf("expected empty body for invalid docx, got %q", result.Body) } } func TestParseUnsupportedFormat(t *testing.T) { path := writeTempFile(t, "test.txt", "plain text") defer os.Remove(path) _, err := ParseFile(path) if err == nil { t.Fatal("expected error for unsupported format") } } func TestIsSupported(t *testing.T) { cases := []struct { path string want bool }{ {"doc.md", true}, {"doc.markdown", true}, {"doc.pdf", true}, {"doc.docx", true}, {"doc.txt", false}, {"doc.html", false}, } for _, c := range cases { got := IsSupported(c.path) if got != c.want { t.Errorf("IsSupported(%q) = %v, want %v", c.path, got, c.want) } } } func TestParseFromBytes(t *testing.T) { content := "# Hello\nThis is markdown with cache invalidation." result, err := ParseFromBytes("test.md", []byte(content)) if err != nil { t.Fatalf("ParseFromBytes error: %v", err) } if result.Title != "test" { t.Fatalf("title = %q, want %q", result.Title, "test") } if !strings.Contains(result.Body, "cache invalidation") { t.Fatal("body should contain cache invalidation") } } func TestSupportedExtensions(t *testing.T) { exts := SupportedExtensions() if len(exts) == 0 { t.Fatal("expected at least one supported extension") } hasMD := false hasPDF := false hasDOCX := false for _, ext := range exts { switch ext { case ".md": hasMD = true case ".markdown": hasMD = true case ".pdf": hasPDF = true case ".docx": hasDOCX = true } } if !hasMD { t.Error("expected .md or .markdown in supported extensions") } if !hasPDF { t.Error("expected .pdf in supported extensions") } if !hasDOCX { t.Error("expected .docx in supported extensions") } } func TestStripFrontmatter(t *testing.T) { cases := []struct { input string want string }{ { input: "---\ntitle: test\n---\nbody text", want: "body text", }, { input: "no frontmatter", want: "no frontmatter", }, { input: "---\nno end marker", want: "---\nno end marker", }, { input: "---\ntags:\n - go\n - testing\n---\nreal body", want: "real body", }, { input: "---\ntitle: YAML with --- inside\n---\nbody after", want: "body after", }, } for _, c := range cases { got := stripFrontmatter(c.input) if strings.TrimSpace(got) != strings.TrimSpace(c.want) { t.Errorf("stripFrontmatter(%q) = %q, want %q", c.input, got, c.want) } } } func TestStripFrontmatterMultilineYAML(t *testing.T) { input := "---\ntitle: Study Notes\ndescription: |\n This block contains --- as part of YAML\n and more content.\n---\nbody content" got := stripFrontmatter(input) want := "body content" if strings.TrimSpace(got) != strings.TrimSpace(want) { t.Errorf("stripFrontmatter with multiline YAML = %q, want %q", got, want) } } func TestStripFrontmatterCRLF(t *testing.T) { input := "---\r\ntitle: test\r\n---\r\nbody with CRLF" got := stripFrontmatter(input) want := "body with CRLF" if strings.TrimSpace(got) != strings.TrimSpace(want) { t.Errorf("stripFrontmatter CRLF = %q, want %q", got, want) } } func TestParseFromBytesPathTraversal(t *testing.T) { filename := "../../etc/passwd" _, err := ParseFromBytes(filename, []byte("malicious")) if err == nil { t.Fatal("expected error for path traversal filename") } } func TestParseFromBytesInvalidFilename(t *testing.T) { for _, name := range []string{"", ".", string(filepath.Separator)} { _, err := ParseFromBytes(name, []byte("content")) if err == nil { t.Errorf("expected error for filename %q", name) } } } func TestIsSupportedCaseInsensitive(t *testing.T) { cases := []string{"DOC.MD", "File.PDF", "Notes.DOCX", "FILE.MARKDOWN"} for _, name := range cases { if !IsSupported(name) { t.Errorf("IsSupported(%q) should be true (case-insensitive)", name) } } } func TestParseMarkdownCRLF(t *testing.T) { content := "# Hello\r\nCache invalidation with TTL tradeoffs.\r\n" path := writeTempFile(t, "notes.md", content) result, err := ParseFile(path) if err != nil { t.Fatalf("ParseFile error: %v", err) } if !strings.Contains(result.Body, "Cache invalidation") { t.Fatalf("body = %q, want to contain %q", result.Body, "Cache invalidation") } } func TestParseMarkdownEmptyFile(t *testing.T) { path := writeTempFile(t, "empty.md", "") result, err := ParseFile(path) if err != nil { t.Fatalf("ParseFile error: %v", err) } if result.Body != "" { t.Fatalf("expected empty body, got %q", result.Body) } } func writeTempFile(t *testing.T, name, content string) string { t.Helper() dir := t.TempDir() path := filepath.Join(dir, name) if err := os.WriteFile(path, []byte(content), 0644); err != nil { t.Fatalf("write temp file: %v", err) } return path }