feat: add file upload for materials (PDF/DOCX) with ingestion pipeline
This commit is contained in:
57
internal/ingestion/parse_markdown.go
Normal file
57
internal/ingestion/parse_markdown.go
Normal file
@@ -0,0 +1,57 @@
|
||||
package ingestion
|
||||
|
||||
import (
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func ParseMarkdown(path string) (string, error) {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
content := string(data)
|
||||
content = stripFrontmatter(content)
|
||||
content = strings.TrimSpace(content)
|
||||
return content, nil
|
||||
}
|
||||
|
||||
func stripFrontmatter(content string) string {
|
||||
content = strings.TrimLeft(content, "\n\r\t ")
|
||||
if !strings.HasPrefix(content, "---") {
|
||||
return content
|
||||
}
|
||||
|
||||
rest := content[3:]
|
||||
closing := findFMClosing(rest)
|
||||
if closing < 0 {
|
||||
return content
|
||||
}
|
||||
|
||||
return strings.TrimLeft(rest[closing:], "\n\r")
|
||||
}
|
||||
|
||||
func findFMClosing(s string) int {
|
||||
i := 0
|
||||
for i < len(s) {
|
||||
nl := strings.IndexByte(s[i:], '\n')
|
||||
if nl < 0 {
|
||||
break
|
||||
}
|
||||
lineStart := i + nl + 1
|
||||
if lineStart >= len(s) {
|
||||
break
|
||||
}
|
||||
end := strings.IndexByte(s[lineStart:], '\n')
|
||||
line := s[lineStart:]
|
||||
if end >= 0 {
|
||||
line = s[lineStart : lineStart+end]
|
||||
}
|
||||
if strings.TrimRight(line, "\r") == "---" {
|
||||
return lineStart + len(line)
|
||||
}
|
||||
i = lineStart + len(line)
|
||||
}
|
||||
return -1
|
||||
}
|
||||
Reference in New Issue
Block a user