feat: add file upload for materials (PDF/DOCX) with ingestion pipeline
This commit is contained in:
74
internal/ingestion/parse_pdf.go
Normal file
74
internal/ingestion/parse_pdf.go
Normal file
@@ -0,0 +1,74 @@
|
||||
package ingestion
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/pdfcpu/pdfcpu/pkg/api"
|
||||
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model"
|
||||
)
|
||||
|
||||
func ParsePDF(path string) (string, error) {
|
||||
ctx, err := api.ReadContextFile(path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
tmpDir, err := os.MkdirTemp("", "pdf-extract-*")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer os.RemoveAll(tmpDir)
|
||||
|
||||
conf := model.NewDefaultConfiguration()
|
||||
if err := api.ExtractContentFile(path, tmpDir, nil, conf); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
var buf strings.Builder
|
||||
entries, err := os.ReadDir(tmpDir)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
for _, entry := range entries {
|
||||
if entry.IsDir() {
|
||||
continue
|
||||
}
|
||||
content, err := os.ReadFile(filepath.Join(tmpDir, entry.Name()))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("read extracted content %s: %w", entry.Name(), err)
|
||||
}
|
||||
buf.WriteString(string(content))
|
||||
buf.WriteString("\n")
|
||||
}
|
||||
|
||||
if buf.Len() == 0 {
|
||||
content, err := extractPageText(ctx)
|
||||
if err == nil {
|
||||
buf.WriteString(content)
|
||||
}
|
||||
}
|
||||
|
||||
return buf.String(), nil
|
||||
}
|
||||
|
||||
func extractPageText(ctx *model.Context) (string, error) {
|
||||
var buf strings.Builder
|
||||
for i := 1; i <= ctx.PageCount; i++ {
|
||||
r, err := api.ExtractPage(ctx, i)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
data, err := io.ReadAll(r)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
buf.Write(data)
|
||||
buf.WriteString("\n")
|
||||
}
|
||||
return buf.String(), nil
|
||||
}
|
||||
Reference in New Issue
Block a user