package ingestion import ( "fmt" "io" "os" "path/filepath" "strings" "github.com/pdfcpu/pdfcpu/pkg/api" "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model" ) func ParsePDF(path string) (string, error) { ctx, err := api.ReadContextFile(path) if err != nil { return "", err } tmpDir, err := os.MkdirTemp("", "pdf-extract-*") if err != nil { return "", err } defer os.RemoveAll(tmpDir) conf := model.NewDefaultConfiguration() if err := api.ExtractContentFile(path, tmpDir, nil, conf); err != nil { return "", err } var buf strings.Builder entries, err := os.ReadDir(tmpDir) if err != nil { return "", err } for _, entry := range entries { if entry.IsDir() { continue } content, err := os.ReadFile(filepath.Join(tmpDir, entry.Name())) if err != nil { return "", fmt.Errorf("read extracted content %s: %w", entry.Name(), err) } buf.WriteString(string(content)) buf.WriteString("\n") } if buf.Len() == 0 { content, err := extractPageText(ctx) if err == nil { buf.WriteString(content) } } return buf.String(), nil } func extractPageText(ctx *model.Context) (string, error) { var buf strings.Builder for i := 1; i <= ctx.PageCount; i++ { r, err := api.ExtractPage(ctx, i) if err != nil { continue } data, err := io.ReadAll(r) if err != nil { continue } buf.Write(data) buf.WriteString("\n") } return buf.String(), nil }