Files
tutor-service/internal/ingestion/parse_pdf.go

75 lines
1.4 KiB
Go

package ingestion
import (
"fmt"
"io"
"os"
"path/filepath"
"strings"
"github.com/pdfcpu/pdfcpu/pkg/api"
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model"
)
func ParsePDF(path string) (string, error) {
ctx, err := api.ReadContextFile(path)
if err != nil {
return "", err
}
tmpDir, err := os.MkdirTemp("", "pdf-extract-*")
if err != nil {
return "", err
}
defer os.RemoveAll(tmpDir)
conf := model.NewDefaultConfiguration()
if err := api.ExtractContentFile(path, tmpDir, nil, conf); err != nil {
return "", err
}
var buf strings.Builder
entries, err := os.ReadDir(tmpDir)
if err != nil {
return "", err
}
for _, entry := range entries {
if entry.IsDir() {
continue
}
content, err := os.ReadFile(filepath.Join(tmpDir, entry.Name()))
if err != nil {
return "", fmt.Errorf("read extracted content %s: %w", entry.Name(), err)
}
buf.WriteString(string(content))
buf.WriteString("\n")
}
if buf.Len() == 0 {
content, err := extractPageText(ctx)
if err == nil {
buf.WriteString(content)
}
}
return buf.String(), nil
}
func extractPageText(ctx *model.Context) (string, error) {
var buf strings.Builder
for i := 1; i <= ctx.PageCount; i++ {
r, err := api.ExtractPage(ctx, i)
if err != nil {
continue
}
data, err := io.ReadAll(r)
if err != nil {
continue
}
buf.Write(data)
buf.WriteString("\n")
}
return buf.String(), nil
}