75 lines
1.4 KiB
Go
75 lines
1.4 KiB
Go
package ingestion
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"github.com/pdfcpu/pdfcpu/pkg/api"
|
|
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model"
|
|
)
|
|
|
|
func ParsePDF(path string) (string, error) {
|
|
ctx, err := api.ReadContextFile(path)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
tmpDir, err := os.MkdirTemp("", "pdf-extract-*")
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer os.RemoveAll(tmpDir)
|
|
|
|
conf := model.NewDefaultConfiguration()
|
|
if err := api.ExtractContentFile(path, tmpDir, nil, conf); err != nil {
|
|
return "", err
|
|
}
|
|
|
|
var buf strings.Builder
|
|
entries, err := os.ReadDir(tmpDir)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
for _, entry := range entries {
|
|
if entry.IsDir() {
|
|
continue
|
|
}
|
|
content, err := os.ReadFile(filepath.Join(tmpDir, entry.Name()))
|
|
if err != nil {
|
|
return "", fmt.Errorf("read extracted content %s: %w", entry.Name(), err)
|
|
}
|
|
buf.WriteString(string(content))
|
|
buf.WriteString("\n")
|
|
}
|
|
|
|
if buf.Len() == 0 {
|
|
content, err := extractPageText(ctx)
|
|
if err == nil {
|
|
buf.WriteString(content)
|
|
}
|
|
}
|
|
|
|
return buf.String(), nil
|
|
}
|
|
|
|
func extractPageText(ctx *model.Context) (string, error) {
|
|
var buf strings.Builder
|
|
for i := 1; i <= ctx.PageCount; i++ {
|
|
r, err := api.ExtractPage(ctx, i)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
data, err := io.ReadAll(r)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
buf.Write(data)
|
|
buf.WriteString("\n")
|
|
}
|
|
return buf.String(), nil
|
|
}
|