DocumentParserService.java

package com.taxonomy.provenance.service;

import com.taxonomy.dto.DocumentParseResult;
import com.taxonomy.dto.RequirementCandidate;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.text.BreakIterator;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
 * Parses uploaded PDF and DOCX documents into requirement candidates.
 *
 * <p>This first-stage parser performs:
 * <ul>
 *   <li>Raw text extraction</li>
 *   <li>Section/heading detection</li>
 *   <li>Requirement candidate splitting (paragraph-based)</li>
 * </ul>
 *
 * <p>It does <em>not</em> attempt to fully interpret legal semantics.
 */
@Service
public class DocumentParserService {

    private static final Logger log = LoggerFactory.getLogger(DocumentParserService.class);

    /** Minimum length for a paragraph to be considered a requirement candidate. */
    private static final int MIN_CANDIDATE_LENGTH = 40;

    /** Maximum length for a single candidate text. */
    private static final int MAX_CANDIDATE_LENGTH = 2000;

    /** Maximum characters for the raw text preview. */
    private static final int RAW_TEXT_PREVIEW_LENGTH = 2000;

    private static final String MIME_PDF = "application/pdf";
    private static final String MIME_DOCX = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";

    /** Pattern to detect likely section headings. */
    private static final Pattern HEADING_PATTERN = Pattern.compile(
            "^(?:§\\s*\\d+|Art\\.?\\s*\\d+|\\d+\\.\\d*\\s+[A-ZÄÖÜ]|[IVXLCDM]+\\.\\s+|Kapitel\\s+\\d+|" +
            "Chapter\\s+\\d+|Section\\s+\\d+|Abschnitt\\s+\\d+|Artikel\\s+\\d+).*",
            Pattern.CASE_INSENSITIVE);

    /** Pattern to detect [H1], [H2], etc. markers injected during DOCX extraction. */
    private static final Pattern H_MARKER_PATTERN = Pattern.compile(
            "^\\[H(\\d+)]\\s*(.+)$");

    /** Pattern to extract a trailing digit from a style ID (e.g. "Heading2" → 2). */
    private static final Pattern STYLE_LEVEL_PATTERN = Pattern.compile("\\d+");

    /** Represents a detected heading with its hierarchical level. */
    record HeadingMatch(int level, String text) {}

    /**
     * Parses an uploaded document and extracts requirement candidates.
     *
     * @param file the uploaded PDF or DOCX file
     * @return the parse result with candidates
     * @throws IOException if the file cannot be read
     */
    public DocumentParseResult parse(MultipartFile file) throws IOException {
        String contentType = detectMimeType(file);
        String rawText;
        int pageCount;

        if (MIME_PDF.equals(contentType)) {
            try (PDDocument pdf = Loader.loadPDF(file.getBytes())) {
                pageCount = pdf.getNumberOfPages();
                PDFTextStripper stripper = new PDFTextStripper();
                rawText = stripper.getText(pdf);
            }
        } else if (MIME_DOCX.equals(contentType)) {
            try (InputStream in = file.getInputStream();
                 XWPFDocument doc = new XWPFDocument(in)) {
                pageCount = 1;
                try {
                    int p = doc.getProperties().getExtendedProperties()
                            .getUnderlyingProperties().getPages();
                    if (p > 0) pageCount = p;
                } catch (Exception ignored) {
                    // Page count unavailable in minimal DOCX files
                }
                rawText = extractDocxText(doc);
            }
        } else {
            throw new IOException("Unsupported file type: " + contentType
                    + ". Only PDF and DOCX files are supported.");
        }

        List<RequirementCandidate> candidates = extractCandidates(rawText);

        DocumentParseResult result = new DocumentParseResult();
        result.setFileName(file.getOriginalFilename());
        result.setMimeType(contentType);
        result.setTotalPages(pageCount);
        result.setRawTextPreview(rawText.length() > RAW_TEXT_PREVIEW_LENGTH
                ? rawText.substring(0, RAW_TEXT_PREVIEW_LENGTH) + "…"
                : rawText);
        result.setCandidates(candidates);
        result.setWarnings(new ArrayList<>());

        if (candidates.isEmpty()) {
            result.getWarnings().add("No requirement candidates were extracted from this document.");
        }

        log.info("Parsed document '{}': {} pages, {} candidates extracted",
                file.getOriginalFilename(), pageCount, candidates.size());

        return result;
    }

    /**
     * Computes a SHA-256 content hash for deduplication.
     */
    public String computeContentHash(byte[] content) {
        try {
            MessageDigest digest = MessageDigest.getInstance("SHA-256");
            byte[] hash = digest.digest(content);
            return HexFormat.of().formatHex(hash);
        } catch (NoSuchAlgorithmException e) {
            throw new IllegalStateException("SHA-256 not available", e);
        }
    }

    // ── Private helpers ────────────────────────────────────────────────────────

    private String detectMimeType(MultipartFile file) {
        String ct = file.getContentType();
        if (ct != null && !ct.isBlank()) {
            if (ct.contains("pdf")) return MIME_PDF;
            if (ct.contains("wordprocessingml") || ct.contains("docx")) return MIME_DOCX;
        }
        String name = file.getOriginalFilename();
        if (name != null) {
            String lower = name.toLowerCase();
            if (lower.endsWith(".pdf")) return MIME_PDF;
            if (lower.endsWith(".docx")) return MIME_DOCX;
        }
        return ct != null ? ct : "application/octet-stream";
    }

    private String extractDocxText(XWPFDocument doc) {
        StringBuilder sb = new StringBuilder();
        for (XWPFParagraph para : doc.getParagraphs()) {
            String text = para.getText();
            if (text == null || text.isBlank()) continue;
            String styleId = para.getStyleID();
            if (styleId != null && isHeadingStyle(styleId)) {
                int level = extractHeadingLevel(styleId);
                sb.append("[H").append(level).append("] ").append(text).append("\n\n");
            } else {
                sb.append(text).append("\n\n");
            }
        }
        return sb.toString();
    }

    /**
     * Checks whether a DOCX style ID represents a heading style.
     * Matches English ("Heading1"), German ("Überschrift1" / "berschrift1"
     * where the umlaut may be stripped by some XML serialisers), and
     * common variations.
     */
    private static boolean isHeadingStyle(String styleId) {
        String lower = styleId.toLowerCase(Locale.ROOT);
        return lower.startsWith("heading")
                || lower.startsWith("berschrift")  // Ü stripped by some XML encodings
                || lower.startsWith("überschrift");
    }

    /**
     * Extracts the numeric heading level from a style ID.
     * "Heading2" → 2, "berschrift3" → 3, fallback → 1.
     */
    private static int extractHeadingLevel(String styleId) {
        Matcher m = STYLE_LEVEL_PATTERN.matcher(styleId);
        return m.find() ? Integer.parseInt(m.group()) : 1;
    }

    /**
     * Splits raw text into requirement candidates based on paragraph boundaries,
     * heading detection (including [H1]/[H2] markers from DOCX styles), and
     * hierarchical section-path tracking.
     */
    public List<RequirementCandidate> extractCandidates(String rawText) {
        List<RequirementCandidate> candidates = new ArrayList<>();
        String[] paragraphs = rawText.split("\\n\\s*\\n");
        Deque<Map.Entry<Integer, String>> headingStack = new ArrayDeque<>();
        String currentSectionPath = null;
        int index = 0;

        for (String para : paragraphs) {
            String trimmed = para.strip();
            if (trimmed.isEmpty()) continue;

            // Detect headings (with level from [H] markers or regex heuristics)
            HeadingMatch heading = detectHeading(trimmed);
            if (heading != null) {
                // Pop all headings at the same or deeper level
                while (!headingStack.isEmpty() && headingStack.peek().getKey() >= heading.level()) {
                    headingStack.pop();
                }
                headingStack.push(Map.entry(heading.level(), heading.text()));
                currentSectionPath = buildSectionPath(headingStack);
                continue;
            }

            // Skip very short paragraphs (page numbers, headers, footers)
            if (trimmed.length() < MIN_CANDIDATE_LENGTH) continue;

            // Split at sentence boundaries instead of truncating
            if (trimmed.length() > MAX_CANDIDATE_LENGTH) {
                for (String sub : splitAtSentenceBoundaries(trimmed, MAX_CANDIDATE_LENGTH)) {
                    candidates.add(new RequirementCandidate(index++, currentSectionPath, sub, null));
                }
            } else {
                candidates.add(new RequirementCandidate(index++, currentSectionPath, trimmed, null));
            }
        }

        return candidates;
    }

    // ── Heading detection helpers ──────────────────────────────────────────────

    /**
     * Tries to detect a heading in the given text, returning a {@link HeadingMatch}
     * with level and clean text, or {@code null} if the text is not a heading.
     *
     * <p>Recognition order:
     * <ol>
     *   <li>[H1]/[H2] markers injected during DOCX style-based extraction</li>
     *   <li>Regex-based heading patterns (§, Chapter, Section, …)</li>
     *   <li>All-caps short lines</li>
     * </ol>
     */
    HeadingMatch detectHeading(String text) {
        if (text.length() > 200) return null;

        // 1. [H1], [H2], … markers from DOCX extraction
        Matcher hm = H_MARKER_PATTERN.matcher(text);
        if (hm.matches()) {
            int level = Integer.parseInt(hm.group(1));
            return new HeadingMatch(level, hm.group(2).strip());
        }

        // 2. Regex-based heading patterns with inferred level
        if (HEADING_PATTERN.matcher(text).matches()) {
            int level = inferRegexHeadingLevel(text);
            return new HeadingMatch(level, text);
        }

        // 3. All-caps short lines
        if (text.length() < 80 && text.equals(text.toUpperCase()) && text.matches(".*[A-ZÄÖÜ].*")) {
            return new HeadingMatch(1, text);
        }

        return null;
    }

    /**
     * Infers a hierarchical heading level from a regex-matched heading.
     * Chapter/Kapitel → 1, Section/Abschnitt → 2, numbered (X.Y) → dot-depth + 1,
     * everything else → 1.
     */
    private static int inferRegexHeadingLevel(String text) {
        String lower = text.toLowerCase(Locale.ROOT);
        if (lower.startsWith("chapter") || lower.startsWith("kapitel")) return 1;
        if (lower.startsWith("section") || lower.startsWith("abschnitt")) return 2;
        // Numbered sections: "3.2 …" → level 2, "3.2.1 …" → level 3
        if (text.matches("^\\d+\\.\\d+\\.\\d+.*")) return 3;
        if (text.matches("^\\d+\\.\\d+.*")) return 2;
        return 1;
    }

    /**
     * Builds a hierarchical section path from the heading stack.
     * E.g. "§ 3 Datenschutz > Abs. 2 Verarbeitung".
     *
     * <p>The stack is sorted by level because {@link ArrayDeque#stream()}
     * iterates head-to-tail (deepest level first), but we need the
     * shallowest level first in the output path.
     */
    private static String buildSectionPath(Deque<Map.Entry<Integer, String>> headingStack) {
        return headingStack.stream()
                .sorted(Comparator.comparingInt(Map.Entry::getKey))
                .map(Map.Entry::getValue)
                .collect(Collectors.joining(" > "));
    }

    // ── Sentence-boundary splitting ────────────────────────────────────────────

    /**
     * Splits text at sentence boundaries so that each resulting chunk is at most
     * {@code maxLen} characters long. This avoids the text loss caused by simple
     * truncation.
     */
    List<String> splitAtSentenceBoundaries(String text, int maxLen) {
        List<String> result = new ArrayList<>();
        BreakIterator sentIter = BreakIterator.getSentenceInstance(Locale.GERMAN);
        sentIter.setText(text);
        StringBuilder current = new StringBuilder();
        int start = sentIter.first();
        for (int end = sentIter.next(); end != BreakIterator.DONE; end = sentIter.next()) {
            String sentence = text.substring(start, end);
            if (current.length() + sentence.length() > maxLen && !current.isEmpty()) {
                result.add(current.toString().strip());
                current = new StringBuilder();
            }
            // Hard-split individual sentences that exceed maxLen on their own
            if (sentence.length() > maxLen && current.isEmpty()) {
                for (int i = 0; i < sentence.length(); i += maxLen) {
                    result.add(sentence.substring(i, Math.min(i + maxLen, sentence.length())).strip());
                }
            } else {
                current.append(sentence);
            }
            start = end;
        }
        if (!current.isEmpty()) {
            result.add(current.toString().strip());
        }
        return result;
    }
}