DocumentAnalysisService.java

package com.taxonomy.provenance.service;

import com.taxonomy.analysis.service.LlmService;
import com.taxonomy.catalog.model.TaxonomyNode;
import com.taxonomy.catalog.service.TaxonomyService;
import com.taxonomy.dto.AiExtractedCandidate;
import com.taxonomy.dto.RegulationArchitectureMatch;
import com.taxonomy.shared.service.PromptTemplateService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import tools.jackson.core.type.TypeReference;
import tools.jackson.databind.ObjectMapper;

import java.util.Collections;
import java.util.List;

/**
 * AI-powered document analysis service that provides:
 * <ul>
 *   <li>LLM-assisted extraction of requirement candidates from document text</li>
 *   <li>Direct regulation-to-architecture taxonomy mapping via LLM</li>
 * </ul>
 *
 * <p>Falls back gracefully when the LLM is unavailable or returns invalid responses.
 */
@Service
public class DocumentAnalysisService {

    private static final Logger log = LoggerFactory.getLogger(DocumentAnalysisService.class);

    /** Maximum characters of document text sent to the LLM in a single prompt. */
    private static final int MAX_DOCUMENT_LENGTH = 15_000;

    private final PromptTemplateService promptTemplateService;
    private final LlmService llmService;
    private final TaxonomyService taxonomyService;
    private final ObjectMapper objectMapper;

    public DocumentAnalysisService(PromptTemplateService promptTemplateService,
                                    LlmService llmService,
                                    TaxonomyService taxonomyService,
                                    ObjectMapper objectMapper) {
        this.promptTemplateService = promptTemplateService;
        this.llmService = llmService;
        this.taxonomyService = taxonomyService;
        this.objectMapper = objectMapper;
    }

    /**
     * Extracts requirement candidates from document text using the LLM.
     * Falls back to an empty list if the LLM is unavailable or returns invalid JSON.
     *
     * @param documentText raw document text
     * @param sourceType   "REGULATION" uses the specialized regulation extraction prompt;
     *                     all other values use the general extraction prompt
     * @return list of AI-extracted requirement candidates
     */
    public List<AiExtractedCandidate> extractWithAi(String documentText, String sourceType) {
        if (!llmService.isAvailable()) {
            log.warn("LLM is not available — AI extraction skipped");
            return Collections.emptyList();
        }

        String promptCode = "REGULATION".equalsIgnoreCase(sourceType)
                ? "extract-regulation" : "extract-default";
        String prompt = promptTemplateService.renderExtractionPrompt(
                promptCode, truncateIfNeeded(documentText));

        log.info("Calling LLM for AI extraction (prompt code: {})", promptCode);
        String response = llmService.callLlmRaw(prompt);
        return parseExtractionResponse(response);
    }

    /**
     * Maps regulation text directly to architecture taxonomy nodes using the LLM.
     * Returns direct node matches with confidence and paragraph references.
     *
     * @param regulationText raw regulation text
     * @return list of regulation-to-architecture matches
     */
    public List<RegulationArchitectureMatch> mapRegulationToArchitecture(String regulationText) {
        if (!llmService.isAvailable()) {
            log.warn("LLM is not available — regulation mapping skipped");
            return Collections.emptyList();
        }

        String nodeList = buildFullNodeList();
        String prompt = promptTemplateService.renderRegulationMappingPrompt(
                "reg-map-default", truncateIfNeeded(regulationText), nodeList);

        log.info("Calling LLM for regulation-to-architecture mapping");
        String response = llmService.callLlmRaw(prompt);
        return parseRegulationMappingResponse(response);
    }

    /**
     * Extracts requirement candidates from document text using the LLM,
     * enhanced with parent-section context for better understanding.
     *
     * <p>The parent context is prepended to the prompt so the LLM can
     * understand the chunk in its broader document context.
     *
     * @param chunkText     the text of the specific chunk
     * @param parentContext the surrounding section context
     * @param sourceType    "REGULATION" or other
     * @return list of AI-extracted requirement candidates
     */
    public List<AiExtractedCandidate> extractWithAiContextual(String chunkText,
                                                               String parentContext,
                                                               String sourceType) {
        if (!llmService.isAvailable()) {
            log.warn("LLM is not available — AI extraction skipped");
            return Collections.emptyList();
        }

        String contextualText = parentContext != null && !parentContext.isBlank()
                ? "Context (parent section): " + parentContext + "\n\n"
                  + "Text to analyse:\n" + chunkText
                : chunkText;

        return extractWithAi(contextualText, sourceType);
    }

    // ── Internal helpers ──────────────────────────────────────────────────────

    List<AiExtractedCandidate> parseExtractionResponse(String response) {
        if (response == null || response.isBlank()) {
            log.warn("Empty LLM response for AI extraction");
            return Collections.emptyList();
        }
        try {
            String json = extractJsonArray(response);
            return objectMapper.readValue(json, new TypeReference<List<AiExtractedCandidate>>() {});
        } catch (Exception e) {
            log.warn("Failed to parse AI extraction response: {}", e.getMessage());
            return Collections.emptyList();
        }
    }

    List<RegulationArchitectureMatch> parseRegulationMappingResponse(String response) {
        if (response == null || response.isBlank()) {
            log.warn("Empty LLM response for regulation mapping");
            return Collections.emptyList();
        }
        try {
            String json = extractJsonArray(response);
            return objectMapper.readValue(json, new TypeReference<List<RegulationArchitectureMatch>>() {});
        } catch (Exception e) {
            log.warn("Failed to parse regulation mapping response: {}", e.getMessage());
            return Collections.emptyList();
        }
    }

    String truncateIfNeeded(String text) {
        if (text == null) return "";
        if (text.length() <= MAX_DOCUMENT_LENGTH) return text;
        log.info("Document text truncated from {} to {} characters", text.length(), MAX_DOCUMENT_LENGTH);
        return text.substring(0, MAX_DOCUMENT_LENGTH) + "\n\n[... truncated ...]";
    }

    private String buildFullNodeList() {
        List<TaxonomyNode> roots = taxonomyService.getRootNodes();
        StringBuilder sb = new StringBuilder();
        for (TaxonomyNode root : roots) {
            appendNodeTree(sb, root, 0);
        }
        return sb.toString();
    }

    private void appendNodeTree(StringBuilder sb, TaxonomyNode node, int depth) {
        sb.append("  ".repeat(depth))
          .append(node.getCode()).append(": ").append(node.getName()).append("\n");
        List<TaxonomyNode> children = taxonomyService.getChildrenOf(node.getCode());
        for (TaxonomyNode child : children) {
            appendNodeTree(sb, child, depth + 1);
        }
    }

    /**
     * Extracts a JSON array from a text response that may contain markdown code fences
     * or surrounding prose.
     */
    private String extractJsonArray(String text) {
        // Strip markdown code fences
        String stripped = text.strip();
        if (stripped.startsWith("```")) {
            int firstNewline = stripped.indexOf('\n');
            if (firstNewline > 0) {
                stripped = stripped.substring(firstNewline + 1);
            }
            if (stripped.endsWith("```")) {
                stripped = stripped.substring(0, stripped.length() - 3);
            }
            stripped = stripped.strip();
        }
        // Find outermost JSON array
        int start = stripped.indexOf('[');
        int end = stripped.lastIndexOf(']');
        if (start >= 0 && end > start) {
            return stripped.substring(start, end + 1);
        }
        return stripped;
    }
}