ChunkingStrategySelector.java

package com.taxonomy.provenance.service;

import com.taxonomy.dto.ChunkingStrategy;
import com.taxonomy.shared.service.LocalEmbeddingService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;

/**
 * Selects the best {@link ChunkingStrategy} for a given raw text based on its
 * structural characteristics and the availability of the embedding service.
 *
 * <ul>
 *   <li>{@link ChunkingStrategy#STRUCTURAL} — chosen when the text has ≥3
 *       recognisable headings (sections, paragraphs, articles).</li>
 *   <li>{@link ChunkingStrategy#SEMANTIC} — chosen as a fallback when headings
 *       are scarce but the local embedding service is available.</li>
 *   <li>{@link ChunkingStrategy#PARAGRAPH_BASED} — last-resort fallback when
 *       neither structural nor semantic chunking is feasible.</li>
 * </ul>
 */
@Service
public class ChunkingStrategySelector {

    private static final Logger log = LoggerFactory.getLogger(ChunkingStrategySelector.class);

    /** Minimum number of detected headings for structural chunking. */
    private static final int STRUCTURAL_HEADING_THRESHOLD = 3;

    private final DocumentParserService parserService;
    private final LocalEmbeddingService embeddingService;

    public ChunkingStrategySelector(DocumentParserService parserService,
                                     LocalEmbeddingService embeddingService) {
        this.parserService = parserService;
        this.embeddingService = embeddingService;
    }

    /**
     * Selects the optimal chunking strategy for the given raw text.
     *
     * @param rawText the raw text extracted from a document
     * @return the recommended chunking strategy
     */
    public ChunkingStrategy selectStrategy(String rawText) {
        if (rawText == null || rawText.isBlank()) {
            return ChunkingStrategy.PARAGRAPH_BASED;
        }

        int headingCount = countHeadings(rawText);

        if (headingCount >= STRUCTURAL_HEADING_THRESHOLD) {
            log.debug("Selected STRUCTURAL strategy ({} headings detected)", headingCount);
            return ChunkingStrategy.STRUCTURAL;
        }

        if (embeddingService.isAvailable()) {
            log.debug("Selected SEMANTIC strategy ({} headings, embedding available)", headingCount);
            return ChunkingStrategy.SEMANTIC;
        }

        log.debug("Selected PARAGRAPH_BASED strategy ({} headings, embedding unavailable)", headingCount);
        return ChunkingStrategy.PARAGRAPH_BASED;
    }

    /**
     * Counts the number of headings in the raw text by scanning for heading
     * patterns using the parser's detection logic.
     */
    int countHeadings(String rawText) {
        int count = 0;
        String[] paragraphs = rawText.split("\\n\\s*\\n");
        for (String para : paragraphs) {
            String trimmed = para.strip();
            if (!trimmed.isEmpty() && parserService.detectHeading(trimmed) != null) {
                count++;
            }
        }
        return count;
    }
}