StructuredDocumentParser.java
package com.taxonomy.provenance.service;
import com.taxonomy.dto.DocumentSection;
import org.springframework.stereotype.Service;
import java.util.ArrayDeque;
import java.util.Deque;
/**
* Builds a hierarchical {@link DocumentSection} tree from raw text.
*
* <p>Re-uses the heading detection logic of {@link DocumentParserService} to
* identify section boundaries, then organises the paragraphs under a
* parent–child tree structure. The tree is suitable for hierarchical chunking
* and context-aware LLM prompts.
*/
@Service
public class StructuredDocumentParser {
private static final int MIN_PARAGRAPH_LENGTH = 40;
private final DocumentParserService parserService;
public StructuredDocumentParser(DocumentParserService parserService) {
this.parserService = parserService;
}
/**
* Parses raw text into a hierarchical document tree.
*
* @param rawText the raw text (with double-newline paragraph separators)
* @return the root {@link DocumentSection} (level 0) containing the full tree
*/
public DocumentSection parse(String rawText) {
DocumentSection root = new DocumentSection(0, "Document Root");
root.setSectionPath("Document Root");
if (rawText == null || rawText.isBlank()) {
return root;
}
Deque<DocumentSection> stack = new ArrayDeque<>();
stack.push(root);
String[] paragraphs = rawText.split("\\n\\s*\\n");
for (String para : paragraphs) {
String trimmed = para.strip();
if (trimmed.isEmpty()) continue;
DocumentParserService.HeadingMatch heading = parserService.detectHeading(trimmed);
if (heading != null) {
DocumentSection section = new DocumentSection(heading.level(), heading.text());
// Pop until we find a parent with a strictly smaller level
while (stack.size() > 1 && stack.peek().getLevel() >= heading.level()) {
stack.pop();
}
DocumentSection parent = stack.peek();
section.setSectionPath(buildSectionPath(parent, heading.text()));
parent.getChildren().add(section);
stack.push(section);
} else if (trimmed.length() >= MIN_PARAGRAPH_LENGTH) {
stack.peek().getParagraphs().add(trimmed);
}
}
return root;
}
private static String buildSectionPath(DocumentSection parent, String heading) {
if (parent.getLevel() == 0) {
return heading;
}
return parent.getSectionPath() + " > " + heading;
}
}