RelationSeedParser.java

package com.taxonomy.catalog.service;

import com.taxonomy.dto.RelationSeedRow;
import com.taxonomy.model.RelationType;
import com.taxonomy.model.SeedType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * Parses the relation seed CSV file into {@link RelationSeedRow} records.
 *
 * <p>The parser supports both the legacy 4-column format
 * ({@code SourceCode,TargetCode,RelationType,Description}) and the extended
 * 10-column format that adds {@code SourceStandard}, {@code SourceReference},
 * {@code Confidence}, {@code SeedType}, {@code ReviewRequired}, and
 * {@code Status}.
 *
 * <p>Rows that cannot be parsed are logged as warnings and skipped.
 * The parser does not throw exceptions for individual malformed rows.
 */
public final class RelationSeedParser {

    private static final Logger log = LoggerFactory.getLogger(RelationSeedParser.class);

    /** Maximum number of columns in the extended CSV format. */
    private static final int EXTENDED_COLUMN_COUNT = 10;

    /** Minimum number of columns required for a valid row. */
    private static final int MIN_COLUMN_COUNT = 3;

    private RelationSeedParser() {
        // utility class
    }

    /**
     * Parse the relation seed CSV from the given input stream.
     *
     * @param inputStream the CSV input stream (UTF-8 encoded, with header row)
     * @return an unmodifiable list of parsed seed rows; never {@code null}
     * @throws IOException if the stream cannot be read
     */
    public static List<RelationSeedRow> parse(InputStream inputStream) throws IOException {
        List<RelationSeedRow> rows = new ArrayList<>();
        try (BufferedReader reader = new BufferedReader(
                new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
            String header = reader.readLine(); // skip header row
            if (header == null) {
                log.warn("Relation seed CSV is empty.");
                return Collections.emptyList();
            }

            int lineNumber = 1;
            String line;
            while ((line = reader.readLine()) != null) {
                lineNumber++;
                if (line.isBlank()) continue;

                RelationSeedRow row = parseLine(line, lineNumber);
                if (row != null) {
                    rows.add(row);
                }
            }
        }
        return Collections.unmodifiableList(rows);
    }

    /**
     * Parse a single CSV line into a {@link RelationSeedRow}.
     *
     * @param line       the raw CSV line
     * @param lineNumber the 1-based line number (for error messages)
     * @return the parsed row, or {@code null} if the line is malformed
     */
    static RelationSeedRow parseLine(String line, int lineNumber) {
        String[] parts = line.split(",", EXTENDED_COLUMN_COUNT);
        if (parts.length < MIN_COLUMN_COUNT) {
            log.warn("Relation seed CSV line {}: too few columns ({}) — skipping.", lineNumber, parts.length);
            return null;
        }

        String sourceCode = parts[0].trim();
        String targetCode = parts[1].trim();
        String typeStr = parts[2].trim();

        if (sourceCode.isEmpty() || targetCode.isEmpty() || typeStr.isEmpty()) {
            log.warn("Relation seed CSV line {}: empty required field — skipping.", lineNumber);
            return null;
        }

        // Parse relation type
        RelationType relationType;
        try {
            relationType = RelationType.valueOf(typeStr.toUpperCase());
        } catch (IllegalArgumentException e) {
            log.warn("Relation seed CSV line {}: unknown relation type '{}' — skipping.", lineNumber, typeStr);
            return null;
        }

        // Column 4: Description (optional)
        String description = safeGet(parts, 3);

        // Column 5: SourceStandard (optional)
        String sourceStandard = safeGet(parts, 4);

        // Column 6: SourceReference (optional)
        String sourceReference = safeGet(parts, 5);

        // Column 7: Confidence (optional, default 1.0)
        double confidence = parseConfidence(safeGet(parts, 6), lineNumber);

        // Column 8: SeedType (optional, default TYPE_DEFAULT)
        SeedType seedType = parseSeedType(safeGet(parts, 7), lineNumber);

        // Column 9: ReviewRequired (optional, default false)
        boolean reviewRequired = parseBoolean(safeGet(parts, 8));

        // Column 10: Status (optional, default "accepted")
        String status = safeGet(parts, 9);
        if (status == null || status.isEmpty()) {
            status = "accepted";
        }

        return new RelationSeedRow(
                sourceCode, targetCode, relationType, description,
                sourceStandard, sourceReference, confidence,
                seedType, reviewRequired, status);
    }

    private static String safeGet(String[] parts, int index) {
        if (index >= parts.length) return null;
        String value = parts[index].trim();
        return value.isEmpty() ? null : value;
    }

    private static double parseConfidence(String value, int lineNumber) {
        if (value == null) return 1.0;
        try {
            double confidence = Double.parseDouble(value);
            if (confidence < 0.0 || confidence > 1.0) {
                log.warn("Relation seed CSV line {}: confidence {} out of [0.0, 1.0] range — clamping.",
                        lineNumber, confidence);
                return Math.max(0.0, Math.min(1.0, confidence));
            }
            return confidence;
        } catch (NumberFormatException e) {
            log.warn("Relation seed CSV line {}: invalid confidence '{}' — using default 1.0.",
                    lineNumber, value);
            return 1.0;
        }
    }

    private static SeedType parseSeedType(String value, int lineNumber) {
        if (value == null) return SeedType.TYPE_DEFAULT;
        try {
            return SeedType.valueOf(value.toUpperCase());
        } catch (IllegalArgumentException e) {
            log.warn("Relation seed CSV line {}: unknown seed type '{}' — using TYPE_DEFAULT.",
                    lineNumber, value);
            return SeedType.TYPE_DEFAULT;
        }
    }

    private static boolean parseBoolean(String value) {
        return "true".equalsIgnoreCase(value);
    }
}