CsvKeywordAnalyzer.java
package com.taxonomy.shared.config;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.pattern.PatternTokenizer;
import java.util.regex.Pattern;
import com.taxonomy.architecture.model.ArchitectureCommitIndex;
/**
* Custom Lucene {@link Analyzer} for comma/semicolon-separated keyword fields.
*
* <p>Used for {@code affectedElementIds} (comma-separated) and
* {@code affectedRelationIds} (semicolon-separated) fields in
* {@link com.taxonomy.architecture.model.ArchitectureCommitIndex}.
*
* <p>Splits on commas and semicolons, trims whitespace, and lowercases
* so that individual IDs like {@code "CP-1023"} become searchable tokens.
*/
public class CsvKeywordAnalyzer extends Analyzer {
/** Split on commas or semicolons, optionally surrounded by whitespace. */
private static final Pattern DELIMITER = Pattern.compile("[,;]\\s*");
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new PatternTokenizer(DELIMITER, -1);
TokenStream filter = new LowerCaseFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
}