DslAnalyzer.java
package com.taxonomy.shared.config;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import com.taxonomy.dsl.parser.DslTokenizer;
/**
* Custom Lucene {@link Analyzer} for DSL-tokenized text.
*
* <p>The {@link com.taxonomy.dsl.parser.DslTokenizer} produces space-separated
* tokens with category prefixes ({@code STRUCT:element}, {@code REL:REALIZES},
* {@code DOM:Capability}) and raw identifiers ({@code CP-1023}). This analyzer:
* <ol>
* <li>Splits on whitespace (tokens are already pre-tokenized)</li>
* <li>Lowercases for case-insensitive matching</li>
* </ol>
*
* <p>Prefixed tokens remain intact so that queries like {@code "rel:realizes"}
* perform precise faceted searches.
*/
public class DslAnalyzer extends Analyzer {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new WhitespaceTokenizer();
TokenStream filter = new LowerCaseFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
}