JavaSourceAnalysisConfigurer.java
/*******************************************************************************
* Copyright (c) 2026 Carsten Hammer.
*
* This program and the accompanying materials
* are made available under the terms of the Eclipse Public License 2.0
* which accompanies this distribution, and is available at
* https://www.eclipse.org/legal/epl-2.0/
*
* SPDX-License-Identifier: EPL-2.0
*
* Contributors:
* Carsten Hammer
*******************************************************************************/
package org.eclipse.jgit.storage.hibernate.search;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.hibernate.search.backend.lucene.analysis.LuceneAnalysisConfigurationContext;
import org.hibernate.search.backend.lucene.analysis.LuceneAnalysisConfigurer;
/**
* Configures Lucene analyzers tailored for Java source code search.
* <p>
* This configurer defines named analyzers for CamelCase-aware identifier
* search, file path and package search, commit message analysis,
* dot-qualified fully qualified name search, and ECJ-based Java source
* tokenization. It is registered with Hibernate Search via the
* {@code hibernate.search.backend.analysis.configurer} property.
* </p>
*/
public class JavaSourceAnalysisConfigurer implements LuceneAnalysisConfigurer {
@Override
public void configure(LuceneAnalysisConfigurationContext context) {
// generateWordParts + preserveOriginal: indexes both the original
// identifier and its CamelCase parts (e.g. "StringBuilder" →
// "StringBuilder", "String", "Builder")
context.analyzer("javaIdentifier").custom() //$NON-NLS-1$
.tokenizer("standard") //$NON-NLS-1$
.tokenFilter("wordDelimiterGraph") //$NON-NLS-1$
.param("splitOnCaseChange", "1") //$NON-NLS-1$ //$NON-NLS-2$
.param("generateWordParts", "1") //$NON-NLS-1$ //$NON-NLS-2$
.param("preserveOriginal", "1") //$NON-NLS-1$ //$NON-NLS-2$
.tokenFilter("lowercase"); //$NON-NLS-1$
context.analyzer("javaPath").custom() //$NON-NLS-1$
.tokenizer("standard") //$NON-NLS-1$
.tokenFilter("wordDelimiterGraph") //$NON-NLS-1$
.param("splitOnCaseChange", "1") //$NON-NLS-1$ //$NON-NLS-2$
.tokenFilter("lowercase"); //$NON-NLS-1$
context.analyzer("commitMessage").custom() //$NON-NLS-1$
.tokenizer("standard") //$NON-NLS-1$
.tokenFilter("wordDelimiterGraph") //$NON-NLS-1$
.param("splitOnCaseChange", "1") //$NON-NLS-1$ //$NON-NLS-2$
.param("preserveOriginal", "1") //$NON-NLS-1$ //$NON-NLS-2$
.tokenFilter("lowercase") //$NON-NLS-1$
.tokenFilter("stop"); //$NON-NLS-1$
context.analyzer("dotQualifiedName").custom() //$NON-NLS-1$
.tokenizer("keyword") //$NON-NLS-1$
.tokenFilter("patternReplace") //$NON-NLS-1$
.param("pattern", "\\.") //$NON-NLS-1$ //$NON-NLS-2$
.param("replacement", " ") //$NON-NLS-1$ //$NON-NLS-2$
.tokenFilter("lowercase"); //$NON-NLS-1$
// ECJ Scanner-based analyzer for Java source code
context.analyzer("javaSourceEcj") //$NON-NLS-1$
.instance(new Analyzer() {
@Override
protected TokenStreamComponents createComponents(
String fieldName) {
EcjTokenizer tokenizer = new EcjTokenizer();
TokenStream filter = new EcjTokenFilter(tokenizer);
filter = new LowerCaseFilter(filter);
return new TokenStreamComponents(tokenizer, filter);
}
});
// Generic analyzer for non-Java text content
context.analyzer("genericContent").custom() //$NON-NLS-1$
.tokenizer("standard") //$NON-NLS-1$
.tokenFilter("wordDelimiterGraph") //$NON-NLS-1$
.param("splitOnCaseChange", "1") //$NON-NLS-1$ //$NON-NLS-2$
.param("preserveOriginal", "1") //$NON-NLS-1$ //$NON-NLS-2$
.tokenFilter("lowercase") //$NON-NLS-1$
.tokenFilter("stop"); //$NON-NLS-1$
// XML content analyzer
context.analyzer("xmlContent").custom() //$NON-NLS-1$
.tokenizer("standard") //$NON-NLS-1$
.tokenFilter("wordDelimiterGraph") //$NON-NLS-1$
.param("splitOnCaseChange", "1") //$NON-NLS-1$ //$NON-NLS-2$
.tokenFilter("lowercase"); //$NON-NLS-1$
// Maven coordinate analyzer
context.analyzer("mavenCoordinate").custom() //$NON-NLS-1$
.tokenizer("keyword") //$NON-NLS-1$
.tokenFilter("patternReplace") //$NON-NLS-1$
.param("pattern", ":") //$NON-NLS-1$ //$NON-NLS-2$
.param("replacement", " ") //$NON-NLS-1$ //$NON-NLS-2$
.tokenFilter("lowercase"); //$NON-NLS-1$
}
}