EcjTokenFilter.java

/*******************************************************************************
 * Copyright (c) 2026 Carsten Hammer.
 *
 * This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License 2.0
 * which accompanies this distribution, and is available at
 * https://www.eclipse.org/legal/epl-2.0/
 *
 * SPDX-License-Identifier: EPL-2.0
 *
 * Contributors:
 *     Carsten Hammer
 *******************************************************************************/
package org.eclipse.jgit.storage.hibernate.search;

import java.io.IOException;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;

/**
 * A Lucene {@link TokenFilter} that processes tokens emitted by
 * {@link EcjTokenizer}.
 * <p>
 * For identifiers, it applies CamelCase splitting, emitting the sub-tokens at
 * the same position (via {@link PositionIncrementAttribute} set to 0). For
 * string literals, it strips surrounding quotes and indexes the content. For
 * operators, the tokens are skipped (no search value). Keywords pass through
 * unchanged. Comments are optionally indexed or skipped.
 * </p>
 */
public final class EcjTokenFilter extends TokenFilter {

	private final CharTermAttribute termAttr = addAttribute(
			CharTermAttribute.class);

	private final TypeAttribute typeAttr = addAttribute(TypeAttribute.class);

	private final PositionIncrementAttribute posIncAttr = addAttribute(
			PositionIncrementAttribute.class);

	private final OffsetAttribute offsetAttr = addAttribute(
			OffsetAttribute.class);

	private final boolean indexComments;

	private String[] pendingParts;

	private int pendingIndex;

	private String savedType;

	private int savedStartOffset;

	private int savedEndOffset;

	/**
	 * Create a new ECJ token filter that skips comments.
	 *
	 * @param input
	 *            the upstream token stream
	 */
	public EcjTokenFilter(TokenStream input) {
		this(input, false);
	}

	/**
	 * Create a new ECJ token filter.
	 *
	 * @param input
	 *            the upstream token stream
	 * @param indexComments
	 *            whether to index comment tokens
	 */
	public EcjTokenFilter(TokenStream input, boolean indexComments) {
		super(input);
		this.indexComments = indexComments;
	}

	@Override
	public boolean incrementToken() throws IOException {
		// Emit pending CamelCase sub-parts
		if (pendingParts != null && pendingIndex < pendingParts.length) {
			clearAttributes();
			termAttr.setEmpty().append(pendingParts[pendingIndex]);
			typeAttr.setType(savedType);
			posIncAttr.setPositionIncrement(0);
			offsetAttr.setOffset(savedStartOffset, savedEndOffset);
			pendingIndex++;
			if (pendingIndex >= pendingParts.length) {
				pendingParts = null;
			}
			return true;
		}
		pendingParts = null;

		while (input.incrementToken()) {
			String type = typeAttr.type();

			// Skip operators
			if (EcjTokenizer.TYPE_OPERATOR.equals(type)) {
				continue;
			}

			// Skip comments unless configured to index them
			if (EcjTokenizer.TYPE_COMMENT.equals(type) && !indexComments) {
				continue;
			}

			// Skip annotations (the '@' symbol itself)
			if (EcjTokenizer.TYPE_ANNOTATION.equals(type)) {
				continue;
			}

			// For identifiers: apply CamelCase splitting
			if (EcjTokenizer.TYPE_IDENTIFIER.equals(type)) {
				String term = termAttr.toString();
				String[] parts = splitCamelCase(term);
				if (parts.length > 1) {
					// The original term is already emitted
					// Queue sub-parts at position increment 0
					savedType = type;
					savedStartOffset = offsetAttr.startOffset();
					savedEndOffset = offsetAttr.endOffset();
					pendingParts = parts;
					pendingIndex = 0;
				}
				return true;
			}

			// Keywords, string literals, number literals: pass through
			return true;
		}
		return false;
	}

	@Override
	public void reset() throws IOException {
		super.reset();
		pendingParts = null;
		pendingIndex = 0;
	}

	/**
	 * Split a CamelCase identifier into its parts.
	 *
	 * @param identifier
	 *            the identifier to split
	 * @return an array of parts
	 */
	public static String[] splitCamelCase(String identifier) {
		if (identifier == null || identifier.isEmpty()) {
			return new String[0];
		}
		java.util.List<String> parts = new java.util.ArrayList<>();
		int start = 0;
		for (int i = 1; i < identifier.length(); i++) {
			if (Character.isUpperCase(identifier.charAt(i))
					&& Character.isLowerCase(identifier.charAt(i - 1))) {
				parts.add(identifier.substring(start, i));
				start = i;
			}
		}
		parts.add(identifier.substring(start));
		return parts.toArray(new String[0]);
	}
}