EcjTokenizer.java
/*******************************************************************************
* Copyright (c) 2026 Carsten Hammer.
*
* This program and the accompanying materials
* are made available under the terms of the Eclipse Public License 2.0
* which accompanies this distribution, and is available at
* https://www.eclipse.org/legal/epl-2.0/
*
* SPDX-License-Identifier: EPL-2.0
*
* Contributors:
* Carsten Hammer
*******************************************************************************/
package org.eclipse.jgit.storage.hibernate.search;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.eclipse.jdt.core.compiler.InvalidInputException;
import org.eclipse.jdt.internal.compiler.parser.Scanner;
import org.eclipse.jdt.internal.compiler.parser.TerminalToken;
/**
* A Lucene {@link Tokenizer} that uses ECJ's {@link Scanner} to produce
* lexically correct Java tokens.
* <p>
* Each emitted token carries a {@link TypeAttribute} distinguishing keywords,
* identifiers, string literals, number literals, comments, annotations, and
* operators. This enables downstream filters to apply different processing
* strategies per token type.
* </p>
*/
public final class EcjTokenizer extends Tokenizer {
/** Type attribute value for Java keywords. */
public static final String TYPE_KEYWORD = "KEYWORD"; //$NON-NLS-1$
/** Type attribute value for Java identifiers. */
public static final String TYPE_IDENTIFIER = "IDENTIFIER"; //$NON-NLS-1$
/** Type attribute value for string literals. */
public static final String TYPE_STRING_LITERAL = "STRING_LITERAL"; //$NON-NLS-1$
/** Type attribute value for numeric literals. */
public static final String TYPE_NUMBER_LITERAL = "NUMBER_LITERAL"; //$NON-NLS-1$
/** Type attribute value for comments. */
public static final String TYPE_COMMENT = "COMMENT"; //$NON-NLS-1$
/** Type attribute value for annotations. */
public static final String TYPE_ANNOTATION = "ANNOTATION"; //$NON-NLS-1$
/** Type attribute value for operators and punctuation. */
public static final String TYPE_OPERATOR = "OPERATOR"; //$NON-NLS-1$
private final CharTermAttribute termAttr = addAttribute(
CharTermAttribute.class);
private final OffsetAttribute offsetAttr = addAttribute(
OffsetAttribute.class);
private final TypeAttribute typeAttr = addAttribute(TypeAttribute.class);
private Scanner scanner;
private char[] sourceChars;
/**
* Create a new ECJ-based tokenizer.
*/
public EcjTokenizer() {
super();
}
@Override
public void reset() throws IOException {
super.reset();
sourceChars = readFully(input);
scanner = new Scanner();
scanner.tokenizeComments = true;
scanner.recordLineSeparator = false;
scanner.setSource(sourceChars);
}
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
if (scanner == null) {
return false;
}
while (true) {
TerminalToken token;
try {
token = scanner.getNextToken();
} catch (InvalidInputException e) {
// Graceful degradation on syntax errors: skip the bad token
continue;
}
if (token == TerminalToken.TokenNameEOF) {
return false;
}
String type = classifyToken(token);
if (type == null) {
// Skip whitespace and tokens with no search value
continue;
}
int start = scanner.getCurrentTokenStartPosition();
int end = scanner.getCurrentTokenEndPosition() + 1;
String text = tokenText(token, start, end);
if (text.isEmpty()) {
continue;
}
termAttr.setEmpty().append(text);
offsetAttr.setOffset(correctOffset(start),
correctOffset(Math.min(end, sourceChars.length)));
typeAttr.setType(type);
return true;
}
}
private String tokenText(TerminalToken token, int start, int end) {
if (isStringLiteral(token)) {
// Strip surrounding quotes from string/char literals
if (end - start >= 2) {
return new String(sourceChars, start + 1, end - start - 2);
}
}
if (isAnnotation(token)) {
// For annotations, skip the '@' prefix
if (start + 1 < end && start + 1 < sourceChars.length) {
return scanner.getCurrentTokenString();
}
}
return new String(sourceChars, start, Math.min(end - start,
sourceChars.length - start));
}
private static String classifyToken(TerminalToken token) {
if (token == TerminalToken.TokenNameWHITESPACE) {
return null;
}
if (token == TerminalToken.TokenNameIdentifier) {
return TYPE_IDENTIFIER;
}
if (isKeyword(token)) {
return TYPE_KEYWORD;
}
if (isStringLiteral(token)) {
return TYPE_STRING_LITERAL;
}
if (isNumberLiteral(token)) {
return TYPE_NUMBER_LITERAL;
}
if (isComment(token)) {
return TYPE_COMMENT;
}
if (isAnnotation(token)) {
return TYPE_ANNOTATION;
}
if (isOperatorOrPunctuation(token)) {
return TYPE_OPERATOR;
}
return TYPE_OPERATOR;
}
private static boolean isKeyword(TerminalToken token) {
String name = token.name();
// Java keywords are named TokenNameXxx where xxx is the keyword
// e.g. TokenNamepublic, TokenNameclass, TokenNameint, etc.
if (name.startsWith("TokenName") //$NON-NLS-1$
&& name.length() > "TokenName".length()) { //$NON-NLS-1$
char c = name.charAt("TokenName".length()); //$NON-NLS-1$
// Keywords start with lowercase after "TokenName"
// Identifiers and literals start with uppercase
if (Character.isLowerCase(c)
&& token != TerminalToken.TokenNameIdentifier) {
return true;
}
}
return false;
}
private static boolean isStringLiteral(TerminalToken token) {
return token == TerminalToken.TokenNameStringLiteral
|| token == TerminalToken.TokenNameCharacterLiteral
|| token == TerminalToken.TokenNameTextBlock
|| token == TerminalToken.TokenNameSingleQuoteStringLiteral;
}
private static boolean isNumberLiteral(TerminalToken token) {
return token == TerminalToken.TokenNameIntegerLiteral
|| token == TerminalToken.TokenNameLongLiteral
|| token == TerminalToken.TokenNameFloatingPointLiteral
|| token == TerminalToken.TokenNameDoubleLiteral;
}
private static boolean isComment(TerminalToken token) {
return token == TerminalToken.TokenNameCOMMENT_LINE
|| token == TerminalToken.TokenNameCOMMENT_BLOCK
|| token == TerminalToken.TokenNameCOMMENT_JAVADOC
|| token == TerminalToken.TokenNameCOMMENT_MARKDOWN;
}
private static boolean isAnnotation(TerminalToken token) {
return token == TerminalToken.TokenNameAT;
}
private static boolean isOperatorOrPunctuation(TerminalToken token) {
String name = token.name();
return name.startsWith("TokenName") //$NON-NLS-1$
&& name.length() > "TokenName".length() //$NON-NLS-1$
&& Character.isUpperCase(
name.charAt("TokenName".length())); //$NON-NLS-1$
}
private static char[] readFully(Reader reader) throws IOException {
StringBuilder sb = new StringBuilder(4096);
char[] buf = new char[4096];
int n;
while ((n = reader.read(buf)) != -1) {
sb.append(buf, 0, n);
}
return sb.toString().toCharArray();
}
}