BlobIndexer.java
/*******************************************************************************
* Copyright (c) 2026 Carsten Hammer.
*
* This program and the accompanying materials
* are made available under the terms of the Eclipse Public License 2.0
* which accompanies this distribution, and is available at
* https://www.eclipse.org/legal/epl-2.0/
*
* SPDX-License-Identifier: EPL-2.0
*
* Contributors:
* Carsten Hammer
*******************************************************************************/
package org.eclipse.jgit.storage.hibernate.service;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.eclipse.jgit.lib.ObjectId;
import org.eclipse.jgit.lib.ObjectLoader;
import org.eclipse.jgit.lib.ObjectReader;
import org.eclipse.jgit.lib.Repository;
import org.eclipse.jgit.revwalk.RevCommit;
import org.eclipse.jgit.revwalk.RevWalk;
import org.eclipse.jgit.storage.hibernate.entity.FilePathHistory;
import org.eclipse.jgit.storage.hibernate.entity.JavaBlobIndex;
import org.eclipse.jgit.storage.hibernate.search.BlobIndexData;
import org.eclipse.jgit.storage.hibernate.search.EmbeddingService;
import org.eclipse.jgit.storage.hibernate.search.FileTypeStrategy;
import org.eclipse.jgit.storage.hibernate.search.FileTypeStrategyRegistry;
import org.eclipse.jgit.treewalk.TreeWalk;
import org.hibernate.Session;
import org.hibernate.SessionFactory;
/**
* Indexes Java source blobs from Git commits into {@link JavaBlobIndex}
* entities.
* <p>
* Complements {@link CommitIndexer} by walking each commit's tree and
* extracting structural metadata from {@code .java} files using
* {@link JavaBlobExtractor}. Blobs larger than the configured maximum size
* are skipped. Entities are persisted in configurable batches.
* </p>
*/
public class BlobIndexer {
private static final Logger LOG = Logger
.getLogger(BlobIndexer.class.getName());
/** Default maximum blob size in bytes (1 MB). */
public static final int DEFAULT_MAX_BLOB_SIZE = 1024 * 1024;
/** Default batch size for persist operations. */
public static final int DEFAULT_BATCH_SIZE = 50;
private static final int BINARY_CHECK_SIZE = 8192;
private static final Set<String> BINARY_EXTENSIONS = Set.of(
".class", ".jar", ".png", ".jpg", ".jpeg", ".gif", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$
".zip", ".tar", ".gz", ".bz2", ".pdf", ".so", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$
".dll", ".exe", ".ico", ".war", ".ear"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$
private final SessionFactory sessionFactory;
private final String repositoryName;
private final FileTypeStrategyRegistry strategyRegistry;
private final int maxBlobSize;
private final int batchSize;
private final EmbeddingService embeddingService;
/**
* Create a new blob indexer with default settings.
*
* @param sessionFactory
* the Hibernate session factory
* @param repositoryName
* the repository name for partitioning
*/
public BlobIndexer(SessionFactory sessionFactory,
String repositoryName) {
this(sessionFactory, repositoryName, getMaxBlobSizeFromEnv(),
getBatchSizeFromEnv(), new EmbeddingService());
}
/**
* Create a new blob indexer with custom settings.
*
* @param sessionFactory
* the Hibernate session factory
* @param repositoryName
* the repository name for partitioning
* @param maxBlobSize
* the maximum blob size in bytes to index
* @param batchSize
* number of entities to persist per transaction batch
*/
public BlobIndexer(SessionFactory sessionFactory,
String repositoryName, int maxBlobSize, int batchSize) {
this(sessionFactory, repositoryName, maxBlobSize, batchSize,
new EmbeddingService());
}
/**
* Create a new blob indexer with custom settings and embedding service.
*
* @param sessionFactory
* the Hibernate session factory
* @param repositoryName
* the repository name for partitioning
* @param maxBlobSize
* the maximum blob size in bytes to index
* @param batchSize
* number of entities to persist per transaction batch
* @param embeddingService
* the embedding service for semantic vector generation
*/
public BlobIndexer(SessionFactory sessionFactory,
String repositoryName, int maxBlobSize, int batchSize,
EmbeddingService embeddingService) {
this.sessionFactory = sessionFactory;
this.repositoryName = repositoryName;
this.strategyRegistry = new FileTypeStrategyRegistry();
this.maxBlobSize = maxBlobSize;
this.batchSize = batchSize;
this.embeddingService = embeddingService;
}
/**
* Index all Java blobs in a commit's tree.
*
* @param repo
* the repository to read objects from
* @param commitId
* the commit object ID whose tree will be walked
* @return the number of blobs indexed
* @throws IOException
* if an error occurs reading objects
*/
public int indexCommitBlobs(Repository repo, ObjectId commitId)
throws IOException {
LOG.log(Level.INFO, "Starting blob indexing for commit {0} in {1}", //$NON-NLS-1$
new Object[] { commitId.name(), repositoryName });
Set<String> alreadyIndexed = loadIndexedBlobOids();
List<JavaBlobIndex> batch = new ArrayList<>();
List<FilePathHistory> historyBatch = new ArrayList<>();
int count = 0;
try (RevWalk rw = new RevWalk(repo)) {
RevCommit commit = rw.parseCommit(commitId);
String commitAuthor = commit.getAuthorIdent() != null
? commit.getAuthorIdent().getName() : null;
java.time.Instant commitDate = commit.getAuthorIdent() != null
? commit.getAuthorIdent().getWhenAsInstant() : null;
try (ObjectReader reader = repo.newObjectReader();
TreeWalk tw = new TreeWalk(reader)) {
tw.addTree(commit.getTree());
tw.setRecursive(true);
boolean allFileTypes = isAllFileTypesEnabled();
boolean trackHistory = isFilePathHistoryEnabled();
Set<String> skipExts = getSkipExtensions();
while (tw.next()) {
String path = tw.getPathString();
// Track file path history if enabled
if (trackHistory) {
FilePathHistory fph = new FilePathHistory();
fph.setRepositoryName(repositoryName);
fph.setCommitObjectId(commitId.name());
fph.setFilePath(path);
fph.setBlobObjectId(tw.getObjectId(0).name());
fph.setFileType(detectFileType(path));
fph.setCommitTime(commitDate);
historyBatch.add(fph);
if (historyBatch.size() >= batchSize) {
persistHistoryBatch(historyBatch);
historyBatch.clear();
}
}
if (isBinaryExtension(path, skipExts)) {
continue;
}
if (!allFileTypes
&& !path.endsWith(".java")) { //$NON-NLS-1$
continue;
}
ObjectLoader loader = reader.open(tw.getObjectId(0));
if (loader.getSize() > maxBlobSize) {
LOG.log(Level.FINE,
"Skipping blob too large ({0} bytes): {1}", //$NON-NLS-1$
new Object[] {
Long.valueOf(loader.getSize()),
path });
continue;
}
String blobOid = tw.getObjectId(0).name();
if (alreadyIndexed.contains(blobOid)) {
continue;
}
byte[] bytes = loader.getBytes();
if (isBinaryContent(bytes)) {
LOG.log(Level.FINE,
"Skipping binary blob: {0}", path); //$NON-NLS-1$
continue;
}
String source = new String(bytes,
StandardCharsets.UTF_8);
FileTypeStrategy strategy = strategyRegistry
.getStrategy(path);
BlobIndexData blobData = strategy.extract(source,
path);
JavaBlobIndex idx = toBlobIndex(blobData, path,
blobOid, commitId.name(), commitAuthor,
commitDate);
batch.add(idx);
alreadyIndexed.add(blobOid);
count++;
if (batch.size() >= batchSize) {
persistBatch(batch);
batch.clear();
}
}
}
}
if (!batch.isEmpty()) {
persistBatch(batch);
}
if (!historyBatch.isEmpty()) {
persistHistoryBatch(historyBatch);
}
LOG.log(Level.INFO,
"Completed blob indexing for commit {0}: {1} blobs indexed", //$NON-NLS-1$
new Object[] { commitId.name(), Integer.valueOf(count) });
return count;
}
private void persistHistoryBatch(List<FilePathHistory> entities) {
try (Session session = sessionFactory.openSession()) {
session.beginTransaction();
for (FilePathHistory fph : entities) {
session.persist(fph);
}
session.getTransaction().commit();
}
}
private static String detectFileType(String path) {
int dot = path.lastIndexOf('.');
if (dot >= 0) {
return path.substring(dot + 1).toLowerCase();
}
return "unknown"; //$NON-NLS-1$
}
private void persistBatch(List<JavaBlobIndex> entities) {
try (Session session = sessionFactory.openSession()) {
session.beginTransaction();
for (JavaBlobIndex idx : entities) {
session.persist(idx);
}
session.getTransaction().commit();
}
}
/**
* Pre-load the set of already-indexed blob OIDs for this repository.
* <p>
* Deduplicates on blob OID alone so that the same content is only indexed
* once regardless of how many commits reference it. The
* {@code commitObjectId} stored in the resulting {@link JavaBlobIndex}
* represents the first commit where the blob was encountered.
* </p>
* <p>
* This avoids one query per blob during indexing. For repositories with
* very large numbers of indexed blobs, this set may consume significant
* memory.
* </p>
*
* @return set of blob OID hex strings already in the index
*/
private Set<String> loadIndexedBlobOids() {
try (Session session = sessionFactory.openSession()) {
List<String> oids = session.createQuery(
"SELECT j.blobObjectId FROM JavaBlobIndex j WHERE j.repositoryName = :repo", //$NON-NLS-1$
String.class)
.setParameter("repo", repositoryName) //$NON-NLS-1$
.getResultList();
return new HashSet<>(oids);
}
}
/**
* Detect binary content by checking for null bytes in the first 8 KB.
*
* @param bytes
* the file content
* @return {@code true} if the content appears to be binary
*/
public static boolean isBinaryContent(byte[] bytes) {
int checkLen = Math.min(bytes.length, BINARY_CHECK_SIZE);
for (int i = 0; i < checkLen; i++) {
if (bytes[i] == 0) {
return true;
}
}
return false;
}
private static boolean isBinaryExtension(String path,
Set<String> skipExts) {
int dot = path.lastIndexOf('.');
if (dot >= 0) {
return skipExts
.contains(path.substring(dot).toLowerCase());
}
return false;
}
private static int getMaxBlobSizeFromEnv() {
String val = System.getenv("JGIT_INDEX_MAX_BLOB_SIZE"); //$NON-NLS-1$
if (val != null) {
try {
return Integer.parseInt(val);
} catch (NumberFormatException e) {
// ignore
}
}
return DEFAULT_MAX_BLOB_SIZE;
}
private static int getBatchSizeFromEnv() {
String val = System.getenv("JGIT_INDEX_BATCH_SIZE"); //$NON-NLS-1$
if (val != null) {
try {
return Integer.parseInt(val);
} catch (NumberFormatException e) {
// ignore
}
}
return DEFAULT_BATCH_SIZE;
}
private static boolean isAllFileTypesEnabled() {
String val = System.getenv("JGIT_INDEX_ALL_FILE_TYPES"); //$NON-NLS-1$
return val == null || !"false".equalsIgnoreCase(val); //$NON-NLS-1$
}
private static boolean isFilePathHistoryEnabled() {
String val = System.getenv("JGIT_INDEX_FILE_PATH_HISTORY"); //$NON-NLS-1$
return val == null || !"false".equalsIgnoreCase(val); //$NON-NLS-1$
}
private static Set<String> getSkipExtensions() {
String val = System.getenv("JGIT_INDEX_SKIP_EXTENSIONS"); //$NON-NLS-1$
if (val != null && !val.isEmpty()) {
Set<String> exts = new HashSet<>();
for (String ext : val.split(",")) { //$NON-NLS-1$
exts.add(ext.trim().toLowerCase());
}
return exts;
}
return BINARY_EXTENSIONS;
}
private JavaBlobIndex toBlobIndex(BlobIndexData data, String filePath,
String blobOid, String commitOid, String commitAuthor,
java.time.Instant commitDate) {
JavaBlobIndex idx = new JavaBlobIndex();
idx.setRepositoryName(repositoryName);
idx.setBlobObjectId(blobOid);
idx.setCommitObjectId(commitOid);
idx.setFileType(data.getFileType());
idx.setFilePath(filePath);
idx.setPackageName(data.getPackageOrNamespace());
idx.setDeclaredTypes(data.getDeclaredTypes());
idx.setFullyQualifiedNames(data.getFullyQualifiedNames());
idx.setDeclaredMethods(data.getDeclaredMethods());
idx.setDeclaredFields(data.getDeclaredFields());
idx.setExtendsTypes(data.getExtendsTypes());
idx.setImplementsTypes(data.getImplementsTypes());
idx.setImportStatements(data.getImportStatements());
idx.setSourceSnippet(data.getSourceSnippet());
idx.setProjectName(data.getProjectName());
idx.setSimpleClassName(data.getSimpleClassName());
idx.setTypeKind(data.getTypeKind());
idx.setVisibility(data.getVisibility());
idx.setAnnotations(data.getAnnotations());
idx.setLineCount(data.getLineCount());
idx.setTypeDocumentation(data.getTypeDocumentation());
idx.setMethodSignatures(data.getMethodSignatures());
idx.setReferencedTypes(data.getReferencedTypes());
idx.setStringLiterals(data.getStringLiterals());
idx.setHasMainMethod(data.isHasMainMethod());
idx.setCommitAuthor(commitAuthor);
idx.setCommitDate(commitDate);
// Generate semantic embedding if service is available
try {
String embeddingText = EmbeddingService.buildEmbeddingText(
data.getSimpleClassName(),
data.getTypeDocumentation(),
data.getMethodSignatures(),
data.getPackageOrNamespace());
float[] embedding = embeddingService.embed(embeddingText);
if (embedding != null) {
idx.setSemanticEmbedding(embedding);
idx.setHasEmbedding(true);
}
} catch (Exception e) {
LOG.log(Level.FINE,
"Embedding generation failed for blob {0}: {1}", //$NON-NLS-1$
new Object[] { blobOid, e.getMessage() });
}
return idx;
}
}