EmbeddingBackfillService.java

/*******************************************************************************
 * Copyright (c) 2026 Carsten Hammer.
 *
 * This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License 2.0
 * which accompanies this distribution, and is available at
 * https://www.eclipse.org/legal/epl-2.0/
 *
 * SPDX-License-Identifier: EPL-2.0
 *
 * Contributors:
 *     Carsten Hammer
 *******************************************************************************/
package org.eclipse.jgit.storage.hibernate.search;

import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.eclipse.jgit.storage.hibernate.entity.JavaBlobIndex;
import org.hibernate.Session;
import org.hibernate.SessionFactory;

/**
 * Service for backfilling semantic embeddings on existing
 * {@link JavaBlobIndex} entries that were indexed before embedding support was
 * added.
 * <p>
 * Processes entries in configurable batches to avoid excessive memory usage.
 * Each batch reads entries with {@code hasEmbedding = false}, computes their
 * embedding via {@link EmbeddingService}, and updates them in a single
 * transaction.
 * </p>
 *
 * <h3>Usage</h3>
 * <pre>
 * EmbeddingService embeddingService = new EmbeddingService();
 * EmbeddingBackfillService.backfill(sessionFactory, embeddingService, 100);
 * </pre>
 *
 * <h3>Configuration</h3>
 * <ul>
 * <li>{@code JGIT_BACKFILL_ON_STARTUP} — set to {@code true} to trigger
 * backfill automatically when the session factory is created
 * (default: {@code false})</li>
 * <li>{@code JGIT_EMBEDDING_BATCH_SIZE} — batch size for backfill processing
 * (default: {@code 100})</li>
 * </ul>
 */
public class EmbeddingBackfillService {

	private static final Logger LOG = Logger
			.getLogger(EmbeddingBackfillService.class.getName());

	/** Default batch size for backfill processing. */
	public static final int DEFAULT_BATCH_SIZE = 100;

	private EmbeddingBackfillService() {
		// utility class
	}

	/**
	 * Backfill embeddings for all entries without embeddings.
	 *
	 * @param sessionFactory
	 *            the Hibernate session factory
	 * @param embeddingService
	 *            the embedding service to generate vectors
	 * @param batchSize
	 *            number of entries to process per batch
	 * @return the number of entries updated
	 */
	public static int backfill(SessionFactory sessionFactory,
			EmbeddingService embeddingService, int batchSize) {
		if (!embeddingService.isAvailable()) {
			LOG.log(Level.INFO,
					"Embedding service not available — skipping backfill"); //$NON-NLS-1$
			return 0;
		}
		int totalUpdated = 0;
		List<JavaBlobIndex> batch;
		while (true) {
			batch = fetchBatch(sessionFactory, batchSize);
			if (batch.isEmpty()) {
				break;
			}
			int updated = processBatch(sessionFactory, embeddingService,
					batch);
			if (updated == 0) {
				LOG.log(Level.WARNING,
						"No entries updated in last backfill batch — " //$NON-NLS-1$
								+ "terminating to avoid potential infinite loop"); //$NON-NLS-1$
				break;
			}
			totalUpdated += updated;
			LOG.log(Level.INFO,
					"Backfill progress: {0} entries updated so far", //$NON-NLS-1$
					Integer.valueOf(totalUpdated));
		}
		LOG.log(Level.INFO,
				"Backfill complete: {0} entries updated", //$NON-NLS-1$
				Integer.valueOf(totalUpdated));
		return totalUpdated;
	}

	/**
	 * Backfill embeddings using default batch size from environment or
	 * default.
	 *
	 * @param sessionFactory
	 *            the Hibernate session factory
	 * @param embeddingService
	 *            the embedding service to generate vectors
	 * @return the number of entries updated
	 */
	public static int backfill(SessionFactory sessionFactory,
			EmbeddingService embeddingService) {
		return backfill(sessionFactory, embeddingService,
				getBatchSizeFromEnv());
	}

	/**
	 * Check if backfill on startup is enabled.
	 *
	 * @return {@code true} if {@code JGIT_BACKFILL_ON_STARTUP} is set to
	 *         {@code true}
	 */
	public static boolean isBackfillOnStartupEnabled() {
		String val = System.getenv("JGIT_BACKFILL_ON_STARTUP"); //$NON-NLS-1$
		return "true".equalsIgnoreCase(val); //$NON-NLS-1$
	}

	private static List<JavaBlobIndex> fetchBatch(
			SessionFactory sessionFactory, int batchSize) {
		try (Session session = sessionFactory.openSession()) {
			return session.createQuery(
					"FROM JavaBlobIndex j WHERE j.hasEmbedding = false", //$NON-NLS-1$
					JavaBlobIndex.class)
					.setMaxResults(batchSize)
					.getResultList();
		}
	}

	private static int processBatch(SessionFactory sessionFactory,
			EmbeddingService embeddingService,
			List<JavaBlobIndex> entries) {
		int updated = 0;
		try (Session session = sessionFactory.openSession()) {
			session.beginTransaction();
			for (JavaBlobIndex entry : entries) {
				String embeddingText = EmbeddingService
						.buildEmbeddingText(
								entry.getSimpleClassName(),
								entry.getTypeDocumentation(),
								entry.getMethodSignatures(),
								entry.getPackageName());
				float[] embedding = embeddingService
						.embed(embeddingText);
				if (embedding != null) {
					JavaBlobIndex managed = session
							.merge(entry);
					managed.setSemanticEmbedding(embedding);
					managed.setHasEmbedding(true);
					updated++;
				}
			}
			session.getTransaction().commit();
		}
		return updated;
	}

	private static int getBatchSizeFromEnv() {
		String val = System.getenv("JGIT_EMBEDDING_BATCH_SIZE"); //$NON-NLS-1$
		if (val != null) {
			try {
				return Integer.parseInt(val);
			} catch (NumberFormatException e) {
				LOG.log(Level.WARNING,
						"Invalid JGIT_EMBEDDING_BATCH_SIZE value: {0} — using default {1}", //$NON-NLS-1$
						new Object[] { val,
								Integer.valueOf(DEFAULT_BATCH_SIZE) });
			}
		}
		return DEFAULT_BATCH_SIZE;
	}
}