MiningCli.java
/*******************************************************************************
* Copyright (c) 2025 Carsten Hammer.
*
* This program and the accompanying materials
* are made available under the terms of the Eclipse Public License 2.0
* which accompanies this distribution, and is available at
* https://www.eclipse.org/legal/epl-2.0/
*
* SPDX-License-Identifier: EPL-2.0
*
* Contributors:
* Carsten Hammer
*******************************************************************************/
package org.sandbox.mining.core;
import java.io.IOException;
import java.io.PrintStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.Duration;
import java.time.Instant;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.eclipse.jgit.api.errors.GitAPIException;
import org.eclipse.jgit.revwalk.RevCommit;
import org.sandbox.mining.core.category.CategoryManager;
import org.sandbox.mining.core.comparison.DeltaReport;
import org.sandbox.mining.core.comparison.ErrorFeedbackCollector;
import org.sandbox.mining.core.comparison.ExternalEvaluationImporter;
import org.sandbox.mining.core.comparison.HintFileUpdater;
import org.sandbox.mining.core.comparison.MiningComparator;
import org.sandbox.mining.core.config.KnownRulesStore;
import org.sandbox.mining.core.config.MiningConfig;
import org.sandbox.mining.core.config.MiningState;
import org.sandbox.mining.core.config.MiningState.DeferredCommit;
import org.sandbox.mining.core.config.MiningState.RepoState;
import org.sandbox.mining.core.config.RepoEntry;
import org.sandbox.mining.core.enrichment.TypeContextEnricher;
import org.sandbox.mining.core.filter.CommitKeywordFilter;
import org.sandbox.jdt.triggerpattern.internal.DslValidator;
import org.sandbox.jdt.triggerpattern.llm.CommitEvaluation;
import org.sandbox.jdt.triggerpattern.llm.DslContextCollector;
import org.sandbox.jdt.triggerpattern.llm.LlmClient;
import org.sandbox.jdt.triggerpattern.llm.LlmClientFactory;
import org.sandbox.jdt.triggerpattern.llm.PromptBuilder;
import org.sandbox.jdt.triggerpattern.llm.PromptBuilder.CommitData;
import org.sandbox.jdt.triggerpattern.git.CommitWalker;
import org.sandbox.jdt.triggerpattern.git.DiffExtractor;
import org.sandbox.jdt.triggerpattern.git.RepoCloner;
import org.sandbox.mining.core.report.GithubPagesGenerator;
import org.sandbox.mining.core.report.JsonReporter;
import org.sandbox.mining.core.report.NetBeansReporter;
import org.sandbox.mining.core.report.ReportAggregator;
import org.sandbox.mining.core.report.StatisticsCollector;
/**
* CLI entry point for AI-powered commit analysis.
*
* <p>Analyzes Eclipse project commits using a configurable LLM provider and
* generates reports for TriggerPattern DSL mining.</p>
*
* <p>Usage: java -jar sandbox-mining-core.jar [options]</p>
*/
public class MiningCli {
/**
* Output stream for informational messages.
* Default: {@code System.out}. Set to {@code System.err} when {@code --strict-netbeans} is active,
* keeping stdout clean for pipe-friendly NetBeans-format output.
*/
private PrintStream miningLog = System.out;
/**
* Known rules context for LLM prompts, formatted by {@link KnownRulesStore}.
* Populated in {@link #run(String[])} and consumed in {@link #processBatch}.
*/
private String knownRulesContext = ""; //$NON-NLS-1$
private static final String OPT_CONFIG = "--config"; //$NON-NLS-1$
private static final String OPT_STATE = "--state"; //$NON-NLS-1$
private static final String OPT_SANDBOX_ROOT = "--sandbox-root"; //$NON-NLS-1$
private static final String OPT_BATCH_SIZE = "--batch-size"; //$NON-NLS-1$
private static final String OPT_OUTPUT = "--output"; //$NON-NLS-1$
private static final String OPT_COMMITS_PER_REQUEST = "--commits-per-request"; //$NON-NLS-1$
private static final String OPT_MAX_FAILURE_DURATION = "--max-failure-duration"; //$NON-NLS-1$
private static final String OPT_LLM_PROVIDER = "--llm-provider"; //$NON-NLS-1$
private static final String OPT_RETRY_DEFERRED = "--retry-deferred"; //$NON-NLS-1$
private static final String OPT_RESET_LEARNED_LIMITS = "--reset-learned-limits"; //$NON-NLS-1$
private static final String OPT_COMMIT_LIST = "--commit-list"; //$NON-NLS-1$
private static final String OPT_MAX_DURATION = "--max-duration"; //$NON-NLS-1$
private static final String OPT_COMPARISON_MODE = "--comparison-mode"; //$NON-NLS-1$
private static final String OPT_COPILOT_RESULTS = "--copilot-results"; //$NON-NLS-1$
private static final String OPT_ENRICH_TYPE_CONTEXT = "--enrich-type-context"; //$NON-NLS-1$
private static final String OPT_KEYWORD_FILTER = "--keyword-filter"; //$NON-NLS-1$
private static final String OPT_OUTPUT_FORMAT = "--output-format"; //$NON-NLS-1$
private static final String OPT_STRICT_NETBEANS = "--strict-netbeans"; //$NON-NLS-1$
private static final int DEFAULT_BATCH_SIZE = 500;
private static final int DEFAULT_COMMITS_PER_REQUEST = 4;
private static final int DEFAULT_MAX_FAILURE_DURATION_SECONDS = 300;
private static final DateTimeFormatter COMMIT_DATE_FORMAT =
DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss").withZone(ZoneOffset.UTC); //$NON-NLS-1$
/**
* Default upper bound for useful commits by diff size.
* <p>Used as a fallback when the configuration does not specify
* {@code max-diff-lines-per-commit}. The configured value from
* {@code repos.yml} is preferred.</p>
*/
private static final int MAX_USEFUL_DIFF_LINES = 300;
public static void main(String[] args) {
try {
new MiningCli().run(args);
} catch (Exception e) {
System.err.println("Error: " + e.getMessage());
e.printStackTrace();
System.exit(1);
}
}
/**
* Runs the mining process with the given arguments.
*
* @param args CLI arguments
* @throws IOException if an I/O error occurs
* @throws GitAPIException if a Git operation fails
*/
public void run(String[] args) throws IOException, GitAPIException {
long startTimeMs = System.currentTimeMillis();
Path configPath = null;
Path statePath = null;
Path sandboxRoot = Path.of("."); //$NON-NLS-1$
int batchSize = DEFAULT_BATCH_SIZE;
int commitsPerRequest = DEFAULT_COMMITS_PER_REQUEST;
int maxFailureDurationSeconds = DEFAULT_MAX_FAILURE_DURATION_SECONDS;
Path outputDir = Path.of("output"); //$NON-NLS-1$
String llmProvider = null;
boolean retryDeferred = false;
boolean resetLearnedLimits = false;
Path commitListPath = null;
int maxDurationMinutes = 0;
boolean comparisonMode = false;
Path copilotResultsPath = null;
boolean enrichTypeContext = false;
Path keywordFilterPath = null;
String outputFormat = "json"; //$NON-NLS-1$
boolean strictNetbeans = false;
for (int i = 0; i < args.length; i++) {
switch (args[i]) {
case OPT_CONFIG:
configPath = Path.of(requireArg(args, ++i, OPT_CONFIG));
break;
case OPT_STATE:
statePath = Path.of(requireArg(args, ++i, OPT_STATE));
break;
case OPT_SANDBOX_ROOT:
sandboxRoot = Path.of(requireArg(args, ++i, OPT_SANDBOX_ROOT));
break;
case OPT_BATCH_SIZE:
batchSize = Integer.parseInt(requireArg(args, ++i, OPT_BATCH_SIZE));
break;
case OPT_COMMITS_PER_REQUEST:
commitsPerRequest = Integer.parseInt(requireArg(args, ++i, OPT_COMMITS_PER_REQUEST));
if (commitsPerRequest < 1) {
throw new IllegalArgumentException(
"--commits-per-request must be >= 1 but was " + commitsPerRequest); //$NON-NLS-1$
}
break;
case OPT_OUTPUT:
outputDir = Path.of(requireArg(args, ++i, OPT_OUTPUT));
break;
case OPT_MAX_FAILURE_DURATION:
maxFailureDurationSeconds = Integer.parseInt(requireArg(args, ++i, OPT_MAX_FAILURE_DURATION));
if (maxFailureDurationSeconds < 10) {
throw new IllegalArgumentException(
"--max-failure-duration must be at least 10 seconds but was " //$NON-NLS-1$
+ maxFailureDurationSeconds);
}
break;
case OPT_LLM_PROVIDER:
llmProvider = requireArg(args, ++i, OPT_LLM_PROVIDER);
break;
case OPT_RETRY_DEFERRED:
retryDeferred = true;
break;
case OPT_RESET_LEARNED_LIMITS:
resetLearnedLimits = true;
break;
case OPT_COMMIT_LIST:
commitListPath = Path.of(requireArg(args, ++i, OPT_COMMIT_LIST));
break;
case OPT_MAX_DURATION:
maxDurationMinutes = Integer.parseInt(requireArg(args, ++i, OPT_MAX_DURATION));
break;
case OPT_COMPARISON_MODE:
comparisonMode = true;
break;
case OPT_COPILOT_RESULTS:
copilotResultsPath = Path.of(requireArg(args, ++i, OPT_COPILOT_RESULTS));
break;
case OPT_ENRICH_TYPE_CONTEXT:
enrichTypeContext = true;
break;
case OPT_KEYWORD_FILTER:
keywordFilterPath = Path.of(requireArg(args, ++i, OPT_KEYWORD_FILTER));
break;
case OPT_OUTPUT_FORMAT:
outputFormat = requireArg(args, ++i, OPT_OUTPUT_FORMAT);
break;
case OPT_STRICT_NETBEANS:
strictNetbeans = true;
break;
default:
System.err.println("Unknown option: " + args[i]); //$NON-NLS-1$
printUsage();
return;
}
}
if (configPath == null) {
configPath = sandboxRoot.resolve(".github/refactoring-mining/repos.yml"); //$NON-NLS-1$
}
if (statePath == null) {
statePath = sandboxRoot.resolve(".github/refactoring-mining/state.json"); //$NON-NLS-1$
}
// Use miningLog for all informational output; when --strict-netbeans is set
// it redirects to stderr so stdout stays clean for pipe-friendly output.
if (strictNetbeans) {
miningLog = System.err;
}
miningLog.println("=== Sandbox Mining Core ==="); //$NON-NLS-1$
miningLog.println("Config: " + configPath); //$NON-NLS-1$
miningLog.println("State: " + statePath); //$NON-NLS-1$
miningLog.println("Output: " + outputDir); //$NON-NLS-1$
MiningConfig config = MiningConfig.parse(configPath);
MiningState state = MiningState.load(statePath);
MiningState.backup(statePath);
CategoryManager categoryManager = new CategoryManager();
DslContextCollector dslCollector = new DslContextCollector();
String dslContext = dslCollector.collectContext(sandboxRoot);
PromptBuilder promptBuilder = new PromptBuilder();
DslValidator validator = new DslValidator();
StatisticsCollector stats = new StatisticsCollector();
ReportAggregator aggregator = new ReportAggregator();
// Enrich prompt with type context if requested
// The TypeContextEnricher is initialized here but applied per-batch
// in processBatch() where the actual diff content is available.
TypeContextEnricher typeEnricher = enrichTypeContext ? new TypeContextEnricher() : null;
// Collect error feedback from existing evaluations for prompt improvement
JsonReporter feedbackReader = new JsonReporter();
Path existingEvals = outputDir.resolve("evaluations.json"); //$NON-NLS-1$
List<CommitEvaluation> previousEvals = feedbackReader.loadExistingEvaluations(existingEvals);
if (!previousEvals.isEmpty()) {
ErrorFeedbackCollector errorCollector = new ErrorFeedbackCollector();
errorCollector.collect(previousEvals);
if (errorCollector.getErrorCount() > 0) {
promptBuilder.setErrorFeedback(errorCollector.formatFeedback());
miningLog.println("Loaded " + errorCollector.getErrorCount() //$NON-NLS-1$
+ " error patterns from previous evaluations for prompt feedback"); //$NON-NLS-1$
}
}
// Load known rules and provide them as LLM context to avoid duplicates
Path knownRulesPath = outputDir.resolve("known-rules.json"); //$NON-NLS-1$
KnownRulesStore knownRules = KnownRulesStore.load(knownRulesPath);
knownRulesContext = knownRules.formatForPrompt();
if (!knownRulesContext.isEmpty()) {
miningLog.println("Loaded " + knownRules.size() + " known rules as LLM context"); //$NON-NLS-1$ //$NON-NLS-2$
}
// Initialize keyword filter if specified
CommitKeywordFilter keywordFilter = null;
if (keywordFilterPath != null) {
keywordFilter = new CommitKeywordFilter(keywordFilterPath);
miningLog.println("Keyword filter loaded: " + keywordFilter.getKeywords().size() + " keywords"); //$NON-NLS-1$ //$NON-NLS-2$
}
// Read commit list if specified — only these commits will be processed
Set<String> commitListHashes = null;
if (commitListPath != null) {
List<String> hashes = readCommitList(commitListPath);
commitListHashes = new HashSet<>(hashes);
miningLog.println("Commit list loaded: " + commitListHashes.size() + " commits to process"); //$NON-NLS-1$ //$NON-NLS-2$
}
Path workDir = Files.createTempDirectory("mining-core-"); //$NON-NLS-1$
try (LlmClient llmClient = LlmClientFactory.createFromEnvironment(llmProvider)) {
llmClient.setMaxFailureDuration(Duration.ofSeconds(maxFailureDurationSeconds));
// Reset learned limits if requested or if model changed
for (RepoEntry repo : config.getRepositories()) {
RepoState repoState = state.getRepoState(repo.getUrl());
String currentModel = llmClient.getModel();
if (resetLearnedLimits || (repoState.getLastModelUsed() != null
&& !repoState.getLastModelUsed().equals(currentModel))) {
if (repoState.getLearnedMaxDiffLines() != -1) {
miningLog.println("Resetting learned diff limit for " + repo.getUrl() //$NON-NLS-1$
+ " (was " + repoState.getLearnedMaxDiffLines() + " lines)"); //$NON-NLS-1$ //$NON-NLS-2$
repoState.setLearnedMaxDiffLines(-1);
}
// Reset deferred retry counts on model change
for (DeferredCommit dc : repoState.getDeferredCommits()) {
dc.setRetryCount(0);
}
}
repoState.setLastModelUsed(currentModel);
}
try {
processRepositories(config, state, statePath, workDir, batchSize, commitsPerRequest,
llmClient, promptBuilder, dslContext, categoryManager, validator, stats, aggregator,
startTimeMs, maxDurationMinutes, typeEnricher, keywordFilter, commitListHashes);
// Always process deferred commits at end of each run
retryDeferredCommits(state, statePath, workDir, llmClient, promptBuilder,
dslContext, categoryManager, validator, stats, aggregator, config, retryDeferred);
} finally {
deleteDirectory(workDir);
}
printDeferredReport(state, config);
printRunSummary(stats, state, config, llmClient, startTimeMs);
// Persist run metadata
long durationMs = System.currentTimeMillis() - startTimeMs;
int totalDeferred = 0;
int totalPermanentlySkipped = 0;
for (RepoEntry repo : config.getRepositories()) {
RepoState rs = state.getRepoState(repo.getUrl());
totalDeferred += rs.getDeferredCommits().size();
totalPermanentlySkipped += rs.getPermanentlySkipped().size();
}
String startedAt = Instant.ofEpochMilli(startTimeMs).toString();
String completedAt = Instant.now().toString();
stats.recordRunMetadata(startedAt, completedAt, durationMs / 1000,
llmClient.getClass().getSimpleName(), llmClient.getModel(),
batchSize, commitsPerRequest,
llmClient.getDailyRequestCount(), totalDeferred, totalPermanentlySkipped);
stats.computeTimeWindow(aggregator.getAllEvaluations());
}
// Generate output
Files.createDirectories(outputDir);
JsonReporter jsonReporter = new JsonReporter();
jsonReporter.writeEvaluations(aggregator.getAllEvaluations(), outputDir);
jsonReporter.writeStatistics(stats, outputDir);
GithubPagesGenerator pagesGenerator = new GithubPagesGenerator();
pagesGenerator.generate(aggregator.getAllEvaluations(), stats, outputDir);
// Write .sandbox-hint files for GREEN evaluations with VALID DSL rules
HintFileUpdater hintUpdater = new HintFileUpdater(validator);
Path hintOutputDir = sandboxRoot.resolve(
"sandbox_common_core/src/main/resources/org/sandbox/jdt/triggerpattern/internal"); //$NON-NLS-1$
List<Path> createdHints = hintUpdater.writeHintFiles(aggregator.getAllEvaluations(), hintOutputDir);
if (!createdHints.isEmpty()) {
miningLog.println("Created " + createdHints.size() + " new .sandbox-hint files:"); //$NON-NLS-1$ //$NON-NLS-2$
for (Path hint : createdHints) {
miningLog.println(" " + hint); //$NON-NLS-1$
}
}
// NetBeans format output
if ("netbeans".equals(outputFormat) || "both".equals(outputFormat)) { //$NON-NLS-1$ //$NON-NLS-2$
NetBeansReporter nbReporter = new NetBeansReporter();
nbReporter.write(aggregator.getAllEvaluations(), outputDir);
// With --strict-netbeans, NetBeans format goes to stdout (pipe-friendly),
// everything else goes to stderr via miningLog field
nbReporter.printToStream(aggregator.getAllEvaluations(), System.out);
}
// Comparison mode
if (comparisonMode && copilotResultsPath != null) {
ExternalEvaluationImporter importer = new ExternalEvaluationImporter();
List<CommitEvaluation> copilotResults = importer.importFromJson(copilotResultsPath);
MiningComparator comparator = new MiningComparator();
DeltaReport deltaReport = comparator.compare(aggregator.getAllEvaluations(), copilotResults);
miningLog.println(deltaReport.format());
deltaReport.writeToFiles(outputDir);
miningLog.println("Delta report written to " + outputDir.resolve("delta-report.json")); //$NON-NLS-1$ //$NON-NLS-2$
miningLog.println("Delta report written to " + outputDir.resolve("delta-report.md")); //$NON-NLS-1$ //$NON-NLS-2$
}
// Register GREEN+VALID evaluations in known-rules.json
// Run number is not tracked in MiningState; use globalTotalProcessed as proxy
int newKnownRules = knownRules.registerFromEvaluations(
aggregator.getAllEvaluations(), state.getGlobalTotalProcessed());
if (newKnownRules > 0) {
knownRules.save(knownRulesPath);
miningLog.println("Registered " + newKnownRules + " new known rules (total: " + knownRules.size() + ")"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
}
state.save(statePath);
miningLog.println("=== Mining complete ==="); //$NON-NLS-1$
miningLog.println("Processed: " + stats.getTotalProcessed() + " commits"); //$NON-NLS-1$ //$NON-NLS-2$
miningLog.println("Relevant: " + stats.getRelevant()); //$NON-NLS-1$
miningLog.println("Output: " + outputDir.toAbsolutePath()); //$NON-NLS-1$
}
private void processRepositories(MiningConfig config, MiningState state, Path statePath,
Path workDir, int batchSize, int commitsPerRequest,
LlmClient llmClient, PromptBuilder promptBuilder,
String dslContext, CategoryManager categoryManager, DslValidator validator,
StatisticsCollector stats, ReportAggregator aggregator,
long startTimeMs, int maxDurationMinutes,
TypeContextEnricher typeEnricher,
CommitKeywordFilter keywordFilter, Set<String> commitListHashes) throws IOException, GitAPIException {
RepoCloner cloner = new RepoCloner();
// Dynamic batch size tracking (reduced on truncation)
int[] dynamicCPR = { commitsPerRequest };
for (RepoEntry repo : config.getRepositories()) {
if (shouldStop(startTimeMs, maxDurationMinutes)) {
miningLog.println("Max duration reached (" + maxDurationMinutes //$NON-NLS-1$
+ " minutes). Stopping. Will resume from current position on next run."); //$NON-NLS-1$
return;
}
if (!llmClient.hasRemainingQuota()) {
miningLog.println("Daily API quota exhausted (" + llmClient.getDailyRequestCount() //$NON-NLS-1$
+ " requests used). Stopping. Will resume from current position on next run."); //$NON-NLS-1$
return;
}
if (llmClient.isApiUnavailable()) {
logApiUnavailable(llmClient);
return;
}
miningLog.println("Processing: " + repo.getUrl()); //$NON-NLS-1$
Path repoDir = workDir.resolve(repoDirectoryName(repo.getUrl()));
cloner.cloneRepo(repo.getUrl(), repo.getBranch(), repoDir);
String lastCommit = state.getLastProcessedCommit(repo.getUrl());
RepoState repoState = state.getRepoState(repo.getUrl());
// Determine epoch-aware date range
String effectiveStartDate = config.getStartDate();
String effectiveEndDate = config.getEndDate();
List<MiningConfig.EpochEntry> epochs = config.getEpochs();
if (!epochs.isEmpty()) {
int epochIdx = repoState.getCurrentEpoch();
if (epochIdx < epochs.size()) {
MiningConfig.EpochEntry epoch = epochs.get(epochIdx);
effectiveStartDate = epoch.getStart();
effectiveEndDate = epoch.getEnd();
miningLog.println(" Epoch " + epochIdx + ": " + epoch); //$NON-NLS-1$ //$NON-NLS-2$
} else {
miningLog.println(" All " + epochs.size() + " epochs completed for " + repo.getUrl()); //$NON-NLS-1$ //$NON-NLS-2$
continue;
}
}
try (CommitWalker walker = new CommitWalker(repoDir);
DiffExtractor diffExtractor = new DiffExtractor(repoDir,
config.getMaxDiffLinesPerCommit(), repo.getPaths(),
config.getMaxFilesPerCommit())) {
List<RevCommit> batch = walker.nextBatch(lastCommit, effectiveStartDate, effectiveEndDate, batchSize);
while (!batch.isEmpty()) {
// Remember the last commit in the unfiltered batch for walker advancement
String lastBatchCommit = batch.get(batch.size() - 1).getName();
// Filter batch by commit-list if specified — commits not in the
// list are silently skipped (state still advances past them)
List<RevCommit> effectiveBatch = batch;
if (commitListHashes != null) {
int beforeSize = batch.size();
effectiveBatch = batch.stream()
.filter(c -> commitListHashes.contains(c.getName()))
.toList();
if (effectiveBatch.size() < beforeSize) {
miningLog.println(" Commit-list filter: " + effectiveBatch.size() //$NON-NLS-1$
+ " of " + beforeSize + " commits matched"); //$NON-NLS-1$
}
if (effectiveBatch.isEmpty()) {
// Advance state past skipped commits and continue to next batch
state.updateLastProcessedCommit(repo.getUrl(), lastBatchCommit);
batch = walker.nextBatch(lastBatchCommit, effectiveStartDate, effectiveEndDate, batchSize);
continue;
}
}
// Group commits into sub-batches for API calls
for (int i = 0; i < effectiveBatch.size(); i += dynamicCPR[0]) {
if (shouldStop(startTimeMs, maxDurationMinutes)) {
miningLog.println("Max duration reached. Stopping batch processing."); //$NON-NLS-1$
return;
}
if (!llmClient.hasRemainingQuota()) {
miningLog.println("Daily API quota exhausted (" //$NON-NLS-1$
+ llmClient.getDailyRequestCount()
+ " requests used). Stopping. Will resume from current position on next run."); //$NON-NLS-1$
return;
}
if (llmClient.isApiUnavailable()) {
logApiUnavailable(llmClient);
return;
}
int end = Math.min(i + dynamicCPR[0], effectiveBatch.size());
List<RevCommit> subBatch = effectiveBatch.subList(i, end);
processBatch(subBatch, repo, diffExtractor, state, statePath,
llmClient, promptBuilder, dslContext, categoryManager,
validator, stats, aggregator, config.getMinDiffLinesPerCommit(),
config.getMaxDiffLinesPerCommit(), typeEnricher, keywordFilter);
if (llmClient.wasLastResponseTruncated() && dynamicCPR[0] > 1) {
dynamicCPR[0] = Math.max(1, dynamicCPR[0] / 2);
miningLog.println(" Reducing commits-per-request to " + dynamicCPR[0] + " after truncated response"); //$NON-NLS-1$
}
if (llmClient.isApiUnavailable()) {
logApiUnavailable(llmClient);
return;
}
}
batch = walker.nextBatch(lastBatchCommit,
effectiveStartDate, effectiveEndDate, batchSize);
}
// Epoch rotation: if epochs are configured and current epoch is exhausted,
// advance to the next epoch and reset lastProcessedCommit
if (!epochs.isEmpty()) {
int epochIdx = repoState.getCurrentEpoch();
MiningConfig.EpochEntry completedEpoch = epochs.get(epochIdx);
repoState.getCompletedEpochs().add(completedEpoch.toString());
repoState.setCurrentEpoch(epochIdx + 1);
repoState.setLastProcessedCommit(null);
repoState.setStatus("EPOCH_COMPLETE"); //$NON-NLS-1$
miningLog.println(" Epoch " + epochIdx + " complete (" + completedEpoch //$NON-NLS-1$ //$NON-NLS-2$
+ "). Advancing to epoch " + (epochIdx + 1)); //$NON-NLS-1$
state.save(statePath);
}
}
miningLog.println(" Completed: " + repo.getUrl()); //$NON-NLS-1$
}
}
private void processBatch(List<RevCommit> commits, RepoEntry repo,
DiffExtractor diffExtractor, MiningState state, Path statePath,
LlmClient llmClient, PromptBuilder promptBuilder,
String dslContext, CategoryManager categoryManager, DslValidator validator,
StatisticsCollector stats, ReportAggregator aggregator,
int minDiffLines, int maxDiffLines,
TypeContextEnricher typeEnricher,
CommitKeywordFilter keywordFilter) throws IOException {
// Classify commits in original order: track which are skipped vs included
List<CommitData> commitDataList = new ArrayList<>();
List<Boolean> isSkipped = new ArrayList<>();
List<Integer> diffLineCounts = new ArrayList<>();
RepoState repoState = state.getRepoState(repo.getUrl());
int effectiveMaxDiff = repoState.getLearnedMaxDiffLines() > 0
? repoState.getLearnedMaxDiffLines() : maxDiffLines;
for (RevCommit commit : commits) {
// Skip commits that don't match keyword filter (if active)
if (keywordFilter != null && !keywordFilter.matches(commit.getFullMessage())) {
miningLog.println(" Skipping commit " + formatCommitInfo(commit, repo) //$NON-NLS-1$
+ " (no keyword match)"); //$NON-NLS-1$
isSkipped.add(Boolean.TRUE);
continue;
}
String diff = diffExtractor.extractDiff(commit);
int lineCount = diff.split("\n", -1).length; //$NON-NLS-1$
if (diff.isBlank()) {
isSkipped.add(Boolean.TRUE);
} else if (lineCount < minDiffLines) {
miningLog.println(" Skipping commit " + formatCommitInfo(commit, repo) //$NON-NLS-1$
+ " (diff too small: " + lineCount + " lines)"); //$NON-NLS-1$ //$NON-NLS-2$
isSkipped.add(Boolean.TRUE);
} else if (lineCount > effectiveMaxDiff) {
miningLog.println(" Deferring commit " + formatCommitInfo(commit, repo) //$NON-NLS-1$
+ " (" + lineCount + " lines > limit " + effectiveMaxDiff + ")"); //$NON-NLS-1$ //$NON-NLS-2$
String shortMsg = commit.getShortMessage();
String truncatedMsg = shortMsg == null
? "" //$NON-NLS-1$
: shortMsg.substring(0, Math.min(120, shortMsg.length()));
repoState.addDeferredCommit(new DeferredCommit(
commit.getName(),
truncatedMsg,
lineCount, "DIFF_TOO_LARGE", Instant.now().toString(), 0, 3)); //$NON-NLS-1$
isSkipped.add(Boolean.TRUE);
} else {
commitDataList.add(new CommitData(commit.getName(), commit.getFullMessage(), diff));
diffLineCounts.add(lineCount);
isSkipped.add(Boolean.FALSE);
}
}
List<CommitEvaluation> evaluations = null;
if (!commitDataList.isEmpty()) {
// Enrich prompt with Eclipse type context from actual diffs if enabled
if (typeEnricher != null) {
// Clear stale type context from previous batch before enriching
promptBuilder.setTypeContext(null);
StringBuilder combinedDiffs = new StringBuilder();
for (CommitData cd : commitDataList) {
combinedDiffs.append(cd.diff()).append('\n');
}
String context = typeEnricher.enrichFromDiff(combinedDiffs.toString());
if (!context.isEmpty()) {
promptBuilder.setTypeContext(context);
}
}
List<String> hashes = commitDataList.stream().map(CommitData::commitHash).toList();
List<String> messages = commitDataList.stream().map(CommitData::commitMessage).toList();
String previousResults = knownRulesContext.isEmpty() ? null : knownRulesContext;
String prompt = promptBuilder.buildBatchPrompt(dslContext,
categoryManager.getCategoriesJson(), commitDataList, previousResults);
evaluations = llmClient.evaluateBatch(prompt, hashes, messages, repo.getUrl());
// Learn from truncation
if (llmClient.wasLastResponseTruncated() && !diffLineCounts.isEmpty()) {
int maxDiffInBatch = diffLineCounts.stream().mapToInt(Integer::intValue).max().orElse(0);
if (maxDiffInBatch > 0) {
int newLimit = (int) (maxDiffInBatch * 0.8);
if (repoState.getLearnedMaxDiffLines() == -1 || newLimit < repoState.getLearnedMaxDiffLines()) {
miningLog.println(" Learning: reducing max diff limit to " + newLimit //$NON-NLS-1$
+ " lines (was " + (repoState.getLearnedMaxDiffLines() == -1 ? "default" : repoState.getLearnedMaxDiffLines()) + ")"); //$NON-NLS-1$ //$NON-NLS-2$
repoState.setLearnedMaxDiffLines(newLimit);
}
}
}
if (evaluations == null || evaluations.size() != commitDataList.size()) {
miningLog.println(" Incomplete batch evaluation for repository " + repo.getUrl() //$NON-NLS-1$
+ " [" + repo.getBranch() + "]" //$NON-NLS-1$ //$NON-NLS-2$
+ "; will retry non-evaluated commits in a future run."); //$NON-NLS-1$
miningLog.println(" Batch contained commits:"); //$NON-NLS-1$
for (RevCommit c : commits) {
miningLog.println(" - " + formatCommitInfo(c, repo)); //$NON-NLS-1$
}
if (llmClient.isApiUnavailable()) {
logApiUnavailable(llmClient);
}
// Defer commits that were not evaluated
if (evaluations != null) {
for (int j = evaluations.size(); j < commitDataList.size(); j++) {
CommitData cd = commitDataList.get(j);
String msg = cd.commitMessage();
String truncatedMsg = msg == null
? "" //$NON-NLS-1$
: msg.substring(0, Math.min(120, msg.length()));
repoState.addDeferredCommit(new DeferredCommit(
cd.commitHash(),
truncatedMsg,
diffLineCounts.get(j), "INCOMPLETE_BATCH", Instant.now().toString(), 0, 3)); //$NON-NLS-1$
}
}
// Advance state only through the leading prefix of skipped commits so
// that included commits in this batch are not permanently lost.
for (int i = 0; i < commits.size(); i++) {
if (!isSkipped.get(i)) {
break;
}
state.updateLastProcessedCommit(repo.getUrl(), commits.get(i).getName());
}
state.save(statePath);
return;
}
}
// Process all commits in original order so state always advances
// monotonically and no commit is permanently skipped on failure.
if (evaluations != null) {
miningLog.println(" Evaluated batch of " + commitDataList.size() + " commits for " //$NON-NLS-1$ //$NON-NLS-2$
+ repo.getUrl() + " [" + repo.getBranch() + "]:"); //$NON-NLS-1$ //$NON-NLS-2$
}
int evalIdx = 0;
for (int i = 0; i < commits.size(); i++) {
RevCommit commit = commits.get(i);
if (isSkipped.get(i)) {
state.updateLastProcessedCommit(repo.getUrl(), commit.getName());
} else {
CommitEvaluation evaluation = evaluations != null ? evaluations.get(evalIdx++) : null;
if (evaluation == null) {
miningLog.println(" Missing evaluation for commit " + formatCommitInfo(commit, repo) //$NON-NLS-1$
+ "; stopping batch to retry remaining commits later."); //$NON-NLS-1$
break;
}
int diffIdx = evalIdx - 1;
String diffInfo = diffIdx >= 0 && diffIdx < diffLineCounts.size()
? " [" + diffLineCounts.get(diffIdx) + " lines]" //$NON-NLS-1$ //$NON-NLS-2$
: ""; //$NON-NLS-1$
miningLog.println(" - " + formatCommitInfo(commit, repo) + diffInfo + " -> " + evaluation.trafficLight()); //$NON-NLS-1$ //$NON-NLS-2$
handleEvaluation(evaluation, commit, repo, validator, categoryManager, stats, aggregator);
state.updateLastProcessedCommit(repo.getUrl(), commit.getName());
// Track per-category hit counts for category-aware mining
if (evaluation.relevant() && evaluation.category() != null) {
repoState.incrementCategoryHitCount(evaluation.category());
}
// Remove from deferred list if it was previously deferred
repoState.removeDeferredCommit(commit.getName());
}
}
// Save state after each batch for resume safety
state.save(statePath);
}
private void handleEvaluation(CommitEvaluation evaluation, RevCommit commit, RepoEntry repo,
DslValidator validator, CategoryManager categoryManager,
StatisticsCollector stats, ReportAggregator aggregator) {
String validationResult = null;
if (evaluation.dslRule() != null && !evaluation.dslRule().isBlank()) {
var validation = validator.validate(evaluation.dslRule());
if (validation.valid()) {
validationResult = "VALID"; //$NON-NLS-1$
} else {
validationResult = validation.message();
miningLog.println(" Invalid DSL rule for " + formatCommitInfo(commit, repo) //$NON-NLS-1$
+ ": " + validation.message()); //$NON-NLS-1$
if (Boolean.parseBoolean(System.getenv("GEMINI_DEBUG"))) { //$NON-NLS-1$
miningLog.println(" --- DSL rule begin ---"); //$NON-NLS-1$
miningLog.println(evaluation.dslRule());
miningLog.println(" --- DSL rule end ---"); //$NON-NLS-1$
}
}
}
// Create evaluation with validation result and commit date (author date)
Instant commitDate = commit.getAuthorIdent().getWhen().toInstant();
CommitEvaluation enriched = new CommitEvaluation(
evaluation.commitHash(), evaluation.commitMessage(), evaluation.repoUrl(),
evaluation.evaluatedAt(), commitDate, evaluation.relevant(), evaluation.irrelevantReason(),
evaluation.isDuplicate(), evaluation.duplicateOf(),
evaluation.reusability(), evaluation.codeImprovement(), evaluation.implementationEffort(),
evaluation.trafficLight(), evaluation.category(), evaluation.isNewCategory(),
evaluation.categoryReason(), evaluation.canImplementInCurrentDsl(),
evaluation.dslRule(), evaluation.targetHintFile(),
evaluation.languageChangeNeeded(), evaluation.dslRuleAfterChange(),
evaluation.summary(), validationResult);
if (evaluation.isNewCategory() && evaluation.category() != null) {
categoryManager.addCategory(evaluation.category());
}
stats.record(enriched);
aggregator.add(enriched);
}
static String repoDirectoryName(String url) {
String name = url;
if (name.endsWith(".git")) {
name = name.substring(0, name.length() - 4);
}
int lastSlash = name.lastIndexOf('/');
if (lastSlash >= 0) {
name = name.substring(lastSlash + 1);
}
return name;
}
static String formatCommitInfo(RevCommit commit, RepoEntry repo) {
String datetime = COMMIT_DATE_FORMAT.format(commit.getAuthorIdent().getWhen().toInstant());
String title = commit.getShortMessage().replace("\"", "\\\""); //$NON-NLS-1$ //$NON-NLS-2$
return commit.getName().substring(0, 7) + " on " + repo.getBranch() //$NON-NLS-1$
+ " (" + datetime + ") \"" + title + "\""; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
}
private void logApiUnavailable(LlmClient llmClient) {
miningLog.println("LLM API has been unreachable for over " //$NON-NLS-1$
+ llmClient.getMaxFailureDuration().toMinutes()
+ " minutes. Stopping to avoid wasting CI time. State saved; will resume on next run."); //$NON-NLS-1$
}
private static void printUsage() {
System.out.println("Usage: java -jar sandbox-mining-core.jar [options]"); //$NON-NLS-1$
System.out.println("Options:"); //$NON-NLS-1$
System.out.println(" --config <path> Path to repos.yml config file"); //$NON-NLS-1$
System.out.println(" --state <path> Path to state.json file"); //$NON-NLS-1$
System.out.println(" --sandbox-root <path> Root of sandbox repository"); //$NON-NLS-1$
System.out.println(" --batch-size <n> Number of commits per batch (default: 500)"); //$NON-NLS-1$
System.out.println(" --commits-per-request <n> Commits grouped into one API call (default: 4)"); //$NON-NLS-1$
System.out.println(" --output <path> Output directory (default: output)"); //$NON-NLS-1$
System.out.println(" --max-failure-duration <s> Seconds without a successful API call before aborting (default: 300)"); //$NON-NLS-1$
System.out.println(" --llm-provider <name> LLM provider: gemini, openai, deepseek, qwen, llama, or mistral (default: auto-detect)"); //$NON-NLS-1$
System.out.println(" --retry-deferred Retry previously deferred commits"); //$NON-NLS-1$
System.out.println(" --reset-learned-limits Reset learned diff size limits"); //$NON-NLS-1$
System.out.println(" --commit-list <path> Path to file with commit hashes to process (one per line)"); //$NON-NLS-1$
System.out.println(" --max-duration <minutes> Maximum run duration in minutes"); //$NON-NLS-1$
System.out.println(" --comparison-mode Enable comparison against reference results"); //$NON-NLS-1$
System.out.println(" --copilot-results <path> Path to Copilot/reference evaluation results JSON"); //$NON-NLS-1$
System.out.println(" --enrich-type-context Add Eclipse type hierarchy context to prompts"); //$NON-NLS-1$
System.out.println(" --keyword-filter <path> Path to keyword filter file for commit pre-filtering"); //$NON-NLS-1$
System.out.println(" --output-format <format> Output format: json, netbeans, or both (default: json)"); //$NON-NLS-1$
System.out.println(" --strict-netbeans Only NetBeans format on stdout, info on stderr"); //$NON-NLS-1$
}
private static String requireArg(String[] args, int index, String option) {
if (index >= args.length) {
throw new IllegalArgumentException("Option " + option + " requires a value");
}
return args[index];
}
private static void deleteDirectory(Path dir) throws IOException {
if (Files.exists(dir)) {
try (var walk = Files.walk(dir)) {
walk.sorted(java.util.Comparator.reverseOrder())
.forEach(p -> {
try {
Files.deleteIfExists(p);
} catch (IOException e) {
// best effort cleanup
}
});
}
}
}
private void retryDeferredCommits(MiningState state, Path statePath, Path workDir,
LlmClient llmClient, PromptBuilder promptBuilder, String dslContext,
CategoryManager categoryManager, DslValidator validator,
StatisticsCollector stats, ReportAggregator aggregator,
MiningConfig config, boolean forceRetry) throws IOException {
for (RepoEntry repo : config.getRepositories()) {
RepoState repoState = state.getRepoState(repo.getUrl());
List<DeferredCommit> toRetry = new ArrayList<>(repoState.getDeferredCommits());
if (toRetry.isEmpty()) continue;
miningLog.println("Processing " + toRetry.size() + " deferred commits for " + repo.getUrl()); //$NON-NLS-1$ //$NON-NLS-2$
List<DeferredCommit> remaining = new ArrayList<>();
for (DeferredCommit dc : toRetry) {
if (!forceRetry && dc.getRetryCount() >= dc.getMaxRetries()) {
repoState.moveToPermanentlySkipped(dc.getHash());
continue;
}
// No actual retry can be performed here because the repository checkout and
// diff extraction pipeline are not available in this context. Keep the commit
// deferred with its current retryCount so it can be retried during the next
// normal processing run when its diff is available again.
remaining.add(dc);
}
repoState.setDeferredCommits(remaining);
state.save(statePath);
}
}
private void printRunSummary(StatisticsCollector stats, MiningState state,
MiningConfig config, LlmClient llmClient, long startTimeMs) {
long durationMs = System.currentTimeMillis() - startTimeMs;
long minutes = durationMs / 60000;
long seconds = (durationMs % 60000) / 1000;
int totalDeferred = 0;
int totalPermanentlySkipped = 0;
for (RepoEntry repo : config.getRepositories()) {
RepoState rs = state.getRepoState(repo.getUrl());
totalDeferred += rs.getDeferredCommits().size();
totalPermanentlySkipped += rs.getPermanentlySkipped().size();
}
miningLog.println("=== Mining Run Summary ==="); //$NON-NLS-1$
miningLog.println("Duration: " + minutes + "m " + seconds + "s"); //$NON-NLS-1$ //$NON-NLS-2$
miningLog.println("Commits processed: " + stats.getTotalProcessed()); //$NON-NLS-1$
miningLog.println("Commits deferred: " + totalDeferred); //$NON-NLS-1$
miningLog.println("Commits permanently skipped: " + totalPermanentlySkipped); //$NON-NLS-1$
miningLog.println("Relevant: " + stats.getRelevant()); //$NON-NLS-1$
miningLog.println("API calls: " + llmClient.getDailyRequestCount()); //$NON-NLS-1$
miningLog.println("Model: " + llmClient.getModel()); //$NON-NLS-1$
// Learned limits
for (RepoEntry repo : config.getRepositories()) {
RepoState rs = state.getRepoState(repo.getUrl());
if (rs.getLearnedMaxDiffLines() > 0) {
miningLog.println("Learned max diff: " + rs.getLearnedMaxDiffLines() + " lines (for " + repo.getUrl() + ")"); //$NON-NLS-1$ //$NON-NLS-2$
}
}
}
private void printDeferredReport(MiningState state, MiningConfig config) {
boolean hasDeferred = false;
for (RepoEntry repo : config.getRepositories()) {
RepoState rs = state.getRepoState(repo.getUrl());
if (!rs.getDeferredCommits().isEmpty()) {
if (!hasDeferred) {
miningLog.println("Deferred commits for retry:"); //$NON-NLS-1$
hasDeferred = true;
}
miningLog.println(" " + repo.getUrl() + ": " + rs.getDeferredCommits().size() + " commits"); //$NON-NLS-1$ //$NON-NLS-2$
for (DeferredCommit dc : rs.getDeferredCommits()) {
miningLog.println(" - " + dc.getHash().substring(0, Math.min(7, dc.getHash().length())) //$NON-NLS-1$
+ " (" + dc.getDiffLines() + " lines, " + dc.getReason() //$NON-NLS-1$ //$NON-NLS-2$
+ ", retry " + dc.getRetryCount() + "/" + dc.getMaxRetries() + ")"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
}
}
}
}
/**
* Checks whether the mining run should stop based on elapsed time.
*
* @param startTimeMs the start time in milliseconds
* @param maxDurationMin maximum duration in minutes (0 = no limit)
* @return true if the run should stop
*/
static boolean shouldStop(long startTimeMs, int maxDurationMin) {
if (maxDurationMin <= 0) {
return false;
}
long elapsed = System.currentTimeMillis() - startTimeMs;
return elapsed >= (long) maxDurationMin * 60 * 1000;
}
/**
* Reads commit hashes from a file (one per line).
* Blank lines and lines starting with {@code #} are ignored.
*
* @param commitListPath path to the commit list file
* @return list of commit hashes
* @throws IOException if the file cannot be read
*/
static List<String> readCommitList(Path commitListPath) throws IOException {
List<String> hashes = new ArrayList<>();
for (String line : Files.readAllLines(commitListPath, StandardCharsets.UTF_8)) {
String trimmed = line.trim();
if (!trimmed.isEmpty() && !trimmed.startsWith("#")) { //$NON-NLS-1$
hashes.add(trimmed);
}
}
return hashes;
}
}