DiffExtractor.java

/*******************************************************************************
 * Copyright (c) 2025 Carsten Hammer.
 *
 * This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License 2.0
 * which accompanies this distribution, and is available at
 * https://www.eclipse.org/legal/epl-2.0/
 *
 * SPDX-License-Identifier: EPL-2.0
 *
 * Contributors:
 *     Carsten Hammer
 *******************************************************************************/
package org.sandbox.jdt.triggerpattern.git;

import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.IOException;
import java.nio.file.Path;
import java.util.List;

import org.eclipse.jgit.api.Git;
import org.eclipse.jgit.diff.DiffEntry;
import org.eclipse.jgit.diff.DiffFormatter;
import org.eclipse.jgit.lib.ObjectReader;
import org.eclipse.jgit.lib.Repository;
import org.eclipse.jgit.revwalk.RevCommit;
import org.eclipse.jgit.treewalk.AbstractTreeIterator;
import org.eclipse.jgit.treewalk.CanonicalTreeParser;
import org.eclipse.jgit.treewalk.EmptyTreeIterator;

/**
 * Extracts the diff of a commit as a string using JGit's DiffFormatter.
 *
 * <p>Limits the diff output to a configurable maximum number of lines
 * to avoid sending overly large prompts to the Gemini API.</p>
 *
 * <p>If the commit touches more files than {@code maxFilesPerCommit}, an empty
 * string is returned so that the caller treats the commit as skipped (mass-change
 * commits such as auto-formatting or renames are not useful for DSL mining).</p>
 *
 * <p>Implements {@link Closeable} to ensure the underlying Git and
 * Repository resources are properly released.</p>
 */
public class DiffExtractor implements Closeable {

	private final Git git;
	private final Repository repository;
	private final int maxDiffLines;
	private final List<String> pathFilters;
	private final int maxFilesPerCommit;

	/**
	 * Creates a DiffExtractor for the given repository directory.
	 *
	 * @param repoDir      the local repository directory
	 * @param maxDiffLines maximum number of diff lines to include
	 * @throws IOException if the repository cannot be opened
	 */
	public DiffExtractor(Path repoDir, int maxDiffLines) throws IOException {
		this(repoDir, maxDiffLines, List.of(), Integer.MAX_VALUE);
	}

	/**
	 * Creates a DiffExtractor for the given repository directory with path filtering.
	 *
	 * @param repoDir      the local repository directory
	 * @param maxDiffLines maximum number of diff lines to include
	 * @param pathFilters  list of path prefixes to include (empty = all paths)
	 * @throws IOException if the repository cannot be opened
	 */
	public DiffExtractor(Path repoDir, int maxDiffLines, List<String> pathFilters) throws IOException {
		this(repoDir, maxDiffLines, pathFilters, Integer.MAX_VALUE);
	}

	/**
	 * Creates a DiffExtractor for the given repository directory with path filtering
	 * and a maximum file count per commit.
	 *
	 * @param repoDir           the local repository directory
	 * @param maxDiffLines      maximum number of diff lines to include
	 * @param pathFilters       list of path prefixes to include (empty = all paths)
	 * @param maxFilesPerCommit maximum number of changed files before the commit is
	 *                          skipped (returns empty string)
	 * @throws IOException if the repository cannot be opened
	 */
	public DiffExtractor(Path repoDir, int maxDiffLines, List<String> pathFilters,
			int maxFilesPerCommit) throws IOException {
		this.git = Git.open(repoDir.toFile());
		this.repository = git.getRepository();
		this.maxDiffLines = maxDiffLines;
		this.pathFilters = pathFilters != null ? pathFilters : List.of();
		this.maxFilesPerCommit = maxFilesPerCommit;
	}

	/**
	 * Extracts the diff of a commit as a string.
	 *
	 * <p>Returns an empty string if the commit touches more files (after path
	 * filtering) than {@code maxFilesPerCommit}, so that the caller can treat it as
	 * a skipped commit (mass-change commits are not useful for DSL mining).</p>
	 *
	 * @param commit the commit to extract the diff from
	 * @return the diff as a string truncated to maxDiffLines, or an empty string if
	 *         too many files are changed
	 * @throws IOException if a Git operation fails
	 */
	public String extractDiff(RevCommit commit) throws IOException {
		ByteArrayOutputStream out = new ByteArrayOutputStream();

		try (DiffFormatter formatter = new DiffFormatter(out)) {
			formatter.setRepository(repository);
			formatter.setDetectRenames(true);

			AbstractTreeIterator parentTree = getParentTree(commit);
			AbstractTreeIterator commitTree = getTree(commit);

			List<DiffEntry> diffs = formatter.scan(parentTree, commitTree);

			// Skip commits that touch too many files (mass-changes are not useful for mining)
			long matchingFiles = diffs.stream().filter(this::matchesPathFilter).count();
			if (matchingFiles > maxFilesPerCommit) {
				return ""; //$NON-NLS-1$
			}

			for (DiffEntry entry : diffs) {
				if (matchesPathFilter(entry)) {
					formatter.format(entry);
				}
			}
		}

		String fullDiff = out.toString(java.nio.charset.StandardCharsets.UTF_8);
		return truncateToMaxLines(fullDiff);
	}

	private boolean matchesPathFilter(DiffEntry entry) {
		if (pathFilters.isEmpty()) {
			return true;
		}
		String path = entry.getChangeType() == DiffEntry.ChangeType.DELETE
				? entry.getOldPath()
				: entry.getNewPath();
		return pathFilters.stream().anyMatch(path::startsWith);
	}

	private AbstractTreeIterator getParentTree(RevCommit commit) throws IOException {
		if (commit.getParentCount() == 0) {
			return new EmptyTreeIterator();
		}
		RevCommit parent = commit.getParent(0);
		return getTree(parent);
	}

	private AbstractTreeIterator getTree(RevCommit commit) throws IOException {
		try (ObjectReader reader = repository.newObjectReader()) {
			CanonicalTreeParser parser = new CanonicalTreeParser();
			parser.reset(reader, commit.getTree().getId());
			return parser;
		}
	}

	private String truncateToMaxLines(String diff) {
		String[] lines = diff.split("\n", -1);
		if (lines.length <= maxDiffLines) {
			return diff;
		}
		StringBuilder sb = new StringBuilder();
		for (int i = 0; i < maxDiffLines; i++) {
			sb.append(lines[i]).append('\n');
		}
		sb.append("\n... (truncated, ").append(lines.length - maxDiffLines)
				.append(" more lines)");
		return sb.toString();
	}

	@Override
	public void close() {
		repository.close();
		git.close();
	}
}