XmlFileStrategy.java

/*******************************************************************************
 * Copyright (c) 2026 Carsten Hammer.
 *
 * This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License 2.0
 * which accompanies this distribution, and is available at
 * https://www.eclipse.org/legal/epl-2.0/
 *
 * SPDX-License-Identifier: EPL-2.0
 *
 * Contributors:
 *     Carsten Hammer
 *******************************************************************************/
package org.eclipse.jgit.storage.hibernate.search.strategies;

import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.eclipse.jgit.storage.hibernate.search.BlobIndexData;
import org.eclipse.jgit.storage.hibernate.search.FileTypeStrategy;

/**
 * Strategy for extracting searchable metadata from XML files.
 */
public class XmlFileStrategy implements FileTypeStrategy {

	private static final Pattern NAMESPACE_PATTERN = Pattern
			.compile("xmlns[^\"]*=\"([^\"]+)\""); //$NON-NLS-1$

	private static final Pattern ELEMENT_PATTERN = Pattern
			.compile("<([a-zA-Z][\\w.:-]*)\\s"); //$NON-NLS-1$

	private static final int MAX_SNIPPET = 65535;

	@Override
	public Set<String> supportedExtensions() {
		return Set.of(".xml", ".xsd", ".exsd", ".xsl"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
	}

	@Override
	public Set<String> supportedFilenames() {
		return Collections.emptySet();
	}

	@Override
	public BlobIndexData extract(String source, String filePath) {
		BlobIndexData data = new BlobIndexData();
		data.setFileType("xml"); //$NON-NLS-1$
		data.setSourceSnippet(
				source.length() > MAX_SNIPPET
						? source.substring(0, MAX_SNIPPET)
						: source);

		Set<String> namespaces = new LinkedHashSet<>();
		Matcher nsMatcher = NAMESPACE_PATTERN.matcher(source);
		while (nsMatcher.find()) {
			namespaces.add(nsMatcher.group(1));
		}
		data.setFullyQualifiedNames(
				String.join("\n", namespaces)); //$NON-NLS-1$

		Set<String> elements = new LinkedHashSet<>();
		Matcher elemMatcher = ELEMENT_PATTERN.matcher(source);
		while (elemMatcher.find()) {
			elements.add(elemMatcher.group(1));
		}
		data.setDeclaredTypes(String.join("\n", elements)); //$NON-NLS-1$

		return data;
	}

	@Override
	public String fileType() {
		return "xml"; //$NON-NLS-1$
	}
}