SchemaTransformationUtils.java

/*******************************************************************************
 * Copyright (c) 2021 Carsten Hammer.
 *
 * This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License 2.0
 * which accompanies this distribution, and is available at
 * https://www.eclipse.org/legal/epl-2.0/
 *
 * SPDX-License-Identifier: EPL-2.0
 *
 * Contributors:
 *     Carsten Hammer
 *******************************************************************************/
package org.sandbox.jdt.internal.corext.fix.helper;

import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.XMLConstants;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;

/**
 * Utilities for transforming XML/XSD/EXSD files using XSLT and post-processing.
 */
public class SchemaTransformationUtils {
	
	/**
	 * Transform an XML file with default settings (no indentation).
	 * 
	 * @param schemaPath path to the XML file
	 * @return transformed content
	 * @throws Exception if transformation fails
	 */
	public static String transform(Path schemaPath) throws Exception {
		return transform(schemaPath, false);
	}
	
	/**
	 * Transform an XML file with configurable indentation.
	 * 
	 * @param schemaPath path to the XML file
	 * @param enableIndent whether to enable indentation (default is false for size reduction)
	 * @return transformed content
	 * @throws Exception if transformation fails
	 */
	public static String transform(Path schemaPath, boolean enableIndent) throws Exception {
		// Load the formatter.xsl file from classpath
		try (InputStream xslStream = SchemaTransformationUtils.class.getClassLoader().getResourceAsStream("resources/formatter.xsl")) {
			if (xslStream == null) {
				throw new IllegalArgumentException("Unable to find formatter.xsl in resources.");
			}

			// Initialize transformer with secure settings
			TransformerFactory factory = TransformerFactory.newInstance();
			factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
			factory.setAttribute(XMLConstants.ACCESS_EXTERNAL_DTD, "");
			factory.setAttribute(XMLConstants.ACCESS_EXTERNAL_STYLESHEET, "");
			
			// Additional security features
//			factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
//			factory.setFeature("http://xml.org/sax/features/external-general-entities", false);
//			factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false);
			
			Transformer transformer = factory.newTransformer(new StreamSource(xslStream));
			
			// Set indentation to "yes" to get proper formatting that can be converted to tabs
			// The enableIndent parameter controls whether the final output has tabs or spaces
			transformer.setOutputProperty(OutputKeys.INDENT, "yes");

			// Perform transformation
			StreamSource source = new StreamSource(schemaPath.toFile());
			Path tempOutput = Files.createTempFile("formatted-schema", ".xml");
			
			try {
				StreamResult result = new StreamResult(tempOutput.toFile());
				transformer.transform(source, result);

				// Read transformed content
				String transformed = Files.readString(tempOutput, StandardCharsets.UTF_8);
				
				// Post-processing for size reduction:
				
				// 1. Collapse empty elements to self-closing tags
				transformed = collapseEmptyElements(transformed);
				
				// 2. Remove trailing whitespace from all lines
				transformed = removeTrailingWhitespace(transformed);
				
				// 3. Normalize whitespace (convert leading spaces to tabs, reduce empty lines)
				transformed = normalizeWhitespace(transformed);
				
				return transformed;
			} finally {
				// Ensure temp file is always deleted
				Files.deleteIfExists(tempOutput);
			}
		}
	}
	
	/**
	 * Collapse empty XML elements to self-closing tags.
	 * Converts: <element></element> or <element>   </element>
	 * To: <element/>
	 * 
	 * @param content the XML content
	 * @return content with empty elements collapsed
	 */
	private static String collapseEmptyElements(String content) {
		// Pattern matches: <tagname attributes></tagname> or <tagname attributes>   </tagname>
		// Captures the opening tag (without >) and ensures matching closing tag
		// Supports namespaces (e.g., ns:element)
		Pattern emptyElementPattern = Pattern.compile(
			"<([\\w:]+)((?:\\s+[^>]*?)?)>\\s*</\\1>",
			Pattern.MULTILINE
		);
		
		Matcher matcher = emptyElementPattern.matcher(content);
		StringBuffer sb = new StringBuffer();
		
		while (matcher.find()) {
			String tagName = matcher.group(1);
			String attributes = matcher.group(2);
			// Replace with self-closing tag
			String replacement = "<" + tagName + attributes + "/>";
			matcher.appendReplacement(sb, Matcher.quoteReplacement(replacement));
		}
		matcher.appendTail(sb);
		
		return sb.toString();
	}
	
	/**
	 * Remove trailing whitespace from all lines.
	 * Only removes spaces and tabs at the end of lines, not other whitespace.
	 * 
	 * @param content the XML content
	 * @return content with trailing whitespace removed from each line
	 */
	private static String removeTrailingWhitespace(String content) {
		// Remove trailing spaces/tabs from each line
		// Pattern matches spaces or tabs at the end of lines (before newline or end of string)
		// Using (?m) for MULTILINE mode to match end-of-line, not just end-of-string
		return content.replaceAll("(?m)[ \\t]+$", "");
	}
	
	/**
	 * Normalize whitespace in the transformed XML:
	 * - Reduce excessive empty lines (max 2 consecutive empty lines)
	 * - Convert leading 4-space indentation to tabs (not inside text nodes)
	 * - Preserve comments and content
	 * - Preserve original line ending style (CRLF vs LF)
	 * 
	 * @param content the XML content to normalize
	 * @return normalized content
	 */
	private static String normalizeWhitespace(String content) {
		// Reduce excessive empty lines - keep max 2 consecutive empty lines,
		// preserving the original line ending style (LF vs CRLF)
		content = content.replaceAll("(\\r?\\n){3,}", "$1$1");
		
		// Convert leading 4 spaces to tabs (only at line start, not in text content)
		// This pattern matches lines that start with spaces (after optional newline)
		Pattern leadingSpaces = Pattern.compile("^( {4})+", Pattern.MULTILINE);
		Matcher matcher = leadingSpaces.matcher(content);
		StringBuilder sb = new StringBuilder();
		
		while (matcher.find()) {
			String spaces = matcher.group();
			int numSpaces = spaces.length();
			int numTabs = numSpaces / 4;
			matcher.appendReplacement(sb, "\t".repeat(numTabs));
		}
		matcher.appendTail(sb);
		
		return sb.toString();
	}
}