From ce6f7311f75d1d23078ba2b7179a625e96b8c37d Mon Sep 17 00:00:00 2001
From: Microindole <1513979779@qq.com>
Date: Wed, 15 Oct 2025 01:29:03 +0800
Subject: [PATCH] feat: Add RLE and Shannon-Fano compression algorithms (#6779)
* feat: Add RLE and Shannon-Fano compression algorithms
* Fix: Resolve CI failures for compression algorithms
* chore: trigger CI rebuild
---
.../compression/RunLengthEncoding.java | 87 ++++++++++
.../compression/ShannonFano.java | 159 ++++++++++++++++++
.../compression/RunLengthEncodingTest.java | 90 ++++++++++
.../compression/ShannonFanoTest.java | 71 ++++++++
4 files changed, 407 insertions(+)
create mode 100644 src/main/java/com/thealgorithms/compression/RunLengthEncoding.java
create mode 100644 src/main/java/com/thealgorithms/compression/ShannonFano.java
create mode 100644 src/test/java/com/thealgorithms/compression/RunLengthEncodingTest.java
create mode 100644 src/test/java/com/thealgorithms/compression/ShannonFanoTest.java
diff --git a/src/main/java/com/thealgorithms/compression/RunLengthEncoding.java b/src/main/java/com/thealgorithms/compression/RunLengthEncoding.java
new file mode 100644
index 000000000..8d065f464
--- /dev/null
+++ b/src/main/java/com/thealgorithms/compression/RunLengthEncoding.java
@@ -0,0 +1,87 @@
+package com.thealgorithms.compression;
+
+/**
+ * An implementation of the Run-Length Encoding (RLE) algorithm.
+ *
+ *
Run-Length Encoding is a simple form of lossless data compression in which
+ * runs of data (sequences in which the same data value occurs in many
+ * consecutive data elements) are stored as a single data value and count,
+ * rather than as the original run.
+ *
+ *
This implementation provides methods for both compressing and decompressing
+ * a string. For example:
+ *
+ * - Compressing "AAAABBBCCDAA" results in "4A3B2C1D2A".
+ * - Decompressing "4A3B2C1D2A" results in "AAAABBBCCDAA".
+ *
+ *
+ * Time Complexity: O(n) for both compression and decompression, where n is the
+ * length of the input string.
+ *
+ *
References:
+ *
+ */
+public final class RunLengthEncoding {
+
+ /**
+ * Private constructor to prevent instantiation of this utility class.
+ */
+ private RunLengthEncoding() {
+ }
+
+ /**
+ * Compresses a string using the Run-Length Encoding algorithm.
+ *
+ * @param text The string to be compressed. Must not be null.
+ * @return The compressed string. Returns an empty string if the input is empty.
+ */
+ public static String compress(String text) {
+ if (text == null || text.isEmpty()) {
+ return "";
+ }
+
+ StringBuilder compressed = new StringBuilder();
+ int count = 1;
+
+ for (int i = 0; i < text.length(); i++) {
+ // Check if it's the last character or if the next character is different
+ if (i == text.length() - 1 || text.charAt(i) != text.charAt(i + 1)) {
+ compressed.append(count);
+ compressed.append(text.charAt(i));
+ count = 1; // Reset count for the new character
+ } else {
+ count++;
+ }
+ }
+ return compressed.toString();
+ }
+
+ /**
+ * Decompresses a string that was compressed using the Run-Length Encoding algorithm.
+ *
+ * @param compressedText The compressed string. Must not be null.
+ * @return The original, uncompressed string.
+ */
+ public static String decompress(String compressedText) {
+ if (compressedText == null || compressedText.isEmpty()) {
+ return "";
+ }
+
+ StringBuilder decompressed = new StringBuilder();
+ int count = 0;
+
+ for (char ch : compressedText.toCharArray()) {
+ if (Character.isDigit(ch)) {
+ // Build the number for runs of 10 or more (e.g., "12A")
+ count = count * 10 + ch - '0';
+ } else {
+ // Append the character 'count' times
+ decompressed.append(String.valueOf(ch).repeat(Math.max(0, count)));
+ count = 0; // Reset count for the next sequence
+ }
+ }
+ return decompressed.toString();
+ }
+}
diff --git a/src/main/java/com/thealgorithms/compression/ShannonFano.java b/src/main/java/com/thealgorithms/compression/ShannonFano.java
new file mode 100644
index 000000000..aa5d7ad91
--- /dev/null
+++ b/src/main/java/com/thealgorithms/compression/ShannonFano.java
@@ -0,0 +1,159 @@
+package com.thealgorithms.compression;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+/**
+ * An implementation of the Shannon-Fano algorithm for generating prefix codes.
+ *
+ * Shannon-Fano coding is an entropy encoding technique for lossless data
+ * compression. It assigns variable-length codes to symbols based on their
+ * frequencies of occurrence. It is a precursor to Huffman coding and works by
+ * recursively partitioning a sorted list of symbols into two sub-lists with
+ * nearly equal total frequencies.
+ *
+ *
The algorithm works as follows:
+ *
+ * - Count the frequency of each symbol in the input data.
+ * - Sort the symbols in descending order of their frequencies.
+ * - Recursively divide the list of symbols into two parts with sums of
+ * frequencies as close as possible to each other.
+ * - Assign a '0' bit to the codes in the first part and a '1' bit to the codes
+ * in the second part.
+ * - Repeat the process for each part until a part contains only one symbol.
+ *
+ *
+ * Time Complexity: O(n^2) in this implementation due to the partitioning logic,
+ * or O(n log n) if a more optimized partitioning strategy is used.
+ * Sorting takes O(n log n), where n is the number of unique symbols.
+ *
+ *
References:
+ *
+ */
+public final class ShannonFano {
+
+ /**
+ * Private constructor to prevent instantiation of this utility class.
+ */
+ private ShannonFano() {
+ }
+
+ /**
+ * A private inner class to represent a symbol and its frequency.
+ * Implements Comparable to allow sorting based on frequency.
+ */
+ private static class Symbol implements Comparable {
+ final char character;
+ final int frequency;
+ String code = "";
+
+ Symbol(char character, int frequency) {
+ this.character = character;
+ this.frequency = frequency;
+ }
+
+ @Override
+ public int compareTo(Symbol other) {
+ return Integer.compare(other.frequency, this.frequency); // Sort descending
+ }
+ }
+
+ /**
+ * Generates Shannon-Fano codes for the symbols in a given text.
+ *
+ * @param text The input string for which to generate codes. Must not be null.
+ * @return A map where keys are characters and values are their corresponding Shannon-Fano codes.
+ */
+ public static Map generateCodes(String text) {
+ if (text == null || text.isEmpty()) {
+ return Collections.emptyMap();
+ }
+
+ Map frequencyMap = new HashMap<>();
+ for (char c : text.toCharArray()) {
+ frequencyMap.put(c, frequencyMap.getOrDefault(c, 0) + 1);
+ }
+
+ List symbols = new ArrayList<>();
+ for (Map.Entry entry : frequencyMap.entrySet()) {
+ symbols.add(new Symbol(entry.getKey(), entry.getValue()));
+ }
+
+ Collections.sort(symbols);
+
+ // Special case: only one unique symbol
+ if (symbols.size() == 1) {
+ symbols.getFirst().code = "0";
+ } else {
+ buildCodeTree(symbols, 0, symbols.size() - 1, "");
+ }
+
+ return symbols.stream().collect(Collectors.toMap(s -> s.character, s -> s.code));
+ }
+
+ /**
+ * Recursively builds the Shannon-Fano code tree by partitioning the list of symbols.
+ * Uses index-based approach to avoid sublist creation issues.
+ *
+ * @param symbols The sorted list of symbols to be processed.
+ * @param start The start index of the current partition.
+ * @param end The end index of the current partition (inclusive).
+ * @param prefix The current prefix code being built for the symbols in this partition.
+ */
+ private static void buildCodeTree(List symbols, int start, int end, String prefix) {
+ // The initial check in generateCodes ensures start <= end is always true here.
+ // The base case is when a partition has only one symbol.
+ if (start == end) {
+ symbols.get(start).code = prefix;
+ return;
+ }
+
+ // Find the optimal split point
+ int splitIndex = findSplitIndex(symbols, start, end);
+
+ // Recursively process left and right partitions with updated prefixes
+ buildCodeTree(symbols, start, splitIndex, prefix + "0");
+ buildCodeTree(symbols, splitIndex + 1, end, prefix + "1");
+ }
+
+ /**
+ * Finds the index that splits the range into two parts with the most balanced frequency sums.
+ * This method tries every possible split point and returns the index that minimizes the
+ * absolute difference between the two partition sums.
+ *
+ * @param symbols The sorted list of symbols.
+ * @param start The start index of the range.
+ * @param end The end index of the range (inclusive).
+ * @return The index of the last element in the first partition.
+ */
+ private static int findSplitIndex(List symbols, int start, int end) {
+ // Calculate total frequency for the entire range
+ long totalFrequency = 0;
+ for (int i = start; i <= end; i++) {
+ totalFrequency += symbols.get(i).frequency;
+ }
+
+ long leftSum = 0;
+ long minDifference = Long.MAX_VALUE;
+ int splitIndex = start;
+
+ // Try every possible split point and find the one with minimum difference
+ for (int i = start; i < end; i++) {
+ leftSum += symbols.get(i).frequency;
+ long rightSum = totalFrequency - leftSum;
+ long difference = Math.abs(leftSum - rightSum);
+
+ if (difference < minDifference) {
+ minDifference = difference;
+ splitIndex = i;
+ }
+ }
+ return splitIndex;
+ }
+}
diff --git a/src/test/java/com/thealgorithms/compression/RunLengthEncodingTest.java b/src/test/java/com/thealgorithms/compression/RunLengthEncodingTest.java
new file mode 100644
index 000000000..049a7fac9
--- /dev/null
+++ b/src/test/java/com/thealgorithms/compression/RunLengthEncodingTest.java
@@ -0,0 +1,90 @@
+package com.thealgorithms.compression;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import org.junit.jupiter.api.Test;
+
+class RunLengthEncodingTest {
+
+ @Test
+ void testNullInputs() {
+ // Test that a null input to compress returns an empty string
+ assertEquals("", RunLengthEncoding.compress(null));
+
+ // Test that a null input to decompress returns an empty string
+ assertEquals("", RunLengthEncoding.decompress(null));
+ }
+
+ @Test
+ void testCompressionSimple() {
+ // Test a typical string with multiple runs
+ String input = "AAAABBBCCDAA";
+ String expected = "4A3B2C1D2A";
+ assertEquals(expected, RunLengthEncoding.compress(input));
+ }
+
+ @Test
+ void testCompressionWithNoRuns() {
+ // Test a string with no consecutive characters
+ String input = "ABCDE";
+ String expected = "1A1B1C1D1E";
+ assertEquals(expected, RunLengthEncoding.compress(input));
+ }
+
+ @Test
+ void testCompressionEdgeCases() {
+ // Test with an empty string
+ assertEquals("", RunLengthEncoding.compress(""));
+
+ // Test with a single character
+ assertEquals("1A", RunLengthEncoding.compress("A"));
+
+ // Test with a long run of a single character
+ assertEquals("10Z", RunLengthEncoding.compress("ZZZZZZZZZZ"));
+ }
+
+ @Test
+ void testDecompressionSimple() {
+ // Test decompression of a typical RLE string
+ String input = "4A3B2C1D2A";
+ String expected = "AAAABBBCCDAA";
+ assertEquals(expected, RunLengthEncoding.decompress(input));
+ }
+
+ @Test
+ void testDecompressionWithNoRuns() {
+ // Test decompression of a string with single characters
+ String input = "1A1B1C1D1E";
+ String expected = "ABCDE";
+ assertEquals(expected, RunLengthEncoding.decompress(input));
+ }
+
+ @Test
+ void testDecompressionWithMultiDigitCount() {
+ // Test decompression where a run count is greater than 9
+ String input = "12A1B3C";
+ String expected = "AAAAAAAAAAAABCCC";
+ assertEquals(expected, RunLengthEncoding.decompress(input));
+ }
+
+ @Test
+ void testDecompressionEdgeCases() {
+ // Test with an empty string
+ assertEquals("", RunLengthEncoding.decompress(""));
+
+ // Test with a single character run
+ assertEquals("A", RunLengthEncoding.decompress("1A"));
+ }
+
+ @Test
+ void testSymmetry() {
+ // Test that compressing and then decompressing returns the original string
+ String original1 = "WWWWWWWWWWWWBWWWWWWWWWWWWBBBWWWWWWWWWWWWWWWWWWWWWWWWB";
+ String compressed = RunLengthEncoding.compress(original1);
+ String decompressed = RunLengthEncoding.decompress(compressed);
+ assertEquals(original1, decompressed);
+
+ String original2 = "A";
+ assertEquals(original2, RunLengthEncoding.decompress(RunLengthEncoding.compress(original2)));
+ }
+}
diff --git a/src/test/java/com/thealgorithms/compression/ShannonFanoTest.java b/src/test/java/com/thealgorithms/compression/ShannonFanoTest.java
new file mode 100644
index 000000000..ce34088da
--- /dev/null
+++ b/src/test/java/com/thealgorithms/compression/ShannonFanoTest.java
@@ -0,0 +1,71 @@
+package com.thealgorithms.compression;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.Map;
+import org.junit.jupiter.api.Test;
+
+class ShannonFanoTest {
+
+ @Test
+ void testNullInput() {
+ // Test with a null string, should return an empty map
+ assertTrue(ShannonFano.generateCodes(null).isEmpty());
+ }
+
+ @Test
+ void testSimpleString() {
+ // A simple string to test basic code generation
+ String text = "AAABBC";
+ Map codes = ShannonFano.generateCodes(text);
+
+ assertEquals(3, codes.size());
+ assertEquals("0", codes.get('A'));
+ assertEquals("10", codes.get('B'));
+ assertEquals("11", codes.get('C'));
+ }
+
+ @Test
+ void testExampleFromStringIssue() {
+ // Example from the original issue proposal: A:15, B:7, C:6, D:6, E:5
+ // The code finds a more optimal split: {A,B} | {C,D,E} -> |22-17|=5
+ // instead of {A} | {B,C,D,E} -> |15-24|=9.
+ String text = "AAAAAAAAAAAAAAABBBBBBBCCCCCCDDDDDDEEEEE";
+ Map codes = ShannonFano.generateCodes(text);
+
+ assertEquals(5, codes.size());
+ assertEquals("00", codes.get('A'));
+ assertEquals("01", codes.get('B'));
+ assertEquals("10", codes.get('C'));
+ assertEquals("110", codes.get('D'));
+ assertEquals("111", codes.get('E'));
+ }
+
+ @Test
+ void testEdgeCases() {
+ // Test with an empty string
+ assertTrue(ShannonFano.generateCodes("").isEmpty());
+
+ // Test with a single character
+ Map singleCharCodes = ShannonFano.generateCodes("AAAAA");
+ assertEquals(1, singleCharCodes.size());
+ assertEquals("0", singleCharCodes.get('A')); // A single symbol gets code "0"
+
+ // Test with all unique characters
+ String uniqueCharsText = "ABCDEF";
+ Map uniqueCharCodes = ShannonFano.generateCodes(uniqueCharsText);
+ assertEquals(6, uniqueCharCodes.size());
+ // Check that codes are unique and have varying lengths as expected
+ assertEquals(6, uniqueCharCodes.values().stream().distinct().count());
+ }
+
+ @Test
+ void testStringWithTwoChars() {
+ String text = "ABABAB";
+ Map codes = ShannonFano.generateCodes(text);
+
+ assertEquals(2, codes.size());
+ assertTrue(codes.get('A').equals("0") && codes.get('B').equals("1") || codes.get('A').equals("1") && codes.get('B').equals("0"));
+ }
+}