From ce6f7311f75d1d23078ba2b7179a625e96b8c37d Mon Sep 17 00:00:00 2001 From: Microindole <1513979779@qq.com> Date: Wed, 15 Oct 2025 01:29:03 +0800 Subject: [PATCH] feat: Add RLE and Shannon-Fano compression algorithms (#6779) * feat: Add RLE and Shannon-Fano compression algorithms * Fix: Resolve CI failures for compression algorithms * chore: trigger CI rebuild --- .../compression/RunLengthEncoding.java | 87 ++++++++++ .../compression/ShannonFano.java | 159 ++++++++++++++++++ .../compression/RunLengthEncodingTest.java | 90 ++++++++++ .../compression/ShannonFanoTest.java | 71 ++++++++ 4 files changed, 407 insertions(+) create mode 100644 src/main/java/com/thealgorithms/compression/RunLengthEncoding.java create mode 100644 src/main/java/com/thealgorithms/compression/ShannonFano.java create mode 100644 src/test/java/com/thealgorithms/compression/RunLengthEncodingTest.java create mode 100644 src/test/java/com/thealgorithms/compression/ShannonFanoTest.java diff --git a/src/main/java/com/thealgorithms/compression/RunLengthEncoding.java b/src/main/java/com/thealgorithms/compression/RunLengthEncoding.java new file mode 100644 index 000000000..8d065f464 --- /dev/null +++ b/src/main/java/com/thealgorithms/compression/RunLengthEncoding.java @@ -0,0 +1,87 @@ +package com.thealgorithms.compression; + +/** + * An implementation of the Run-Length Encoding (RLE) algorithm. + * + *

Run-Length Encoding is a simple form of lossless data compression in which + * runs of data (sequences in which the same data value occurs in many + * consecutive data elements) are stored as a single data value and count, + * rather than as the original run. + * + *

This implementation provides methods for both compressing and decompressing + * a string. For example: + *

+ * + *

Time Complexity: O(n) for both compression and decompression, where n is the + * length of the input string. + * + *

References: + *

+ */ +public final class RunLengthEncoding { + + /** + * Private constructor to prevent instantiation of this utility class. + */ + private RunLengthEncoding() { + } + + /** + * Compresses a string using the Run-Length Encoding algorithm. + * + * @param text The string to be compressed. Must not be null. + * @return The compressed string. Returns an empty string if the input is empty. + */ + public static String compress(String text) { + if (text == null || text.isEmpty()) { + return ""; + } + + StringBuilder compressed = new StringBuilder(); + int count = 1; + + for (int i = 0; i < text.length(); i++) { + // Check if it's the last character or if the next character is different + if (i == text.length() - 1 || text.charAt(i) != text.charAt(i + 1)) { + compressed.append(count); + compressed.append(text.charAt(i)); + count = 1; // Reset count for the new character + } else { + count++; + } + } + return compressed.toString(); + } + + /** + * Decompresses a string that was compressed using the Run-Length Encoding algorithm. + * + * @param compressedText The compressed string. Must not be null. + * @return The original, uncompressed string. + */ + public static String decompress(String compressedText) { + if (compressedText == null || compressedText.isEmpty()) { + return ""; + } + + StringBuilder decompressed = new StringBuilder(); + int count = 0; + + for (char ch : compressedText.toCharArray()) { + if (Character.isDigit(ch)) { + // Build the number for runs of 10 or more (e.g., "12A") + count = count * 10 + ch - '0'; + } else { + // Append the character 'count' times + decompressed.append(String.valueOf(ch).repeat(Math.max(0, count))); + count = 0; // Reset count for the next sequence + } + } + return decompressed.toString(); + } +} diff --git a/src/main/java/com/thealgorithms/compression/ShannonFano.java b/src/main/java/com/thealgorithms/compression/ShannonFano.java new file mode 100644 index 000000000..aa5d7ad91 --- /dev/null +++ b/src/main/java/com/thealgorithms/compression/ShannonFano.java @@ -0,0 +1,159 @@ +package com.thealgorithms.compression; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * An implementation of the Shannon-Fano algorithm for generating prefix codes. + * + *

Shannon-Fano coding is an entropy encoding technique for lossless data + * compression. It assigns variable-length codes to symbols based on their + * frequencies of occurrence. It is a precursor to Huffman coding and works by + * recursively partitioning a sorted list of symbols into two sub-lists with + * nearly equal total frequencies. + * + *

The algorithm works as follows: + *

    + *
  1. Count the frequency of each symbol in the input data.
  2. + *
  3. Sort the symbols in descending order of their frequencies.
  4. + *
  5. Recursively divide the list of symbols into two parts with sums of + * frequencies as close as possible to each other.
  6. + *
  7. Assign a '0' bit to the codes in the first part and a '1' bit to the codes + * in the second part.
  8. + *
  9. Repeat the process for each part until a part contains only one symbol.
  10. + *
+ * + *

Time Complexity: O(n^2) in this implementation due to the partitioning logic, + * or O(n log n) if a more optimized partitioning strategy is used. + * Sorting takes O(n log n), where n is the number of unique symbols. + * + *

References: + *

+ */ +public final class ShannonFano { + + /** + * Private constructor to prevent instantiation of this utility class. + */ + private ShannonFano() { + } + + /** + * A private inner class to represent a symbol and its frequency. + * Implements Comparable to allow sorting based on frequency. + */ + private static class Symbol implements Comparable { + final char character; + final int frequency; + String code = ""; + + Symbol(char character, int frequency) { + this.character = character; + this.frequency = frequency; + } + + @Override + public int compareTo(Symbol other) { + return Integer.compare(other.frequency, this.frequency); // Sort descending + } + } + + /** + * Generates Shannon-Fano codes for the symbols in a given text. + * + * @param text The input string for which to generate codes. Must not be null. + * @return A map where keys are characters and values are their corresponding Shannon-Fano codes. + */ + public static Map generateCodes(String text) { + if (text == null || text.isEmpty()) { + return Collections.emptyMap(); + } + + Map frequencyMap = new HashMap<>(); + for (char c : text.toCharArray()) { + frequencyMap.put(c, frequencyMap.getOrDefault(c, 0) + 1); + } + + List symbols = new ArrayList<>(); + for (Map.Entry entry : frequencyMap.entrySet()) { + symbols.add(new Symbol(entry.getKey(), entry.getValue())); + } + + Collections.sort(symbols); + + // Special case: only one unique symbol + if (symbols.size() == 1) { + symbols.getFirst().code = "0"; + } else { + buildCodeTree(symbols, 0, symbols.size() - 1, ""); + } + + return symbols.stream().collect(Collectors.toMap(s -> s.character, s -> s.code)); + } + + /** + * Recursively builds the Shannon-Fano code tree by partitioning the list of symbols. + * Uses index-based approach to avoid sublist creation issues. + * + * @param symbols The sorted list of symbols to be processed. + * @param start The start index of the current partition. + * @param end The end index of the current partition (inclusive). + * @param prefix The current prefix code being built for the symbols in this partition. + */ + private static void buildCodeTree(List symbols, int start, int end, String prefix) { + // The initial check in generateCodes ensures start <= end is always true here. + // The base case is when a partition has only one symbol. + if (start == end) { + symbols.get(start).code = prefix; + return; + } + + // Find the optimal split point + int splitIndex = findSplitIndex(symbols, start, end); + + // Recursively process left and right partitions with updated prefixes + buildCodeTree(symbols, start, splitIndex, prefix + "0"); + buildCodeTree(symbols, splitIndex + 1, end, prefix + "1"); + } + + /** + * Finds the index that splits the range into two parts with the most balanced frequency sums. + * This method tries every possible split point and returns the index that minimizes the + * absolute difference between the two partition sums. + * + * @param symbols The sorted list of symbols. + * @param start The start index of the range. + * @param end The end index of the range (inclusive). + * @return The index of the last element in the first partition. + */ + private static int findSplitIndex(List symbols, int start, int end) { + // Calculate total frequency for the entire range + long totalFrequency = 0; + for (int i = start; i <= end; i++) { + totalFrequency += symbols.get(i).frequency; + } + + long leftSum = 0; + long minDifference = Long.MAX_VALUE; + int splitIndex = start; + + // Try every possible split point and find the one with minimum difference + for (int i = start; i < end; i++) { + leftSum += symbols.get(i).frequency; + long rightSum = totalFrequency - leftSum; + long difference = Math.abs(leftSum - rightSum); + + if (difference < minDifference) { + minDifference = difference; + splitIndex = i; + } + } + return splitIndex; + } +} diff --git a/src/test/java/com/thealgorithms/compression/RunLengthEncodingTest.java b/src/test/java/com/thealgorithms/compression/RunLengthEncodingTest.java new file mode 100644 index 000000000..049a7fac9 --- /dev/null +++ b/src/test/java/com/thealgorithms/compression/RunLengthEncodingTest.java @@ -0,0 +1,90 @@ +package com.thealgorithms.compression; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +class RunLengthEncodingTest { + + @Test + void testNullInputs() { + // Test that a null input to compress returns an empty string + assertEquals("", RunLengthEncoding.compress(null)); + + // Test that a null input to decompress returns an empty string + assertEquals("", RunLengthEncoding.decompress(null)); + } + + @Test + void testCompressionSimple() { + // Test a typical string with multiple runs + String input = "AAAABBBCCDAA"; + String expected = "4A3B2C1D2A"; + assertEquals(expected, RunLengthEncoding.compress(input)); + } + + @Test + void testCompressionWithNoRuns() { + // Test a string with no consecutive characters + String input = "ABCDE"; + String expected = "1A1B1C1D1E"; + assertEquals(expected, RunLengthEncoding.compress(input)); + } + + @Test + void testCompressionEdgeCases() { + // Test with an empty string + assertEquals("", RunLengthEncoding.compress("")); + + // Test with a single character + assertEquals("1A", RunLengthEncoding.compress("A")); + + // Test with a long run of a single character + assertEquals("10Z", RunLengthEncoding.compress("ZZZZZZZZZZ")); + } + + @Test + void testDecompressionSimple() { + // Test decompression of a typical RLE string + String input = "4A3B2C1D2A"; + String expected = "AAAABBBCCDAA"; + assertEquals(expected, RunLengthEncoding.decompress(input)); + } + + @Test + void testDecompressionWithNoRuns() { + // Test decompression of a string with single characters + String input = "1A1B1C1D1E"; + String expected = "ABCDE"; + assertEquals(expected, RunLengthEncoding.decompress(input)); + } + + @Test + void testDecompressionWithMultiDigitCount() { + // Test decompression where a run count is greater than 9 + String input = "12A1B3C"; + String expected = "AAAAAAAAAAAABCCC"; + assertEquals(expected, RunLengthEncoding.decompress(input)); + } + + @Test + void testDecompressionEdgeCases() { + // Test with an empty string + assertEquals("", RunLengthEncoding.decompress("")); + + // Test with a single character run + assertEquals("A", RunLengthEncoding.decompress("1A")); + } + + @Test + void testSymmetry() { + // Test that compressing and then decompressing returns the original string + String original1 = "WWWWWWWWWWWWBWWWWWWWWWWWWBBBWWWWWWWWWWWWWWWWWWWWWWWWB"; + String compressed = RunLengthEncoding.compress(original1); + String decompressed = RunLengthEncoding.decompress(compressed); + assertEquals(original1, decompressed); + + String original2 = "A"; + assertEquals(original2, RunLengthEncoding.decompress(RunLengthEncoding.compress(original2))); + } +} diff --git a/src/test/java/com/thealgorithms/compression/ShannonFanoTest.java b/src/test/java/com/thealgorithms/compression/ShannonFanoTest.java new file mode 100644 index 000000000..ce34088da --- /dev/null +++ b/src/test/java/com/thealgorithms/compression/ShannonFanoTest.java @@ -0,0 +1,71 @@ +package com.thealgorithms.compression; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Map; +import org.junit.jupiter.api.Test; + +class ShannonFanoTest { + + @Test + void testNullInput() { + // Test with a null string, should return an empty map + assertTrue(ShannonFano.generateCodes(null).isEmpty()); + } + + @Test + void testSimpleString() { + // A simple string to test basic code generation + String text = "AAABBC"; + Map codes = ShannonFano.generateCodes(text); + + assertEquals(3, codes.size()); + assertEquals("0", codes.get('A')); + assertEquals("10", codes.get('B')); + assertEquals("11", codes.get('C')); + } + + @Test + void testExampleFromStringIssue() { + // Example from the original issue proposal: A:15, B:7, C:6, D:6, E:5 + // The code finds a more optimal split: {A,B} | {C,D,E} -> |22-17|=5 + // instead of {A} | {B,C,D,E} -> |15-24|=9. + String text = "AAAAAAAAAAAAAAABBBBBBBCCCCCCDDDDDDEEEEE"; + Map codes = ShannonFano.generateCodes(text); + + assertEquals(5, codes.size()); + assertEquals("00", codes.get('A')); + assertEquals("01", codes.get('B')); + assertEquals("10", codes.get('C')); + assertEquals("110", codes.get('D')); + assertEquals("111", codes.get('E')); + } + + @Test + void testEdgeCases() { + // Test with an empty string + assertTrue(ShannonFano.generateCodes("").isEmpty()); + + // Test with a single character + Map singleCharCodes = ShannonFano.generateCodes("AAAAA"); + assertEquals(1, singleCharCodes.size()); + assertEquals("0", singleCharCodes.get('A')); // A single symbol gets code "0" + + // Test with all unique characters + String uniqueCharsText = "ABCDEF"; + Map uniqueCharCodes = ShannonFano.generateCodes(uniqueCharsText); + assertEquals(6, uniqueCharCodes.size()); + // Check that codes are unique and have varying lengths as expected + assertEquals(6, uniqueCharCodes.values().stream().distinct().count()); + } + + @Test + void testStringWithTwoChars() { + String text = "ABABAB"; + Map codes = ShannonFano.generateCodes(text); + + assertEquals(2, codes.size()); + assertTrue(codes.get('A').equals("0") && codes.get('B').equals("1") || codes.get('A').equals("1") && codes.get('B').equals("0")); + } +}