feat(compression): Add LZW and Arithmetic Coding algorithms (#6799)

* feat(compression): Add LZW and Arithmetic Coding algorithms

* test(compression): Improve test coverage for LZW and ArithmeticCoding

* style(compression): fix code style
This commit is contained in:
Indole Yi
2025-10-20 02:11:22 +08:00
committed by GitHub
parent a7f0bab021
commit 4a97258189
4 changed files with 551 additions and 0 deletions

View File

@@ -0,0 +1,157 @@
package com.thealgorithms.compression;
import java.math.BigDecimal;
import java.math.MathContext;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* An implementation of the Arithmetic Coding algorithm.
*
* <p>
* Arithmetic coding is a form of entropy encoding used in lossless data
* compression. It encodes an entire message into a single number, a fraction n
* where (0.0 <= n < 1.0). Unlike Huffman coding, which assigns a specific
* bit sequence to each symbol, arithmetic coding represents the message as a
* sub-interval of the [0, 1) interval.
* </p>
*
* <p>
* This implementation uses BigDecimal for precision to handle the shrinking
* intervals, making it suitable for educational purposes to demonstrate the
* core logic.
* </p>
*
* <p>
* Time Complexity: O(n*m) for compression and decompression where n is the
* length of the input and m is the number of unique symbols, due to the need
* to calculate symbol probabilities.
* </p>
*
* <p>
* References:
* <ul>
* <li><a href="https://en.wikipedia.org/wiki/Arithmetic_coding">Wikipedia:
* Arithmetic coding</a></li>
* </ul>
* </p>
*/
public final class ArithmeticCoding {
private ArithmeticCoding() {
}
/**
* Compresses a string using the Arithmetic Coding algorithm.
*
* @param uncompressed The string to be compressed.
* @return The compressed representation as a BigDecimal number.
* @throws IllegalArgumentException if the input string is null or empty.
*/
public static BigDecimal compress(String uncompressed) {
if (uncompressed == null || uncompressed.isEmpty()) {
throw new IllegalArgumentException("Input string cannot be null or empty.");
}
Map<Character, Symbol> probabilityTable = calculateProbabilities(uncompressed);
BigDecimal low = BigDecimal.ZERO;
BigDecimal high = BigDecimal.ONE;
for (char symbol : uncompressed.toCharArray()) {
BigDecimal range = high.subtract(low);
Symbol sym = probabilityTable.get(symbol);
high = low.add(range.multiply(sym.high()));
low = low.add(range.multiply(sym.low()));
}
return low; // Return the lower bound of the final interval
}
/**
* Decompresses a BigDecimal number back into the original string.
*
* @param compressed The compressed BigDecimal number.
* @param length The length of the original uncompressed string.
* @param probabilityTable The probability table used during compression.
* @return The original, uncompressed string.
*/
public static String decompress(BigDecimal compressed, int length, Map<Character, Symbol> probabilityTable) {
StringBuilder decompressed = new StringBuilder();
// Create a sorted list of symbols for deterministic decompression, matching the
// order used in calculateProbabilities
List<Map.Entry<Character, Symbol>> sortedSymbols = new ArrayList<>(probabilityTable.entrySet());
sortedSymbols.sort(Map.Entry.comparingByKey());
BigDecimal low = BigDecimal.ZERO;
BigDecimal high = BigDecimal.ONE;
for (int i = 0; i < length; i++) {
BigDecimal range = high.subtract(low);
// Find which symbol the compressed value falls into
for (Map.Entry<Character, Symbol> entry : sortedSymbols) {
Symbol sym = entry.getValue();
// Calculate the actual range for this symbol in the current interval
BigDecimal symLow = low.add(range.multiply(sym.low()));
BigDecimal symHigh = low.add(range.multiply(sym.high()));
// Check if the compressed value falls within this symbol's range
if (compressed.compareTo(symLow) >= 0 && compressed.compareTo(symHigh) < 0) {
decompressed.append(entry.getKey());
// Update the interval for the next iteration
low = symLow;
high = symHigh;
break;
}
}
}
return decompressed.toString();
}
/**
* Calculates the frequency and probability range for each character in the
* input string in a deterministic order.
*
* @param text The input string.
* @return A map from each character to a Symbol object containing its
* probability range.
*/
public static Map<Character, Symbol> calculateProbabilities(String text) {
Map<Character, Integer> frequencies = new HashMap<>();
for (char c : text.toCharArray()) {
frequencies.put(c, frequencies.getOrDefault(c, 0) + 1);
}
// Sort the characters to ensure a deterministic order for the probability table
List<Character> sortedKeys = new ArrayList<>(frequencies.keySet());
Collections.sort(sortedKeys);
Map<Character, Symbol> probabilityTable = new HashMap<>();
BigDecimal currentLow = BigDecimal.ZERO;
int total = text.length();
for (char symbol : sortedKeys) {
BigDecimal probability = BigDecimal.valueOf(frequencies.get(symbol)).divide(BigDecimal.valueOf(total), MathContext.DECIMAL128);
BigDecimal high = currentLow.add(probability);
probabilityTable.put(symbol, new Symbol(currentLow, high));
currentLow = high;
}
return probabilityTable;
}
/**
* Helper class to store the probability range [low, high) for a symbol.
*/
public record Symbol(BigDecimal low, BigDecimal high) {
}
}

View File

@@ -0,0 +1,136 @@
package com.thealgorithms.compression;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* An implementation of the Lempel-Ziv-Welch (LZW) algorithm.
*
* <p>
* LZW is a universal lossless data compression algorithm created by Abraham
* Lempel, Jacob Ziv, and Terry Welch. It works by building a dictionary of
* strings encountered during compression and replacing occurrences of those
* strings with a shorter code.
* </p>
*
* <p>
* This implementation handles standard ASCII characters and provides methods for
* both compression and decompression.
* <ul>
* <li>Compressing "TOBEORNOTTOBEORTOBEORNOT" results in a list of integer
* codes.</li>
* <li>Decompressing that list of codes results back in the original
* string.</li>
* </ul>
* </p>
*
* <p>
* Time Complexity: O(n) for both compression and decompression, where n is the
* length of the input string.
* </p>
*
* <p>
* References:
* <ul>
* <li><a href="https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Welch">Wikipedia:
* LempelZivWelch</a></li>
* </ul>
* </p>
*/
public final class LZW {
/**
* Private constructor to prevent instantiation of this utility class.
*/
private LZW() {
}
/**
* Compresses a string using the LZW algorithm.
*
* @param uncompressed The string to be compressed. Can be null.
* @return A list of integers representing the compressed data. Returns an empty
* list if the input is null or empty.
*/
public static List<Integer> compress(String uncompressed) {
if (uncompressed == null || uncompressed.isEmpty()) {
return new ArrayList<>();
}
// Initialize dictionary with single characters (ASCII 0-255)
int dictSize = 256;
Map<String, Integer> dictionary = new HashMap<>();
for (int i = 0; i < dictSize; i++) {
dictionary.put("" + (char) i, i);
}
String w = "";
List<Integer> result = new ArrayList<>();
for (char c : uncompressed.toCharArray()) {
String wc = w + c;
if (dictionary.containsKey(wc)) {
// If the new string is in the dictionary, extend the current string
w = wc;
} else {
// Otherwise, output the code for the current string
result.add(dictionary.get(w));
// Add the new string to the dictionary
dictionary.put(wc, dictSize++);
// Start a new current string
w = "" + c;
}
}
// Output the code for the last remaining string
result.add(dictionary.get(w));
return result;
}
/**
* Decompresses a list of integers back into a string using the LZW algorithm.
*
* @param compressed A list of integers representing the compressed data. Can be
* null.
* @return The original, uncompressed string. Returns an empty string if the
* input is null or empty.
*/
public static String decompress(List<Integer> compressed) {
if (compressed == null || compressed.isEmpty()) {
return "";
}
// Initialize dictionary with single characters (ASCII 0-255)
int dictSize = 256;
Map<Integer, String> dictionary = new HashMap<>();
for (int i = 0; i < dictSize; i++) {
dictionary.put(i, "" + (char) i);
}
// Decompress the first code
String w = "" + (char) (int) compressed.removeFirst();
StringBuilder result = new StringBuilder(w);
for (int k : compressed) {
String entry;
if (dictionary.containsKey(k)) {
// The code is in the dictionary
entry = dictionary.get(k);
} else if (k == dictSize) {
// Special case for sequences like "ababab"
entry = w + w.charAt(0);
} else {
throw new IllegalArgumentException("Bad compressed k: " + k);
}
result.append(entry);
// Add new sequence to the dictionary
dictionary.put(dictSize++, w + entry.charAt(0));
w = entry;
}
return result.toString();
}
}

View File

@@ -0,0 +1,154 @@
package com.thealgorithms.compression;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.math.BigDecimal;
import java.util.HashMap;
import java.util.Map;
import org.junit.jupiter.api.Test;
class ArithmeticCodingTest {
@Test
void testThrowsExceptionForNullOrEmptyInput() {
// Test that null input throws IllegalArgumentException
assertThrows(IllegalArgumentException.class, () -> ArithmeticCoding.compress(null));
// Test that empty string throws IllegalArgumentException
assertThrows(IllegalArgumentException.class, () -> ArithmeticCoding.compress(""));
}
@Test
void testCompressionAndDecompressionSimple() {
String original = "BABA";
Map<Character, ArithmeticCoding.Symbol> probTable = ArithmeticCoding.calculateProbabilities(original);
BigDecimal compressed = ArithmeticCoding.compress(original);
// Verify that compression produces a valid number in [0, 1)
assertNotNull(compressed);
assertTrue(compressed.compareTo(BigDecimal.ZERO) >= 0);
assertTrue(compressed.compareTo(BigDecimal.ONE) < 0);
// Verify decompression restores the original string
String decompressed = ArithmeticCoding.decompress(compressed, original.length(), probTable);
assertEquals(original, decompressed);
}
@Test
void testSymmetryWithComplexString() {
String original = "THE_QUICK_BROWN_FOX_JUMPS_OVER_THE_LAZY_DOG";
Map<Character, ArithmeticCoding.Symbol> probTable = ArithmeticCoding.calculateProbabilities(original);
BigDecimal compressed = ArithmeticCoding.compress(original);
// Verify compression produces a number in valid range
assertTrue(compressed.compareTo(BigDecimal.ZERO) >= 0);
assertTrue(compressed.compareTo(BigDecimal.ONE) < 0);
// Verify symmetry: decompress(compress(x)) == x
String decompressed = ArithmeticCoding.decompress(compressed, original.length(), probTable);
assertEquals(original, decompressed);
}
@Test
void testSymmetryWithRepetitions() {
String original = "MISSISSIPPI";
Map<Character, ArithmeticCoding.Symbol> probTable = ArithmeticCoding.calculateProbabilities(original);
BigDecimal compressed = ArithmeticCoding.compress(original);
// Verify compression produces a number in valid range
assertTrue(compressed.compareTo(BigDecimal.ZERO) >= 0);
assertTrue(compressed.compareTo(BigDecimal.ONE) < 0);
// Verify the compression-decompression cycle
String decompressed = ArithmeticCoding.decompress(compressed, original.length(), probTable);
assertEquals(original, decompressed);
}
@Test
void testSingleCharacterString() {
String original = "AAAAA";
Map<Character, ArithmeticCoding.Symbol> probTable = ArithmeticCoding.calculateProbabilities(original);
BigDecimal compressed = ArithmeticCoding.compress(original);
// Even with a single unique character, compression should work
assertTrue(compressed.compareTo(BigDecimal.ZERO) >= 0);
assertTrue(compressed.compareTo(BigDecimal.ONE) < 0);
String decompressed = ArithmeticCoding.decompress(compressed, original.length(), probTable);
assertEquals(original, decompressed);
}
@Test
void testCompressionOutputDemo() {
// Demonstrate actual compression output similar to LZW test
String original = "BABA";
BigDecimal compressed = ArithmeticCoding.compress(original);
// Example: "BABA" compresses to approximately 0.625
// This shows that the entire message is encoded as a single number
System.out.println("Original: " + original);
System.out.println("Compressed to: " + compressed);
System.out.println("Compression: " + original.length() + " characters -> 1 BigDecimal number");
// Verify the compressed value is in valid range [0, 1)
assertTrue(compressed.compareTo(BigDecimal.ZERO) >= 0);
assertTrue(compressed.compareTo(BigDecimal.ONE) < 0);
}
@Test
void testProbabilityTableCalculation() {
// Test that probability table is calculated correctly
String text = "AABBC";
Map<Character, ArithmeticCoding.Symbol> probTable = ArithmeticCoding.calculateProbabilities(text);
// Verify all characters are in the table
assertTrue(probTable.containsKey('A'));
assertTrue(probTable.containsKey('B'));
assertTrue(probTable.containsKey('C'));
// Verify probability ranges are valid
for (ArithmeticCoding.Symbol symbol : probTable.values()) {
assertTrue(symbol.low().compareTo(BigDecimal.ZERO) >= 0);
assertTrue(symbol.high().compareTo(BigDecimal.ONE) <= 0);
assertTrue(symbol.low().compareTo(symbol.high()) < 0);
}
}
@Test
void testDecompressionWithMismatchedProbabilityTable() {
// Test decompression with a probability table that doesn't match the original
String original = "ABCD";
BigDecimal compressed = ArithmeticCoding.compress(original);
// Create a different probability table (for "XYZ" instead of "ABCD")
Map<Character, ArithmeticCoding.Symbol> wrongProbTable = ArithmeticCoding.calculateProbabilities("XYZ");
// Decompression with wrong probability table should produce incorrect output
String decompressed = ArithmeticCoding.decompress(compressed, original.length(), wrongProbTable);
// The decompressed string will be different from original (likely all 'X', 'Y', or 'Z')
// This tests the edge case where the compressed value doesn't fall into expected ranges
assertNotNull(decompressed);
assertEquals(original.length(), decompressed.length());
}
@Test
void testDecompressionWithValueOutsideSymbolRanges() {
// Create a custom probability table
Map<Character, ArithmeticCoding.Symbol> probTable = new HashMap<>();
probTable.put('A', new ArithmeticCoding.Symbol(new BigDecimal("0.0"), new BigDecimal("0.5")));
probTable.put('B', new ArithmeticCoding.Symbol(new BigDecimal("0.5"), new BigDecimal("1.0")));
// Use a compressed value that should decode properly
BigDecimal compressed = new BigDecimal("0.25"); // Falls in 'A' range
String decompressed = ArithmeticCoding.decompress(compressed, 3, probTable);
// Verify decompression completes (even if result might not be meaningful)
assertNotNull(decompressed);
assertEquals(3, decompressed.length());
}
}

View File

@@ -0,0 +1,104 @@
package com.thealgorithms.compression;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.junit.jupiter.api.Test;
class LZWTest {
@Test
void testNullAndEmptyInputs() {
// Test that a null input to compress returns an empty list
assertTrue(LZW.compress(null).isEmpty());
// Test that a null input to decompress returns an empty string
assertEquals("", LZW.decompress(null));
// Test that an empty input to compress returns an empty list
assertTrue(LZW.compress("").isEmpty());
// Test that an empty input to decompress returns an empty string
assertEquals("", LZW.decompress(Collections.emptyList()));
}
@Test
void testCompressionAndDecompressionWithSimpleString() {
// Test a classic example string
String original = "TOBEORNOTTOBEORTOBEORNOT";
List<Integer> compressed = LZW.compress(original);
// Create the expected output list
List<Integer> expectedOutput = List.of(84, 79, 66, 69, 79, 82, 78, 79, 84, 256, 258, 260, 265, 259, 261, 263);
// This assertion will fail if the output is not what we expect
assertEquals(expectedOutput, compressed);
// This assertion ensures the decompressed string is correct
String decompressed = LZW.decompress(compressed);
assertEquals(original, decompressed);
}
@Test
void testCompressionWithRepeatedChars() {
// Test a string with long runs of the same character
String original = "AAAAABBBBBAAAAA";
List<Integer> compressed = LZW.compress(original);
String decompressed = LZW.decompress(compressed);
assertEquals(original, decompressed);
}
@Test
void testCompressionWithUniqueChars() {
// Test a string with no repetitions
String original = "ABCDEFG";
List<Integer> compressed = LZW.compress(original);
String decompressed = LZW.decompress(compressed);
assertEquals(original, decompressed);
}
@Test
void testSymmetry() {
// Test that compressing and then decompressing a complex string returns the
// original
String original = "THE_QUICK_BROWN_FOX_JUMPS_OVER_THE_LAZY_DOG";
List<Integer> compressed = LZW.compress(original);
String decompressed = LZW.decompress(compressed);
assertEquals(original, decompressed);
// Another symmetry test with special characters and patterns
String original2 = "ababcbababa";
List<Integer> compressed2 = LZW.compress(original2);
String decompressed2 = LZW.decompress(compressed2);
assertEquals(original2, decompressed2);
}
@Test
void testInvalidCompressedData() {
// Test that decompressing with an invalid code throws IllegalArgumentException
// Create a list with a code that doesn't exist in the dictionary
List<Integer> invalidCompressed = new ArrayList<>();
invalidCompressed.add(65); // 'A' - valid
invalidCompressed.add(999); // Invalid code (not in dictionary)
// This should throw IllegalArgumentException with message "Bad compressed k: 999"
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, () -> LZW.decompress(invalidCompressed));
assertTrue(exception.getMessage().contains("Bad compressed k: 999"));
}
@Test
void testDecompressionWithGapInDictionary() {
// Test with codes that skip dictionary entries
List<Integer> invalidCompressed = new ArrayList<>();
invalidCompressed.add(84); // 'T' - valid
invalidCompressed.add(500); // Way beyond current dictionary size
// This should throw IllegalArgumentException
assertThrows(IllegalArgumentException.class, () -> LZW.decompress(invalidCompressed));
}
}