mirror of
https://github.com/TheAlgorithms/Java.git
synced 2025-12-19 07:00:35 +08:00
feat(compression): Add LZ77 and LZ78 algorithms (#6910)
* feat(compression): Add LZ77 and LZ78 algorithms * Resolve Spotbugs warning in LZ78 by using Trie structure * fix code style
This commit is contained in:
168
src/main/java/com/thealgorithms/compression/LZ77.java
Normal file
168
src/main/java/com/thealgorithms/compression/LZ77.java
Normal file
@@ -0,0 +1,168 @@
|
||||
package com.thealgorithms.compression;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* An implementation of the Lempel-Ziv 77 (LZ77) compression algorithm.
|
||||
* <p>
|
||||
* LZ77 is a lossless data compression algorithm that works by finding repeated
|
||||
* occurrences of data in a sliding window. It replaces subsequent occurrences
|
||||
* with references (offset, length) to the first occurrence within the window.
|
||||
* </p>
|
||||
* <p>
|
||||
* This implementation uses a simple sliding window and lookahead buffer approach.
|
||||
* Output format is a sequence of tuples (offset, length, next_character).
|
||||
* </p>
|
||||
* <p>
|
||||
* Time Complexity: O(n*W) in this naive implementation, where n is the input length
|
||||
* and W is the window size, due to the search for the longest match. More advanced
|
||||
* data structures (like suffix trees) can improve this.
|
||||
* </p>
|
||||
* <p>
|
||||
* References:
|
||||
* <ul>
|
||||
* <li><a href="https://en.wikipedia.org/wiki/LZ77_and_LZ78#LZ77">Wikipedia: LZ77</a></li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*/
|
||||
public final class LZ77 {
|
||||
|
||||
private static final int DEFAULT_WINDOW_SIZE = 4096;
|
||||
private static final int DEFAULT_LOOKAHEAD_BUFFER_SIZE = 16;
|
||||
private static final char END_OF_STREAM = '\u0000';
|
||||
private LZ77() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Represents a token in the LZ77 compressed output.
|
||||
* Stores the offset back into the window, the length of the match,
|
||||
* and the next character after the match (or END_OF_STREAM if at end).
|
||||
*/
|
||||
public record Token(int offset, int length, char nextChar) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Compresses the input text using the LZ77 algorithm.
|
||||
*
|
||||
* @param text The input string to compress. Must not be null.
|
||||
* @param windowSize The size of the sliding window (search buffer). Must be positive.
|
||||
* @param lookaheadBufferSize The size of the lookahead buffer. Must be positive.
|
||||
* @return A list of {@link Token} objects representing the compressed data.
|
||||
* @throws IllegalArgumentException if windowSize or lookaheadBufferSize are not positive.
|
||||
*/
|
||||
public static List<Token> compress(String text, int windowSize, int lookaheadBufferSize) {
|
||||
if (text == null) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
if (windowSize <= 0 || lookaheadBufferSize <= 0) {
|
||||
throw new IllegalArgumentException("Window size and lookahead buffer size must be positive.");
|
||||
}
|
||||
|
||||
List<Token> compressedOutput = new ArrayList<>();
|
||||
int currentPosition = 0;
|
||||
|
||||
while (currentPosition < text.length()) {
|
||||
int bestMatchDistance = 0;
|
||||
int bestMatchLength = 0;
|
||||
|
||||
// Define the start of the search window
|
||||
int searchBufferStart = Math.max(0, currentPosition - windowSize);
|
||||
// Define the end of the lookahead buffer (don't go past text length)
|
||||
int lookaheadEnd = Math.min(currentPosition + lookaheadBufferSize, text.length());
|
||||
|
||||
// Search for the longest match in the window
|
||||
for (int i = searchBufferStart; i < currentPosition; i++) {
|
||||
int currentMatchLength = 0;
|
||||
|
||||
// Check how far the match extends into the lookahead buffer
|
||||
// This allows for overlapping matches (e.g., "aaa" can match with offset 1)
|
||||
while (currentPosition + currentMatchLength < lookaheadEnd) {
|
||||
int sourceIndex = i + currentMatchLength;
|
||||
|
||||
// Handle overlapping matches (run-length encoding within LZ77)
|
||||
// When we've matched beyond our starting position, wrap around using modulo
|
||||
if (sourceIndex >= currentPosition) {
|
||||
int offset = currentPosition - i;
|
||||
sourceIndex = i + (currentMatchLength % offset);
|
||||
}
|
||||
|
||||
if (text.charAt(sourceIndex) == text.charAt(currentPosition + currentMatchLength)) {
|
||||
currentMatchLength++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If this match is longer than the best found so far
|
||||
if (currentMatchLength > bestMatchLength) {
|
||||
bestMatchLength = currentMatchLength;
|
||||
bestMatchDistance = currentPosition - i; // Calculate offset from current position
|
||||
}
|
||||
}
|
||||
|
||||
char nextChar;
|
||||
if (currentPosition + bestMatchLength < text.length()) {
|
||||
nextChar = text.charAt(currentPosition + bestMatchLength);
|
||||
} else {
|
||||
nextChar = END_OF_STREAM;
|
||||
}
|
||||
|
||||
// Add the token to the output
|
||||
compressedOutput.add(new Token(bestMatchDistance, bestMatchLength, nextChar));
|
||||
|
||||
// Move the current position forward
|
||||
// If we're at the end and had a match, just move by the match length
|
||||
if (nextChar == END_OF_STREAM) {
|
||||
currentPosition += bestMatchLength;
|
||||
} else {
|
||||
currentPosition += bestMatchLength + 1;
|
||||
}
|
||||
}
|
||||
|
||||
return compressedOutput;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compresses the input text using the LZ77 algorithm with default buffer sizes.
|
||||
*
|
||||
* @param text The input string to compress. Must not be null.
|
||||
* @return A list of {@link Token} objects representing the compressed data.
|
||||
*/
|
||||
public static List<Token> compress(String text) {
|
||||
return compress(text, DEFAULT_WINDOW_SIZE, DEFAULT_LOOKAHEAD_BUFFER_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decompresses a list of LZ77 tokens back into the original string.
|
||||
*
|
||||
* @param compressedData The list of {@link Token} objects. Must not be null.
|
||||
* @return The original, uncompressed string.
|
||||
*/
|
||||
public static String decompress(List<Token> compressedData) {
|
||||
if (compressedData == null) {
|
||||
return "";
|
||||
}
|
||||
|
||||
StringBuilder decompressedText = new StringBuilder();
|
||||
|
||||
for (Token token : compressedData) {
|
||||
// Copy matched characters from the sliding window
|
||||
if (token.length > 0) {
|
||||
int startIndex = decompressedText.length() - token.offset;
|
||||
|
||||
// Handle overlapping matches (e.g., when length > offset)
|
||||
for (int i = 0; i < token.length; i++) {
|
||||
decompressedText.append(decompressedText.charAt(startIndex + i));
|
||||
}
|
||||
}
|
||||
|
||||
// Append the next character (if not END_OF_STREAM)
|
||||
if (token.nextChar != END_OF_STREAM) {
|
||||
decompressedText.append(token.nextChar);
|
||||
}
|
||||
}
|
||||
|
||||
return decompressedText.toString();
|
||||
}
|
||||
}
|
||||
136
src/main/java/com/thealgorithms/compression/LZ78.java
Normal file
136
src/main/java/com/thealgorithms/compression/LZ78.java
Normal file
@@ -0,0 +1,136 @@
|
||||
package com.thealgorithms.compression;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* An implementation of the Lempel-Ziv 78 (LZ78) compression algorithm.
|
||||
* <p>
|
||||
* LZ78 is a dictionary-based lossless data compression algorithm. It processes
|
||||
* input data sequentially, building a dictionary of phrases encountered so far.
|
||||
* It outputs pairs (dictionary_index, next_character), representing
|
||||
* the longest match found in the dictionary plus the character that follows it.
|
||||
* </p>
|
||||
* <p>
|
||||
* This implementation builds the dictionary dynamically during compression.
|
||||
* The dictionary index 0 represents the empty string (no prefix).
|
||||
* </p>
|
||||
* <p>
|
||||
* Time Complexity: O(n) on average for compression and decompression, assuming
|
||||
* efficient dictionary lookups (using a HashMap), where n is the
|
||||
* length of the input string.
|
||||
* </p>
|
||||
* <p>
|
||||
* References:
|
||||
* <ul>
|
||||
* <li><a href="https://en.wikipedia.org/wiki/LZ77_and_LZ78#LZ78">Wikipedia: LZ78</a></li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*/
|
||||
public final class LZ78 {
|
||||
|
||||
/**
|
||||
* Special character used to mark end of stream when needed.
|
||||
*/
|
||||
private static final char END_OF_STREAM = '\u0000';
|
||||
|
||||
/**
|
||||
* Private constructor to prevent instantiation of this utility class.
|
||||
*/
|
||||
private LZ78() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Represents a token in the LZ78 compressed output.
|
||||
* Stores the index of the matching prefix in the dictionary and the next character.
|
||||
* Index 0 represents the empty string (no prefix).
|
||||
*/
|
||||
public record Token(int index, char nextChar) {
|
||||
}
|
||||
|
||||
/**
|
||||
* A node in the dictionary trie structure.
|
||||
* Each node represents a phrase and can have child nodes for extended phrases.
|
||||
*/
|
||||
private static final class TrieNode {
|
||||
Map<Character, TrieNode> children = new HashMap<>();
|
||||
int index = -1; // -1 means not assigned yet
|
||||
}
|
||||
|
||||
/**
|
||||
* Compresses the input text using the LZ78 algorithm.
|
||||
*
|
||||
* @param text The input string to compress. Must not be null.
|
||||
* @return A list of {@link Token} objects representing the compressed data.
|
||||
*/
|
||||
public static List<Token> compress(String text) {
|
||||
if (text == null || text.isEmpty()) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
List<Token> compressedOutput = new ArrayList<>();
|
||||
TrieNode root = new TrieNode();
|
||||
int nextDictionaryIndex = 1;
|
||||
|
||||
TrieNode currentNode = root;
|
||||
int lastMatchedIndex = 0;
|
||||
|
||||
for (int i = 0; i < text.length(); i++) {
|
||||
char currentChar = text.charAt(i);
|
||||
|
||||
if (currentNode.children.containsKey(currentChar)) {
|
||||
currentNode = currentNode.children.get(currentChar);
|
||||
lastMatchedIndex = currentNode.index;
|
||||
} else {
|
||||
// Output: (index of longest matching prefix, current character)
|
||||
compressedOutput.add(new Token(lastMatchedIndex, currentChar));
|
||||
|
||||
TrieNode newNode = new TrieNode();
|
||||
newNode.index = nextDictionaryIndex++;
|
||||
currentNode.children.put(currentChar, newNode);
|
||||
|
||||
currentNode = root;
|
||||
lastMatchedIndex = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Handle remaining phrase at end of input
|
||||
if (currentNode != root) {
|
||||
compressedOutput.add(new Token(lastMatchedIndex, END_OF_STREAM));
|
||||
}
|
||||
|
||||
return compressedOutput;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decompresses a list of LZ78 tokens back into the original string.
|
||||
*
|
||||
* @param compressedData The list of {@link Token} objects. Must not be null.
|
||||
* @return The original, uncompressed string.
|
||||
*/
|
||||
public static String decompress(List<Token> compressedData) {
|
||||
if (compressedData == null || compressedData.isEmpty()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
StringBuilder decompressedText = new StringBuilder();
|
||||
Map<Integer, String> dictionary = new HashMap<>();
|
||||
int nextDictionaryIndex = 1;
|
||||
|
||||
for (Token token : compressedData) {
|
||||
String prefix = (token.index == 0) ? "" : dictionary.get(token.index);
|
||||
|
||||
if (token.nextChar == END_OF_STREAM) {
|
||||
decompressedText.append(prefix);
|
||||
} else {
|
||||
String currentPhrase = prefix + token.nextChar;
|
||||
decompressedText.append(currentPhrase);
|
||||
dictionary.put(nextDictionaryIndex++, currentPhrase);
|
||||
}
|
||||
}
|
||||
|
||||
return decompressedText.toString();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user