Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
package com.thealgorithms.compression;

import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

/**
* Implementation of the Burrows-Wheeler Transform (BWT) and its inverse.
* <p>
* BWT is a reversible data transformation algorithm that rearranges a string into runs of
* similar characters. While not a compression algorithm itself, it significantly improves
* the compressibility of data for subsequent algorithms like Move-to-Front encoding and
* Run-Length Encoding.
* </p>
*
* <p>The transform works by:
* <ol>
* <li>Generating all rotations of the input string</li>
* <li>Sorting these rotations lexicographically</li>
* <li>Taking the last column of the sorted matrix as output</li>
* <li>Recording the index of the original string in the sorted matrix</li>
* </ol>
* </p>
*
* <p><b>Important:</b> The input string should end with a unique end-of-string marker
* (typically '$') that:
* <ul>
* <li>Does not appear anywhere else in the text</li>
* <li>Is lexicographically smaller than all other characters</li>
* <li>Ensures unique rotations and enables correct inverse transformation</li>
* </ul>
* Without this marker, the inverse transform may not correctly reconstruct the original string.
* </p>
*
* <p><b>Time Complexity:</b>
* <ul>
* <li>Forward transform: O(n² log n) where n is the string length</li>
* <li>Inverse transform: O(n) using the LF-mapping technique</li>
* </ul>
* </p>
*
* <p><b>Example:</b></p>
* <pre>
* Input: "banana$"
* Output: BWTResult("annb$aa", 4)
* - "annb$aa" is the transformed string (groups similar characters)
* - 4 is the index of the original string in the sorted rotations
* </pre>
*
* @see <a href="https://en.wikipedia.org/wiki/Burrows%E2%80%93Wheeler_transform">Burrows–Wheeler transform (Wikipedia)</a>
*/
public final class BurrowsWheelerTransform {

private BurrowsWheelerTransform() {
}

/**
* A container for the result of the forward BWT.
* <p>
* Contains the transformed string and the index of the original string
* in the sorted rotations matrix, both of which are required for the
* inverse transformation.
* </p>
*/
public static class BWTResult {
/** The transformed string (last column of the sorted rotation matrix) */
public final String transformed;

/** The index of the original string in the sorted rotations matrix */
public final int originalIndex;

/**
* Constructs a BWTResult with the transformed string and original index.
*
* @param transformed the transformed string (L-column)
* @param originalIndex the index of the original string in sorted rotations
*/
public BWTResult(String transformed, int originalIndex) {
this.transformed = transformed;
this.originalIndex = originalIndex;
}

@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null || getClass() != obj.getClass()) {
return false;
}
BWTResult bwtResult = (BWTResult) obj;
return originalIndex == bwtResult.originalIndex && transformed.equals(bwtResult.transformed);
}

@Override
public int hashCode() {
return 31 * transformed.hashCode() + originalIndex;
}

@Override
public String toString() {
return "BWTResult[transformed=" + transformed + ", originalIndex=" + originalIndex + "]";
}
}

/**
* Performs the forward Burrows-Wheeler Transform on the input string.
* <p>
* The algorithm generates all cyclic rotations of the input, sorts them
* lexicographically, and returns the last column of this sorted matrix
* along with the position of the original string.
* </p>
*
* <p><b>Note:</b> It is strongly recommended that the input string ends with
* a unique end-of-string marker (e.g., '$') that is lexicographically smaller
* than any other character in the string. This ensures correct inversion.</p>
*
* @param text the input string to transform; must not be {@code null}
* @return a {@link BWTResult} object containing the transformed string (L-column)
* and the index of the original string in the sorted rotations matrix;
* returns {@code BWTResult("", -1)} for empty input
* @throws NullPointerException if {@code text} is {@code null}
*/
public static BWTResult transform(String text) {
if (text == null || text.isEmpty()) {
return new BWTResult("", -1);
}

int n = text.length();

// Generate all rotations of the input string
String[] rotations = new String[n];
for (int i = 0; i < n; i++) {
rotations[i] = text.substring(i) + text.substring(0, i);
}

// Sort rotations lexicographically
Arrays.sort(rotations);
int originalIndex = Arrays.binarySearch(rotations, text);
StringBuilder lastColumn = new StringBuilder(n);
for (int i = 0; i < n; i++) {
lastColumn.append(rotations[i].charAt(n - 1));
}

return new BWTResult(lastColumn.toString(), originalIndex);
}

/**
* Performs the inverse Burrows-Wheeler Transform using the LF-mapping technique.
* <p>
* The LF-mapping (Last-First mapping) is an efficient method to reconstruct
* the original string from the BWT output without explicitly reconstructing
* the entire sorted rotations matrix.
* </p>
*
* <p>The algorithm works by:
* <ol>
* <li>Creating the first column by sorting the BWT string</li>
* <li>Building a mapping from first column indices to last column indices</li>
* <li>Following this mapping starting from the original index to reconstruct the string</li>
* </ol>
* </p>
*
* @param bwtString the transformed string (L-column) from the forward transform; must not be {@code null}
* @param originalIndex the index of the original string row from the forward transform;
* use -1 for empty strings
* @return the original, untransformed string; returns empty string if input is empty or {@code originalIndex} is -1
* @throws NullPointerException if {@code bwtString} is {@code null}
* @throws IllegalArgumentException if {@code originalIndex} is out of valid range (except -1)
*/
public static String inverseTransform(String bwtString, int originalIndex) {
if (bwtString == null || bwtString.isEmpty() || originalIndex == -1) {
return "";
}

int n = bwtString.length();
if (originalIndex < 0 || originalIndex >= n) {
throw new IllegalArgumentException("Original index must be between 0 and " + (n - 1) + ", got: " + originalIndex);
}

char[] lastColumn = bwtString.toCharArray();
char[] firstColumn = bwtString.toCharArray();
Arrays.sort(firstColumn);

// Create the "next" array for LF-mapping.
// next[i] stores the row index in the last column that corresponds to firstColumn[i]
int[] next = new int[n];

// Track the count of each character seen so far in the last column
Map<Character, Integer> countMap = new HashMap<>();

// Store the first occurrence index of each character in the first column
Map<Character, Integer> firstOccurrence = new HashMap<>();

for (int i = 0; i < n; i++) {
if (!firstOccurrence.containsKey(firstColumn[i])) {
firstOccurrence.put(firstColumn[i], i);
}
}

// Build the LF-mapping
for (int i = 0; i < n; i++) {
char c = lastColumn[i];
int count = countMap.getOrDefault(c, 0);
int firstIndex = firstOccurrence.get(c);
next[firstIndex + count] = i;
countMap.put(c, count + 1);
}

// Reconstruct the original string by following the LF-mapping
StringBuilder originalString = new StringBuilder(n);
int currentRow = originalIndex;
for (int i = 0; i < n; i++) {
originalString.append(firstColumn[currentRow]);
currentRow = next[currentRow];
}

return originalString.toString();
}
}
164 changes: 164 additions & 0 deletions src/main/java/com/thealgorithms/compression/MoveToFront.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
package com.thealgorithms.compression;

import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.Collectors;

/**
* Implementation of the Move-to-Front (MTF) transform and its inverse.
* <p>
* MTF is a data transformation algorithm that encodes each symbol in the input
* as its current position in a dynamically-maintained list, then moves that symbol
* to the front of the list. This transformation is particularly effective when used
* after the Burrows-Wheeler Transform (BWT), as BWT groups similar characters together.
* </p>
*
* <p>The transform converts runs of repeated characters into sequences of small integers
* (often zeros), which are highly compressible by subsequent entropy encoding algorithms
* like Run-Length Encoding (RLE) or Huffman coding. This technique is used in the
* bzip2 compression algorithm.
* </p>
*
* <p><b>How it works:</b>
* <ol>
* <li>Maintain a list of symbols (the alphabet), initially in a fixed order</li>
* <li>For each input symbol:
* <ul>
* <li>Output its current index in the list</li>
* <li>Move that symbol to the front of the list</li>
* </ul>
* </li>
* </ol>
* This means frequently occurring symbols quickly move to the front and are encoded
* with small indices (often 0), while rare symbols remain near the back.
* </p>
*
* <p><b>Time Complexity:</b>
* <ul>
* <li>Forward transform: O(n × m) where n is input length and m is alphabet size</li>
* <li>Inverse transform: O(n × m)</li>
* </ul>
* Note: Using {@link LinkedList} for O(1) insertions and O(m) search operations.
* </p>
*
* <p><b>Example:</b></p>
* <pre>
* Input: "annb$aa"
* Alphabet: "$abn" (initial order)
* Output: [1, 3, 0, 3, 3, 3, 0]
*
* Step-by-step:
* - 'a': index 1 in [$,a,b,n] → output 1, list becomes [a,$,b,n]
* - 'n': index 3 in [a,$,b,n] → output 3, list becomes [n,a,$,b]
* - 'n': index 0 in [n,a,$,b] → output 0, list stays [n,a,$,b]
* - 'b': index 3 in [n,a,$,b] → output 3, list becomes [b,n,a,$]
* - etc.
*
* Notice how repeated 'n' characters produce zeros after the first occurrence!
* </pre>
*
* @see <a href="https://en.wikipedia.org/wiki/Move-to-front_transform">Move-to-front transform (Wikipedia)</a>
*/
public final class MoveToFront {

private MoveToFront() {
}

/**
* Performs the forward Move-to-Front transform.
* <p>
* Converts the input string into a list of integers, where each integer represents
* the position of the corresponding character in a dynamically-maintained alphabet list.
* </p>
*
* <p><b>Note:</b> All characters in the input text must exist in the provided alphabet,
* otherwise an {@link IllegalArgumentException} is thrown. The alphabet should contain
* all unique characters that may appear in the input.</p>
*
* @param text the input string to transform; if empty, returns an empty list
* @param initialAlphabet a string containing the initial ordered set of symbols
* (e.g., "$abn" or the full ASCII set); must not be empty
* when {@code text} is non-empty
* @return a list of integers representing the transformed data, where each integer
* is the index of the corresponding input character in the current alphabet state
* @throws IllegalArgumentException if {@code text} is non-empty and {@code initialAlphabet}
* is {@code null} or empty
* @throws IllegalArgumentException if any character in {@code text} is not found in
* {@code initialAlphabet}
*/
public static List<Integer> transform(String text, String initialAlphabet) {
if (text == null || text.isEmpty()) {
return new ArrayList<>();
}
if (initialAlphabet == null || initialAlphabet.isEmpty()) {
throw new IllegalArgumentException("Alphabet cannot be null or empty when text is not empty.");
}

List<Integer> output = new ArrayList<>(text.length());

// Use LinkedList for O(1) add-to-front and O(n) remove operations
// This is more efficient than ArrayList for the move-to-front pattern
List<Character> alphabet = initialAlphabet.chars().mapToObj(c -> (char) c).collect(Collectors.toCollection(LinkedList::new));

for (char c : text.toCharArray()) {
int index = alphabet.indexOf(c);
if (index == -1) {
throw new IllegalArgumentException("Symbol '" + c + "' not found in the initial alphabet.");
}

output.add(index);

// Move the character to the front
Character symbol = alphabet.remove(index);
alphabet.addFirst(symbol);
}
return output;
}

/**
* Performs the inverse Move-to-Front transform.
* <p>
* Reconstructs the original string from the list of indices produced by the
* forward transform. This requires the exact same initial alphabet that was
* used in the forward transform.
* </p>
*
* <p><b>Important:</b> The {@code initialAlphabet} parameter must be identical
* to the one used in the forward transform, including character order, or the
* output will be incorrect.</p>
*
* @param indices The list of integers from the forward transform.
* @param initialAlphabet the exact same initial alphabet string used for the forward transform;
* if {@code null} or empty, returns an empty string
* @return the original, untransformed string
* @throws IllegalArgumentException if any index in {@code indices} is negative or
* exceeds the current alphabet size
*/
public static String inverseTransform(Collection<Integer> indices, String initialAlphabet) {
if (indices == null || indices.isEmpty() || initialAlphabet == null || initialAlphabet.isEmpty()) {
return "";
}

StringBuilder output = new StringBuilder(indices.size());

// Use LinkedList for O(1) add-to-front and O(n) remove operations
List<Character> alphabet = initialAlphabet.chars().mapToObj(c -> (char) c).collect(Collectors.toCollection(LinkedList::new));

for (int index : indices) {
if (index < 0 || index >= alphabet.size()) {
throw new IllegalArgumentException("Index " + index + " is out of bounds for the current alphabet of size " + alphabet.size() + ".");
}

// Get the symbol at the index
char symbol = alphabet.get(index);
output.append(symbol);

// Move the symbol to the front (mirroring the forward transform)
alphabet.remove(index);
alphabet.addFirst(symbol);
}
return output.toString();
}
}
Loading