Skip to content

Commit 41aa1d9

Browse files
committed
Merge branch 'feature/string-compression' into develop
2 parents 6320ab7 + d2607b2 commit 41aa1d9

File tree

3 files changed

+57
-0
lines changed

3 files changed

+57
-0
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -1249,6 +1249,7 @@ var str = collection.stream()
12491249
12501250
**Examples**
12511251
1252+
- Bit compression, CCSP#1.2: [python](python-algorithm/algorithm/string) | Compress a string using bit manipulation.
12521253
- Convert string, EPI#6.1: [c++](cpp-algorithm/src/string)(`IntToString`, `StringToInt`) | Convert integer to string and vice versa.
12531254
- IP address validation, EPI#6.9: [c++](cpp-algorithm/src/string) | Validate IPv4 address that is in the form of _x.x.x.x_ where _x_ is a number between 0 and 255.
12541255
- Look and say problem, EPI#6.7: [c++](cpp-algorithm/src/string)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
class CompressedGene:
2+
def __init__(self, gene: str) -> None:
3+
self._compress(gene)
4+
5+
def __str__(self) -> str: # string representation for pretty printing
6+
return self.decompress()
7+
8+
def _compress(self, gene: str) -> None:
9+
self.bit_string: int = 1 # start with sentinel
10+
for nucleotide in gene.upper():
11+
self.bit_string <<= 2 # shift left two bits
12+
if nucleotide == 'A': # change last two bits to 00
13+
self.bit_string |= 0b00
14+
elif nucleotide == 'C': # change last two bits to 01
15+
self.bit_string |= 0b01
16+
elif nucleotide == 'G': # change last two bits to 10
17+
self.bit_string |= 0b10
18+
elif nucleotide == 'T': # change last two bits to 11
19+
self.bit_string |= 0b11
20+
else:
21+
raise ValueError('Invalid Nucleotide:{}'.format(nucleotide))
22+
23+
def decompress(self) -> str:
24+
gene: str = ''
25+
for i in range(0, self.bit_string.bit_length() - 1, 2): # - 1 to exclude sentinel
26+
bits: int = self.bit_string >> i & 0b11 # get just 2 relevant bits
27+
if bits == 0b00: # A
28+
gene += 'A'
29+
elif bits == 0b01: # C
30+
gene += 'C'
31+
elif bits == 0b10: # G
32+
gene += 'G'
33+
elif bits == 0b11: # T
34+
gene += 'T'
35+
else:
36+
raise ValueError('Invalid bits:{}'.format(bits))
37+
return gene[::-1] # [::-1] reverses string by slicing backward
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from sys import getsizeof
2+
3+
import pytest
4+
5+
import algorithm.utils.logging as log_utils
6+
from algorithm.string.bit_compression import CompressedGene
7+
8+
9+
@pytest.mark.benchmark(group='gene_string_compression')
10+
def test_bit_compression(benchmark):
11+
logger = log_utils.get_console_logger(__name__, 'DEBUG')
12+
original: str = 'TAGGGATTAACCGTTATATATATATAGCCATGGATCGATTATATAGGGATTAACCGTTATATATATATAGCCATGGATCGATTATA' * 100
13+
compressed: CompressedGene = benchmark(CompressedGene, original)
14+
15+
print('')
16+
logger.debug('original is {} bytes'.format(getsizeof(original)))
17+
logger.debug('compressed is {} bytes'.format(getsizeof(compressed.bit_string)))
18+
19+
assert original == compressed.decompress()

0 commit comments

Comments
 (0)