Skip to content

Commit

Permalink
Add default gap penalties for stretcher
Browse files Browse the repository at this point in the history
  • Loading branch information
aziele committed Feb 27, 2025
1 parent de764b1 commit 79b8654
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 15 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# pairwise-sequence-alignment (psa)

![PyPI - Version](https://img.shields.io/pypi/v/pairwise-sequence-alignment)
![PyPI - Version](https://img.shields.io/pypi/v/pairwise-sequence-alignment?label=version)

This is a Python module to calculate a pairwise alignment between biological sequences (protein or nucleic acid). This module uses the [needle](https://www.ebi.ac.uk/Tools/psa/emboss_needle/), [stretcher](https://www.ebi.ac.uk/jdispatcher/psa/emboss_stretcher) and [water](https://www.ebi.ac.uk/Tools/psa/emboss_water/) tools from the EMBOSS package to calculate an optimal, global/local pairwise alignment.

Expand Down Expand Up @@ -33,8 +33,8 @@ I wrote this module for two reasons. First, the needle and water tools are faste
Pairwise sequence alignment identifies regions of similarity between two sequences, which can indicate functional, structural, or evolutionary relationships.

1. Global alignment aligns two sequences from start to finish, ideal for sequences that are similar and of comparable length.
* *needle* [Needleman-Wunsch algorithm](https://en.wikipedia.org/wiki/Needleman–Wunsch_algorithm)) calculates a full-length global alignment by maximizing similarity across both sequences through a dynamic programming approach.
* *stretcher* [documentation](https://galaxy-iuc.github.io/emboss-5.0-docs/stretcher.html) performs global alignment using a modified dynamic programming algorithm optimized for linear space efficiency.
* *needle* ([Needleman-Wunsch algorithm](https://en.wikipedia.org/wiki/Needleman–Wunsch_algorithm)) calculates a full-length global alignment by maximizing similarity across both sequences through a dynamic programming approach.
* *stretcher* ([documentation](https://galaxy-iuc.github.io/emboss-5.0-docs/stretcher.html) performs global alignment using a modified dynamic programming algorithm optimized for linear space efficiency.

2. Local alignment (*water*; [Smith-Waterman algorithm](https://en.wikipedia.org/wiki/Smith–Waterman_algorithm)) finds the region with the highest level of similarity between the two sequences. It is suitable for sequences that are not assumed to be similar over the entire length.

Expand Down
39 changes: 28 additions & 11 deletions psa.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
an optimal global and local alignment between a pair of sequences (query and
subject).
Copyright 2022 Andrzej Zielezinski ([email protected])
Copyright 2022-2025 Andrzej Zielezinski ([email protected])
https://github.com/aziele/pairwise-sequence-alignment
"""
Expand All @@ -16,14 +16,14 @@
import shutil
import random

__version__ = '1.0.1'
__version__ = '1.0.2'

# Check whether needle is on PATH and marked as executable.
assert shutil.which('needle'), "needle not found (is emboss installed?)"


class PairwiseAlignment():
"""Object representing a pairwise ailgnment.
"""Object representing a pairwise alignment.
Attributes:
qid Query sequence identifier
Expand Down Expand Up @@ -125,7 +125,7 @@ def query_coverage(self) -> float:
return (self.qend - self.qstart + 1) / self.qlen * 100

def subject_coverage(self) -> float:
"""Query coverage"""
"""Subject coverage"""
return (self.send - self.sstart + 1) / self.slen * 100

def fasta(self, wrap=70) -> str:
Expand Down Expand Up @@ -194,14 +194,14 @@ def __getitem__(self, index):
)

def align(
program: Literal['needle', 'water'],
program: Literal['needle', 'water', 'stretcher'],
moltype: Literal['prot', 'nucl'],
qseq: str,
sseq: str,
qid: str = 'query',
sid: str = 'subject',
gapopen: int = 10,
gapextend: int = 0.5,
gapopen: Optional[int] = None,
gapextend: Optional[float] = None,
matrix: Optional[str] = None
) -> PairwiseAlignment:
"""Aligns two sequences, parses the output, and returns an alignment object.
Expand All @@ -213,15 +213,30 @@ def align(
sseq Subject sequence
qid Query sequence identifier
sid Subject sequence identifier
gapopen Gap open penalty
gapextend Gap extension penalty
gapopen Gap open penalty (if None, defaults are used)
gapextend Gap extension penalty (if None, defaults are used)
matrix Name of scoring matrix
Returns:
A list of lines from EMBOSS output
A PairwiseAlignment object.
"""

# Define default penalties
default_penalties = {
'needle': {'prot': (10, 0.5), 'nucl': (10, 0.5)},
'water': {'prot': (10, 0.5), 'nucl': (10, 0.5)},
'stretcher': {'prot': (12, 2), 'nucl': (16, 4)}
}

# Assign default values if not provided
if gapopen is None:
gapopen = default_penalties[program][moltype][0]
if gapextend is None:
gapextend = default_penalties[program][moltype][1]

if not matrix:
matrix = 'EBLOSUM62' if moltype == 'prot' else 'EDNAFULL'

handle = emboss_run(
program=program,
moltype=moltype,
Expand Down Expand Up @@ -258,6 +273,7 @@ def align(
raw=aln.output
)


def emboss_run(
program: Literal['needle', 'water', 'stretcher'],
moltype: Literal['prot', 'nucl'],
Expand Down Expand Up @@ -299,6 +315,7 @@ def emboss_run(
f"-datafile {matrix}",
f"-gapopen {gapopen}",
f"-gapextend {gapextend}",
"-aformat pair",
]
process = subprocess.run(
" ".join(cmd),
Expand All @@ -311,7 +328,7 @@ def emboss_run(
return process.stdout.splitlines()


def emboss_parse(handle: Iterable[str]) -> collections.namedtuple:
def emboss_parse(handle: Iterable[str]) -> namedtuple:
"""Parses EMBOSS output.
Args:
Expand Down
2 changes: 1 addition & 1 deletion test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def test_water_dna(self):
self.assertEqual(len(aln.qaln), len(aln.saln))
self.assertEqual(len(aln.qaln), aln.length)

def test_water_stretcher(self):
def test_stretcher_dna(self):
aln = psa.stretcher(moltype='nucl', qseq='ATGCTAGATA', sseq='ATGCTAGTTA')
self.assertEqual(len(aln.qaln), len(aln.saln))
self.assertEqual(len(aln.qaln), aln.length)
Expand Down

0 comments on commit 79b8654

Please sign in to comment.