Skip to content

Commit

Permalink
Merge branch 'master' of github.com:mutalyzer/algebra
Browse files Browse the repository at this point in the history
  • Loading branch information
marksantcroos committed Sep 2, 2024
2 parents 3343d01 + 0799bb8 commit c715a6e
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 45 deletions.
23 changes: 23 additions & 0 deletions algebra/lcs/lcs_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,29 @@ def traversal(source, path):

return traversal(self._source, [])

def uniq_atomics(self):
"""The set of unique atomics for the whole LCS graph."""
atomics = set()
for *_, variant in self.bfs_traversal():
for idx in range(variant[0].start, variant[0].end):
atomics |= {(idx, "")}
for ch in set(variant[0].sequence):
atomics |= {(idx, ch)}
for ch in set(variant[0].sequence):
atomics |= {(variant[0].end, ch)}
return atomics

def overlap(self, other):
"""The set of common unique atomics and the set of all unique
atomics for two LCS graphs."""
lhs = self.uniq_atomics()
rhs = other.uniq_atomics()
return lhs.intersection(rhs), lhs.union(rhs)

def is_disjoint(self, other):
"""Are two LCS graphs disjoint."""
return self.uniq_atomics().isdisjoint(other.uniq_atomics())


def trim(lhs, rhs):
"""Find the lengths of the common prefix and common suffix between
Expand Down
8 changes: 3 additions & 5 deletions algebra/relations/graph_based.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Functions to compare LCS graphs."""


from itertools import product
from ..lcs import edit_distance
from .relation import Relation

Expand Down Expand Up @@ -74,8 +73,7 @@ def compare(reference, lhs, rhs):
if rhs.distance - lhs.distance == distance:
return Relation.IS_CONTAINED

for lhs_variant, rhs_variant in product(lhs.edges(), rhs.edges()):
if not lhs_variant.is_disjoint(rhs_variant):
return Relation.OVERLAP
if lhs.is_disjoint(rhs):
return Relation.DISJOINT

return Relation.DISJOINT
return Relation.OVERLAP
29 changes: 5 additions & 24 deletions algebra/relations/sequence_based.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Functions to compare variants as sequences."""


from itertools import product
from .relation import Relation
from ..lcs import LCSgraph, edit_distance

Expand Down Expand Up @@ -46,14 +45,7 @@ def are_disjoint(reference, lhs, rhs):
if distance == abs(lhs_distance - rhs_distance):
return False

lhs_graph = LCSgraph.from_sequence(reference, lhs)
rhs_graph = LCSgraph.from_sequence(reference, rhs)

for lhs_variant, rhs_variant in product(lhs_graph.edges(), rhs_graph.edges()):
if not lhs_variant.is_disjoint(rhs_variant):
return False

return True
return LCSgraph.from_sequence(reference, lhs).is_disjoint(LCSgraph.from_sequence(reference, rhs))


def have_overlap(reference, lhs, rhs):
Expand All @@ -68,14 +60,7 @@ def have_overlap(reference, lhs, rhs):
if distance in (lhs_distance + rhs_distance, abs(lhs_distance - rhs_distance)):
return False

lhs_graph = LCSgraph.from_sequence(reference, lhs)
rhs_graph = LCSgraph.from_sequence(reference, rhs)

for lhs_variant, rhs_variant in product(lhs_graph.edges(), rhs_graph.edges()):
if not lhs_variant.is_disjoint(rhs_variant):
return True

return False
return not LCSgraph.from_sequence(reference, lhs).is_disjoint(LCSgraph.from_sequence(reference, rhs))


def compare(reference, lhs, rhs):
Expand Down Expand Up @@ -112,11 +97,7 @@ def compare(reference, lhs, rhs):
if rhs_distance - lhs_distance == distance:
return Relation.IS_CONTAINED

lhs_graph = LCSgraph.from_sequence(reference, lhs)
rhs_graph = LCSgraph.from_sequence(reference, rhs)

for lhs_variant, rhs_variant in product(lhs_graph.edges(), rhs_graph.edges()):
if not lhs_variant.is_disjoint(rhs_variant):
return Relation.OVERLAP
if LCSgraph.from_sequence(reference, lhs).is_disjoint(LCSgraph.from_sequence(reference, rhs)):
return Relation.DISJOINT

return Relation.DISJOINT
return Relation.OVERLAP
36 changes: 20 additions & 16 deletions scripts/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pstats
import sys
from itertools import combinations
from algebra.lcs import LCSgraph
from algebra import LCSgraph, Relation
from algebra.utils import fasta_sequence
from algebra.variants import parse_hgvs, to_hgvs
from algebra.relations.graph_based import compare
Expand Down Expand Up @@ -36,10 +36,20 @@ def graphs(reference, variants):

@benchmark
def pairwise(reference, variants):
return [{"lhs": lhs["label"],
"rhs": rhs["label"],
"relation": compare(reference, lhs["graph"], rhs["graph"])
} for lhs, rhs in combinations(variants, 2)]
results = []
for lhs, rhs in combinations(variants, 2):
relation = compare(reference, lhs["graph"], rhs["graph"])
overlap = None
if relation == Relation.OVERLAP:
common, universe = lhs["graph"].overlap(rhs["graph"])
overlap = len(common), len(universe)
results.append({
"lhs": lhs["label"],
"rhs": rhs["label"],
"relation": relation,
"overlap": overlap,
})
return results


@benchmark
Expand Down Expand Up @@ -73,17 +83,11 @@ def main():
relations = pairwise(reference, variants)
with open("data/benchmark_fast_relations.txt", "w", encoding="utf-8") as file:
for entry in relations:
print(entry["lhs"], entry["rhs"], entry["relation"].value, file=file)
if entry["overlap"]:
print(entry["lhs"], entry["rhs"], entry["relation"].value, *entry["overlap"], file=file)
else:
print(entry["lhs"], entry["rhs"], entry["relation"].value, file=file)


if __name__ == "__main__":
#main()
graph = large_del()
count_nodes = 0
count_edges = 0
for node in graph.nodes():
count_nodes += 1
count_edges += len(node.edges)
#print(node)
print(count_nodes, count_edges)
print(graph.count_split)
main()

0 comments on commit c715a6e

Please sign in to comment.