Skip to content

Fix mutation ordering #3257

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 142 additions & 0 deletions c/tests/test_tables.c
Original file line number Diff line number Diff line change
Expand Up @@ -8905,6 +8905,7 @@ static void
test_sort_tables_errors(void)
{
int ret;
tsk_id_t ret_id;
tsk_treeseq_t ts;
tsk_table_collection_t tables;
tsk_bookmark_t pos;
Expand Down Expand Up @@ -8968,6 +8969,32 @@ test_sort_tables_errors(void)
ret = tsk_table_collection_sort(&tables, &pos, 0);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_OFFSET_NOT_SUPPORTED);

/* Test TSK_ERR_MUTATION_PARENT_INCONSISTENT */
ret = tsk_table_collection_clear(&tables, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
tables.sequence_length = 1.0;

ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0.0, TSK_NULL, TSK_NULL, NULL, 0);
CU_ASSERT_FATAL(ret_id >= 0);
ret_id = tsk_site_table_add_row(&tables.sites, 0.0, "x", 1, NULL, 0);
CU_ASSERT_FATAL(ret_id >= 0);

ret_id
= tsk_mutation_table_add_row(&tables.mutations, 0, 0, 2, 0.0, "a", 1, NULL, 0);
CU_ASSERT_FATAL(ret_id >= 0);
ret_id
= tsk_mutation_table_add_row(&tables.mutations, 0, 0, 3, 0.0, "b", 1, NULL, 0);
CU_ASSERT_FATAL(ret_id >= 0);
ret_id
= tsk_mutation_table_add_row(&tables.mutations, 0, 0, 1, 0.0, "c", 1, NULL, 0);
CU_ASSERT_FATAL(ret_id >= 0);
ret_id
= tsk_mutation_table_add_row(&tables.mutations, 0, 0, 2, 0.0, "d", 1, NULL, 0);
CU_ASSERT_FATAL(ret_id >= 0);

ret = tsk_table_collection_sort(&tables, NULL, 0);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_PARENT_INCONSISTENT);

tsk_table_collection_free(&tables);
tsk_treeseq_free(&ts);
}
Expand Down Expand Up @@ -9032,6 +9059,120 @@ test_sort_tables_mutation_times(void)
tsk_treeseq_free(&ts);
}

static void
test_sort_tables_mutations(void)
{
int ret;
tsk_table_collection_t tables;

/* Sorting hierarchy:
* 1. site
* 2. time (when known)
* 3. node_time
* 4. num_descendants: parent mutations first
* 5. node_id
* 6. mutation_id
*/

const char *sites = "0.0 A\n"
"0.5 T\n"
"0.75 G\n";

const char *mutations_unsorted =
/* Test site criterion (primary) - site 1 should come after site 0 */
"1 0 X -1 0.0\n" /* mut 0: site 1, will be sorted after site 0 mutations */
"0 0 Y -1 0.0\n" /* mut 1: site 0, will be sorted before site 1 mutations */

/* Test time criterion - within same site, earlier time first */
"0 4 B -1 2.0\n" /* mut 2: site 0, node 4 (time 1.0), time 2.0 (later time)
*/
"0 5 A -1 2.5\n" /* mut 3: site 0, node 5 (time 2.0), time 2.5 (earlier
relative) */

/* Test unknown vs known times - unknown times at site 2, fall back to node_time
sorting */
"2 4 U2 -1\n" /* mut 4: site 2, node 4 (time 1.0), unknown time - falls back
to node_time */
"2 4 U3 -1\n" /* mut 5: site 2, node 4 (time 1.0), unknown time - should use
mutation_id as tiebreaker */
"2 5 U1 -1\n" /* mut 6: site 2, node 5 (time 2.0), unknown time - falls back
to node_time */

/* Test node_time criterion - same site, same mut time, different node times */
"0 4 D -1 1.5\n" /* mut 7: site 0, node 4 (time 1.0), mut time 1.5 */
"0 5 C -1 2.5\n" /* mut 8: site 0, node 5 (time 2.0), mut time 2.5 - same
mut time */

/* Test num_descendants criterion with mutation parent-child relationships */
"0 2 P -1 0.0\n" /* mut 9: site 0, node 2, parent mutation (0 descendants
initially) */
"0 1 C1 9 0.0\n" /* mut 10: site 0, node 1, child of mut 9 (parent now has
1+ descendants) */
"0 1 C2 9 0.0\n" /* mut 11: site 0, node 1, another child of mut 9 (parent
now has 2+ descendants) */
"0 3 Q -1 0.0\n" /* mut 12: site 0, node 3, no children (0 descendants) */
"0 0 C3 10 0.0\n" /* mut 13: site 0, node 0, child of mut 10 (making mut 9 a
grandparent) */

/* Test node and mutation_id criteria for final tiebreaking */
"0 0 Z1 -1 0.0\n" /* mut 14: site 0, node 0, no parent, will test node+id
ordering */
"0 0 Z2 -1 0.0\n"; /* mut 15: site 0, node 0, no parent, later in input =
higher ID */

const char *mutations_sorted =
/* Site 0 mutations - known times first, sorted by time */
"0 5 A -1 2.5\n"
"0 5 C -1 2.5\n"
"0 4 B -1 2.0\n"
"0 4 D -1 1.5\n"
"0 2 P -1 0.0\n"
"0 1 C1 4 0.0\n"
"0 0 Y -1 0.0\n"
"0 0 C3 5 0.0\n"
"0 0 Z1 -1 0.0\n"
"0 0 Z2 -1 0.0\n"
"0 1 C2 4 0.0\n"
"0 3 Q -1 0.0\n"

/* Site 1 mutations */
"1 0 X -1 0.0\n"

/* Site 2 mutations - unknown times, sorted by node_time then other criteria */
"2 5 U1 -1\n"
"2 4 U2 -1\n"
"2 4 U3 -1\n";

ret = tsk_table_collection_init(&tables, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
tables.sequence_length = 1.0;
parse_nodes(single_tree_ex_nodes, &tables.nodes);
parse_edges(single_tree_ex_edges, &tables.edges);

parse_sites(sites, &tables.sites);
CU_ASSERT_EQUAL_FATAL(tables.sites.num_rows, 3);

parse_mutations(mutations_unsorted, &tables.mutations);
CU_ASSERT_EQUAL_FATAL(tables.mutations.num_rows, 16);

ret = tsk_table_collection_sort(&tables, NULL, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);

tsk_table_collection_t expected;
ret = tsk_table_collection_init(&expected, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
expected.sequence_length = 1.0;
parse_nodes(single_tree_ex_nodes, &expected.nodes);
parse_edges(single_tree_ex_edges, &expected.edges);
parse_sites(sites, &expected.sites);
parse_mutations(mutations_sorted, &expected.mutations);

CU_ASSERT_TRUE(tsk_mutation_table_equals(&tables.mutations, &expected.mutations, 0));

tsk_table_collection_free(&expected);
tsk_table_collection_free(&tables);
}

static void
test_sort_tables_canonical_errors(void)
{
Expand Down Expand Up @@ -11608,6 +11749,7 @@ main(int argc, char **argv)
{ "test_sort_tables_errors", test_sort_tables_errors },
{ "test_sort_tables_individuals", test_sort_tables_individuals },
{ "test_sort_tables_mutation_times", test_sort_tables_mutation_times },
{ "test_sort_tables_mutations", test_sort_tables_mutations },
{ "test_sort_tables_migrations", test_sort_tables_migrations },
{ "test_sort_tables_no_edge_metadata", test_sort_tables_no_edge_metadata },
{ "test_sort_tables_offsets", test_sort_tables_offsets },
Expand Down
105 changes: 18 additions & 87 deletions c/tskit/tables.c
Original file line number Diff line number Diff line change
Expand Up @@ -6756,7 +6756,8 @@ typedef struct {
typedef struct {
tsk_mutation_t mut;
int num_descendants;
} mutation_canonical_sort_t;
double node_time;
} mutation_sort_t;

typedef struct {
tsk_individual_t ind;
Expand Down Expand Up @@ -6797,39 +6798,30 @@ cmp_site(const void *a, const void *b)
static int
cmp_mutation(const void *a, const void *b)
{
const tsk_mutation_t *ia = (const tsk_mutation_t *) a;
const tsk_mutation_t *ib = (const tsk_mutation_t *) b;
/* Compare mutations by site */
int ret = (ia->site > ib->site) - (ia->site < ib->site);
/* Within a particular site sort by time if known, then ID. This ensures that
* relative ordering within a site is maintained */
if (ret == 0 && !tsk_is_unknown_time(ia->time) && !tsk_is_unknown_time(ib->time)) {
ret = (ia->time < ib->time) - (ia->time > ib->time);
}
if (ret == 0) {
ret = (ia->id > ib->id) - (ia->id < ib->id);
}
return ret;
}

static int
cmp_mutation_canonical(const void *a, const void *b)
{
const mutation_canonical_sort_t *ia = (const mutation_canonical_sort_t *) a;
const mutation_canonical_sort_t *ib = (const mutation_canonical_sort_t *) b;
const mutation_sort_t *ia = (const mutation_sort_t *) a;
const mutation_sort_t *ib = (const mutation_sort_t *) b;
/* Compare mutations by site */
int ret = (ia->mut.site > ib->mut.site) - (ia->mut.site < ib->mut.site);

/* Within a particular site sort by time if known */
if (ret == 0 && !tsk_is_unknown_time(ia->mut.time)
&& !tsk_is_unknown_time(ib->mut.time)) {
ret = (ia->mut.time < ib->mut.time) - (ia->mut.time > ib->mut.time);
}
/* Or node times when mutation times are unknown or equal */
if (ret == 0) {
ret = (ia->node_time < ib->node_time) - (ia->node_time > ib->node_time);
}
/* If node times are equal, sort by number of descendants */
if (ret == 0) {
ret = (ia->num_descendants < ib->num_descendants)
- (ia->num_descendants > ib->num_descendants);
}
/* If number of descendants are equal, sort by node */
if (ret == 0) {
ret = (ia->mut.node > ib->mut.node) - (ia->mut.node < ib->mut.node);
}
/* Final tiebreaker: ID */
if (ret == 0) {
ret = (ia->mut.id > ib->mut.id) - (ia->mut.id < ib->mut.id);
}
Expand Down Expand Up @@ -7056,77 +7048,15 @@ tsk_table_sorter_sort_sites(tsk_table_sorter_t *self)

static int
tsk_table_sorter_sort_mutations(tsk_table_sorter_t *self)
{
int ret = 0;
tsk_size_t j;
tsk_id_t ret_id, parent, mapped_parent;
tsk_mutation_table_t *mutations = &self->tables->mutations;
tsk_size_t num_mutations = mutations->num_rows;
tsk_mutation_table_t copy;
tsk_mutation_t *sorted_mutations
= tsk_malloc(num_mutations * sizeof(*sorted_mutations));
tsk_id_t *mutation_id_map = tsk_malloc(num_mutations * sizeof(*mutation_id_map));

ret = tsk_mutation_table_copy(mutations, &copy, 0);
if (ret != 0) {
goto out;
}
if (mutation_id_map == NULL || sorted_mutations == NULL) {
ret = tsk_trace_error(TSK_ERR_NO_MEMORY);
goto out;
}

for (j = 0; j < num_mutations; j++) {
tsk_mutation_table_get_row_unsafe(&copy, (tsk_id_t) j, sorted_mutations + j);
sorted_mutations[j].site = self->site_id_map[sorted_mutations[j].site];
}
ret = tsk_mutation_table_clear(mutations);
if (ret != 0) {
goto out;
}

qsort(sorted_mutations, (size_t) num_mutations, sizeof(*sorted_mutations),
cmp_mutation);

/* Make a first pass through the sorted mutations to build the ID map. */
for (j = 0; j < num_mutations; j++) {
mutation_id_map[sorted_mutations[j].id] = (tsk_id_t) j;
}

for (j = 0; j < num_mutations; j++) {
mapped_parent = TSK_NULL;
parent = sorted_mutations[j].parent;
if (parent != TSK_NULL) {
mapped_parent = mutation_id_map[parent];
}
ret_id = tsk_mutation_table_add_row(mutations, sorted_mutations[j].site,
sorted_mutations[j].node, mapped_parent, sorted_mutations[j].time,
sorted_mutations[j].derived_state, sorted_mutations[j].derived_state_length,
sorted_mutations[j].metadata, sorted_mutations[j].metadata_length);
if (ret_id < 0) {
ret = (int) ret_id;
goto out;
}
}
ret = 0;

out:
tsk_safe_free(mutation_id_map);
tsk_safe_free(sorted_mutations);
tsk_mutation_table_free(&copy);
return ret;
}

static int
tsk_table_sorter_sort_mutations_canonical(tsk_table_sorter_t *self)
{
int ret = 0;
tsk_size_t j;
tsk_id_t ret_id, parent, mapped_parent, p;
tsk_mutation_table_t *mutations = &self->tables->mutations;
tsk_node_table_t *nodes = &self->tables->nodes;
tsk_size_t num_mutations = mutations->num_rows;
tsk_mutation_table_t copy;
mutation_canonical_sort_t *sorted_mutations
mutation_sort_t *sorted_mutations
= tsk_malloc(num_mutations * sizeof(*sorted_mutations));
tsk_id_t *mutation_id_map = tsk_malloc(num_mutations * sizeof(*mutation_id_map));

Expand Down Expand Up @@ -7158,14 +7088,15 @@ tsk_table_sorter_sort_mutations_canonical(tsk_table_sorter_t *self)
for (j = 0; j < num_mutations; j++) {
tsk_mutation_table_get_row_unsafe(&copy, (tsk_id_t) j, &sorted_mutations[j].mut);
sorted_mutations[j].mut.site = self->site_id_map[sorted_mutations[j].mut.site];
sorted_mutations[j].node_time = nodes->time[sorted_mutations[j].mut.node];
}
ret = tsk_mutation_table_clear(mutations);
if (ret != 0) {
goto out;
}

qsort(sorted_mutations, (size_t) num_mutations, sizeof(*sorted_mutations),
cmp_mutation_canonical);
cmp_mutation);

/* Make a first pass through the sorted mutations to build the ID map. */
for (j = 0; j < num_mutations; j++) {
Expand Down Expand Up @@ -12245,7 +12176,7 @@ tsk_table_collection_canonicalise(tsk_table_collection_t *self, tsk_flags_t opti
if (ret != 0) {
goto out;
}
sorter.sort_mutations = tsk_table_sorter_sort_mutations_canonical;
sorter.sort_mutations = tsk_table_sorter_sort_mutations;
sorter.sort_individuals = tsk_table_sorter_sort_individuals_canonical;

nodes = tsk_malloc(self->nodes.num_rows * sizeof(*nodes));
Expand Down
7 changes: 7 additions & 0 deletions python/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@

**Bugfixes**

- In some tables with mutations out-of-order `TableCollection.sort` did not re-order
the mutations so they formed a valid TreeSequence. `TableCollection.sort` and
`TableCollection.canonicalise` now sort mutations by site, then time (if known),
then the mutation's node's time, then number of descendant mutations
(ensuring that parent mutations occur before children), then node, then
their original order in the tables. (:user:`benjeffery`, :pr:`3257`, :issue:`3253`)

- Fix bug in ``TreeSequence.pair_coalescence_counts`` when ``span_normalise=True``
and a window breakpoint falls within an internal missing interval.
(:user:`nspope`, :pr:`3176`, :issue:`3175`)
Expand Down
3 changes: 1 addition & 2 deletions python/tests/test_extend_haplotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,7 @@ def extend_haplotypes(ts, max_iter=10):
extender = HaplotypeExtender(ts, forwards=forwards)
extender.extend_haplotypes()
tables.edges.replace_with(extender.edges)
tables.sort()
tables.sort(mutation_start=tables.mutations.num_rows)
tables.build_index()
ts = tables.tree_sequence()
if ts.num_edges == last_num_edges:
Expand All @@ -493,7 +493,6 @@ def extend_haplotypes(ts, max_iter=10):
tables = ts.dump_tables()
mutations = _slide_mutation_nodes_up(ts, mutations)
tables.mutations.replace_with(mutations)
tables.sort()
ts = tables.tree_sequence()
return ts

Expand Down
7 changes: 6 additions & 1 deletion python/tests/test_highlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -3396,7 +3396,12 @@ def test_text_record_round_trip(self, ts1):
sequence_length=ts1.sequence_length,
strict=True,
)
self.verify_approximate_equality(ts1, ts2)
tables1 = ts1.tables.copy()
# load_text performs a `sort`, which changes the order relative to
# the original tree sequence
tables1.sort()
ts1_sorted = tables1.tree_sequence()
self.verify_approximate_equality(ts1_sorted, ts2)

def test_empty_files(self):
nodes_file = io.StringIO("is_sample\ttime\n")
Expand Down
5 changes: 5 additions & 0 deletions python/tests/test_table_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,11 @@ def test_genotypes_round_trip(self, ts):
@pytest.mark.parametrize("ts", get_example_tree_sequences())
@pytest.mark.parametrize("population", [-1, None])
def test_definition(self, ts, population):
# The python implementation of split_edges performs a sort,
# which changes the order relative to the original tree sequence
tables = ts.dump_tables()
tables.sort()
ts = tables.tree_sequence()
time = 0 if ts.num_nodes == 0 else np.median(ts.tables.nodes.time)
if ts.num_migrations == 0:
ts1 = split_edges_definition(ts, time, population=population)
Expand Down
Loading
Loading