Skip to content

Commit 717e94c

Browse files
authored
Merge pull request #211 from RNAcentral/expression-atlas-fixes
Expression atlas fixes: - lookup csv is now pulled only from Ensembl, and only for relevant taxids - copying now only pulls what is needed into a cache directory, which should be significantly faster - Parsing is parallelised across experiments, so should happen faster - Unit tests for key bits of the parser
2 parents 877d8da + 55be643 commit 717e94c

File tree

21 files changed

+1646
-1056
lines changed

21 files changed

+1646
-1056
lines changed

config/databases.config

+4
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,10 @@ params {
7575
}
7676
}
7777

78+
expressionatlas {
79+
cache = '/hps/nobackup/agb/rnacentral/expression_atlas_cache'
80+
}
81+
7882
flybase {
7983
remote = 'ftp://ftp.flybase.net/releases/current/precomputed_files/genes/ncRNA*.json.gz'
8084
}
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,38 @@
1+
CREATE TEMP TABLE taxids_to_fetch (
2+
taxid bigint PRIMARY KEY
3+
);
4+
5+
\copy taxids_to_fetch FROM 'taxids_to_fetch';
6+
7+
18
COPY(
29
SELECT xref.upi || '_' || xref.taxid as urs_taxid,
310
xref.taxid as taxid,
4-
gene || '|' || external_id || '|' || gene_synonym || '|' || optional_id as external_id,
11+
split_part(gene, '.', 1) as gene,
12+
external_id,
13+
gene_synonym ,
14+
optional_id,
515
description,
616
seq_version,
717
rna_type,
818
COALESCE(seq_short, seq_long) as seq
9-
FROM rnc_accessions
19+
FROM rna
1020
JOIN xref
11-
ON xref.ac = rnc_accessions.accession
12-
13-
JOIN rna
1421
ON xref.upi = rna.upi
22+
join rnc_accessions
23+
ON xref.ac = rnc_accessions.accession
1524

1625
WHERE xref.deleted = 'N'
26+
AND xref.dbid IN (25, 31, 34, 35, 36, 47)
27+
AND (
28+
gene <> ''
29+
OR
30+
external_id <> ''
31+
OR
32+
gene_synonym <> ''
33+
OR optional_id <> ''
34+
)
35+
AND xref.taxid IN (SELECT taxid FROM taxids_to_fetch)
1736

1837

1938
) TO STDOUT CSV HEADER

rnacentral_pipeline/cli/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
ensembl,
2626
europepmc,
2727
evlncrnas,
28-
expressionatlas,
2928
five_s_rrnadb,
3029
flybase,
3130
ftp_export,
@@ -71,6 +70,7 @@
7170
zfin,
7271
zwd,
7372
)
73+
from rnacentral_pipeline.databases.expressionatlas import cli as expressionatlas
7474
from rnacentral_pipeline.databases.tmrna import cli as tmrna
7575

7676

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
# -*- coding: utf-8 -*-
2+
3+
"""
4+
Copyright [2009-2021] EMBL-European Bioinformatics Institute
5+
Licensed under the Apache License, Version 2.0 (the "License");
6+
you may not use this file except in compliance with the License.
7+
You may obtain a copy of the License at
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License.
14+
"""
15+
16+
import logging
17+
import re
18+
from pathlib import Path
19+
20+
import click
21+
import polars as pl
22+
23+
from rnacentral_pipeline.databases.expressionatlas import configuration, parser, sdrf
24+
from rnacentral_pipeline.databases.expressionatlas.helpers import find_all_taxids
25+
from rnacentral_pipeline.writers import entry_writer
26+
27+
LOGGER = logging.getLogger(__name__)
28+
29+
30+
@click.group("expressionatlas")
31+
def cli():
32+
"""
33+
Commands for parsing expression atlas data
34+
"""
35+
36+
37+
@cli.command("parse")
38+
@click.argument("genes_mapped", type=click.File("r"))
39+
@click.argument("lookup", type=click.Path())
40+
@click.argument(
41+
"output",
42+
default=".",
43+
type=click.Path(writable=True, dir_okay=True, file_okay=False),
44+
)
45+
def process_csv(genes_mapped, lookup, output):
46+
"""
47+
Process the csv generated by linking EA data to rnc data
48+
49+
Args:
50+
gened_mapped: A concatenated ndjson of all hits for all experiments
51+
lookup: The retrieved lookup CSV file
52+
output: The directory in which the load data will be written
53+
"""
54+
entries = parser.parse(genes_mapped, lookup)
55+
56+
with entry_writer(Path(output)) as writer:
57+
try:
58+
writer.write(entries)
59+
except ValueError:
60+
print("No entries from this chunk")
61+
62+
63+
@cli.command("get-taxids")
64+
@click.argument(
65+
"directory", type=click.Path(exists=True, file_okay=False, dir_okay=True)
66+
)
67+
@click.argument("output")
68+
def get_taxids(directory, output):
69+
"""
70+
Find all condensed SDRF files in the subdirectory, and parse out all possible taxids.
71+
Write these out to a file so we can query with them later.
72+
73+
Args:
74+
directory: The path to the top-level directory within which are all Expression atlas
75+
experiments.
76+
output: A text file that will have one taxid per line, ready to be used in a future
77+
sql query
78+
"""
79+
taxids = find_all_taxids(directory)
80+
with open(output, "w") as out:
81+
for t in taxids:
82+
out.write(f"{t}\n")
83+
84+
LOGGER.info(f"Found {len(taxids)} unique taxids to search for")
85+
86+
87+
@cli.command("parse-dir")
88+
@click.argument(
89+
"directory", type=click.Path(exists=True, file_okay=False, dir_okay=True)
90+
)
91+
@click.argument("lookup", type=click.Path())
92+
@click.argument("output", type=click.Path())
93+
def parse_dir(directory, lookup, output):
94+
"""
95+
Look at the files within a directory and parse them appropriately
96+
97+
This will parse down to a urs_taxid <> experiment mapping, saved as ndjson
98+
The next parsing stage will create entries using the lookup
99+
100+
Args:
101+
directory: The path to a specific experiment that we will try to parse
102+
lookup: Path to the retrieved lookup CSV file
103+
output: Output filename for writing the ndjson hits
104+
"""
105+
directory = Path(directory)
106+
configurations = list(directory.glob("*configuration.xml"))
107+
assert len(configurations) == 1
108+
config_file = configurations[0]
109+
config = configuration.parse_config(config_file)
110+
input("wait...")
111+
if config.exp_type == "rnaseq_mrna_differential":
112+
analytics = list(directory.glob("*analytics.tsv"))
113+
assert len(analytics) == 1
114+
analytics_file = analytics[0]
115+
116+
sdrfs = list(directory.glob("*condensed-sdrf.tsv"))
117+
assert len(sdrfs) == 1
118+
sdrf = sdrfs[0]
119+
print(analytics_file)
120+
try:
121+
hits = parser.parse_differential(analytics_file, sdrf, lookup)
122+
except ValueError:
123+
LOGGER.error("Failed to parse differential experiment %s", analytics_file)
124+
hits = pl.DataFrame()
125+
elif config.exp_type == "rnaseq_mrna_baseline":
126+
## There is a transcripts tpms file which we don't want, so grab both with
127+
## pathlib glob, then filter to only get the one we want
128+
tpms = list(directory.glob(r"*-tpms.tsv"))
129+
tpms = list(filter(lambda x: re.match(".*\d+-tpms\.tsv$", str(x)), tpms))[0]
130+
131+
sdrfs = list(directory.glob("*condensed-sdrf.tsv"))
132+
assert len(sdrfs) == 1
133+
sdrf = sdrfs[0]
134+
try:
135+
hits = parser.parse_baseline(tpms, sdrf, lookup)
136+
except ValueError:
137+
hits = pl.DataFrame()
138+
LOGGER.error("failed to parse baseline experiment %s", tpms)
139+
140+
hits.write_ndjson(output)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
# This module handles the parsing of the configuration file
2+
import os
3+
from dataclasses import dataclass, field
4+
from typing import Any, Dict, List, Optional
5+
6+
from lxml import etree
7+
8+
9+
@dataclass
10+
class Contrast:
11+
id: str
12+
name: str
13+
ref_group: str # Mapped from reference_assay_group
14+
test_group: str # Mapped from test_assay_group
15+
16+
17+
@dataclass
18+
class Contrasts:
19+
contrast: List[Contrast] = field(default_factory=list)
20+
21+
22+
@dataclass
23+
class Assay:
24+
id: str
25+
technical_replicate_id: Optional[str] = None
26+
27+
28+
@dataclass
29+
class AssayGroup:
30+
id: str
31+
label: Optional[str] = None
32+
assays: List[str] = field(default_factory=list)
33+
34+
35+
@dataclass
36+
class AssayGroups:
37+
assay_group: List[AssayGroup] = field(default_factory=list)
38+
39+
40+
@dataclass
41+
class Analytics:
42+
assay_groups: AssayGroups
43+
array_design: Optional[str] = None
44+
contrasts: Optional[Contrasts] = None
45+
46+
47+
@dataclass
48+
class Config:
49+
exp_type: str # Mapped from experimentType
50+
r_data: Optional[str] = None # Added to match the example XML
51+
analytics: List[Analytics] = field(default_factory=list)
52+
53+
54+
def parse_config(file_path):
55+
"""Parse the XML configuration file into a Config object."""
56+
try:
57+
with open(file_path, "r") as file:
58+
xml_content = file.read()
59+
60+
root = etree.fromstring(xml_content.encode("utf-8"))
61+
62+
# Create Config object - the root is 'configuration' with experimentType attribute
63+
config = Config(
64+
exp_type=root.get("experimentType"), r_data=root.get("r_data"), analytics=[]
65+
)
66+
67+
# Parse analytics
68+
for analytics_elem in root.findall("analytics"):
69+
assay_groups = AssayGroups(assay_group=[])
70+
71+
# Parse assay groups - in the example, they're directly under 'assay_groups'
72+
for assay_group_elem in analytics_elem.findall(
73+
"./assay_groups/assay_group"
74+
):
75+
assays = []
76+
77+
# Process each assay element and extract technical_replicate_id if present
78+
for assay_elem in assay_group_elem.findall("assay"):
79+
assays.append(assay_elem.text)
80+
81+
assay_group = AssayGroup(
82+
id=assay_group_elem.get("id"),
83+
label=assay_group_elem.get("label"),
84+
assays=assays,
85+
)
86+
assay_groups.assay_group.append(assay_group)
87+
88+
# The example doesn't have contrasts, but keeping the code for completeness
89+
contrasts_elem = analytics_elem.find("contrasts")
90+
contrasts = None
91+
if contrasts_elem is not None:
92+
contrasts = Contrasts(contrast=[])
93+
for contrast_elem in contrasts_elem.findall("contrast"):
94+
contrast = Contrast(
95+
id=contrast_elem.get("id"),
96+
name=contrast_elem.find("name").text,
97+
ref_group=contrast_elem.find("reference_assay_group").text
98+
if contrast_elem.find("reference_assay_group") is not None
99+
else contrast_elem.find("ref_group").text,
100+
test_group=contrast_elem.find("test_assay_group").text
101+
if contrast_elem.find("test_assay_group") is not None
102+
else contrast_elem.find("test_group").text,
103+
)
104+
contrasts.contrast.append(contrast)
105+
106+
# The example doesn't have array_design, but keeping the code for completeness
107+
array_design_elem = analytics_elem.find("arrayDesign")
108+
array_design = (
109+
array_design_elem.text if array_design_elem is not None else None
110+
)
111+
112+
analytics = Analytics(
113+
assay_groups=assay_groups,
114+
array_design=array_design,
115+
contrasts=contrasts,
116+
)
117+
config.analytics.append(analytics)
118+
119+
return config
120+
121+
except Exception as e:
122+
raise Exception(f"Error parsing configuration file: {e}")

0 commit comments

Comments
 (0)