RNAcentral
diff --git a/‎config/databases.config
+4 b/‎config/databases.config
+4
diff --git a/‎files/import-data/expressionatlas/lookup-dump-query.sql
+24-5 b/‎files/import-data/expressionatlas/lookup-dump-query.sql
+24-5
diff --git a/‎rnacentral_pipeline/cli/__init__.py
+1-1 b/‎rnacentral_pipeline/cli/__init__.py
+1-1
diff --git a/‎rnacentral_pipeline/databases/expressionatlas/cli.py
+140 b/‎rnacentral_pipeline/databases/expressionatlas/cli.py
+140
diff --git a/‎rnacentral_pipeline/databases/expressionatlas/configuration.py
+122 b/‎rnacentral_pipeline/databases/expressionatlas/configuration.py
+122
@@ -75,6 +75,10 @@ params {
       }
     }
 
+    expressionatlas {
+      cache = '/hps/nobackup/agb/rnacentral/expression_atlas_cache'
+    }
+
     flybase {
       remote = 'ftp://ftp.flybase.net/releases/current/precomputed_files/genes/ncRNA*.json.gz'
     }
 
@@ -1,19 +1,38 @@
+CREATE TEMP TABLE taxids_to_fetch (
+  taxid bigint PRIMARY KEY
+);
+
+\copy taxids_to_fetch FROM 'taxids_to_fetch';
+
+
 COPY(
   SELECT xref.upi || '_' || xref.taxid as urs_taxid,
     xref.taxid as taxid,
-    gene || '|' || external_id || '|' || gene_synonym || '|' || optional_id  as external_id,
+    split_part(gene, '.', 1) as gene,
+    external_id,
+    gene_synonym ,
+    optional_id,
     description,
     seq_version,
     rna_type,
     COALESCE(seq_short, seq_long) as seq
-  FROM rnc_accessions
+  FROM rna
   JOIN xref
-  ON xref.ac = rnc_accessions.accession
-
-  JOIN rna
   ON xref.upi = rna.upi
+  join rnc_accessions
+  ON xref.ac = rnc_accessions.accession
 
   WHERE xref.deleted = 'N'
+  AND xref.dbid IN (25, 31, 34, 35, 36, 47)
+  AND (
+    gene <> ''
+    OR
+    external_id <> ''
+    OR
+    gene_synonym <> ''
+    OR optional_id <> ''
+  )
+  AND xref.taxid IN (SELECT taxid FROM taxids_to_fetch)
 
 
   ) TO STDOUT CSV HEADER
@@ -25,7 +25,6 @@
     ensembl,
     europepmc,
     evlncrnas,
-    expressionatlas,
     five_s_rrnadb,
     flybase,
     ftp_export,
@@ -71,6 +70,7 @@
     zfin,
     zwd,
 )
+from rnacentral_pipeline.databases.expressionatlas import cli as expressionatlas
 from rnacentral_pipeline.databases.tmrna import cli as tmrna
 
 
 
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+
+"""
+Copyright [2009-2021] EMBL-European Bioinformatics Institute
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import logging
+import re
+from pathlib import Path
+
+import click
+import polars as pl
+
+from rnacentral_pipeline.databases.expressionatlas import configuration, parser, sdrf
+from rnacentral_pipeline.databases.expressionatlas.helpers import find_all_taxids
+from rnacentral_pipeline.writers import entry_writer
+
+LOGGER = logging.getLogger(__name__)
+
+
+@click.group("expressionatlas")
+def cli():
+    """
+    Commands for parsing expression atlas data
+    """
+
+
+@cli.command("parse")
+@click.argument("genes_mapped", type=click.File("r"))
+@click.argument("lookup", type=click.Path())
+@click.argument(
+    "output",
+    default=".",
+    type=click.Path(writable=True, dir_okay=True, file_okay=False),
+)
+def process_csv(genes_mapped, lookup, output):
+    """
+    Process the csv generated by linking EA data to rnc data
+
+    Args:
+        gened_mapped: A concatenated ndjson of all hits for all experiments
+        lookup: The retrieved lookup CSV file
+        output: The directory in which the load data will be written
+    """
+    entries = parser.parse(genes_mapped, lookup)
+
+    with entry_writer(Path(output)) as writer:
+        try:
+            writer.write(entries)
+        except ValueError:
+            print("No entries from this chunk")
+
+
+@cli.command("get-taxids")
+@click.argument(
+    "directory", type=click.Path(exists=True, file_okay=False, dir_okay=True)
+)
+@click.argument("output")
+def get_taxids(directory, output):
+    """
+    Find all condensed SDRF files in the subdirectory, and parse out all possible taxids.
+    Write these out to a file so we can query with them later.
+
+    Args:
+        directory: The path to the top-level directory within which are all Expression atlas
+            experiments.
+        output: A text file that will have one taxid per line, ready to be used in a future
+            sql query
+    """
+    taxids = find_all_taxids(directory)
+    with open(output, "w") as out:
+        for t in taxids:
+            out.write(f"{t}\n")
+
+    LOGGER.info(f"Found {len(taxids)} unique taxids to search for")
+
+
+@cli.command("parse-dir")
+@click.argument(
+    "directory", type=click.Path(exists=True, file_okay=False, dir_okay=True)
+)
+@click.argument("lookup", type=click.Path())
+@click.argument("output", type=click.Path())
+def parse_dir(directory, lookup, output):
+    """
+    Look at the files within a directory and parse them appropriately
+
+    This will parse down to a urs_taxid <> experiment mapping, saved as ndjson
+    The next parsing stage will create entries using the lookup
+
+    Args:
+        directory: The path to a specific experiment that we will try to parse
+        lookup: Path to the retrieved lookup CSV file
+        output: Output filename for writing the ndjson hits
+    """
+    directory = Path(directory)
+    configurations = list(directory.glob("*configuration.xml"))
+    assert len(configurations) == 1
+    config_file = configurations[0]
+    config = configuration.parse_config(config_file)
+    input("wait...")
+    if config.exp_type == "rnaseq_mrna_differential":
+        analytics = list(directory.glob("*analytics.tsv"))
+        assert len(analytics) == 1
+        analytics_file = analytics[0]
+
+        sdrfs = list(directory.glob("*condensed-sdrf.tsv"))
+        assert len(sdrfs) == 1
+        sdrf = sdrfs[0]
+        print(analytics_file)
+        try:
+            hits = parser.parse_differential(analytics_file, sdrf, lookup)
+        except ValueError:
+            LOGGER.error("Failed to parse differential experiment %s", analytics_file)
+            hits = pl.DataFrame()
+    elif config.exp_type == "rnaseq_mrna_baseline":
+        ## There is a transcripts tpms file which we don't want, so grab both with
+        ## pathlib glob, then filter to only get the one we want
+        tpms = list(directory.glob(r"*-tpms.tsv"))
+        tpms = list(filter(lambda x: re.match(".*\d+-tpms\.tsv$", str(x)), tpms))[0]
+
+        sdrfs = list(directory.glob("*condensed-sdrf.tsv"))
+        assert len(sdrfs) == 1
+        sdrf = sdrfs[0]
+        try:
+            hits = parser.parse_baseline(tpms, sdrf, lookup)
+        except ValueError:
+            hits = pl.DataFrame()
+            LOGGER.error("failed to parse baseline experiment %s", tpms)
+
+    hits.write_ndjson(output)
@@ -0,0 +1,122 @@
+# This module handles the parsing of the configuration file
+import os
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+
+from lxml import etree
+
+
+@dataclass
+class Contrast:
+    id: str
+    name: str
+    ref_group: str  # Mapped from reference_assay_group
+    test_group: str  # Mapped from test_assay_group
+
+
+@dataclass
+class Contrasts:
+    contrast: List[Contrast] = field(default_factory=list)
+
+
+@dataclass
+class Assay:
+    id: str
+    technical_replicate_id: Optional[str] = None
+
+
+@dataclass
+class AssayGroup:
+    id: str
+    label: Optional[str] = None
+    assays: List[str] = field(default_factory=list)
+
+
+@dataclass
+class AssayGroups:
+    assay_group: List[AssayGroup] = field(default_factory=list)
+
+
+@dataclass
+class Analytics:
+    assay_groups: AssayGroups
+    array_design: Optional[str] = None
+    contrasts: Optional[Contrasts] = None
+
+
+@dataclass
+class Config:
+    exp_type: str  # Mapped from experimentType
+    r_data: Optional[str] = None  # Added to match the example XML
+    analytics: List[Analytics] = field(default_factory=list)
+
+
+def parse_config(file_path):
+    """Parse the XML configuration file into a Config object."""
+    try:
+        with open(file_path, "r") as file:
+            xml_content = file.read()
+
+        root = etree.fromstring(xml_content.encode("utf-8"))
+
+        # Create Config object - the root is 'configuration' with experimentType attribute
+        config = Config(
+            exp_type=root.get("experimentType"), r_data=root.get("r_data"), analytics=[]
+        )
+
+        # Parse analytics
+        for analytics_elem in root.findall("analytics"):
+            assay_groups = AssayGroups(assay_group=[])
+
+            # Parse assay groups - in the example, they're directly under 'assay_groups'
+            for assay_group_elem in analytics_elem.findall(
+                "./assay_groups/assay_group"
+            ):
+                assays = []
+
+                # Process each assay element and extract technical_replicate_id if present
+                for assay_elem in assay_group_elem.findall("assay"):
+                    assays.append(assay_elem.text)
+
+                assay_group = AssayGroup(
+                    id=assay_group_elem.get("id"),
+                    label=assay_group_elem.get("label"),
+                    assays=assays,
+                )
+                assay_groups.assay_group.append(assay_group)
+
+            # The example doesn't have contrasts, but keeping the code for completeness
+            contrasts_elem = analytics_elem.find("contrasts")
+            contrasts = None
+            if contrasts_elem is not None:
+                contrasts = Contrasts(contrast=[])
+                for contrast_elem in contrasts_elem.findall("contrast"):
+                    contrast = Contrast(
+                        id=contrast_elem.get("id"),
+                        name=contrast_elem.find("name").text,
+                        ref_group=contrast_elem.find("reference_assay_group").text
+                        if contrast_elem.find("reference_assay_group") is not None
+                        else contrast_elem.find("ref_group").text,
+                        test_group=contrast_elem.find("test_assay_group").text
+                        if contrast_elem.find("test_assay_group") is not None
+                        else contrast_elem.find("test_group").text,
+                    )
+                    contrasts.contrast.append(contrast)
+
+            # The example doesn't have array_design, but keeping the code for completeness
+            array_design_elem = analytics_elem.find("arrayDesign")
+            array_design = (
+                array_design_elem.text if array_design_elem is not None else None
+            )
+
+            analytics = Analytics(
+                assay_groups=assay_groups,
+                array_design=array_design,
+                contrasts=contrasts,
+            )
+            config.analytics.append(analytics)
+
+        return config
+
+    except Exception as e:
+        raise Exception(f"Error parsing configuration file: {e}")
Original file line number	Diff line number	Diff line change
`@@ -75,6 +75,10 @@ params {`
`75`	`75`	`}`
`76`	`76`	`}`
`77`	`77`
	`78`	`+ expressionatlas {`
	`79`	`+ cache = '/hps/nobackup/agb/rnacentral/expression_atlas_cache'`
	`80`	`+ }`
	`81`	`+`
`78`	`82`	`flybase {`
`79`	`83`	`remote = 'ftp://ftp.flybase.net/releases/current/precomputed_files/genes/ncRNA*.json.gz'`
`80`	`84`	`}`