Skip to content

Bugfix/genomefacotry division agnostic #144

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added genome_info.json
Empty file.
66 changes: 39 additions & 27 deletions src/ensembl/production/metadata/api/factories/genomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class GenomeInputFilters:
species: List[str] = field(default_factory=list)
antispecies: List[str] = field(default_factory=list)
dataset_status: List[str] = field(default_factory=lambda: ["Submitted"])
dataset_version: str = ''
release_id: int = 0
batch_size: int = 50
page: int = 1
Expand All @@ -54,6 +55,7 @@ class GenomeInputFilters:
columns: List = field(default_factory=lambda: [Genome.genome_uuid.label('genome_uuid'),
Genome.production_name.label('species'),
Dataset.dataset_uuid.label('dataset_uuid'),
Dataset.version.label('dataset_version'),
Dataset.status.label('dataset_status'),
DatasetSource.name.label('dataset_source'),
DatasetType.name.label('dataset_type')
Expand All @@ -65,9 +67,6 @@ class GenomeFactory:
@staticmethod
def _apply_filters(query, filters):

if filters.organism_group_type:
query = query.filter(OrganismGroup.type == filters.organism_group_type)

if filters.run_all:
filters.division = [
'EnsemblBacteria',
Expand All @@ -78,21 +77,28 @@ def _apply_filters(query, filters):
'EnsemblFungi',
]

if filters.genome_uuid:
query = query.filter(Genome.genome_uuid.in_(filters.genome_uuid))
if filters.organism_group_type or any(
[i.element.table.name in ['organism_group', 'organism_group_member'] for i in filters.columns]):
query = query.outerjoin(Organism.organism_group_members).outerjoin(OrganismGroupMember.organism_group)

if filters.dataset_uuid:
query = query.filter(Dataset.dataset_uuid.in_(filters.dataset_uuid))
if filters.organism_group_type:
query = query.filter(OrganismGroup.type == filters.organism_group_type)

if filters.division:
if filters.division and filters.organism_group_type:
ensembl_divisions = filters.division

if filters.organism_group_type == 'DIVISION':
if filters.organism_group_type == 'Division':
pattern = re.compile(r'^(ensembl)?', re.IGNORECASE)
ensembl_divisions = ['Ensembl' + pattern.sub('', d).capitalize() for d in ensembl_divisions if d]

query = query.filter(OrganismGroup.name.in_(ensembl_divisions))

if filters.genome_uuid:
query = query.filter(Genome.genome_uuid.in_(filters.genome_uuid))

if filters.dataset_uuid:
query = query.filter(Dataset.dataset_uuid.in_(filters.dataset_uuid))

if filters.species:
species = set(filters.species) - set(filters.antispecies)

Expand All @@ -106,18 +112,29 @@ def _apply_filters(query, filters):

if filters.release_id:
query = query.join(Genome.genome_releases)
query = query.filter(GenomeDataset.release_id==filters.release_id)
query = query.filter(GenomeRelease.release_id==filters.release_id)
query = query.filter(GenomeRelease.release_id == filters.release_id)

if filters.dataset_type:
query = query.filter(Genome.genome_datasets.any(DatasetType.name.in_([filters.dataset_type])))

if filters.dataset_status:
query = query.filter(Dataset.status.in_(filters.dataset_status))

if filters.dataset_version:
query = query.filter(Dataset.version == filters.dataset_version)

if filters.batch_size:
filters.page = filters.page if filters.page > 0 else 1
query = query.offset((filters.page - 1) * filters.batch_size).limit(filters.batch_size)

# check if dataset/genome uuid in column list if not add it as we group by genome uuid and dataset uuid
if not any([i.element.table.name == 'dataset' and i.element.name == 'dataset_uuid'
for i in filters.columns]):
filters.columns.append(Dataset.dataset_uuid.label('dataset_uuid'))
if not any([i.element.table.name == 'genome' and i.element.name == 'genome_uuid'
for i in filters.columns]):
filters.columns.append(Genome.genome_uuid.label('genome_uuid'))

logger.debug(f"Filter Query {query}")
return query

Expand All @@ -126,14 +143,13 @@ def _build_query(self, filters):
.select_from(Genome) \
.join(Genome.assembly) \
.join(Genome.organism) \
.join(Organism.organism_group_members) \
.join(OrganismGroupMember.organism_group) \
.join(Genome.genome_datasets) \
.join(GenomeDataset.dataset) \
.join(Dataset.dataset_source) \
.join(Dataset.dataset_type) \
.group_by(Genome.genome_id, Dataset.dataset_id) \
.order_by(Genome.genome_uuid)
.join(Dataset.dataset_type)
#\
# .group_by(Genome.genome_id, Dataset.dataset_id) \
# .order_by(Genome.genome_uuid)

return self._apply_filters(query, filters)

Expand All @@ -154,13 +170,7 @@ def get_genomes(self, **filters: GenomeInputFilters):
if dataset_status and isinstance(dataset_status, DatasetStatus):
genome_info['dataset_status'] = dataset_status.value

if not dataset_uuid:
logger.warning(
f"No dataset uuid found for genome {genome_info} skipping this genome "
)
continue

if filters.update_dataset_status:
if filters.update_dataset_status and dataset_uuid:
_, status = DatasetFactory(filters.metadata_db_uri) \
.update_dataset_status(dataset_uuid,
filters.update_dataset_status,
Expand All @@ -174,7 +184,7 @@ def get_genomes(self, **filters: GenomeInputFilters):

else:
logger.warning(
f"Cannot update status for dataset uuid: {dataset_uuid} "
f"Cannot update status for dataset uuid: {dataset_uuid} , ensure the column with dataset_uuid declared "
f"{filters.update_dataset_status} to {status} for genome {genome_info['genome_uuid']}"
)
genome_info['updated_dataset_status'] = None
Expand All @@ -191,8 +201,8 @@ def main():
help='List of genome UUIDs to filter the query. Default is an empty list.')
parser.add_argument('--dataset_uuid', type=str, nargs='*', default=[], required=False,
help='List of dataset UUIDs to filter the query. Default is an empty list.')
parser.add_argument('--organism_group_type', type=str, default='DIVISION', required=False,
help='Organism group type to filter the query. Default is "DIVISION"')
parser.add_argument('--organism_group_type', type=str, required=False, default="Division",
help='Organism group type to filter the query. ex: "DIVISION", check organism_group table for more types')
parser.add_argument('--division', type=str, nargs='*', default=[], required=False,
help='List of organism group names to filter the query. Default is an empty list.')
parser.add_argument('--dataset_type', type=str, default="assembly", required=False,
Expand All @@ -202,10 +212,12 @@ def main():
parser.add_argument('--antispecies', type=str, nargs='*', default=[], required=False,
help='List of Species Production names to exclude from the query. Default is an empty list.')
parser.add_argument('--release_id', type=int, default=0, required=False,
help='Genome_dataset release_id to filter the query. Default is 0 (no filter).')
help='Genome_dataset release_id to filter the query for released genomes and datasets. Default is 0 (no filter).')
parser.add_argument('--dataset_status', nargs='*', default=["Submitted"],
choices=['Submitted', 'Processing', 'Processed', 'Released'], required=False,
help='List of dataset statuses to filter the query. Default is an empty list.')
parser.add_argument('--dataset_version', type=str, required=False,
help='Filter the query by dataset version')
parser.add_argument('--update_dataset_status', type=str, default="", required=False,
choices=['Submitted', 'Processing', 'Processed', 'Released', ''],
help='Update the status of the selected datasets to the specified value. ')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,15 @@
)


def fetch_division_name(core_db_uri: str) -> str:
def fetch_division_name(core_db_uri: str, production_name: str) -> str:
"""
Fetch the division name from the core database.
"""
with DBConnection(core_db_uri).session_scope() as session:
query = session.query(Meta).filter(Meta.meta_key == 'species.division').one_or_none()
query = session.query(Meta.species_id).filter(Meta.meta_key == 'species.production_name',
Meta.meta_value == production_name).one_or_none()
query = session.query(Meta).filter(Meta.meta_key == 'species.division',
Meta.species_id == query.species_id).one_or_none()
return query.meta_value if query else None


Expand Down Expand Up @@ -92,7 +95,7 @@ def process_genomes(session, args, organism_group_id: int = None):
for genome, dataset_source in query.all():
logging.info(f"Processing genome {genome.genome_uuid} for organism {genome.organism_id}")
if not (args.organism_group_type and args.organism_group_name) and args.core_server_uri:
division_name = fetch_division_name(os.path.join(args.core_server_uri, dataset_source.name))
division_name = fetch_division_name(os.path.join(args.core_server_uri, dataset_source.name), genome.production_name)
if division_name:
organism_group = session.query(OrganismGroup).filter(OrganismGroup.name == division_name).one_or_none()
if organism_group:
Expand Down