Skip to content

Commit

Permalink
fix: ensure consistent source meta schema typing (#175)
Browse files Browse the repository at this point in the history
  • Loading branch information
jsstevenson authored Dec 20, 2023
1 parent 8b7c35b commit 355a587
Show file tree
Hide file tree
Showing 10 changed files with 89 additions and 71 deletions.
10 changes: 6 additions & 4 deletions src/disease/database/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,11 +103,13 @@ def initialize_db(self) -> None:
"""

@abc.abstractmethod
def get_source_metadata(self, src_name: Union[str, SourceName]) -> Optional[Dict]:
def get_source_metadata(
self, src_name: Union[str, SourceName]
) -> Optional[SourceMeta]:
"""Get license, versioning, data lookup, etc information for a source.
:param src_name: name of the source to get data for
:return: Dict containing metadata if lookup is successful
:return: source metadata, if lookup is successful
"""

@abc.abstractmethod
Expand Down Expand Up @@ -162,11 +164,11 @@ def get_all_records(self, record_type: RecordType) -> Generator[Dict, None, None
"""

@abc.abstractmethod
def add_source_metadata(self, src_name: SourceName, data: SourceMeta) -> None:
def add_source_metadata(self, src_name: SourceName, meta: SourceMeta) -> None:
"""Add new source metadata entry.
:param src_name: name of source
:param data: known source attributes
:param meta: known source attributes
:raise DatabaseWriteException: if write fails
"""

Expand Down
40 changes: 29 additions & 11 deletions src/disease/database/dynamodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,13 @@
DatabaseWriteException,
confirm_aws_db_use,
)
from disease.schemas import RecordType, RefType, SourceMeta, SourceName
from disease.schemas import (
DataLicenseAttributes,
RecordType,
RefType,
SourceMeta,
SourceName,
)

_logger = logging.getLogger()
_logger.setLevel(logging.DEBUG)
Expand Down Expand Up @@ -87,7 +93,7 @@ def __init__(self, db_url: Optional[str] = None, **db_args) -> None:

self.diseases = self.dynamodb.Table(self.disease_table)
self.batch = self.diseases.batch_writer()
self._cached_sources = {}
self._cached_sources: Dict[str, SourceMeta] = {}
atexit.register(self.close_connection)

def list_tables(self) -> List[str]:
Expand Down Expand Up @@ -200,11 +206,13 @@ def initialize_db(self) -> None:
if not self.check_schema_initialized():
self._create_diseases_table()

def get_source_metadata(self, src_name: Union[str, SourceName]) -> Optional[Dict]:
def get_source_metadata(
self, src_name: Union[str, SourceName]
) -> Optional[SourceMeta]:
"""Get license, versioning, data lookup, etc information for a source.
:param src_name: name of the source to get data for
:return: Dict containing metadata if lookup is successful
:return: source metadata, if lookup is successful
"""
if isinstance(src_name, SourceName):
src_name = src_name.value
Expand All @@ -213,13 +221,23 @@ def get_source_metadata(self, src_name: Union[str, SourceName]) -> Optional[Dict
else:
pk = f"{src_name.lower()}##source"
concept_id = f"source:{src_name.lower()}"
metadata = self.diseases.get_item(
retrieved_metadata = self.diseases.get_item(
Key={"label_and_type": pk, "concept_id": concept_id}
).get("Item")
if not metadata:
if not retrieved_metadata:
return None
self._cached_sources[src_name] = metadata
return metadata
formatted_metadata = SourceMeta(
data_license=retrieved_metadata["data_license"],
data_license_url=retrieved_metadata["data_license_url"],
version=retrieved_metadata["version"],
data_url=retrieved_metadata["data_url"],
rdp_url=retrieved_metadata["rdp_url"],
data_license_attributes=DataLicenseAttributes(
**retrieved_metadata["data_license_attributes"]
),
)
self._cached_sources[src_name] = formatted_metadata
return formatted_metadata

def get_record_by_id(
self, concept_id: str, case_sensitive: bool = True, merge: bool = False
Expand Down Expand Up @@ -351,15 +369,15 @@ def get_all_records(self, record_type: RecordType) -> Generator[Dict, None, None
if not last_evaluated_key:
break

def add_source_metadata(self, src_name: SourceName, metadata: SourceMeta) -> None:
def add_source_metadata(self, src_name: SourceName, meta: SourceMeta) -> None:
"""Add new source metadata entry.
:param src_name: name of source
:param data: known source attributes
:param meta: known source attributes
:raise DatabaseWriteException: if write fails
"""
src_name_value = src_name.value
metadata_item = metadata.model_dump()
metadata_item = meta.model_dump()
metadata_item["src_name"] = src_name_value
metadata_item["label_and_type"] = f"{str(src_name_value).lower()}##source"
metadata_item["concept_id"] = f"source:{str(src_name_value).lower()}"
Expand Down
46 changes: 27 additions & 19 deletions src/disease/database/postgresql.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,13 @@
)

from disease.database import AbstractDatabase, DatabaseException, DatabaseWriteException
from disease.schemas import RecordType, RefType, SourceMeta, SourceName
from disease.schemas import (
DataLicenseAttributes,
RecordType,
RefType,
SourceMeta,
SourceName,
)

logger = logging.getLogger()

Expand Down Expand Up @@ -57,7 +63,7 @@ def __init__(self, db_url: Optional[str] = None, **db_args) -> None:

self.conn = psycopg.connect(conninfo)
self.initialize_db()
self._cached_sources = {}
self._cached_sources: Dict[str, SourceMeta] = {}

atexit.register(self.close_connection)

Expand Down Expand Up @@ -255,11 +261,13 @@ def _create_tables(self) -> None:

_source_metadata_query = b"SELECT * FROM disease_sources WHERE name = %s;"

def get_source_metadata(self, src_name: Union[str, SourceName]) -> Optional[Dict]:
def get_source_metadata(
self, src_name: Union[str, SourceName]
) -> Optional[SourceMeta]:
"""Get license, versioning, data lookup, etc information for a source.
:param src_name: name of the source to get data for
:return: Dict containing metadata if lookup is successful
:return: source metadata, if lookup is successful
"""
if isinstance(src_name, SourceName):
src_name = src_name.value
Expand All @@ -272,18 +280,18 @@ def get_source_metadata(self, src_name: Union[str, SourceName]) -> Optional[Dict
metadata_result = cur.fetchone()
if not metadata_result:
return None
metadata = {
"data_license": metadata_result[1],
"data_license_url": metadata_result[2],
"version": metadata_result[3],
"data_url": metadata_result[4],
"rdp_url": metadata_result[5],
"data_license_attributes": {
"non_commercial": metadata_result[6],
"attribution": metadata_result[7],
"share_alike": metadata_result[8],
},
}
metadata = SourceMeta(
data_license=metadata_result[1],
data_license_url=metadata_result[2],
version=metadata_result[3],
data_url=metadata_result[4],
rdp_url=metadata_result[5],
data_license_attributes=DataLicenseAttributes(
non_commercial=metadata_result[6],
attribution=metadata_result[7],
share_alike=metadata_result[8],
),
)
self._cached_sources[src_name] = metadata
return metadata

Expand Down Expand Up @@ -513,9 +521,9 @@ def add_source_metadata(self, src_name: SourceName, meta: SourceMeta) -> None:
meta.version,
meta.data_url,
meta.rdp_url,
meta.data_license_attributes["non_commercial"],
meta.data_license_attributes["attribution"],
meta.data_license_attributes["share_alike"],
meta.data_license_attributes.non_commercial,
meta.data_license_attributes.attribution,
meta.data_license_attributes.share_alike,
],
)
self.conn.commit()
Expand Down
12 changes: 5 additions & 7 deletions src/disease/etl/mondo.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import fastobo

from disease.etl.base import Base
from disease.schemas import NamespacePrefix, SourceMeta
from disease.schemas import DataLicenseAttributes, NamespacePrefix, SourceMeta

_logger = logging.getLogger(__name__)

Expand All @@ -23,11 +23,9 @@ def _load_meta(self) -> None:
version=self._version,
data_url="https://mondo.monarchinitiative.org/pages/download/",
rdp_url="http://reusabledata.org/monarch.html",
data_license_attributes={
"non_commercial": False,
"share_alike": False,
"attribution": True,
},
data_license_attributes=DataLicenseAttributes(
non_commercial=False, share_alike=False, attribution=True
),
)
self._database.add_source_metadata(self._src_name, metadata)

Expand Down Expand Up @@ -77,7 +75,7 @@ def _get_xref_from_url(self, url: str) -> Optional[Tuple[NamespacePrefix, str]]:

@staticmethod
def _get_xref_from_xref_clause(
clause: fastobo.term.XrefClause
clause: fastobo.term.XrefClause,
) -> Optional[Tuple[NamespacePrefix, str]]:
"""Get dbXref from xref clause.
Expand Down
18 changes: 9 additions & 9 deletions src/disease/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,14 +128,6 @@ class Disease(BaseModel):
)


class DataLicenseAttributes(BaseModel):
"""Define constraints for data license attributes."""

non_commercial: StrictBool
share_alike: StrictBool
attribution: StrictBool


class RecordType(str, Enum):
"""Record item types."""

Expand All @@ -153,6 +145,14 @@ class RefType(str, Enum):
ASSOCIATED_WITH = "associated_with"


class DataLicenseAttributes(BaseModel):
"""Define constraints for data license attributes."""

non_commercial: StrictBool
share_alike: StrictBool
attribution: StrictBool


class SourceMeta(BaseModel):
"""Metadata for a given source to return in response object."""

Expand All @@ -161,7 +161,7 @@ class SourceMeta(BaseModel):
version: StrictStr
data_url: Optional[StrictStr] = None
rdp_url: Optional[StrictStr] = None
data_license_attributes: Dict[StrictStr, StrictBool]
data_license_attributes: DataLicenseAttributes

model_config = ConfigDict(
json_schema_extra={
Expand Down
2 changes: 1 addition & 1 deletion src/disease/version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
"""Disease normalizer version"""
__version__ = "0.4.0.dev2"
__version__ = "0.4.0.dev3"
8 changes: 3 additions & 5 deletions tests/unit/test_mondo.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,8 +304,6 @@ def test_meta(mondo):
== "https://mondo.monarchinitiative.org/pages/download/"
)
assert response.source_meta_.rdp_url == "http://reusabledata.org/monarch.html"
assert response.source_meta_.data_license_attributes == {
"non_commercial": False,
"share_alike": False,
"attribution": True,
}
assert not response.source_meta_.data_license_attributes.non_commercial
assert not response.source_meta_.data_license_attributes.share_alike
assert response.source_meta_.data_license_attributes.attribution
8 changes: 3 additions & 5 deletions tests/unit/test_ncit.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,6 @@ def test_meta(ncit):
response.source_meta_.data_url == "https://evs.nci.nih.gov/ftp1/NCI_Thesaurus/"
)
assert response.source_meta_.rdp_url == "http://reusabledata.org/ncit.html"
assert response.source_meta_.data_license_attributes == {
"non_commercial": False,
"share_alike": False,
"attribution": True,
}
assert not response.source_meta_.data_license_attributes.non_commercial
assert not response.source_meta_.data_license_attributes.share_alike
assert response.source_meta_.data_license_attributes.attribution
8 changes: 3 additions & 5 deletions tests/unit/test_omim.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,6 @@ def test_meta(omim):
assert re.match(r"\d{4}-\d{2}-\d{2}", response.source_meta_.version)
assert response.source_meta_.data_url == "https://www.omim.org/downloads"
assert response.source_meta_.rdp_url == "http://reusabledata.org/omim.html"
assert response.source_meta_.data_license_attributes == {
"non_commercial": False,
"share_alike": True,
"attribution": True,
}
assert not response.source_meta_.data_license_attributes.non_commercial
assert response.source_meta_.data_license_attributes.share_alike
assert response.source_meta_.data_license_attributes.attribution
8 changes: 3 additions & 5 deletions tests/unit/test_oncotree.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,6 @@ def test_meta(oncotree):
assert response.source_meta_.version == "2021-11-02"
assert response.source_meta_.data_url == "http://oncotree.mskcc.org/#/home?tab=api"
assert response.source_meta_.rdp_url is None
assert response.source_meta_.data_license_attributes == {
"non_commercial": False,
"share_alike": False,
"attribution": True,
}
assert not response.source_meta_.data_license_attributes.non_commercial
assert not response.source_meta_.data_license_attributes.share_alike
assert response.source_meta_.data_license_attributes.attribution

0 comments on commit 355a587

Please sign in to comment.