pandas tests, cve_dset, cpe_dset unify json_path approach

adamjanovsky · adamjanovsky · commit 8482d827f78b · 2022-11-10T09:51:43.000+01:00
diff --git a/notebooks/examples/common_criteria.ipynb b/notebooks/examples/common_criteria.ipynb
@@ -194,7 +194,7 @@
    "outputs": [],
    "source": [
     "# Automatically match CPEs and CVEs\n",
-    "_, cpe_dset, _ = dset.compute_cpe_heuristics()\n",
+    "dset.compute_cpe_heuristics()\n",
     "dset.compute_related_cves()"
    ]
   },
diff --git a/sec_certs/dataset/common_criteria.py b/sec_certs/dataset/common_criteria.py
@@ -188,6 +188,7 @@ def from_web_latest(cls) -> "CCDataset":
         return cls.from_web(config.cc_latest_snapshot, "Downloading CC Dataset", "cc_latest_dataset.json")
 
     def _set_local_paths(self):
+        super()._set_local_paths()
         for cert in self:
             cert.set_local_paths(self.reports_pdf_dir, self.targets_pdf_dir, self.reports_txt_dir, self.targets_txt_dir)
 
diff --git a/sec_certs/dataset/cpe.py b/sec_certs/dataset/cpe.py
@@ -4,9 +4,8 @@
 import tempfile
 import xml.etree.ElementTree as ET
 import zipfile
-from dataclasses import InitVar, dataclass, field
 from pathlib import Path
-from typing import Any, ClassVar, Dict, Iterator, List, Set, Tuple, Union, cast
+from typing import Any, ClassVar, Dict, Iterator, List, Optional, Set, Tuple, Union, cast
 
 import pandas as pd
 
@@ -19,29 +18,39 @@
 logger = logging.getLogger(__name__)
 
 
-@dataclass
 class CPEDataset(ComplexSerializableType):
     """
     Dataset of CPE records. Includes look-up dictionaries for fast search.
     """
 
-    was_enhanced_with_vuln_cpes: bool
-    json_path: Path
-    cpes: Dict[str, CPE]
-    vendor_to_versions: Dict[str, Set[str]] = field(
-        init=False, default_factory=dict
-    )  # Look-up dict cpe_vendor: list of viable versions
-    vendor_version_to_cpe: Dict[Tuple[str, str], Set[CPE]] = field(
-        init=False, default_factory=dict
-    )  # Look-up dict (cpe_vendor, cpe_version): List of viable cpe items
-    title_to_cpes: Dict[str, Set[CPE]] = field(
-        init=False, default_factory=dict
-    )  # Look-up dict title: List of cert items
-    vendors: Set[str] = field(init=False, default_factory=set)
-
-    init_lookup_dicts: InitVar[bool] = True
-    cpe_xml_basename: ClassVar[str] = "official-cpe-dictionary_v2.3.xml"
-    cpe_url: ClassVar[str] = "https://nvd.nist.gov/feeds/xml/cpe/dictionary/" + cpe_xml_basename + ".zip"
+    CPE_XML_BASENAME: ClassVar[str] = "official-cpe-dictionary_v2.3.xml"
+    CPE_URL: ClassVar[str] = "https://nvd.nist.gov/feeds/xml/cpe/dictionary/" + CPE_XML_BASENAME + ".zip"
+
+    def __init__(
+        self,
+        was_enhanced_with_vuln_cpes: bool,
+        cpes: Dict[str, CPE],
+        json_path: Optional[Union[str, Path]] = None,
+    ):
+        self.was_enhanced_with_vuln_cpes = was_enhanced_with_vuln_cpes
+        self.cpes = cpes
+        self._json_path = Path(json_path) if json_path else Path.cwd() / (type(self).__name__).lower()
+
+        self.vendor_to_versions: Dict[str, Set[str]] = dict()
+        self.vendor_version_to_cpe: Dict[Tuple[str, str], Set[CPE]] = dict()
+        self.title_to_cpes: Dict[str, Set[CPE]] = dict()
+        self.vendors: Set[str] = set()
+
+        self.build_lookup_dicts()
+
+    @property
+    def json_path(self) -> Path:
+        return self._json_path
+
+    @json_path.setter
+    def json_path(self, new_json_path: Union[str, Path]) -> None:
+        self._json_path = Path(new_json_path)
+        self.to_json()
 
     def __iter__(self) -> Iterator[CPE]:
         yield from self.cpes.values()
@@ -65,11 +74,7 @@ def __eq__(self, other: object) -> bool:
 
     @property
     def serialized_attributes(self) -> List[str]:
-        return ["was_enhanced_with_vuln_cpes", "json_path", "cpes"]
-
-    def __post_init__(self, init_lookup_dicts: bool):
-        if init_lookup_dicts:
-            self.build_lookup_dicts()
+        return ["was_enhanced_with_vuln_cpes", "cpes"]
 
     def build_lookup_dicts(self) -> None:
         """
@@ -94,28 +99,25 @@ def build_lookup_dicts(self) -> None:
                     self.title_to_cpes[cpe.title].add(cpe)
 
     @classmethod
-    def from_web(cls, json_path: Union[str, Path], init_lookup_dicts: bool = True) -> "CPEDataset":
+    def from_web(cls, json_path: Optional[Union[str, Path]] = None) -> "CPEDataset":
         """
         Creates CPEDataset from NIST resources published on-line
 
         :param Union[str, Path] json_path: Path to store the dataset to
-        :param bool init_lookup_dicts: If dictionaries for fast matching should be computed, defaults to True
         :return CPEDataset: The resulting dataset
         """
         with tempfile.TemporaryDirectory() as tmp_dir:
-            xml_path = Path(tmp_dir) / cls.cpe_xml_basename
-            zip_path = Path(tmp_dir) / (cls.cpe_xml_basename + ".zip")
-            helpers.download_file(cls.cpe_url, zip_path)
+            xml_path = Path(tmp_dir) / cls.CPE_XML_BASENAME
+            zip_path = Path(tmp_dir) / (cls.CPE_XML_BASENAME + ".zip")
+            helpers.download_file(cls.CPE_URL, zip_path)
 
             with zipfile.ZipFile(zip_path, "r") as zip_ref:
                 zip_ref.extractall(tmp_dir)
 
-            return cls._from_xml(xml_path, json_path, init_lookup_dicts)
+            return cls._from_xml(xml_path, json_path)
 
     @classmethod
-    def _from_xml(
-        cls, xml_path: Union[str, Path], json_path: Union[str, Path], init_lookup_dicts: bool = True
-    ) -> "CPEDataset":
+    def _from_xml(cls, xml_path: Union[str, Path], json_path: Optional[Union[str, Path]] = None) -> "CPEDataset":
         logger.info("Loading CPE dataset from XML.")
         root = ET.parse(xml_path).getroot()
         dct = {}
@@ -136,7 +138,7 @@ def _from_xml(
 
             dct[cpe_uri] = cached_cpe(cpe_uri, title)
 
-        return cls(False, Path(json_path), dct, init_lookup_dicts)
+        return cls(False, dct, json_path)
 
     @classmethod
     def from_json(cls, input_path: Union[str, Path]) -> "CPEDataset":
@@ -147,19 +149,22 @@ def from_json(cls, input_path: Union[str, Path]) -> "CPEDataset":
         :return CPEDataset: the resulting dataset.
         """
         dset = cast("CPEDataset", ComplexSerializableType.from_json(input_path))
-        dset.json_path = Path(input_path)
+        dset._json_path = Path(input_path)
         return dset
 
     @classmethod
-    def from_dict(cls, dct: Dict[str, Any], init_lookup_dicts: bool = True) -> "CPEDataset":
+    def from_dict(cls, dct: Dict[str, Any]) -> "CPEDataset":
         """
         Loads dataset from dictionary.
 
         :param Dict[str, Any] dct: Dictionary that holds the dataset
-        :param bool init_lookup_dicts: Whether look-up dicts should be computed as a part of initialization, defaults to True
         :return CPEDataset: the resulting dataset.
         """
-        return cls(dct["was_enhanced_with_vuln_cpes"], Path("../"), dct["cpes"], init_lookup_dicts)
+        return cls(
+            dct["was_enhanced_with_vuln_cpes"],
+            dct["cpes"],
+            Path("../"),
+        )
 
     def to_pandas(self) -> pd.DataFrame:
         """
diff --git a/sec_certs/dataset/cve.py b/sec_certs/dataset/cve.py
@@ -6,9 +6,8 @@
 import shutil
 import tempfile
 import zipfile
-from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Dict, Final, List, Optional, Set, Union
+from typing import ClassVar, Dict, List, Optional, Set, Union
 
 import numpy as np
 import pandas as pd
@@ -18,18 +17,29 @@
 from sec_certs.config.configuration import config
 from sec_certs.sample.cpe import CPE, cached_cpe
 from sec_certs.sample.cve import CVE
-from sec_certs.serialization.json import ComplexSerializableType, CustomJSONDecoder, CustomJSONEncoder
+from sec_certs.serialization.json import ComplexSerializableType, CustomJSONDecoder
 from sec_certs.utils.parallel_processing import process_parallel
 
 logger = logging.getLogger(__name__)
 
 
-@dataclass
 class CVEDataset(ComplexSerializableType):
-    cves: Dict[str, CVE]
-    cpe_to_cve_ids_lookup: Dict[str, Set[str]] = field(init=False)
-    cve_url: Final[str] = "https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-"
-    cpe_match_feed_url: Final[str] = "https://nvd.nist.gov/feeds/json/cpematch/1.0/nvdcpematch-1.0.json.zip"
+    CVE_URL: ClassVar[str] = "https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-"
+    CPE_MATCH_FEED_URL: ClassVar[str] = "https://nvd.nist.gov/feeds/json/cpematch/1.0/nvdcpematch-1.0.json.zip"
+
+    def __init__(self, cves: Dict[str, CVE], json_path: Optional[Union[str, Path]] = None):
+        self.cves = cves
+        self._json_path = Path(json_path) if json_path else Path.cwd() / (type(self).__name__).lower()
+        self.cpe_to_cve_ids_lookup: Dict[str, Set[str]] = dict()
+
+    @property
+    def json_path(self) -> Path:
+        return self._json_path
+
+    @json_path.setter
+    def json_path(self, new_json_path: Union[str, Path]) -> None:
+        self._json_path = Path(new_json_path)
+        self.to_json()
 
     @property
     def serialized_attributes(self) -> List[str]:
@@ -89,7 +99,7 @@ def download_cves(cls, output_path_str: str, start_year: int, end_year: int):
         if not output_path.exists:
             output_path.mkdir()
 
-        urls = [cls.cve_url + str(x) + ".json.zip" for x in range(start_year, end_year + 1)]
+        urls = [cls.CVE_URL + str(x) + ".json.zip" for x in range(start_year, end_year + 1)]
 
         logger.info(f"Identified {len(urls)} CVE files to fetch from nist.gov. Downloading them into {output_path}")
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -113,7 +123,12 @@ def from_nist_json(cls, input_path: str) -> "CVEDataset":
         return cls({x.cve_id: x for x in cves})
 
     @classmethod
-    def from_web(cls, start_year: int = 2002, end_year: int = datetime.datetime.now().year):
+    def from_web(
+        cls,
+        start_year: int = 2002,
+        end_year: int = datetime.datetime.now().year,
+        json_path: Optional[Union[str, Path]] = None,
+    ):
         logger.info("Building CVE dataset from nist.gov website.")
         with tempfile.TemporaryDirectory() as tmp_dir:
             cls.download_cves(tmp_dir, start_year, end_year)
@@ -131,20 +146,13 @@ def from_web(cls, start_year: int = 2002, end_year: int = datetime.datetime.now(
             for r in results:
                 all_cves.update(r.cves)
 
-        return cls(all_cves)
-
-    def to_json(self, output_path: Optional[Union[str, Path]] = None):
-        if output_path is None:
-            raise RuntimeError(
-                f"You tried to serialize an object ({type(self)}) that does not have implicit json path. Please provide json_path."
-            )
-        with Path(output_path).open("w") as handle:
-            json.dump(self, handle, indent=4, cls=CustomJSONEncoder, ensure_ascii=False)
+        return cls(all_cves, json_path=json_path)
 
     @classmethod
     def from_json(cls, input_path: Union[str, Path]):
         with Path(input_path).open("r") as handle:
             dset = json.load(handle, cls=CustomJSONDecoder)
+        dset._json_path = input_path
         return dset
 
     def get_cve_ids_for_cpe_uri(self, cpe_uri: str) -> Optional[Set[str]]:
@@ -204,10 +212,10 @@ def parse_values_cpe(field: Dict) -> List[CPE]:
         if not input_filepath or not input_filepath.is_file():
             logger.debug("NIST mapping file not available, going to download.")
             with tempfile.TemporaryDirectory() as tmp_dir:
-                filename = Path(self.cpe_match_feed_url).name
+                filename = Path(self.CPE_MATCH_FEED_URL).name
                 download_path = Path(tmp_dir) / filename
                 unzipped_path = Path(tmp_dir) / filename.rstrip(".zip")
-                helpers.download_file(self.cpe_match_feed_url, download_path)
+                helpers.download_file(self.CPE_MATCH_FEED_URL, download_path)
 
                 with zipfile.ZipFile(download_path, "r") as zip_handle:
                     zip_handle.extractall(tmp_dir)
diff --git a/sec_certs/dataset/dataset.py b/sec_certs/dataset/dataset.py
@@ -182,7 +182,10 @@ def from_json(cls: Type[DatasetSubType], input_path: Union[str, Path]) -> Datase
         return dset
 
     def _set_local_paths(self) -> None:
-        raise NotImplementedError("Not meant to be implemented by the base class.")
+        if self.auxillary_datasets.cpe_dset:
+            self.auxillary_datasets.cpe_dset.json_path = self.cpe_dataset_path
+        if self.auxillary_datasets.cve_dset:
+            self.auxillary_datasets.cve_dset.json_path = self.cve_dataset_path
 
     # Workaround from https://peps.python.org/pep-0673/ applied.
     def _copy_dataset_contents(self: DatasetSubType, old_dset: DatasetSubType) -> None:
@@ -246,14 +249,14 @@ def _download_parallel(urls: Collection[str], paths: Collection[Path], prune_cor
                     logger.error(f"Corrupted file at: {p}")
                     p.unlink()
 
-    def _prepare_cpe_dataset(self, download_fresh_cpes: bool = False, init_lookup_dicts: bool = True) -> CPEDataset:
+    def _prepare_cpe_dataset(self, download_fresh_cpes: bool = False) -> CPEDataset:
         logger.info("Preparing CPE dataset.")
         if not self.auxillary_datasets_dir.exists():
             self.auxillary_datasets_dir.mkdir(parents=True)
 
         if not self.cpe_dataset_path.exists() or download_fresh_cpes is True:
-            cpe_dataset = CPEDataset.from_web(self.cpe_dataset_path, init_lookup_dicts)
-            cpe_dataset.to_json(str(self.cpe_dataset_path))
+            cpe_dataset = CPEDataset.from_web(self.cpe_dataset_path)
+            cpe_dataset.to_json()
         else:
             cpe_dataset = CPEDataset.from_json(str(self.cpe_dataset_path))
 
@@ -267,10 +270,10 @@ def _prepare_cve_dataset(
             self.auxillary_datasets_dir.mkdir(parents=True)
 
         if not self.cve_dataset_path.exists() or download_fresh_cves is True:
-            cve_dataset = CVEDataset.from_web()
-            cve_dataset.to_json(str(self.cve_dataset_path))
+            cve_dataset = CVEDataset.from_web(json_path=self.cve_dataset_path)
+            cve_dataset.to_json()
         else:
-            cve_dataset = CVEDataset.from_json(str(self.cve_dataset_path))
+            cve_dataset = CVEDataset.from_json(self.cve_dataset_path)
 
         cve_dataset.build_lookup_dict(use_nist_cpe_matching_dict, self.nist_cve_cpe_matching_dset_path)
         return cve_dataset
@@ -307,8 +310,8 @@ def filter_condition(cpe: CPE) -> bool:
             return True
 
         logger.info("Computing heuristics: Finding CPE matches for certificates")
-        self.auxillary_datasets.cpe_dset = self._prepare_cpe_dataset(download_fresh_cpes, init_lookup_dicts=False)
-        self.auxillary_datasets.cpe_dset.build_lookup_dicts()
+        if not self.auxillary_datasets.cpe_dset or download_fresh_cpes:
+            self.auxillary_datasets.cpe_dset = self._prepare_cpe_dataset(download_fresh_cpes)
 
         # Temporarily disabled, see: https://github.com/crocs-muni/sec-certs/issues/173
         # if not cpe_dset.was_enhanced_with_vuln_cpes:
diff --git a/sec_certs/sample/cpe.py b/sec_certs/sample/cpe.py
@@ -26,8 +26,6 @@ class CPE(PandasSerializableType, ComplexSerializableType):
         "item_name",
         "version",
         "title",
-        "start_version",
-        "end_version",
     ]
 
     def __init__(
diff --git a/tests/cc/test_cc_analysis.py b/tests/cc/test_cc_analysis.py
@@ -50,7 +50,7 @@ def cpes(cpe_single_sign_on: CPE) -> Set[CPE]:
 
 @pytest.fixture(scope="module")
 def cpe_dset(cpes: Set[CPE]) -> CPEDataset:
-    return CPEDataset(True, Path("../"), {x.uri: x for x in cpes})
+    return CPEDataset(False, {x.uri: x for x in cpes}, Path("../"))
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/cc/test_cc_dataset.py b/tests/cc/test_cc_dataset.py
@@ -164,3 +164,10 @@ def test_download_csv_html_files():
         for x in dset.active_csv_tuples:
             assert x[1].exists()
             assert x[1].stat().st_size >= constants.MIN_CC_CSV_SIZE
+
+
+def test_to_pandas(toy_dataset: CCDataset):
+    df = toy_dataset.to_pandas()
+    assert df.shape == (len(toy_dataset), len(CommonCriteriaCert.pandas_columns))
+    assert df.index.name == "dgst"
+    assert set(df.columns) == (set(CommonCriteriaCert.pandas_columns).union({"year_from"})) - {"dgst"}
diff --git a/tests/cc/test_cc_maintenance_updates.py b/tests/cc/test_cc_maintenance_updates.py
@@ -0,0 +1,30 @@
+def test_methods_not_meant_to_be_implemented():
+    pass
+
+
+def test_download_artifacts():
+    pass
+
+
+def test_convert_artifacts():
+    pass
+
+
+def test_extract_data():
+    pass
+
+
+def test_to_json():
+    pass
+
+
+def test_from_json():
+    pass
+
+
+def test_to_pandas():
+    pass
+
+
+def test_from_web():
+    pass
diff --git a/tests/cc/test_cc_schemes.py b/tests/cc/test_cc_schemes.py
diff --git a/tests/fips/test_fips_dataset.py b/tests/fips/test_fips_dataset.py
diff --git a/tests/test_cpe.py b/tests/test_cpe.py
diff --git a/tests/test_cve.py b/tests/test_cve.py

Original file line number	Diff line number	Diff line change
`@@ -194,7 +194,7 @@`
`194`	`194`	`"outputs": [],`
`195`	`195`	`"source": [`
`196`	`196`	`"# Automatically match CPEs and CVEs\n",`
`197`		`- "_, cpe_dset, _ = dset.compute_cpe_heuristics()\n",`
	`197`	`+ "dset.compute_cpe_heuristics()\n",`
`198`	`198`	`"dset.compute_related_cves()"`
`199`	`199`	`]`
`200`	`200`	`},`
Original file line number	Diff line number	Diff line change
`@@ -26,8 +26,6 @@ class CPE(PandasSerializableType, ComplexSerializableType):`
`26`	`26`	`"item_name",`
`27`	`27`	`"version",`
`28`	`28`	`"title",`
`29`		`- "start_version",`
`30`		`- "end_version",`
`31`	`29`	`]`
`32`	`30`
`33`	`31`	`def __init__(`