diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..bbc40ac31e --- /dev/null +++ b/.gitignore @@ -0,0 +1,332 @@ +.idea +build +dist +nlu.egg-info +.sass-cache +personal_notes.md +/tests/nlu_hc_tests/secrets.py +/tests/nlu_hc_tests/spark_nlp_for_healthcare.json +tmp +# Created by https://www.gitignore.io/api/sbt,java,scala,python,eclipse,intellij,intellij+all + +### Eclipse ### + +.metadata +bin/ +tmp/ +*.tmp +*.bak +*.swp +*~.nib +local.properties +.settings/ +.loadpath +.recommenders +PubMed* +*cache_pretrained* +*.crc +*.sst +_SUCCESS* +*stages* +*auxdata* +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# PyDev specific (Python IDE for Eclipse) +*.pydevproject + +# CDT-specific (C/C++ Development Tooling) +.cproject + +# Java annotation processor (APT) +.factorypath + +# PDT-specific (PHP Development Tools) +.buildpath + +# sbteclipse plugin +.target + +# Tern plugin +.tern-project + +# TeXlipse plugin +.texlipse + +# STS (Spring Tool Suite) +.springBeans + +# Code Recommenders +.recommenders/ + +# Scala IDE specific (Scala & Java development for Eclipse) +.cache-main +.scala_dependencies +.worksheet + +### Eclipse Patch ### +# Eclipse Core +.project + +# JDT-specific (Eclipse Java Development Tools) +.classpath + +### Intellij ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff: +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/dictionaries + +# Sensitive or high-churn files: +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.xml +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml + +# Gradle: +.idea/**/gradle.xml +.idea/**/libraries + +# CMake +cmake-build-debug/ + +# Mongo Explorer plugin: +.idea/**/mongoSettings.xml + +## File-based project format: +*.iws + +## Plugin-specific files: + +# IntelliJ +/out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +### Intellij Patch ### +# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 + +*.iml +# modules.xml +# .idea/misc.xml +# *.ipr + +# Sonarlint plugin +.idea/sonarlint + +### Intellij+all ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff: + +# Sensitive or high-churn files: + +# Gradle: + +# CMake + +# Mongo Explorer plugin: + +## File-based project format: + +## Plugin-specific files: + +# IntelliJ + +# mpeltonen/sbt-idea plugin + +# JIRA plugin + +# Cursive Clojure plugin + +# Crashlytics plugin (for Android Studio and IntelliJ) + +### Intellij+all Patch ### +# Ignores the whole idea jsl_folder +# See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 + +.idea/ + +### Java ### +# Compiled class file +*.class + +# Log file +*.log + +# BlueJ files +*.ctxt + +# Mobile Tools for Java (J2ME) +.mtj.tmp/ + +# Package Files # +*.jar +*.war +*.ear +*.zip +*.tar.gz +*.rar + +# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml +hs_err_pid* + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +python/lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +### SBT ### +# Simple Build Tool +# http://www.scala-sbt.org/release/docs/Getting-Started/Directories.html#configuring-version-control + +dist/* +lib_managed/ +src_managed/ +project/boot/ +project/plugins/project/ +.history +.lib/ + +### Scala ### + +# End of https://www.gitignore.io/api/sbt,java,scala,python,eclipse,intellij,intellij+all + +### Local ### +tmp_pipeline/ +tmp_symspell/ +test-output-tmp/ +spark-warehouse/ +/python/python.iml +test_crf_pipeline/ +test_*_pipeline/ +*metastore_db* +python/src/ +python/tensorflow/bert/models/** +**/.DS_Store +**/tmp_* +docs/_site/** +docs/.sass-cache/** + +tst_shortcut_sd/ +src/*/resources/*.classes \ No newline at end of file diff --git a/README.md b/README.md index a779c6f949..b0f80454c7 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,4 @@ # johnsnowlabs -John Snow Labs NLP Library +The John Snow Labs Library gives you access to all of John Snow Labs Enterprise And Open Source products in an easy and simple manner. Access 10000+ state-of-the-art NLP and OCR models for ' +'Finance, Legal and Medical domains. Easily scalable to Spark Cluster. +See https://www.johnsnowlabs.com/ and [the docs](https://nlu.johnsnowlabs.com/docs/en/install) for more detials \ No newline at end of file diff --git a/johnsnowlabs/__init__.py b/johnsnowlabs/__init__.py new file mode 100644 index 0000000000..75f5890de5 --- /dev/null +++ b/johnsnowlabs/__init__.py @@ -0,0 +1,17 @@ +from .auto_install.health_checks.report import check_health, list_remote_licenses, list_local_licenses +from .utils.sparksession_utils import start +from .auto_install.install_flow import install +# get helpers into global space +from johnsnowlabs import medical, nlp, ocr, settings, viz, finance, legal +import johnsnowlabs as jsl + +# databricks +from johnsnowlabs.auto_install.databricks.work_utils import run_in_databricks +from johnsnowlabs.nlp import * + + +def new_version_online(): + from .utils.pip_utils import get_latest_lib_version_on_pypi + # we are outdated, if current version does not match the latest on PypPi + from .auto_install.softwares import Software + return settings.raw_version_jsl_lib != get_latest_lib_version_on_pypi(Software.jsl_lib.pypi_name) diff --git a/johnsnowlabs/abstract_base/__init__.py b/johnsnowlabs/abstract_base/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/johnsnowlabs/abstract_base/base_enum.py b/johnsnowlabs/abstract_base/base_enum.py new file mode 100644 index 0000000000..927e73866c --- /dev/null +++ b/johnsnowlabs/abstract_base/base_enum.py @@ -0,0 +1,19 @@ +from enum import Enum, EnumMeta + + +class MetaEnum(EnumMeta): + + def __contains__(cls, item): + try: + cls(item) + except ValueError: + return False + return True + + def from_name(cls, name): + return cls._member_map_[name] + + +class BaseEnum(Enum, metaclass=MetaEnum): + # Enums extending this will support `in` keyword for values of enums + pass diff --git a/johnsnowlabs/abstract_base/lib_resolver.py b/johnsnowlabs/abstract_base/lib_resolver.py new file mode 100644 index 0000000000..de7f84c18e --- /dev/null +++ b/johnsnowlabs/abstract_base/lib_resolver.py @@ -0,0 +1,200 @@ +import importlib +from abc import ABC +from typing import Dict, Union + +from johnsnowlabs.py_models.url_dependency import UrlDependency +from johnsnowlabs.utils.enums import * +from johnsnowlabs.py_models.lib_version import LibVersion + + +def is_spark_version_env(spark_version: str) -> bool: + import pyspark + env_spark_version = pyspark.__version__[0:-2].replace(".", "") + return spark_version in env_spark_version + + +def try_import_lib(lib: str, print_failure=False): + try: + importlib.import_module(lib) + return True + except Exception as _: + if print_failure: + print(f'Failed to import {lib}. Seems not installed.') + + +class Py4JJslLibDependencyResolverABC(ABC): + """ + We define a resolver for all JAR based dependencies + Each JslLibDependencyResolver util must implement the following : + - get_jar_urls(lib_version,spark_version_to_match,Optional[secret])->UrlDependency + - get_mvn_coordinates(lib_version,spark_version_to_match,Optional[secret])->RepoDependency + - get_python_urls(lib_version,spark_version_to_match,Optional[secret],install_type)->UrlDependency + - get_pypi_identifier(lib_version,spark_version_to_match,Optional[secret],install_type)->RepoDependency + - get_lib_version_i + + INVARIANTS : + 2. Jar/Py Dependencies are always Spark version UN-agnostic. + This is why we have to map pyspark version to jar/py dependencies + + + """ + has_gpu_jars: bool = False + has_cpu_jars: bool = False + has_m1_jar: bool = False + has_py_install: bool = False + has_secret: bool = False + lib_version: LibVersion + product_name: ProductName + # key = Supported Spark Version, Value = Dict with key=m1/cpu/gpu and value = URL/ URL formatable with secret + compatible_spark_to_jar_map: Dict[SparkVersion, + Dict[JvmHardwareTarget, UrlDependency]] + + # key = Supported Spark Version, Value = Py Locations + compatible_spark_to_py_map: Dict[SparkVersion, + Dict[PyInstallTypes, UrlDependency]] + + @classmethod + def get_dependency_url(cls, + install_type: Union[JvmHardwareTarget, PyInstallTypes], + spark_version_to_match: Optional[Union[str, LibVersion]], + secret: Optional[str] = None, + ) -> UrlDependency: + + if install_type in PyInstallTypes: + return cls.get_py_urls( + secret=secret, spark_version_to_match=spark_version_to_match, install_type=install_type) + elif install_type in JvmHardwareTarget: + return cls.get_jar_urls( + secret=secret, spark_version_to_match=spark_version_to_match, hardware_target=install_type) + else: + raise ValueError(f'Invalid Install type = {install_type} must be JvmHardwareTarget or PyInstallType') + + @classmethod + def get_url_from_compat_map(cls, + compat_map: + Dict[SparkVersion, + Dict[Union[JvmHardwareTarget, PyInstallTypes], + UrlDependency]], + install_type: Union[JvmHardwareTarget, PyInstallTypes], + spark_version_to_match: Optional[Union[str, LibVersion]], + secret: Optional[str] = None, + suffix: str = '', + ) -> UrlDependency: + + spark_version_to_match = cls.resolve_to_spark_lib_version(spark_version_to_match) + matching_jsl_spark_release = None + for compatible_spark in compat_map.keys(): + compatible_spark: SparkVersion = compatible_spark + if compatible_spark.value.equals(spark_version_to_match): + matching_jsl_spark_release = compatible_spark + + if not matching_jsl_spark_release: + # Todo make special type of exception and catch? + # TODO nicer print + raise Exception( + f'{cls.product_name.value} does not have any install candidates for' + f' pyspark=={spark_version_to_match.as_str()} \n' + f'Please install one of the following Pyspark Versions : {1} TODO') + + if cls.has_secret: + if settings.enforce_versions: + url = compat_map[matching_jsl_spark_release][install_type].url.format( + secret=secret, lib_version=cls.lib_version.as_str()) + else: + # Read Version from secret + url = compat_map[matching_jsl_spark_release][install_type].url.format( + secret=secret, lib_version=secret.split('-')[0]) + + else: + if settings.enforce_versions: + url = compat_map[matching_jsl_spark_release][install_type].url.format( + lib_version=cls.lib_version.as_str()) + else: + # read from updated settings instead of already instantiated Objects, which will not reflect setting updates + url = compat_map[matching_jsl_spark_release][install_type].url.format( + lib_version=LatestCompatibleProductVersion.from_settings(cls.product_name)) + url.replace(cls.lib_version.as_str(), LatestCompatibleProductVersion.from_settings(cls.product_name)) + + # NAME-VERSION-INSTALL_TYPE-for-spark-SPARK_VERSION-.[jar/tar/wheel] + name = f'{cls.product_name.value}-{cls.lib_version.as_str()}' \ + f'-{install_type.name}-for-spark-{matching_jsl_spark_release.value.as_str()}.{suffix}' + return UrlDependency(url=url, + dependency_type=install_type, + spark_version=matching_jsl_spark_release, + dependency_version=cls.lib_version, + file_name=name, + product_name=cls.product_name.value) + + @classmethod + def get_jar_urls(cls, + secret: Optional[str] = None, + spark_version_to_match: Optional[Union[str, LibVersion]] = None, + hardware_target: JvmHardwareTarget = JvmHardwareTarget.cpu, + ) -> UrlDependency: + """ + Get jar URL location for hardware target. + Jar loc formats with secret+jsl_lib_version for licensed and + only with jsl-lib_version for open source + """ + if hardware_target == JvmHardwareTarget.cpu and not cls.has_cpu_jars: + raise Exception(f'{cls.product_name.value} has no CPU Jars!') + + if hardware_target == JvmHardwareTarget.gpu and not cls.has_gpu_jars: + raise Exception(f'{cls.product_name.value} has no GPU Jars!') + + if hardware_target == JvmHardwareTarget.m1 and not cls.has_m1_jar: + raise Exception(f'{cls.product_name.value} has no M1 Jars!') + + return cls.get_url_from_compat_map(compat_map=cls.compatible_spark_to_jar_map, + install_type=hardware_target, + spark_version_to_match=spark_version_to_match, + secret=secret, suffix='jar') + + @classmethod + def get_py_urls(cls, + secret: Optional[str] = None, + spark_version_to_match: Optional[Union[str, LibVersion]] = None, + install_type: PyInstallTypes = PyInstallTypes.wheel) -> UrlDependency: + if not cls.has_py_install: + raise Exception(f'{cls.product_name.value} has no Py Dependencies!') + return cls.get_url_from_compat_map(compat_map=cls.compatible_spark_to_py_map, + install_type=install_type, + spark_version_to_match=spark_version_to_match, + secret=secret, suffix=install_type.value) + + @classmethod + def resolve_to_spark_lib_version(cls, spark_version_to_match) -> LibVersion: + """Get Pyspark version from installed pyspark. + If not compatible across all libraries, print warning. + If no pyspark installed, use latest pyspark which is compatible across all libs. + Uses LatestCompatibleProductVersion for this + + :return: LibVersionIdentifier for the specific Resolver + """ + + # 1. Check if is spark_version_to_match is a string, if yes cast to LibVersion + if isinstance(spark_version_to_match, str): + # TODO print warning if spark_version_to_match is not compatible across all libs + # in --> Helper func since we need it in get_installed_pyspark_version_or_latest_compatible as well + return LibVersion(spark_version_to_match) + # 2. Check if is spark_version_to_match is a LibVersion, if yes return + elif isinstance(spark_version_to_match, LibVersion): + return spark_version_to_match + # 3. Check if is SparkVersion enum + elif isinstance(spark_version_to_match, SparkVersion): + return spark_version_to_match.value + # 3. Check if is spark_version_to_match is some other type, if yes raise exception, should not happen + elif spark_version_to_match: + raise ValueError( + f'Invalid Type for spark_version_to_match, ' + f'must be either str in format A.B.C, None or Libversion' + f'But type is ={type(spark_version_to_match)}') + + # 4. Check if is pyspark is installed + elif try_import_lib('pyspark'): + import pyspark + # TODO check product wide compatibility for pre-installed pyspark --> make helper method for that + return LibVersion(pyspark.__version__) + # 5. Return the latest compatible pyspark if no other method resolves + else: + return LatestCompatibleProductVersion.pyspark.value diff --git a/johnsnowlabs/abstract_base/pydantic_model.py b/johnsnowlabs/abstract_base/pydantic_model.py new file mode 100644 index 0000000000..97e5c6e92d --- /dev/null +++ b/johnsnowlabs/abstract_base/pydantic_model.py @@ -0,0 +1,25 @@ +from johnsnowlabs import settings + +from johnsnowlabs.abstract_base.base_enum import BaseEnum +from johnsnowlabs.utils.enums import ProductName + +from johnsnowlabs.py_models.lib_version import LibVersion +from pydantic import BaseConfig, BaseModel + +BaseConfig.json_encoders = { + LibVersion: lambda v: v.as_str(), + ProductName: lambda x: x.value, + BaseEnum: lambda x: x.value, +} + + +class WritableBaseModel(BaseModel): + + def write(self, path, *args, **kwargs): + with open(path, 'w') as json_file: + if 'indent' not in kwargs: + kwargs['indent'] = settings.json_indent + json_file.write(self.json(*args, **kwargs)) + + class Config: + arbitrary_types_allowed = True diff --git a/johnsnowlabs/abstract_base/software_product.py b/johnsnowlabs/abstract_base/software_product.py new file mode 100644 index 0000000000..4e500d5817 --- /dev/null +++ b/johnsnowlabs/abstract_base/software_product.py @@ -0,0 +1,192 @@ +import os +from abc import ABC +from typing import List, Set, Optional +import sys +import pkg_resources +from johnsnowlabs import settings +from johnsnowlabs.auto_install.jsl_home import get_install_suite_from_jsl_home +from johnsnowlabs.py_models.lib_version import LibVersion + +from johnsnowlabs.abstract_base.lib_resolver import Py4JJslLibDependencyResolverABC +from johnsnowlabs.utils.enums import ProductName, ProductLogo, ProductSlogan, \ + SparkVersion +from johnsnowlabs.py_models.primitive import LibVersionIdentifier +from johnsnowlabs.utils.env_utils import try_import, try_import_in_venv +from johnsnowlabs.py_models.jsl_secrets import JslSecrets +from johnsnowlabs.utils.pip_utils import install_standard_pypi_lib, install_licensed_pypi_lib, get_pip_lib_version +from johnsnowlabs.utils.venv_utils import VenvWrapper + + +class AbstractSoftwareProduct(ABC): + """Only first degree dependencies may be contained in the hard/licensed/optional/ dependency lists + Higher degree dependencies will be resolved by iterating the dependency graph + By default the ABC implements a check_installed based on import. + + """ + name: ProductName + logo: ProductLogo + slogan: Optional[ProductSlogan] = None + hard_dependencies: Set['AbstractSoftwareProduct'] = set() + licensed_dependencies: Set['AbstractSoftwareProduct'] = set() + optional_dependencies: Set['AbstractSoftwareProduct'] = set() + py_module_name: Optional[str] = None + pypi_name: Optional[str] = None + # Only defined for JSL libs below + compatible_spark_versions: List[SparkVersion] + latest_version: Optional[LibVersion] = None + jsl_url_resolver: Optional[Py4JJslLibDependencyResolverABC] = None + licensed: bool = False + is_py4j = False + pypi_name_databricks: Optional[str] = None + + @classmethod + def get_installed_version_via_import(cls): + return False + + @classmethod + def check_installed(cls, python_exec_path: Optional[str] = sys.executable, download_folder: str = None) -> bool: + if cls.pypi_name and download_folder: + for whl in os.listdir(download_folder): + # whl file names re-name '-' to '_' + if cls.pypi_name.replace('-', '_') in whl: + return True + + if cls.py_module_name and not python_exec_path: + return try_import(cls.py_module_name) + elif python_exec_path: + return VenvWrapper.is_lib_in_py_exec(python_exec_path, cls.py_module_name, False) + + # print(f'Assuming {cls.name} is installed, no checks defined.') + return True + + @classmethod + def check_installed_correct_version(cls, python_exec_path: str = sys.executable, + download_folder: str = None) -> bool: + # Only supported for current Py Exec Path, return True otherwise + if python_exec_path != sys.executable: + return True + if download_folder: + return True + if not cls.pypi_name: + return False + if not cls.latest_version: + return True + if not cls.check_installed(python_exec_path=python_exec_path, download_folder=download_folder): + return False + try: + if pkg_resources.get_distribution(cls.pypi_name).version == cls.latest_version.as_str(): + # print(f'👌 Installed version for {cls.logo + cls.name} is correct, no changes made.') + return True + else: + # print(f'🤓 Installed version for {cls.logo + cls.name} is incorrect, ' + # f'should be {cls.latest_version.as_str()} but is {pkg_resources.get_distribution(cls.pypi_name).version} ' + # f'upgrading the package') + + return False + except Exception as err: + v = get_pip_lib_version(lib=cls.pypi_name, py_exec=python_exec_path) + if v: + return v.as_str() == cls.latest_version.as_str() + return False + + @classmethod + def get_installed_version(cls, python_exec_path: str = sys.executable, + download_folder: str = None, + prefer_pip=False, + fallback_import=False) -> bool: + # Only supported for current Py Exec Path, return True otherwise + if not prefer_pip: + try: + return pkg_resources.get_distribution(cls.pypi_name).version + except: + pass + v = get_pip_lib_version(lib=cls.pypi_name, py_exec=python_exec_path) + if v: + return v.as_str() + else: + return cls.get_installed_version_via_import() + + @classmethod + def check_dependencies(cls, python_exec_path=None) -> bool: + # print(f'Assuming {cls.name} dependencies are fine, no checks defined.') + return True + + @classmethod + def health_check(cls) -> bool: + # print(f'Assuming {cls.name} is ok, no checks defined.') + return True + + @classmethod + def install(cls, + secrets: Optional[JslSecrets] = None, + py_path=sys.executable, + upgrade=True, + re_install=False, + version: Optional[str] = None, + download_folder: Optional[str] = None, + include_dependencies: bool = True, + ) -> bool: + """ + Install the product with default settings. + Defaults to Pypi file_name install. + + -m pip download -d path + """ + if not version and cls.latest_version: + version = cls.latest_version + if cls.pypi_name: + if cls.is_py4j and settings.enforce_versions: + # p4j lib should have jars/wheels for it in ~/.johnsnowlabs + # Try using suite whl before attempting to install from remote location + # Unless we toggle enforce_versions=False + suite = get_install_suite_from_jsl_home() + if cls.name == ProductName.hc.value and suite.hc and suite.hc.py_lib: + return install_standard_pypi_lib(f'{settings.py_dir}/{suite.hc.py_lib.file_name}', + cls.py_module_name, + python_path=py_path, upgrade=upgrade, re_install=re_install, + # version=version, + download_folder=download_folder, + include_dependencies=include_dependencies, + ) + elif cls.name == ProductName.ocr.value and suite.ocr and suite.ocr.py_lib: + return install_standard_pypi_lib(f'{settings.py_dir}/{suite.ocr.py_lib.file_name}', + cls.py_module_name, + python_path=py_path, upgrade=upgrade, re_install=re_install, + # version=version, + download_folder=download_folder, + include_dependencies=include_dependencies, ) + elif cls.name == ProductName.nlp.value and suite.nlp and suite.nlp.py_lib: + return install_standard_pypi_lib(f'{settings.py_dir}/{suite.nlp.py_lib.file_name}', + cls.py_module_name, + python_path=py_path, upgrade=upgrade, re_install=re_install, + # version=version, + download_folder=download_folder, + include_dependencies=include_dependencies, ) + if secrets and cls.licensed: + # Licensed is versioned via the secrets + # Fallback install if we could not find locally + return install_licensed_pypi_lib(secrets=secrets, + pypi_name=cls.pypi_name, + module_name=cls.py_module_name, + product=cls, + py_path=py_path, + download_folder=download_folder, + include_dependencies=include_dependencies, + ) + else: + return install_standard_pypi_lib(cls.pypi_name, cls.py_module_name, + python_path=py_path, upgrade=upgrade, re_install=re_install, + download_folder=download_folder, + version=version, + include_dependencies=include_dependencies, ) + # raise NotImplemented(f'No install defined for {cls.file_name}') + return True + + @classmethod + def install_cli(cls, ) -> bool: + """ + Install the product configurable interactive from CLI + """ + if cls.pypi_name: + return install_standard_pypi_lib(cls.pypi_name) + raise NotImplemented(f'No install defined for {cls.name}') diff --git a/johnsnowlabs/auto_install/__init__.py b/johnsnowlabs/auto_install/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/johnsnowlabs/auto_install/databricks/__init__.py b/johnsnowlabs/auto_install/databricks/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/johnsnowlabs/auto_install/databricks/dbfs.py b/johnsnowlabs/auto_install/databricks/dbfs.py new file mode 100644 index 0000000000..99b7b13843 --- /dev/null +++ b/johnsnowlabs/auto_install/databricks/dbfs.py @@ -0,0 +1,56 @@ +# https://pypi.org/project/databricks-api/ +from johnsnowlabs import settings +from typing import Optional, Tuple, Union, List, Any +from databricks_api import DatabricksAPI +from johnsnowlabs.py_models.install_info import PyInstallInfo, JvmInstallInfo +from johnsnowlabs.utils.file_utils import path_tail + + +def dbfs_file_exists(db: DatabricksAPI, path: str): + try: + dbfs_ls(db, path) + return True + except: + return False + + +def dbfs_ls(db: DatabricksAPI, dbfs_path: str): + return db.dbfs.list(dbfs_path) + + +def dbfs_rm(db: DatabricksAPI, dbfs_path: str, recursive: bool = False, ): + return db.dbfs.delete(dbfs_path, recursive=recursive) + + +def copy_from_local_to_hdfs(db: DatabricksAPI, local_path: str, dbfs_path: str, overwrite: bool = True): + print(f'Copying {local_path} to remote cluster path {dbfs_path}') + db.dbfs.put( + path=dbfs_path, + overwrite=overwrite, + # contents=None, + src_path=local_path, + ) + + +def get_db_path(local_info: Union[JvmInstallInfo, PyInstallInfo, str]): + """Get a deterministic path for a JvmInstallInfo or PyInstallInfo or local files on dbfs. + Always use this method to generate output file path for dbfs. + """ + if isinstance(local_info, JvmInstallInfo): + return f'{settings.dbfs_java_dir}/{local_info.file_name}' + elif isinstance(local_info, PyInstallInfo): + # Gotta add the suffix or databricks will not pickup the correct version + return f'{settings.dbfs_py_dir}/{local_info.file_name.split(".")[0]}-py2.py3-none-any.whl' + elif isinstance(local_info, str): + + if '.py' in local_info: + return f"{settings.db_py_jobs_dir}/{path_tail(local_info)}" + elif '.jar' in local_info: + return f"{settings.db_jar_jobs_dir}/{path_tail(local_info)}" + elif '.ipynb' in local_info: + return f"{settings.db_py_notebook_dir}/{path_tail(local_info)}" + else: + raise Exception(f'Invalid Job file, file name must contain either .py .jar or .ipynb' + f'But got {local_info}') + else: + raise Exception(f'Invalid install type = {type(local_info)}') diff --git a/johnsnowlabs/auto_install/databricks/install_utils.py b/johnsnowlabs/auto_install/databricks/install_utils.py new file mode 100644 index 0000000000..5d78b2d29d --- /dev/null +++ b/johnsnowlabs/auto_install/databricks/install_utils.py @@ -0,0 +1,240 @@ +import time +from pprint import pprint +import re +from johnsnowlabs.auto_install.softwares import Software + +from johnsnowlabs.py_models.install_info import InstallSuite, LocalPy4JLib, LocalPyLib +from johnsnowlabs.py_models.lib_version import LibVersion +from johnsnowlabs.auto_install import jsl_home +from .dbfs import * + +# https://pypi.org/project/databricks-api/ +from ...utils.enums import DatabricksClusterStates + + +def get_db_client_for_token(host, token) -> DatabricksAPI: + # Get client via host or token + return DatabricksAPI(host=host, token=token) + + +def get_db_client_for_password(host, email, password) -> DatabricksAPI: + # Get client via user and password + return DatabricksAPI(host=host, user=email, password=password) + + +def create_cluster( + db: DatabricksAPI, + install_suite: InstallSuite = None, + num_workers=1, + cluster_name=settings.db_cluster_name, + node_type_id=settings.db_node_type_id, + driver_node_type_id=settings.db_driver_node_type, + spark_env_vars=None, + autotermination_minutes=60, + spark_version=settings.db_spark_version, + spark_conf=None, + auto_scale=None, + aws_attributes=None, + ssh_public_keys=None, + custom_tags=None, + cluster_log_conf=None, + enable_elastic_disk=None, + cluster_source=None, + instance_pool_id=None, + headers=None, + block_till_cluster_ready: bool = True, + +) -> str: + if not install_suite: + install_suite = jsl_home.get_install_suite_from_jsl_home() + + default_spark_conf = { + 'spark.kryoserializer.buffer.max': '2000M', + 'spark.serializer': 'org.apache.spark.serializer.KryoSerializer', + } + default_spark_env_vars = dict( + SPARK_NLP_LICENSE=install_suite.secrets.HC_LICENSE, + SPARK_OCR_LICENSE=install_suite.secrets.OCR_LICENSE, + AWS_ACCESS_KEY_ID=install_suite.secrets.AWS_ACCESS_KEY_ID, + AWS_SECRET_ACCESS_KEY=install_suite.secrets.AWS_SECRET_ACCESS_KEY, + ) + + if not spark_conf: + spark_conf = default_spark_conf + else: + spark_conf.update(default_spark_conf) + + if not spark_env_vars: + spark_env_vars = default_spark_env_vars + else: + spark_env_vars.update(default_spark_env_vars) + + cluster_id = db.cluster.create_cluster( + num_workers=num_workers, + autoscale=auto_scale, + cluster_name=cluster_name, + spark_version=spark_version, + spark_conf=spark_conf, + aws_attributes=aws_attributes, + node_type_id=node_type_id, + driver_node_type_id=driver_node_type_id, + ssh_public_keys=ssh_public_keys, + custom_tags=custom_tags, + cluster_log_conf=cluster_log_conf, + spark_env_vars=spark_env_vars, + autotermination_minutes=autotermination_minutes, + enable_elastic_disk=enable_elastic_disk, + cluster_source=cluster_source, + instance_pool_id=instance_pool_id, + headers=headers, + )['cluster_id'] + print(f'👌 Created cluster with id={cluster_id} on host={db.client.url}') + install_jsl_suite_to_cluster(db=db, cluster_id=cluster_id, install_suite=install_suite) + if block_till_cluster_ready: + block_till_cluster_ready_state(db, cluster_id) + + return cluster_id + + +def list_db_runtime_versions(db: DatabricksAPI): + versions = db.cluster.list_spark_versions() + # pprint(versions) + for version in versions['versions']: + print(version['key']) + print(version['name']) + # version_regex = r'[0-9].[0-9].[0-9]' + + spark_version = re.findall(r'Apache Spark [0-9].[0-9]', version['name'])[0].lstrip('Apache Spark ') + scala_version = re.findall(r'Scala [0-9].[0-9][0-9]', version['name'])[0].lstrip('Scala ') + has_gpu = len(re.findall('GPU', version['name'])) > 0 + spark_version = spark_version + '.x' + print(LibVersion(spark_version).as_str(), has_gpu, scala_version) + + +def list_clusters(db: DatabricksAPI): + clusters = db.cluster.list_clusters(headers=None) + pprint(clusters) + print(clusters) + return clusters + + +def list_cluster_lib_status(db: DatabricksAPI, cluster_id: str): + lib_statuses = db.managed_library.cluster_status(cluster_id=cluster_id) + # lib_statuses = db.managed_library.all_cluster_statuses() + pprint(lib_statuses) + return lib_statuses + + +def list_node_types(db: DatabricksAPI): + node_types = db.cluster.list_node_types(headers=None) + pprint(node_types) + + +def install_jsl_suite_to_cluster( + db: DatabricksAPI, + cluster_id: str, + install_suite: InstallSuite, + install_optional: bool = True, +): + if install_suite.nlp.get_py_path() and install_suite.nlp.get_java_path(): + install_py4j_lib_via_hdfs(db, cluster_id, install_suite.nlp) + print(f'{Software.spark_nlp.logo + Software.spark_nlp.name} Installed Spark NLP! ✅') + if install_suite.hc.get_py_path() and install_suite.hc.get_java_path(): + install_py4j_lib_via_hdfs(db, cluster_id, install_suite.hc) + print(f'Installed {Software.spark_hc.logo + Software.spark_hc.name} Spark NLP for Healthcare ✅') + if install_suite.ocr.get_py_path() and install_suite.ocr.get_java_path(): + install_py4j_lib_via_hdfs(db, cluster_id, install_suite.ocr) + print(f'Installed {Software.spark_ocr.logo + Software.spark_ocr.name} Spark OCR ✅') + + py_deps = [Software.nlu.pypi_name, Software.sparknlp_display.pypi_name, Software.jsl_lib.pypi_name_databricks] + for dep in py_deps: + install_py_lib_via_pip(db, cluster_id, dep) + + +def block_till_cluster_ready_state(db: DatabricksAPI, cluster_id: str): + status = None + while status != DatabricksClusterStates.RUNNING: + # https://docs.databricks.com/dev-tools/api/latest/clusters.html#clusterclusterstate + status = DatabricksClusterStates(db.cluster.get_cluster(cluster_id)['state']) + print(f'Cluster-Id={cluster_id} not ready, status={status.value}') + time.sleep(10) + + print(f'👌 Cluster-Id {cluster_id} is ready!') + + +def install_py_lib_via_pip(db: DatabricksAPI, cluster_id: str, pypi_lib: str): + """ + Tell Cluster to install via public pypi + # https://docs.databricks.com/dev-tools/api/latest/libraries.html + https://docs.databricks.com/dev-tools/api/latest/libraries.html#install + :param db: + :param cluster_id: + :param lib: + :return: + """ + # By not defining repo, we will use default pip index + payload = [dict(pypi=dict(package=pypi_lib))] + db.managed_library.install_libraries(cluster_id=cluster_id, libraries=payload) + print(f'Installed {pypi_lib} ✅') + + +def install_py4j_lib_via_hdfs(db: DatabricksAPI, cluster_id: str, lib: LocalPy4JLib): + """ + 1. Copy lib files to HDFS if not present + 2. Tell Cluster to install + https://docs.databricks.com/dev-tools/api/latest/libraries.html#install + :param db: + :param cluster_id: + :param lib: + :return: + """ + copy_p4j_lib_to_hdfs_if_not_present(db, lib) + payload = [dict(jar=get_db_path(lib.java_lib)), + dict(whl=get_db_path(lib.py_lib), )] + db.managed_library.install_libraries(cluster_id=cluster_id, libraries=payload) + + +def copy_p4j_lib_to_hdfs_if_not_present(db: DatabricksAPI, lib: LocalPy4JLib): + if not is_lib_on_dbfs_cluster(db, lib.java_lib): + copy_lib_to_dbfs_cluster(db, lib.java_lib) + if not is_lib_on_dbfs_cluster(db, lib.py_lib): + copy_lib_to_dbfs_cluster(db, lib.py_lib) + + +def copy_py_lib_to_hdfs_if_not_present(db: DatabricksAPI, lib: LocalPyLib): + if not is_lib_on_dbfs_cluster(db, lib.py_lib): + copy_lib_to_dbfs_cluster(db, lib.py_lib) + + +def is_lib_on_dbfs_cluster(db: DatabricksAPI, local_info: Union[JvmInstallInfo, PyInstallInfo]): + dbfs_path = get_db_path(local_info) + return dbfs_file_exists(db, dbfs_path) + + +def copy_lib_to_dbfs_cluster(db: DatabricksAPI, local_info: Union[JvmInstallInfo, PyInstallInfo]): + dbfs_path = get_db_path(local_info) + if isinstance(local_info, JvmInstallInfo): + local_path = f'{settings.java_dir}/{local_info.file_name}' + elif isinstance(local_info, PyInstallInfo): + local_path = f'{settings.py_dir}/{local_info.file_name}' + else: + raise Exception(f'Invalid lib install type to copy {type(local_info)}') + return copy_from_local_to_hdfs(db, local_path=local_path, dbfs_path=dbfs_path) + + + +def wait_till_cluster_running(db: DatabricksAPI, cluster_id: str): + # https://docs.databricks.com/dev-tools/api/latest/clusters.html#clusterclusterstate + import time + while 1: + time.sleep(5) + status = DatabricksClusterStates(db.cluster.get_cluster(cluster_id)['state']) + if status == DatabricksClusterStates.RUNNING: + return True + elif status in [DatabricksClusterStates.PENDING, DatabricksClusterStates.RESIZING, + DatabricksClusterStates.RESIZING]: + continue + elif status in [DatabricksClusterStates.TERMINATED, DatabricksClusterStates.TERMINATING, + DatabricksClusterStates.ERROR, + DatabricksClusterStates.UNKNOWN]: + return False diff --git a/johnsnowlabs/auto_install/databricks/work_utils.py b/johnsnowlabs/auto_install/databricks/work_utils.py new file mode 100644 index 0000000000..8cb6347ad7 --- /dev/null +++ b/johnsnowlabs/auto_install/databricks/work_utils.py @@ -0,0 +1,182 @@ +import inspect +import os.path +from pathlib import Path +from types import ModuleType + +from typing import Callable, Union, Optional + +from databricks_api import DatabricksAPI + +from johnsnowlabs.auto_install.databricks.dbfs import * +from johnsnowlabs.auto_install.databricks.install_utils import create_cluster +from johnsnowlabs.utils.file_utils import path_tail, str_to_file + + +def create_job_in_databricks(db: DatabricksAPI, + local_python_script: str = None, + cluster_id=None, + + name=None, + new_cluster=None, + libraries=None, + email_notifications=None, + timeout_seconds=None, + max_retries=None, + min_retry_interval_millis=None, + retry_on_timeout=None, + schedule=None, + notebook_task=None, + spark_jar_task=None, + # spark_python_task=None, + spark_submit_task=None, + max_concurrent_runs=None, + tasks=None, + headers=None, + version=None, + ): + # https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsCreate + dbfs_target_path = copy_from_local_to_hdfs(db=db, local_path=local_python_script, + dbfs_path=get_db_path(local_python_script)) + if not name: + name = settings.db_job_name.format(job=get_db_path(local_python_script).split('/')[-1]) + + if not cluster_id: + raise NotImplementedError('Not Cluster ID based install not yet implemented') + + db.jobs.create_job( + name=name, + spark_python_task=dbfs_target_path, + notebook_task=notebook_task, + spark_jar_task=spark_jar_task, + spark_submit_task=spark_submit_task, + + libraries=libraries, + + existing_cluster_id=cluster_id, + new_cluster=new_cluster, + + email_notifications=email_notifications, + timeout_seconds=timeout_seconds, + max_retries=max_retries, + min_retry_interval_millis=min_retry_interval_millis, + retry_on_timeout=retry_on_timeout, + schedule=schedule, + max_concurrent_runs=max_concurrent_runs, + tasks=tasks, + headers=headers, + version=version, + + ) + + +def run_local_py_script_as_task(db: DatabricksAPI, + task_definition: Union[str, ModuleType, Callable], + cluster_id: str = None, + run_name: str = None, + parameters: List[Any] = None): + """ + # https://docs.databricks.com/dev-tools/api/latest/examples.html#jobs-api-examples + A job consists of 1 or more tasks + :param db: + :param task_definition: + :param cluster_id: + :param run_name: + :return: + """ + + task_definition = executable_as_script(task_definition) + if not run_name: + run_name = settings.db_run_name + if not cluster_id: + cluster_id = create_cluster(db) + copy_from_local_to_hdfs(db=db, local_path=task_definition, dbfs_path=get_db_path(task_definition), ) + py_task = dict(python_file=get_db_path(task_definition), ) + if parameters: + py_task['parameters'] = parameters + + run_id = db.jobs.submit_run( + existing_cluster_id=cluster_id, + spark_python_task=py_task, + run_name=run_name, + # new_cluster=None, + # libraries=None, + # notebook_task=None, + # spark_jar_task=None, + # spark_submit_task=None, + # timeout_seconds=None, + # tasks=None, + # headers=None, + # version=None, + )['run_id'] + print(f'Stated task with run_id={run_id}') + return run_id + + +def executable_as_script(py_executable: Union[str, ModuleType, Callable]): + if isinstance(py_executable, str) and os.path.exists(py_executable): + print(f'Detected Python Script for Databricks Submission') + # Py file, we can just run this + return py_executable + if isinstance(py_executable, (str, ModuleType, Callable)): + # Convert Module/Callable into a script + return executable_to_str(py_executable) + raise TypeError(f'Invalid Executable Python Task {py_executable}') + + +def executable_to_str(executable_to_convert: Union[str, ModuleType, Callable]): + # write a python code-string/module/function into a temp file and return resulting python file + Path(settings.tmp_notebook_dir).mkdir(parents=True, exist_ok=True) + from random import randrange + if isinstance(executable_to_convert, str) and not os.path.exists(executable_to_convert): + # Executable script + file_name = f'{randrange(1333777)}tmp.py' + + else: + # Module/Callable + try: + file_name = path_tail(inspect.getfile(executable_to_convert)) + except: + # Within a Python shell, we cannot getFile(), so we have this fallback name + file_name = f'{randrange(1333777)}tmp.py' + + out_path = f'{settings.tmp_notebook_dir}/{file_name}' + + if isinstance(executable_to_convert, str): + print(f'Detected Python Code String') + return str_to_file(executable_to_convert, out_path) + + if isinstance(executable_to_convert, Callable): + print(f'Detected Python Function for Databricks Submission') + return str_to_file(inspect.getsource(executable_to_convert) + f'\n{executable_to_convert.__name__}()', out_path) + + if isinstance(executable_to_convert, ModuleType): + print(f'Detected Python Module for Databricks Submission') + return str_to_file(inspect.getsource(executable_to_convert), out_path) + + +def checkon_db_task(db: DatabricksAPI, run_id: str = None, ): + """ + # https://docs.databricks.com/dev-tools/api/latest/examples.html#jobs-api-examples + A job consists of 1 or more tasks + :param db: + :param task_definition: + :param cluster_id: + :param run_name: + :return: + """ + # https://docs.databricks.com/dev-tools/api/2.0/jobs.html#runstate + return db.jobs.get_run(run_id=run_id) + + +def run_in_databricks( + py_script_path: Union[str, ModuleType, Callable], + databricks_cluster_id: Optional[str] = None, + databricks_token: Optional[str] = None, + databricks_host: Optional[str] = None, + run_name: str = None, + databricks_password: Optional[str] = None, + databricks_email: Optional[str] = None, +): + from johnsnowlabs.auto_install.databricks.install_utils import create_cluster, get_db_client_for_token + db_client = get_db_client_for_token(databricks_host, databricks_token) + return run_local_py_script_as_task(db_client, py_script_path, cluster_id=databricks_cluster_id, run_name=run_name) diff --git a/johnsnowlabs/auto_install/health_checks/__init__.py b/johnsnowlabs/auto_install/health_checks/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/johnsnowlabs/auto_install/health_checks/hc_test.py b/johnsnowlabs/auto_install/health_checks/hc_test.py new file mode 100644 index 0000000000..af8ef923f0 --- /dev/null +++ b/johnsnowlabs/auto_install/health_checks/hc_test.py @@ -0,0 +1,26 @@ +# from johnsnowlabs import * +# spark = jsl.start() +from pyspark.ml import Pipeline +from sparknlp import DocumentAssembler +from sparknlp.annotator import ContextSpellCheckerModel, Tokenizer +# from johnsnowlabs import * +def run_test(): + # spark = jsl.start() + documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + + tokenizer = Tokenizer() \ + .setInputCols(["document"]) \ + .setOutputCol("token") + + spellModel = ContextSpellCheckerModel \ + .pretrained('spellcheck_clinical', 'en', 'clinical/models') \ + .setInputCols("token") \ + .setOutputCol("checked") + + pipeline = Pipeline(stages = [documentAssembler, tokenizer, spellModel]) + + empty = spark.createDataFrame([[""]]).toDF("text") + example = spark.createDataFrame([["Witth the hell of phisical terapy.",]]).toDF("text") + pipeline.fit(empty).transform(example).show() \ No newline at end of file diff --git a/johnsnowlabs/auto_install/health_checks/nlp_test.py b/johnsnowlabs/auto_install/health_checks/nlp_test.py new file mode 100644 index 0000000000..e59e21221e --- /dev/null +++ b/johnsnowlabs/auto_install/health_checks/nlp_test.py @@ -0,0 +1,27 @@ + +from johnsnowlabs import * +def run_test(): + spark = jsl.start() + doc = nlp.DocumentAssembler() \ + .setInputCol('text').setOutputCol('doc') + + tok = nlp.Tokenizer() \ + .setInputCols('doc').setOutputCol('tok') + + embeddings = nlp.WordEmbeddingsModel.pretrained("glove_100d", "en") \ + .setInputCols("doc", "tok") \ + .setOutputCol("embeddings") + + ner = nlp.NerDLModel.pretrained("nerdl_fewnerd_100d") \ + .setInputCols(["doc", "tok", "embeddings"]) \ + .setOutputCol("ner") + + ner_converter = nlp.NerConverter() \ + .setInputCols(['doc', 'tok', 'ner']) \ + .setOutputCol('ner_chunk') + + text = 'Peter Parker is a nice guy and lives in New York' + spark_df = spark.createDataFrame([[text]]).toDF("text") + + p = Pipeline(stages=[doc, tok, embeddings, ner, ner_converter]) + p.fit(spark_df).transform(spark_df).show() diff --git a/johnsnowlabs/auto_install/health_checks/ocr_test.py b/johnsnowlabs/auto_install/health_checks/ocr_test.py new file mode 100644 index 0000000000..57c5517b8f --- /dev/null +++ b/johnsnowlabs/auto_install/health_checks/ocr_test.py @@ -0,0 +1,56 @@ +# Convert pdf to image +from johnsnowlabs import * + + +def run_test(): + spark = jsl.start() + pdf_to_image = ocr.PdfToImage() + pdf_to_image.setImageType(jsl.ocr.ImageType.TYPE_3BYTE_BGR) + + # Detect tables on the page using pretrained model + # It can be finetuned for have more accurate results for more specific documents + table_detector = ocr.ImageTableDetector.pretrained("general_model_table_detection_v2", "en", "clinical/ocr") + table_detector.setInputCol("image") + table_detector.setOutputCol("region") + + # Draw detected region's with table to the page + draw_regions = ocr.ImageDrawRegions() + draw_regions.setInputCol("image") + draw_regions.setInputRegionsCol("region") + draw_regions.setOutputCol("image_with_regions") + draw_regions.setRectColor(jsl.ocr.Color.red) + + # Extract table regions to separate images + splitter = ocr.ImageSplitRegions() + splitter.setInputCol("image") + splitter.setInputRegionsCol("region") + splitter.setOutputCol("table_image") + splitter.setDropCols("image") + + # Detect cells on the table image + cell_detector = ocr.ImageTableCellDetector() + cell_detector.setInputCol("table_image") + cell_detector.setOutputCol("cells") + cell_detector.setAlgoType("morphops") + + # Extract text from the detected cells + table_recognition = ocr.ImageCellsToTextTable() + table_recognition.setInputCol("table_image") + table_recognition.setCellsCol('cells') + table_recognition.setMargin(3) + table_recognition.setStrip(True) + table_recognition.setOutputCol('table') + + pipeline = PipelineModel(stages=[ + pdf_to_image, + table_detector, + draw_regions, + splitter, + cell_detector, + table_recognition + ]) + + import pkg_resources + pdf_example = pkg_resources.resource_filename('sparkocr', 'resources/ocr/pdfs/tabular-pdf/data.pdf') + pdf_example_df = spark.read.format("binaryFile").load(pdf_example).cache() + pipeline.transform(pdf_example_df).show() diff --git a/johnsnowlabs/auto_install/health_checks/report.py b/johnsnowlabs/auto_install/health_checks/report.py new file mode 100644 index 0000000000..b05ce5de5c --- /dev/null +++ b/johnsnowlabs/auto_install/health_checks/report.py @@ -0,0 +1,68 @@ +from typing import Dict, Tuple +from colorama import Fore +from johnsnowlabs import settings +from johnsnowlabs.utils.my_jsl_api import get_access_key_from_browser, get_user_licenses +from johnsnowlabs.py_models.jsl_secrets import LicenseInfos +from johnsnowlabs.auto_install.softwares import Software +from johnsnowlabs.abstract_base.software_product import AbstractSoftwareProduct +from johnsnowlabs.utils.enums import ProductName + + +def check_health(check_install=True): + # Print status of installations and licenses + install_status: Dict[AbstractSoftwareProduct:bool] = {} + health_check: Dict[AbstractSoftwareProduct:bool] = {} + license_check: Dict[str, Tuple[bool, bool]] = {} + print(f'{"_" * 10}Installation check results{"_" * 10}') + for product in ProductName: + if check_install: + product = Software.for_name(product) + if not product or not product.pypi_name: + continue + install_status[product] = product.check_installed() and product.check_installed_correct_version() + if not product.check_installed(): + print(f'{product.logo + product.name} is not installed 🚨') + elif not product.check_installed_correct_version(): + print( + f'{product.logo + product.pypi_name + Fore.LIGHTRED_EX}=={product.get_installed_version() + Fore.RESET} ' + f'is installed but should be {product.pypi_name}=={Fore.LIGHTGREEN_EX + product.latest_version.as_str() + Fore.RESET} 🚨 To fix run:') + + if product.licensed: + print( + f'{Fore.LIGHTGREEN_EX}jsl.install(){Fore.RESET} while authorizing to upgrade {product.logo + product.pypi_name}') + else: + print( + f'{Fore.LIGHTGREEN_EX}python -m pip install {product.pypi_name}=={product.latest_version.as_str()} --upgrade{Fore.RESET}') + else: + print(f'{product.logo + product.pypi_name}=={product.get_installed_version()} ' + f'{Fore.LIGHTGREEN_EX}is correctly installed! ✅{Fore.RESET}') + + if health_check: + health_check[product] = product.health_check() + + +def list_remote_licenses(): + access_token = get_access_key_from_browser() + licenses = get_user_licenses(access_token) + + print('Your Remote licenses in https://my.johnsnowlabs.com:') + for i, lic in enumerate(licenses): + softwares = [Software.for_name(p.value) for p in lic.products] + print(f'Remote License Number {i} Has access to: {", ".join([s.logo + s.name for s in softwares])}.\n' + f'Extra info for License {i}: EndDate={lic.endDate}, Type={lic.type}, Platform={lic.platform}, Id={lic.id} ') + + print('_' * 50) + + +def list_local_licenses(): + print(f'Your licenses in {settings.license_dir}') + licenses = LicenseInfos.from_home() + if not licenses: + raise Exception('You have no local licenses') + i = 0 + for file, lic in licenses.infos.items(): + softwares = [Software.for_name(p.value) for p in lic.products] + print(f'Your local license in {i} Has access to: {", ".join([s.logo + s.name for s in softwares])}.\n' + f'Extra info for License {i}: File={file} Id={lic.id}') + i += 1 + print('_' * 50) diff --git a/johnsnowlabs/auto_install/install_flow.py b/johnsnowlabs/auto_install/install_flow.py new file mode 100644 index 0000000000..e0fab2bf6e --- /dev/null +++ b/johnsnowlabs/auto_install/install_flow.py @@ -0,0 +1,168 @@ +import os.path +import shutil +from typing import Optional, Dict + +from johnsnowlabs import settings +from johnsnowlabs.auto_install.databricks.install_utils import create_cluster, get_db_client_for_token, \ + install_jsl_suite_to_cluster +from johnsnowlabs.auto_install.jsl_home import setup_jsl_home, get_install_suite_from_jsl_home +from johnsnowlabs.auto_install.offline_install import get_printable_dependency_urls +from johnsnowlabs.auto_install.softwares import Software +from johnsnowlabs.utils.enums import ProductName, PyInstallTypes, JvmHardwareTarget +from johnsnowlabs.py_models.jsl_secrets import JslSecrets +from johnsnowlabs.auto_install.install_software import check_and_install_dependencies +import sys + + +def install( + # -- JSL-Auth Flows -- + # Browser Auth + browser_login: bool = True, + force_browser: bool = False, + # JWT Token Auth + access_token: Optional[str] = None, + # JSON file Auth + json_license_path: Optional[str] = None, + # Manual License specification Auth + med_license: Optional[str] = None, + enterprise_nlp_secret: Optional[str] = None, + ocr_secret: Optional[str] = None, + ocr_license: Optional[str] = None, + fin_license: Optional[str] = None, + leg_license: Optional[str] = None, + aws_access_key: Optional[str] = None, + aws_key_id: Optional[str] = None, + + # -- Databricks auth flows & Install Target -- + databricks_cluster_id: Optional[str] = None, + databricks_token: Optional[str] = None, + databricks_host: Optional[str] = None, + databricks_password: Optional[str] = None, + databricks_email: Optional[str] = None, + + # -- Install Params -- + # Install Target + python_exec_path: str = sys.executable, + venv_creation_path: Optional[str] = None, + offline_zip_dir: Optional[str] = None, + + # Download Params + offline: bool = False, + install_optional: bool = True, + install_licensed: bool = True, + only_download_jars: bool = False, + product: Optional[str] = ProductName.jsl_full.value, + include_dependencies: bool = True, + # License usage & Caching + local_license_number: int = 0, + remote_license_number: int = 0, + store_in_jsl_home: bool = True, + # Install File Types + jvm_install_type: str = JvmHardwareTarget.cpu.value, + py_install_type: str = PyInstallTypes.wheel.value, + only_refresh_credentials: bool = False, + refresh_install: bool = False, + + # -- Databricks Cluster Creation Params -- + block_till_cluster_ready=True, + num_workers=1, + cluster_name=settings.db_cluster_name, + node_type_id=settings.db_node_type_id, + driver_node_type_id=settings.db_driver_node_type, + spark_env_vars=None, + autotermination_minutes=60, + spark_version=settings.db_spark_version, + spark_conf=None, + auto_scale=None, + aws_attributes=None, + ssh_public_keys=None, + custom_tags=None, + cluster_log_conf=None, + enable_elastic_disk=None, + cluster_source=None, + instance_pool_id=None, + headers=None, + +): + if refresh_install and os.path.exists(settings.root_dir) : + shutil.rmtree(settings.root_dir) + # Input Validation + py_install_type = PyInstallTypes.from_str(py_install_type) + jvm_install_type = JvmHardwareTarget.from_str(jvm_install_type) + product = Software.for_name(product) + + # Get Credentials from Auth Flow + secrets: JslSecrets = JslSecrets.build_or_try_find_secrets(browser_login=browser_login, + force_browser=force_browser, + access_token=access_token, + local_license_number=local_license_number, + remote_license_number=remote_license_number, + secrets_file=json_license_path, + hc_license=med_license, + hc_secret=enterprise_nlp_secret, + ocr_secret=ocr_secret, + ocr_license=ocr_license, + aws_access_key=aws_access_key, + aws_key_id=aws_key_id, + return_empty_secrets_if_none_found=True, + fin_license=fin_license, + leg_license=leg_license, + store_in_jsl_home=store_in_jsl_home) + if only_refresh_credentials: + return + + if offline: + # Offline Install + get_printable_dependency_urls(secrets=secrets, + jvm_install_type=jvm_install_type, + py_install_type=py_install_type) + if not offline_zip_dir: + return + + if store_in_jsl_home and not offline: + # Cache credentials, Wheels and Jars in ~/.johnsnowlabs + setup_jsl_home( + secrets=secrets, + jvm_install_type=jvm_install_type, + py_install_type=py_install_type, + refresh_install=refresh_install) + + # Databricks Install + if databricks_host and databricks_token and not offline: + suite = get_install_suite_from_jsl_home(jvm_hardware_target=jvm_install_type) + if databricks_cluster_id: + install_jsl_suite_to_cluster( + db=get_db_client_for_token(databricks_host, databricks_token), + install_suite=suite, + cluster_id=databricks_cluster_id) + + else: + return create_cluster(db=get_db_client_for_token(databricks_host, databricks_token), + install_suite=suite, block_till_cluster_ready=block_till_cluster_ready, + num_workers=num_workers, + cluster_name=cluster_name, + node_type_id=node_type_id, + driver_node_type_id=driver_node_type_id, + spark_env_vars=spark_env_vars, + autotermination_minutes=autotermination_minutes, + spark_version=spark_version, + spark_conf=spark_conf, + auto_scale=auto_scale, + aws_attributes=aws_attributes, + ssh_public_keys=ssh_public_keys, + custom_tags=custom_tags, + cluster_log_conf=cluster_log_conf, + enable_elastic_disk=enable_elastic_disk, + cluster_source=cluster_source, + instance_pool_id=instance_pool_id, + headers=headers, ) + + # Local Py-Install + elif not only_download_jars: + check_and_install_dependencies(product=product, secrets=secrets, install_optional=install_optional, + install_licensed=install_licensed, + python_exec_path=python_exec_path, + py_setup_dir=venv_creation_path, + offline_zip_dir=offline_zip_dir, + include_dependencies=include_dependencies + ) diff --git a/johnsnowlabs/auto_install/install_software.py b/johnsnowlabs/auto_install/install_software.py new file mode 100644 index 0000000000..48e7d19335 --- /dev/null +++ b/johnsnowlabs/auto_install/install_software.py @@ -0,0 +1,209 @@ +import os +import sys +from pathlib import Path +from typing import Dict, Set +import shutil +from colorama import Fore, Back, Style + +from johnsnowlabs import settings +from johnsnowlabs.utils.env_utils import is_running_in_databricks + +from johnsnowlabs.abstract_base.software_product import AbstractSoftwareProduct +from johnsnowlabs.auto_install.softwares import Software +from johnsnowlabs.utils.enums import ProductName, LatestCompatibleProductVersion +from johnsnowlabs.py_models.jsl_secrets import JslSecrets +from johnsnowlabs.utils.pip_utils import get_pip_lib_version +from johnsnowlabs.utils.venv_utils import VenvWrapper + + +def check_and_install_dependencies(product: AbstractSoftwareProduct, + secrets: JslSecrets, + python_exec_path: str = sys.executable, + install_optional: bool = False, + install_licensed: bool = True, + py_setup_dir: str = None, + offline_zip_dir: str = None, + include_dependencies: bool = True, + ): + """ + Iterates the dependency DAG in DFS order for input product and downloads installs all dependencies + into python_exec_path with pip module. + Usually we just iterate starting from jsl-full-suite Product, which depends on everything + + A product may have 3 types of dependencies which are defined as outgoing edges in the DAG: + - hard_dependency: Must have this installed for core functionality like pyspark + - licensed_dependency: Requires a license, install if accessible like OCR/NLP-Licensed + - optional_dependency: Can be installed for extra features, like NLU, NLP-Display, etc.. + + NOTE: There is currently no product that has licensed dependencies. + INVARIANT : + Either the product itself is licensed and then no dependencies requiring license + Or product is free, but it has licensed optional dependencies + + Spark NLP must be re-installed last, because other libs may depend on it but might install a + Spark NLP version which is not the one we have defined in johnsnowlabs.settings + + TODO (UX Improvement) + To properly log what was installed, we must first iterate the DAG and get all missing notes + Otherwise we might install multiple nodes because of sub-dependencies in one iteration + And then we will not be able to properly track what was installed ( Everything works fine tho) + This causes venv/cli based installs which will install jsl_lib not to print out all JSL-product logos + Like Spark-NLP-Display, NLU, Pyspark or Spark-NLP + + # TODO generalize HOST class Databricks/Local/Venv etc.. with host.has_lib_installed() etc.. + + :param offline_zip_dir: + :param include_dependencies: + :param py_dir: + :param product: + :param python_exec_path: A Python executable into which products should be installed. + The pip module of this executable is used to install all libs + :param py_setup_dir: If not None, a fresh Python Venv with target products will be setup in folder + if using this parameter, the python_exec_path parameter will be ignored + and the python_exec_path from the newly created venv is used to setup all libs + :param secrets: to use for installing licensed libs + :param install_optional: install optional dependencies if True, otherwise not + :param install_licensed: install licensed products if license permits it if True, otherwise not + sparknlp_to_latest: for some releases we might not want to go to the latest spark release + """ + import site + from importlib import reload + reload(site) + running_in_databricks = is_running_in_databricks() + offline_py_dir = None + license_dir = None + java_dir = None + if py_setup_dir: + # Use VenvWrapper to setup new Env + VenvWrapper.create_venv(venv_target_dir=py_setup_dir, log=False) + python_exec_path = VenvWrapper.glob_py_exec_from_venv(py_setup_dir) + if offline_zip_dir and not os.path.exists(offline_zip_dir): + offline_py_dir = f'{offline_zip_dir}/py_installs' + java_dir = f'{offline_zip_dir}/java_installs' + license_dir = f'{offline_zip_dir}/licenses' + Path(offline_py_dir).mkdir(parents=True, exist_ok=True) + Path(java_dir).mkdir(parents=True, exist_ok=True) + Path(license_dir).mkdir(parents=True, exist_ok=True) + + # Iteration Variables + hard_nodes: Set[AbstractSoftwareProduct] = {product} + licensed_nodes: Set[AbstractSoftwareProduct] = set([]) + optional_nodes: Set[AbstractSoftwareProduct] = set([]) + install_results: Dict[AbstractSoftwareProduct:bool] = {} + + # Boolean Checkers + is_spark_nlp = lambda node: node.name == ProductName.nlp.value + exist_install_result = lambda node: node in install_results + licensed_nodes_left_to_install = lambda: licensed_nodes and install_licensed + optional_nodes_left_to_install = lambda: optional_nodes and install_optional + + while hard_nodes or licensed_nodes_left_to_install() or optional_nodes_left_to_install(): + # Core loop, check if any more vertex left to explore + # In addition to popping whatever is not empty, we remember what we popped by returning a tuple + v = hard_nodes.pop() if hard_nodes \ + else licensed_nodes.pop() if licensed_nodes_left_to_install() \ + else optional_nodes.pop() if optional_nodes_left_to_install() \ + else None + + v: AbstractSoftwareProduct + # Collect all children of this vertex for next iteration + hard_nodes = hard_nodes | v.hard_dependencies + licensed_nodes = licensed_nodes | v.licensed_dependencies + optional_nodes = optional_nodes | v.optional_dependencies + + # Check if we should install this vertex + if not v.pypi_name: + # Non Python Dependencies are not handled here yet + continue + if not v.licensed and not install_licensed: + continue + if v.check_installed(python_exec_path=python_exec_path, download_folder=offline_py_dir) \ + and v.check_installed_correct_version() and not offline_py_dir: + # It's already installed and has correct version + continue + elif is_spark_nlp(v) and not offline_py_dir: + # We don't install spark nlp during iterating the DAG + continue + elif exist_install_result(v): + # We could have failed in a previous iteration installing this vertex + # Only if we don't have an entry for it, we attempt installing + continue + + # Attempt installing the node and store the result + install_results[v] = v.install( + secrets=secrets, py_path=python_exec_path, download_folder=offline_py_dir, + include_dependencies=include_dependencies, re_install=v.check_installed_correct_version() + ) + if offline_zip_dir: + print('👷 Zipping installation files for offline install') + finalize_zip_folder(offline_zip_dir, java_dir, offline_py_dir, license_dir) + print("Done zipping") + return + else: + # These checks are only for current env, not defined for offline + if not get_pip_lib_version('spark-nlp', py_exec=python_exec_path).equals( + LatestCompatibleProductVersion.spark_nlp.value): + # Re-install NLP incase some other library up/downgraded it while we installed it + install_results[Software.spark_nlp] = \ + Software.spark_nlp.install(re_install=True, + version=LatestCompatibleProductVersion.spark_nlp.value, + py_path=python_exec_path, download_folder=offline_py_dir, + include_dependencies=include_dependencies,) + if not get_pip_lib_version('pyspark', py_exec=python_exec_path).equals( + LatestCompatibleProductVersion.pyspark.value): + # Re-install NLP incase some other library up/downgraded it while we installed it + install_results[Software.spark_nlp] = \ + Software.pyspark.install(re_install=True, + version=LatestCompatibleProductVersion.pyspark.value, + py_path=python_exec_path, download_folder=offline_py_dir, + include_dependencies=include_dependencies, + ) + + # Log The results of installation + if Software.jsl_full in install_results: + del install_results[Software.jsl_full] + if len(install_results) > 0: + print(f'Installed {len(install_results)} products:') + for installed_software, result in install_results.items(): + if installed_software.check_installed(python_exec_path=python_exec_path, download_folder=offline_py_dir): + print( + f'{installed_software.logo} {installed_software.name}=={installed_software.get_installed_version(prefer_pip=True, fallback_import=False)}' + f' installed! ✅ {installed_software.slogan} ') + else: + print(f'{installed_software.logo} {installed_software.name} not installed! ❌') + + print( + f'🔁{Fore.LIGHTRED_EX} If you are on Google Colab, please restart your Notebook for changes to take effect {Fore.RESET}🔁') + else: + print(f'👌 Everything is already installed, no changes made') + + +def finalize_zip_folder(offline_zip_dir, new_java_dir, py_dir,new_license_dir): + # Copy files from local jsl home and create zip file + copy_jars_from_jsl_home_to_offline_zip_dir(new_java_dir) + copy_py_installs_from_jsl_home_to_offline_zip_dir(py_dir) + copy_licenses_from_jsl_home_to_zip_dir(new_license_dir) + zip_folder(offline_zip_dir) + +def copy_py_installs_from_jsl_home_to_offline_zip_dir(new_py_dir): + for f in os.listdir(settings.py_dir): + if '.gz' in f or '.whl' in f : + print(f'Adding {f} to zip') + shutil.copy(f'{settings.py_dir}/{f}', new_py_dir) + +def copy_jars_from_jsl_home_to_offline_zip_dir(new_java_dir): + for f in os.listdir(settings.java_dir): + if '.jar' in f: + print(f'Adding {f} to zip') + shutil.copy(f'{settings.java_dir}/{f}', new_java_dir) + + +def copy_licenses_from_jsl_home_to_zip_dir(new_license_dir): + for f in os.listdir(settings.license_dir): + if 'info' not in f: + shutil.copy(f'{settings.license_dir}/{f}', new_license_dir) + + +def zip_folder(offline_zip_dir): + shutil.make_archive('john_snow_labs_suite', 'zip', offline_zip_dir) + shutil.move('john_snow_labs_suite.zip', offline_zip_dir) diff --git a/johnsnowlabs/auto_install/jsl_home.py b/johnsnowlabs/auto_install/jsl_home.py new file mode 100644 index 0000000000..afd204052f --- /dev/null +++ b/johnsnowlabs/auto_install/jsl_home.py @@ -0,0 +1,271 @@ +import shutil +import sys +from typing import Optional, List +from johnsnowlabs import settings +from johnsnowlabs.py_models import jsl_secrets +from johnsnowlabs.py_models.install_info import JvmInstallInfo, PyInstallInfo, \ + RootInfo, InstallSuite, LocalPy4JLib, InstallFolder +from johnsnowlabs.py_models.jsl_secrets import JslSecrets +from johnsnowlabs.utils.enums import JvmHardwareTarget, PyInstallTypes, ProductName, ProductLogo +from johnsnowlabs.py_models.url_dependency import UrlDependency +import os +from pathlib import Path +from johnsnowlabs.auto_install.offline_install import get_py4j_dependency_urls +from colorama import Fore + + +def jsl_home_exist(): + return os.path.exists(settings.root_info_file) + + +def is_jsl_home_outdated(): + if jsl_home_exist: + return not RootInfo.get_from_jsl_home().version.as_str() == settings.raw_version_jsl_lib + else: + raise Exception(f'JSL-Home does not exist! Cannot check if outdated') + + +def download_deps_and_create_info(deps: List[UrlDependency], + lib_dir, info_file_path, + overwrite=False, ): + """Download a list of deps to given lib_dir folder and creates info_file at info_file_path. + """ + info, old_info = {}, {} + if os.path.exists(info_file_path): + # keep old infos, we assume they are up-to-date and compatible + old_info = InstallFolder.parse_file(info_file_path) + + for p in deps: + + # print_prefix = Software.for_name(p.product_name).logo + print_prefix = ProductLogo.from_name(p.product_name.name).value + if p.dependency_type in JvmHardwareTarget: + print_prefix = f'{ProductLogo.java.value}+{print_prefix} Java Library' + constructor = JvmInstallInfo + elif p.dependency_type in PyInstallTypes: + print_prefix = f'{ProductLogo.python.value}+{print_prefix} Python Library' + constructor = PyInstallInfo + else: + raise ValueError(f'Unknown Install type {p.dependency_type}') + if not os.path.exists(f'{lib_dir}/{p.file_name}') or overwrite: + try: + p.download_url(lib_dir, name_print_prefix=print_prefix) + except ValueError as _: + # sys.tracebacklimit = 0 + err_msg = f"""🚨 Cannot install {ProductLogo.from_name(p.product_name.name).value}{p.product_name.value} because provided license file secret is outdated or it is invalid. +How to Fix this: +Option1: Run {Fore.LIGHTGREEN_EX}jsl.install(force_browser=True){Fore.RESET} to get a Browser Window pop-up where you can refresh your license data +Option2: Run {Fore.LIGHTGREEN_EX}jsl.install(secrets_file="path/to/fresh_credentials.json"){Fore.RESET} after downloading a fresh license from https://my.johnsnowlabs.com/subscriptions +Option3: Run {Fore.LIGHTGREEN_EX}jsl.install(refresh_install=True,force_browser=True){Fore.RESET} to refresh everything +Option4: Set {Fore.LIGHTGREEN_EX}jsl.settings.enforce_versions=False{Fore.RESET} and run{Fore.LIGHTGREEN_EX} jsl.install(refresh_install=True,force_browser=True){Fore.RESET} to disable this protection mechanism and try to install anyways. Not Recommended, can yield unforeseen consequences +""" + # + + print(err_msg) + raise Exception(err_msg) + + info[p.file_name] = constructor( + file_name=p.file_name, + product=p.product_name, + compatible_spark_version=p.spark_version.value.as_str(), + install_type=p.dependency_type.value, + product_version=p.dependency_version.as_str()) + if info: + info = InstallFolder(**{'infos': info}) + if old_info: + info.infos.update(old_info.infos) + info.write(info_file_path, indent=4) + + +def setup_jsl_home( + secrets: Optional[JslSecrets] = None, + jvm_install_type: JvmHardwareTarget = JvmHardwareTarget.cpu, + py_install_type: PyInstallTypes = PyInstallTypes.wheel, + only_jars: bool = False, + spark_version=None, + overwrite=False, + log=True, + refresh_install=False, +) -> None: + """Folder structure : + Creates Folder for JSL home and downloads all Py4J wheels/Jars + for which we need to take PySpark Compatibility into account as well as JVM Hardware target + ~.johnsnowlabs/ + ├─ licenses/ + │ ├─ info.json + │ ├─ license1.json + │ ├─ license2.json + ├─ java_installs/ + │ ├─ info.json + │ ├─ app1.jar + │ ├─ app2.jar + ├─ py_installs/ + │ ├─ info.json + │ ├─ app1.tar.gz + │ ├─ app2.tar.gz + ├─ info.json + """ + + # Create all Paths + Path(settings.license_dir).mkdir(parents=True, exist_ok=True) + Path(settings.java_dir).mkdir(parents=True, exist_ok=True) + Path(settings.py_dir).mkdir(parents=True, exist_ok=True) + force_update = False + suite = None + + if jsl_home_exist(): + if secrets: + # Don't log because we will ignore the license from localhost, since one is provided + jsl_secrets.already_logged = True + suite = get_install_suite_from_jsl_home(create_jsl_home_if_missing=False, recursive_call=True, log=False, + browser_login=False, + jvm_hardware_target=jvm_install_type) + if secrets: + # We overwrite secrets if user provided any + suite.secrets = secrets + + if jsl_home_exist() and is_jsl_home_outdated() and log: + print(f'🤓 Looks like {settings.root_dir} is outdated, updating it') + if not jsl_home_exist() or is_jsl_home_outdated() or refresh_install: + print(f'👷 Setting up if John Snow Labs home exists in {settings.root_dir} this might take a few minutes.') + # Delete everything except license data and re-create folder + shutil.rmtree(settings.java_dir) + shutil.rmtree(settings.py_dir) + Path(settings.java_dir).mkdir(parents=True, exist_ok=True) + Path(settings.py_dir).mkdir(parents=True, exist_ok=True) + force_update = True + + # Get Urls for P4J based libs + if force_update or suite and suite.get_missing_products(): + java_deps, py_deps = get_py4j_dependency_urls( + secrets=secrets, + spark_version=spark_version, + jvm_install_type=jvm_install_type, + py_install_type=py_install_type) + + # store deps to jsl home with info.json files + if not only_jars: + download_deps_and_create_info(py_deps, settings.py_dir, settings.py_info_file, overwrite) + download_deps_and_create_info(java_deps, settings.java_dir, settings.java_info_file, overwrite) + + RootInfo(version=settings.raw_version_jsl_lib, run_from=sys.executable).write(settings.root_info_file, indent=4) + print(f'🙆 JSL Home setup in {settings.root_dir}') + + return + if log: + print(f'👌 JSL-Home is up to date! ') + + +def get_install_suite_from_jsl_home(create_jsl_home_if_missing: bool = True, + jvm_hardware_target: JvmHardwareTarget = JvmHardwareTarget.cpu, + hc: bool = True, + ocr: bool = True, + nlp: bool = True, + only_jars: bool = False, + recursive_call=False, + # Secret Flow Params + browser_login: bool = True, + force_browser: bool = False, + access_token: Optional[str] = None, + local_license_number: int = 0, + remote_license_number: int = 0, + secrets_file: Optional[str] = None, + hc_license: Optional[str] = None, + hc_secret: Optional[str] = None, + ocr_secret: Optional[str] = None, + ocr_license: Optional[str] = None, + aws_access_key: Optional[str] = None, + aws_key_id: Optional[str] = None, + fin_license: Optional[str] = None, + leg_license: Optional[str] = None, + store_in_jsl_home: bool = True, + log: bool = True, + ) -> InstallSuite: + """Read all info files from JSL home if exists. If not exists, sets up JSL home""" + + + license_data: JslSecrets = JslSecrets.build_or_try_find_secrets(browser_login=browser_login, + force_browser=force_browser, + access_token=access_token, + local_license_number=local_license_number, + remote_license_number=remote_license_number, + secrets_file=secrets_file, + hc_license=hc_license, + hc_secret=hc_secret, + ocr_secret=ocr_secret, + ocr_license=ocr_license, + aws_access_key=aws_access_key, + aws_key_id=aws_key_id, + return_empty_secrets_if_none_found=True, + fin_license=fin_license, + leg_license=leg_license, + store_in_jsl_home=store_in_jsl_home, + ) + + if create_jsl_home_if_missing: + if not jsl_home_exist(): + # Nothing setup yet, download at least spark nlp jars + print(f'🤓 Looks like {settings.root_dir} is missing, creating it') + setup_jsl_home(only_jars=only_jars, log=False) + + if jsl_home_exist() and is_jsl_home_outdated(): + # Nothing setup yet, download at least spark nlp jars + setup_jsl_home(only_jars=only_jars, log=False) + + java_folder, py_folder = None, None + + if os.path.exists(settings.java_info_file): + java_folder = InstallFolder.java_folder_from_home() + if os.path.exists(settings.py_info_file): + py_folder = InstallFolder.py_folder_from_home() + + info = RootInfo.parse_file(settings.root_info_file) + # Read all dependencies from local ~/.johnsnowlabs folder + + suite = InstallSuite( + nlp=LocalPy4JLib( + java_lib=java_folder.get_product_entry(ProductName.nlp, jvm_hardware_target) if java_folder else None, + py_lib=py_folder.get_product_entry(ProductName.nlp) if py_folder else None), + hc=LocalPy4JLib( + java_lib=java_folder.get_product_entry(ProductName.hc) if java_folder else None, + py_lib=py_folder.get_product_entry(ProductName.hc) if py_folder else None), + ocr=LocalPy4JLib( + java_lib=java_folder.get_product_entry(ProductName.ocr) if java_folder else None, + py_lib=py_folder.get_product_entry(ProductName.ocr) if py_folder else None), + secrets=license_data, + info=info + ) + + missing = suite.get_missing_products() + if missing and recursive_call and log: + print(f'🚨 Looks like some of the missing jars could not be fetched...') + suite.log_missing_jars(ocr, hc, nlp) + + if missing and not recursive_call: + print(f'🤓 Looks like you are missing some jars, trying fetching them ...') + setup_jsl_home(license_data, + jvm_install_type=jvm_hardware_target, + only_jars=only_jars, log=False) + # After re-setting up jsl_home, call this method again + return get_install_suite_from_jsl_home( + jvm_hardware_target=jvm_hardware_target, + hc=hc, + ocr=ocr, + nlp=nlp, + only_jars=only_jars, + recursive_call=True, + browser_login=browser_login, + access_token=access_token, + local_license_number=local_license_number, + remote_license_number=remote_license_number, + secrets_file=secrets_file, + hc_license=hc_license, + hc_secret=hc_secret, + ocr_secret=ocr_secret, + ocr_license=ocr_license, + aws_access_key=aws_access_key, + aws_key_id=aws_key_id, + fin_license=fin_license, + leg_license=leg_license, + ) + return suite diff --git a/johnsnowlabs/auto_install/lib_resolvers/__init__.py b/johnsnowlabs/auto_install/lib_resolvers/__init__.py new file mode 100644 index 0000000000..f0416f2360 --- /dev/null +++ b/johnsnowlabs/auto_install/lib_resolvers/__init__.py @@ -0,0 +1,3 @@ +from .hc_installer import HcLibResolver +from .ocr_installer import OcrLibResolver +from .nlp_installer import NlpLibResolver diff --git a/johnsnowlabs/auto_install/lib_resolvers/hc_installer.py b/johnsnowlabs/auto_install/lib_resolvers/hc_installer.py new file mode 100644 index 0000000000..adeb11f0fb --- /dev/null +++ b/johnsnowlabs/auto_install/lib_resolvers/hc_installer.py @@ -0,0 +1,49 @@ +from abc import ABCMeta +from johnsnowlabs.abstract_base.lib_resolver import Py4JJslLibDependencyResolverABC, PyInstallTypes +from johnsnowlabs.utils.enums import ProductName, SparkVersion, JvmHardwareTarget, \ + LatestCompatibleProductVersion +from johnsnowlabs.py_models.url_dependency import UrlDependency + + +class HcLibResolver(Py4JJslLibDependencyResolverABC, metaclass=ABCMeta): + has_py_install = True + has_cpu_jars = True + has_secret = True + compatible_spark_versions = [SparkVersion.spark3xx.value] + lib_version = LatestCompatibleProductVersion.healthcare.value + product_name = ProductName.hc + + compatible_spark_to_jar_map = { + SparkVersion.spark3xx: { + JvmHardwareTarget.cpu: + UrlDependency(url='https://pypi.johnsnowlabs.com/{secret}/spark-nlp-jsl-{lib_version}.jar', + dependency_type=JvmHardwareTarget.cpu, + spark_version=SparkVersion.spark3xx, + product_name=product_name, + file_name=product_name.name, + dependency_version=lib_version) + + } + } + + compatible_spark_to_py_map = { + SparkVersion.spark3xx: { + PyInstallTypes.wheel: + UrlDependency( + url='https://pypi.johnsnowlabs.com/{secret}/spark-nlp-jsl/spark_nlp_jsl-{lib_version}-py3-none-any.whl', + dependency_type=PyInstallTypes.wheel, + spark_version=SparkVersion.spark3xx, + product_name=product_name, + file_name=product_name.name, + dependency_version=lib_version), + + PyInstallTypes.tar: + UrlDependency( + url='https://pypi.johnsnowlabs.com/{secret}/spark-nlp-jsl/spark-nlp-jsl-{lib_version}.tar.gz', + dependency_type=PyInstallTypes.tar, + spark_version=SparkVersion.spark3xx, + product_name=product_name, + file_name=product_name.name, + dependency_version=lib_version), + } + } diff --git a/johnsnowlabs/auto_install/lib_resolvers/nlp_installer.py b/johnsnowlabs/auto_install/lib_resolvers/nlp_installer.py new file mode 100644 index 0000000000..740d40dae7 --- /dev/null +++ b/johnsnowlabs/auto_install/lib_resolvers/nlp_installer.py @@ -0,0 +1,69 @@ +from abc import ABCMeta +from johnsnowlabs.abstract_base.lib_resolver import Py4JJslLibDependencyResolverABC, PyInstallTypes +from johnsnowlabs.utils.enums import LatestCompatibleProductVersion, ProductName, SparkVersion, \ + JvmHardwareTarget +from johnsnowlabs.py_models.url_dependency import UrlDependency + + +class NlpLibResolver(Py4JJslLibDependencyResolverABC, metaclass=ABCMeta): + has_m1_jar = True + has_cpu_jars = True + has_py_install = True + has_gpu_jars = True + product_name = ProductName.nlp + lib_version = LatestCompatibleProductVersion.spark_nlp.value + compatible_spark_versions = [SparkVersion.spark3xx.value] + + compatible_spark_to_jar_map = { + SparkVersion.spark3xx: { + JvmHardwareTarget.gpu: + UrlDependency( + url='https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/jars/spark-nlp-gpu-assembly-{lib_version}.jar', + dependency_type=JvmHardwareTarget.gpu, + spark_version=SparkVersion.spark3xx, + product_name=product_name, + file_name=product_name.name, + dependency_version=lib_version), + + JvmHardwareTarget.m1: + UrlDependency( + url='https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/jars/spark-nlp-m1-assembly-{lib_version}.jar', + dependency_type=JvmHardwareTarget.m1, + spark_version=SparkVersion.spark3xx, + product_name=product_name, + file_name=product_name.name, + dependency_version=lib_version), + + JvmHardwareTarget.cpu: + UrlDependency( + url='https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/jars/spark-nlp-assembly-{lib_version}.jar', + dependency_type=JvmHardwareTarget.cpu, + spark_version=SparkVersion.spark3xx, + product_name=product_name, + file_name=product_name.name, + dependency_version=lib_version), + } + + } + + compatible_spark_to_py_map = { + SparkVersion.spark3xx: { + # TODO HARDCODE HASH!!! OR grap from enum or somwhere comfy. Maybe configs/settings file? + PyInstallTypes.wheel: UrlDependency( + url='https://files.pythonhosted.org/packages/3b/b1/9cba3f35fed9030eb0b55a04fea7e89e4eeaf340f8ebf5e9b43e95c9b338/spark_nlp-{lib_version}-py2.py3-none-any.whl', + dependency_type=PyInstallTypes.wheel, + spark_version=SparkVersion.spark3xx, + product_name=product_name, + file_name=product_name.name, + dependency_version=lib_version), + PyInstallTypes.tar: UrlDependency( + url='https://files.pythonhosted.org/packages/fd/99/9345ba8d7303ac45402ad18c71c604b7bccc1f113409f306ba46bbec4612/spark-nlp-{lib_version}.tar.gz', + dependency_type=PyInstallTypes.tar, + spark_version=SparkVersion.spark3xx, + product_name=product_name, + file_name=product_name.name, + dependency_version=lib_version), + + } + + } diff --git a/johnsnowlabs/auto_install/lib_resolvers/ocr_installer.py b/johnsnowlabs/auto_install/lib_resolvers/ocr_installer.py new file mode 100644 index 0000000000..3609928125 --- /dev/null +++ b/johnsnowlabs/auto_install/lib_resolvers/ocr_installer.py @@ -0,0 +1,52 @@ +from abc import ABCMeta +from johnsnowlabs.abstract_base.lib_resolver import Py4JJslLibDependencyResolverABC, PyInstallTypes +from johnsnowlabs.utils.enums import LatestCompatibleProductVersion, ProductName, SparkVersion, \ + JvmHardwareTarget +from johnsnowlabs.py_models.url_dependency import UrlDependency + + +class OcrLibResolver(Py4JJslLibDependencyResolverABC, metaclass=ABCMeta): + has_cpu_jars = True + has_py_install = True + has_secret = True + product_name = ProductName.ocr + compatible_spark_versions = [SparkVersion.spark32x.value, SparkVersion.spark33x.value] + lib_version = LatestCompatibleProductVersion.ocr.value + + compatible_spark_to_jar_map = { + + SparkVersion.spark3xx: { + JvmHardwareTarget.cpu: + UrlDependency( + url='https://pypi.johnsnowlabs.com/{secret}/jars/spark-ocr-assembly-{lib_version}.jar', + dependency_type=JvmHardwareTarget.cpu, + spark_version=SparkVersion.spark3xx, + product_name=product_name, + file_name=product_name.name, + dependency_version=lib_version), + }, + + + } + + compatible_spark_to_py_map = { + SparkVersion.spark3xx: { + PyInstallTypes.wheel: UrlDependency( + url='https://pypi.johnsnowlabs.com/{secret}/spark-ocr/spark_ocr-{lib_version}-py3-none-any.whl', + dependency_type=PyInstallTypes.wheel, + spark_version=SparkVersion.spark32x, + product_name=product_name, + file_name=product_name.name, + dependency_version=lib_version), + + PyInstallTypes.tar: UrlDependency( + url = 'https://pypi.johnsnowlabs.com/{secret}/spark-ocr/spark-ocr-{lib_version}.tar.gz', + dependency_type=PyInstallTypes.tar, + spark_version=SparkVersion.spark32x, + product_name=product_name, + file_name=product_name.name, + dependency_version=lib_version), + + }, + + } diff --git a/johnsnowlabs/auto_install/offline_install.py b/johnsnowlabs/auto_install/offline_install.py new file mode 100644 index 0000000000..1992d7ad47 --- /dev/null +++ b/johnsnowlabs/auto_install/offline_install.py @@ -0,0 +1,138 @@ +from typing import List, Tuple +from johnsnowlabs.utils.enums import PyInstallTypes, ProductLogo, JvmHardwareTarget +from johnsnowlabs.py_models.url_dependency import UrlDependency +from johnsnowlabs.py_models.jsl_secrets import JslSecrets +from johnsnowlabs.auto_install.lib_resolvers import OcrLibResolver, HcLibResolver, NlpLibResolver + + +def get_printable_dependency_urls( + secrets: JslSecrets, + jvm_install_type: JvmHardwareTarget = JvmHardwareTarget.cpu, + py_install_type: PyInstallTypes = PyInstallTypes.wheel, + spark_version=None, +) -> \ + Tuple[List[str], List[str]]: + """ + Get URL for every dependency to which the found_secrets have access to with respect to CURRENT pyspark install. + If no pyspark is installed, this fails because we need to know pyspark version to generate correct URL + :param jvm_install_type: + :param spark_version: + :param secrets: + :param py_install_type: PyInstallTypes.wheel or PyInstallTypes.tar + :return: list of pre-formatted message arrays java_dependencies, py_dependencies + """ + messages = [] + java_dependencies = [] + py_dependencies = [] + if jvm_install_type == JvmHardwareTarget.gpu: + java_dependencies.append( + f'{ProductLogo.nlp.value}{ProductLogo.java.value} Spark NLP GPU Java Jar:' + f'{NlpLibResolver.get_jar_urls(hardware_target=jvm_install_type, spark_version_to_match=spark_version).url}') + elif jvm_install_type == JvmHardwareTarget.cpu: + java_dependencies.append( + f'{ProductLogo.nlp.value}{ProductLogo.java.value} Spark NLP CPU Java Jar:' + f'{NlpLibResolver.get_jar_urls(hardware_target=jvm_install_type, spark_version_to_match=spark_version).url}') + elif jvm_install_type == JvmHardwareTarget.m1: + java_dependencies.append( + f'{ProductLogo.nlp.value}{ProductLogo.java.value} Spark NLP M1 Java Jar:' + f'{NlpLibResolver.get_jar_urls(hardware_target=jvm_install_type, spark_version_to_match=spark_version).url}') + + if py_install_type == PyInstallTypes.wheel: + py_dependencies.append( + f'{ProductLogo.nlp.value}{ProductLogo.python.value} Spark NLP for Python Wheel: ' + f'{NlpLibResolver.get_py_urls(install_type=py_install_type, spark_version_to_match=spark_version).url}') + else: + py_dependencies.append( + f'{ProductLogo.nlp.value}{ProductLogo.python.value} Spark NLP for Python Tar:' + f'{NlpLibResolver.get_py_urls(install_type=py_install_type, spark_version_to_match=spark_version).url}') + + if secrets.HC_SECRET: + java_dependencies.append( + f'{ProductLogo.hc.value}{ProductLogo.java.value} Spark NLP for Healthcare Java Jar:' + f' {HcLibResolver.get_jar_urls(secret=secrets.HC_SECRET, hardware_target=jvm_install_type, spark_version_to_match=spark_version).url}') + if py_install_type == PyInstallTypes.wheel: + py_dependencies.append( + f'{ProductLogo.hc.value}{ProductLogo.python.value} Spark NLP for Healthcare Python Wheel:' + f' {HcLibResolver.get_py_urls(secret=secrets.HC_SECRET, install_type=py_install_type, spark_version_to_match=spark_version).url}') + else: + py_dependencies.append( + f'{ProductLogo.hc.value}{ProductLogo.python.value} Spark NLP for Healthcare Python Tar:' + f' {HcLibResolver.get_py_urls(secret=secrets.HC_SECRET, install_type=py_install_type, spark_version_to_match=spark_version).url}') + + if secrets.OCR_SECRET: + java_dependencies.append( + f'{ProductLogo.ocr.value}{ProductLogo.java.value} Spark OCR Java Jar:' + f' {OcrLibResolver.get_jar_urls(secret=secrets.OCR_SECRET, hardware_target=jvm_install_type, spark_version_to_match=spark_version).url}') + if py_install_type == PyInstallTypes.wheel: + py_dependencies.append( + f'{ProductLogo.ocr.value}{ProductLogo.python.value} Spark OCR Python Wheel:' + f' {OcrLibResolver.get_py_urls(secret=secrets.OCR_SECRET, install_type=py_install_type, spark_version_to_match=spark_version).url}') + else: + py_dependencies.append( + f'{ProductLogo.ocr.value}{ProductLogo.python.value} Spark OCR Python Tar:' + f' {OcrLibResolver.get_py_urls(secret=secrets.OCR_SECRET, install_type=py_install_type, spark_version_to_match=spark_version).url}') + + print('\n'.join(java_dependencies + py_dependencies)) + print(f'Make sure all these dependencies are installed on your Spark Driver and Worker nodes') + return java_dependencies, py_dependencies + + +def get_py4j_dependency_urls( + secrets: JslSecrets, + jvm_install_type: JvmHardwareTarget = JvmHardwareTarget.cpu, + py_install_type: PyInstallTypes = PyInstallTypes.wheel, + spark_version=None, + get_all_jvm_hardware_targets: bool = False, +) -> \ + Tuple[List[UrlDependency], List[UrlDependency]]: + """ + Get URL for every dependency to which the found_secrets have access to with respect to CURRENT pyspark install. + If no pyspark is installed, this fails because we need to know pyspark version to generate correct URL + :param jvm_install_type: + :param spark_version: + :param get_all_jvm_hardware_targets: + :param secrets: + :param py_install_type: PyInstallTypes.wheel or PyInstallTypes.tar + :return: list of pre-formatted message arrays java_dependencies, py_dependencies + """ + messages = [] + java_dependencies = [] + py_dependencies = [] + if jvm_install_type == JvmHardwareTarget.gpu or get_all_jvm_hardware_targets: + java_dependencies.append( + NlpLibResolver.get_jar_urls(hardware_target=jvm_install_type, spark_version_to_match=spark_version)) + elif jvm_install_type == JvmHardwareTarget.cpu or get_all_jvm_hardware_targets: + java_dependencies.append( + NlpLibResolver.get_jar_urls(hardware_target=jvm_install_type, spark_version_to_match=spark_version)) + elif jvm_install_type == JvmHardwareTarget.m1 or get_all_jvm_hardware_targets: + java_dependencies.append( + NlpLibResolver.get_jar_urls(hardware_target=jvm_install_type, spark_version_to_match=spark_version)) + + if py_install_type == PyInstallTypes.wheel: + py_dependencies.append( + NlpLibResolver.get_py_urls(install_type=py_install_type, spark_version_to_match=spark_version)) + else: + py_dependencies.append( + NlpLibResolver.get_py_urls(install_type=py_install_type, spark_version_to_match=spark_version)) + + if secrets and secrets.HC_SECRET: + java_dependencies.append( + HcLibResolver.get_jar_urls(secret=secrets.HC_SECRET, spark_version_to_match=spark_version)) + if py_install_type == PyInstallTypes.wheel: + py_dependencies.append(HcLibResolver.get_py_urls(secret=secrets.HC_SECRET, install_type=py_install_type, + spark_version_to_match=spark_version)) + else: + py_dependencies.append(HcLibResolver.get_py_urls(secret=secrets.HC_SECRET, install_type=py_install_type, + spark_version_to_match=spark_version)) + + if secrets and secrets.OCR_SECRET: + java_dependencies.append( + OcrLibResolver.get_jar_urls(secret=secrets.OCR_SECRET, spark_version_to_match=spark_version)) + if py_install_type == PyInstallTypes.wheel: + py_dependencies.append(OcrLibResolver.get_py_urls(secret=secrets.OCR_SECRET, install_type=py_install_type, + spark_version_to_match=spark_version)) + else: + py_dependencies.append(OcrLibResolver.get_py_urls(secret=secrets.OCR_SECRET, install_type=py_install_type, + spark_version_to_match=spark_version)) + + return java_dependencies, py_dependencies diff --git a/johnsnowlabs/auto_install/softwares.py b/johnsnowlabs/auto_install/softwares.py new file mode 100644 index 0000000000..e3df784b5c --- /dev/null +++ b/johnsnowlabs/auto_install/softwares.py @@ -0,0 +1,243 @@ +from typing import Union + +from johnsnowlabs.auto_install.lib_resolvers import OcrLibResolver, HcLibResolver, NlpLibResolver +from johnsnowlabs.abstract_base.software_product import AbstractSoftwareProduct +from johnsnowlabs.utils.enums import ProductName, ProductLogo, LatestCompatibleProductVersion, \ + ProductSlogan +from johnsnowlabs.utils.env_utils import try_import +from johnsnowlabs.utils.venv_utils import VenvWrapper + +""" +These are the nodes and edges that define the DAG graph for the check_and_install_dependencies() in install_software.py +""" + + +class PythonSoftware(AbstractSoftwareProduct): + name = ProductName.python.value + logo = ProductLogo.python.value + latest_version = LatestCompatibleProductVersion.python.value + + +class JavaSoftware(AbstractSoftwareProduct): + name = ProductName.java.value + logo = ProductLogo.java.java + latest_version = LatestCompatibleProductVersion.java.value + + +class SparkSoftware(AbstractSoftwareProduct): + name = ProductName.spark.value + logo = ProductLogo.spark.value + hard_dependencies = {JavaSoftware} + latest_version = LatestCompatibleProductVersion.spark.value + + +class PysparkSoftware(AbstractSoftwareProduct): + # TODO needs custom install for windows! (?) + name = ProductName.pyspark.value + logo = ProductLogo.pyspark.value + slogan = ProductSlogan.pyspark.value + hard_dependencies = {PythonSoftware} + latest_version = LatestCompatibleProductVersion.pyspark.value + py_module_name = 'pyspark' + pypi_name = 'pyspark' + + @classmethod + def get_installed_version_via_import(cls): + try: + import pyspark + return pyspark.__version__ + except: + return False + + +class SparkNlpSoftware(AbstractSoftwareProduct): + name = ProductName.nlp.value + logo = ProductLogo.nlp.value + slogan = ProductSlogan.spark_nlp.value + hard_dependencies = {SparkSoftware, PysparkSoftware} + latest_version = LatestCompatibleProductVersion.spark_nlp.value + jsl_url_resolver = NlpLibResolver + py_module_name = 'sparknlp' + pypi_name = 'spark-nlp' + is_py4j = True + + @classmethod + def get_installed_version_via_import(cls): + try: + import sparknlp + return sparknlp.version() + except: + return False + + +class SparkHcSoftware(AbstractSoftwareProduct): + name = ProductName.hc.value + logo = ProductLogo.hc.value + slogan = ProductSlogan.healthcare.value + hard_dependencies = {SparkNlpSoftware} + latest_version = LatestCompatibleProductVersion.healthcare.value + jsl_url_resolver = HcLibResolver + py_module_name = 'sparknlp_jsl' + pypi_name = 'spark-nlp-jsl' + licensed = True + is_py4j = True + + @classmethod + def get_installed_version_via_import(cls): + try: + import sparknlp_jsl + return sparknlp_jsl.version() + except: + return False + + +class SparkOcrSoftware(AbstractSoftwareProduct): + name = ProductName.ocr.value + logo = ProductLogo.ocr.value + slogan = ProductSlogan.ocr.value + hard_dependencies = {SparkSoftware, PysparkSoftware, SparkNlpSoftware, } + optional_dependencies = {SparkHcSoftware} + latest_version = LatestCompatibleProductVersion.ocr.value + jsl_url_resolver = OcrLibResolver + py_module_name = 'sparkocr' # + pypi_name = 'spark-ocr' + licensed = True + is_py4j = True + + @classmethod + def get_installed_version_via_import(cls): + try: + import sparkocr + return sparkocr.version() + except: + return False + + +class NlpDisplaySoftware(AbstractSoftwareProduct): + name = ProductName.nlp_display.value + logo = ProductLogo.nlp_display.value + slogan = ProductSlogan.nlp_display.value + hard_dependencies = {SparkSoftware} + licensed_dependencies = {SparkHcSoftware} + latest_version = LatestCompatibleProductVersion.nlp_display.value + py_module_name = 'sparknlp_display' + pypi_name = 'spark-nlp-display' + + @classmethod + def get_installed_version_via_import(cls): + try: + import sparknlp_display + return sparknlp_display.version() + except: + return False + + +class NluSoftware(AbstractSoftwareProduct): + name = ProductName.nlu.value + logo = ProductLogo.nlu.value + slogan = ProductSlogan.nlu.value + + hard_dependencies = {SparkNlpSoftware} + licensed_dependencies = {SparkHcSoftware, SparkOcrSoftware} + optional_dependencies = {NlpDisplaySoftware} # Todo streamlit,sklearn,plotly, nlp-display + latest_version = LatestCompatibleProductVersion.nlu.value + py_module_name = 'nlu' + pypi_name = 'nlu' + + @classmethod + def get_installed_version_via_import(cls): + try: + import nlu + return nlu.version() + except: + return False + + @classmethod + def health_check(cls) -> bool: + import nlu + try: + pipe = nlu.load('sentiment') + df = pipe.predict('I love peanut butter and jelly!') + for c in df.columns: + print(df[c]) + except Exception as err: + print(f'Failure testing nlu. Err = {err}') + return False + return True + + +class JohnSnowLabsSoftware(AbstractSoftwareProduct): + # Represents this Library itself + name = ProductName.jsl_lib.value + logo = ProductLogo.jsl_lib.value + slogan = ProductSlogan.jsl_lib.value + + hard_dependencies = {SparkNlpSoftware} + licensed_dependencies = {SparkHcSoftware, SparkOcrSoftware} + optional_dependencies = {NlpDisplaySoftware, NluSoftware} + latest_version = LatestCompatibleProductVersion.jsl_lib.value + py_module_name = 'johnsnowlabs' + pypi_name = 'johnsnowlabs' + pypi_name_databricks = 'johnsnowlabs_for_databricks' + +class JslFullSoftware(AbstractSoftwareProduct): + name = ProductName.jsl_full.value + logo = ProductLogo.jsl_full.value + slogan = ProductSlogan.jsl_full.value + + optional_dependencies = {NlpDisplaySoftware, NluSoftware} # Todo streamlit,sklearn,plotly? + hard_dependencies = {JohnSnowLabsSoftware, SparkNlpSoftware, PysparkSoftware} + licensed_dependencies = {SparkHcSoftware, SparkOcrSoftware} + + @classmethod + def check_installed(cls, python_exec_path=None) -> bool: + if python_exec_path: + return VenvWrapper.is_lib_in_py_exec(python_exec_path, cls.py_module_name, False) + # If python_exec_path=None, then check is for current Python py_executable + return all(try_import(dep.py_module_name) for dep in cls.licensed_dependencies) + + +class Software: + """Accessor to all classes that implement AbstractSoftwareProduct. + This also gives access to all enums + """ + spark_nlp: AbstractSoftwareProduct = SparkNlpSoftware + spark_hc: AbstractSoftwareProduct = SparkHcSoftware + spark_ocr: AbstractSoftwareProduct = SparkOcrSoftware + nlu: AbstractSoftwareProduct = NluSoftware + sparknlp_display: AbstractSoftwareProduct = NlpDisplaySoftware + pyspark: AbstractSoftwareProduct = PysparkSoftware + python: AbstractSoftwareProduct = PythonSoftware + java: AbstractSoftwareProduct = JavaSoftware + spark: AbstractSoftwareProduct = SparkSoftware + jsl_lib: AbstractSoftwareProduct = JohnSnowLabsSoftware + jsl_full: AbstractSoftwareProduct = JslFullSoftware + + @staticmethod + def for_name(name: Union[str, ProductName]) -> AbstractSoftwareProduct: + if isinstance(name, str): + name = ProductName.from_str(name) + + if name == ProductName.nlp: + return Software.spark_nlp + elif name == ProductName.hc: + return Software.spark_hc + elif name == ProductName.ocr: + return Software.spark_ocr + elif name == ProductName.nlu: + return Software.nlu + elif name == ProductName.nlp_display: + return Software.sparknlp_display + elif name == ProductName.pyspark: + return Software.pyspark + elif name == ProductName.python: + return Software.python + elif name == ProductName.java: + return Software.java + elif name == ProductName.spark: + return Software.spark + elif name == ProductName.jsl_lib: + return Software.jsl_lib + elif name == ProductName.jsl_full: + return Software.jsl_full + return False # raise ValueError(old_lic'Unknown Product {name}') diff --git a/johnsnowlabs/finance.py b/johnsnowlabs/finance.py new file mode 100644 index 0000000000..fba4453ff5 --- /dev/null +++ b/johnsnowlabs/finance.py @@ -0,0 +1,83 @@ + +from johnsnowlabs.abstract_base.lib_resolver import try_import_lib +from johnsnowlabs.utils.print_messages import log_broken_lib + +try: + + if try_import_lib('sparknlp_jsl') and try_import_lib('sparknlp'): + # Substitutions + from sparknlp_jsl.finance import FinanceBertForTokenClassification as BertForTokenClassification + from sparknlp_jsl.finance import FinanceNerApproach as NerApproach + from sparknlp_jsl.finance import FinanceNerModel as NerModel + from sparknlp_jsl.finance import FinanceBertForSequenceClassification as BertForSequenceClassification + from sparknlp_jsl.finance import FinanceBertForSequenceClassification as BertForSequenceClassification + from sparknlp_jsl.finance import FinanceClassifierDLApproach as ClassifierDLApproach + from sparknlp_jsl.finance import FinanceClassifierDLModel as ClassifierDLModel + # from sparknlp_jsl.finance import FinanceDocumentHashCoder as DocumentHashCoder + # from sparknlp_jsl.finance import FinanceNerQuestionGenerator as QuestionGenerator + from sparknlp_jsl.finance.chunk_classification.deid.document_hashcoder import FinanceDocumentHashCoder as DocumentHashCoder + from sparknlp_jsl.finance.seq_generation.qa_ner_generator import FinanceNerQuestionGenerator as NerQuestionGenerator + + from sparknlp_jsl.finance import SentenceEntityResolverModel, \ + ChunkMapperModel, \ + AssertionDLModel, \ + RelationExtractionDLModel, \ + ZeroShotRelationExtractionModel, \ + ChunkMapperApproach, \ + SentenceEntityResolverApproach, \ + AssertionDLApproach, \ + ZeroShotNerModel + + from sparknlp_jsl.annotator import MedicalDistilBertForSequenceClassification as DistilBertForSequenceClassification + + # These are licensed annos shared across all libs + from sparknlp_jsl.annotator import \ + AssertionLogRegModel, \ + DeIdentificationModel, \ + DocumentLogRegClassifierModel, \ + RelationExtractionModel, \ + ChunkMergeModel, \ + BertSentenceChunkEmbeddings, \ + ChunkKeyPhraseExtraction, \ + NerDisambiguatorModel, \ + EntityChunkEmbeddings, \ + TFGraphBuilder, \ + ChunkConverter, \ + ChunkFilterer, \ + NerConverterInternal, \ + NerChunker, \ + AssertionFilterer, \ + AnnotationMerger, \ + RENerChunksFilter, \ + ChunkSentenceSplitter, \ + ChunkMapperFilterer, \ + DateNormalizer, \ + GenericClassifierModel, \ + ReIdentification + + from sparknlp_jsl.structured_deidentification import StructuredDeidentification + from sparknlp_jsl.annotator.resolution.resolver_merger import ResolverMerger + + from sparknlp_jsl.base import FeaturesAssembler + + from sparknlp_jsl.annotator import \ + AssertionLogRegApproach, \ + DeIdentification, \ + DocumentLogRegClassifierApproach, \ + RelationExtractionApproach, \ + ChunkMergeApproach, \ + NerDisambiguator, \ + ContextualParserApproach, \ + GenericClassifierApproach, \ + Router, \ + NerQuestionGenerator, \ + DocumentHashCoder + + from sparknlp_jsl.compatibility import Compatibility + from sparknlp_jsl.pretrained import InternalResourceDownloader + from sparknlp_jsl.eval import NerDLMetrics, NerDLEvaluation, SymSpellEvaluation, POSEvaluation, \ + NerCrfEvaluation, NorvigSpellEvaluation + +except: + if try_import_lib('sparknlp_jsl') and try_import_lib('sparknlp'): + log_broken_lib('Enterprise Finance') diff --git a/johnsnowlabs/legal.py b/johnsnowlabs/legal.py new file mode 100644 index 0000000000..2eda612386 --- /dev/null +++ b/johnsnowlabs/legal.py @@ -0,0 +1,85 @@ +from johnsnowlabs.abstract_base.lib_resolver import try_import_lib +from johnsnowlabs.utils.print_messages import log_broken_lib + +try : + if try_import_lib('sparknlp_jsl') and try_import_lib('sparknlp'): + + # Substitutions + from sparknlp_jsl.legal import LegalBertForTokenClassification as BertForTokenClassification + from sparknlp_jsl.legal import LegalNerApproach as NerApproach + from sparknlp_jsl.legal import LegalNerModel as NerModel + from sparknlp_jsl.legal import LegalBertForSequenceClassification as BertForSequenceClassification + from sparknlp_jsl.legal import LegalBertForSequenceClassification as BertForSequenceClassification + from sparknlp_jsl.legal import LegalClassifierDLApproach as ClassifierDLApproach + from sparknlp_jsl.legal import LegalClassifierDLModel as ClassifierDLModel + from sparknlp_jsl.annotator import MedicalDistilBertForSequenceClassification as DistilBertForSequenceClassification + # from sparknlp_jsl.legal import LegalDocumentHashCoder as DocumentHashCoder + # from sparknlp_jsl.legal import LegalNerQuestionGenerator as QuestionGenerator + from sparknlp_jsl.legal.chunk_classification.deid.document_hashcoder import LegalDocumentHashCoder as DocumentHashCoder + from sparknlp_jsl.legal.seq_generation.qa_ner_generator import LegalNerQuestionGenerator as NerQuestionGenerator + + + + from sparknlp_jsl.finance import SentenceEntityResolverModel, \ + ChunkMapperModel, \ + AssertionDLModel, \ + RelationExtractionDLModel, \ + ZeroShotRelationExtractionModel, \ + ChunkMapperApproach, \ + SentenceEntityResolverApproach, \ + AssertionDLApproach, \ + ZeroShotNerModel + + # These are licensed annos shared across all libs + from sparknlp_jsl.annotator import \ + AssertionLogRegModel, \ + DeIdentificationModel, \ + DocumentLogRegClassifierModel, \ + RelationExtractionModel, \ + ChunkMergeModel, \ + ChunkMapperModel, \ + BertSentenceChunkEmbeddings, \ + ChunkKeyPhraseExtraction, \ + NerDisambiguatorModel, \ + EntityChunkEmbeddings, \ + TFGraphBuilder, \ + ChunkConverter, \ + ChunkFilterer, \ + NerConverterInternal, \ + NerChunker, \ + AssertionFilterer, \ + AnnotationMerger, \ + RENerChunksFilter, \ + ChunkSentenceSplitter, \ + ChunkMapperFilterer, \ + DateNormalizer, \ + GenericClassifierModel, \ + ReIdentification + # DrugNormalizer, \ + from sparknlp_jsl.structured_deidentification import StructuredDeidentification + from sparknlp_jsl.annotator.resolution.resolver_merger import ResolverMerger + from sparknlp_jsl.base import FeaturesAssembler + + from sparknlp_jsl.annotator import \ + AssertionLogRegApproach, \ + DeIdentification, \ + DocumentLogRegClassifierApproach, \ + RelationExtractionApproach, \ + ChunkMergeApproach, \ + NerDisambiguator, \ + ContextualParserApproach, \ + GenericClassifierApproach, \ + Router, \ + NerQuestionGenerator, \ + DocumentHashCoder + + from sparknlp_jsl.compatibility import Compatibility + from sparknlp_jsl.pretrained import InternalResourceDownloader + from sparknlp_jsl.eval import NerDLMetrics, NerDLEvaluation, SymSpellEvaluation, POSEvaluation, \ + NerCrfEvaluation, NorvigSpellEvaluation + + pass + +except: + if try_import_lib('sparknlp_jsl') and try_import_lib('sparknlp'): + log_broken_lib('Enterprise Finance') diff --git a/johnsnowlabs/medical.py b/johnsnowlabs/medical.py new file mode 100644 index 0000000000..45b926735b --- /dev/null +++ b/johnsnowlabs/medical.py @@ -0,0 +1,94 @@ +from johnsnowlabs.auto_install.softwares import Software +from johnsnowlabs.utils.print_messages import log_outdated_lib, log_broken_lib +from johnsnowlabs.abstract_base.lib_resolver import try_import_lib + +warning_logged = False + +try: + if try_import_lib('sparknlp_jsl') and try_import_lib('sparknlp'): + # Pretrained + from sparknlp_jsl.annotator import \ + AssertionLogRegModel, \ + AssertionDLModel, \ + DeIdentificationModel, \ + DocumentLogRegClassifierModel, \ + RelationExtractionModel, \ + RelationExtractionDLModel, \ + ChunkMergeModel, \ + SentenceEntityResolverModel, \ + ChunkMapperModel, \ + BertSentenceChunkEmbeddings, \ + ChunkKeyPhraseExtraction, \ + NerDisambiguatorModel, \ + EntityChunkEmbeddings, \ + ZeroShotRelationExtractionModel, \ + TFGraphBuilder, \ + ChunkConverter, \ + ChunkFilterer, \ + NerConverterInternal, \ + NerChunker, \ + AssertionFilterer, \ + AnnotationMerger, \ + RENerChunksFilter, \ + ChunkSentenceSplitter, \ + DrugNormalizer, \ + ChunkMapperFilterer, \ + DateNormalizer, \ + GenericClassifierModel, \ + ReIdentification, \ + ZeroShotNerModel + from sparknlp_jsl.structured_deidentification import StructuredDeidentification + + from sparknlp_jsl.base import FeaturesAssembler + + from sparknlp_jsl.annotator import \ + AssertionLogRegApproach, \ + AssertionDLApproach, \ + DeIdentification, \ + DocumentLogRegClassifierApproach, \ + RelationExtractionApproach, \ + ChunkMergeApproach, \ + SentenceEntityResolverApproach, \ + ChunkMapperApproach, \ + NerDisambiguator, \ + ContextualParserApproach, \ + GenericClassifierApproach, \ + Router, \ + NerQuestionGenerator, \ + DocumentHashCoder + + from sparknlp_jsl.annotator.resolution.resolver_merger import ResolverMerger + + from sparknlp_jsl.annotator import MedicalNerModel as NerModel + from sparknlp_jsl.annotator import MedicalNerApproach as NerApproach + from sparknlp_jsl.annotator import MedicalBertForTokenClassifier as BertForTokenClassifier + from sparknlp_jsl.annotator import \ + MedicalDistilBertForSequenceClassification as DistilBertForSequenceClassification + from sparknlp_jsl.annotator import MedicalBertForSequenceClassification as BertForSequenceClassification + from sparknlp_jsl.compatibility import Compatibility + from sparknlp_jsl.pretrained import InternalResourceDownloader + from sparknlp_jsl.eval import NerDLMetrics, NerDLEvaluation, SymSpellEvaluation, POSEvaluation, \ + NerCrfEvaluation, NorvigSpellEvaluation + + # from sparknlp.base import * + # from sparknlp.annotator import * + # from sparknlp_jsl.annotator import * + # from sparknlp_jsl.base import * + + # FinanceBertForSequenceClassification,\ + # FinanceNerModel,\ + # FinanceBertForTokenClassification,\ + # LegalNerModel,\ + # LegalBertForTokenClassification,\ + # LegalBertForSequenceClassification + + from sparknlp_jsl.functions import * + from sparknlp_jsl.training import * +except: + log_broken_lib(Software.spark_hc) + +if try_import_lib('sparknlp_jsl') and try_import_lib('sparknlp'): + if not Software.spark_hc.check_installed_correct_version() and not warning_logged: + warning_logged = True + import sparknlp_jsl + log_outdated_lib(Software.spark_hc, sparknlp_jsl.version()) diff --git a/johnsnowlabs/nlp.py b/johnsnowlabs/nlp.py new file mode 100644 index 0000000000..05af5eb6a7 --- /dev/null +++ b/johnsnowlabs/nlp.py @@ -0,0 +1,33 @@ +from johnsnowlabs.abstract_base.lib_resolver import try_import_lib + +if try_import_lib('sparknlp'): + from sparknlp.base import * + from sparknlp.annotator import * + import sparknlp + from sparknlp.pretrained import ResourceDownloader + from sparknlp.training import * + from sparknlp.functions import * + from sparknlp.pretrained import PretrainedPipeline + +if try_import_lib('pyspark'): + from pyspark.sql import DataFrame + import pyspark.sql.functions as F + import pyspark.sql.types as T + import pyspark.sql as SQL + + from pyspark import ml as ML + import pyspark.ml.param.shared as _shared_pyspark_ml_param + + ML.param.shared = _shared_pyspark_ml_param + + from pyspark.sql import SparkSession + from pyspark.ml import Pipeline, PipelineModel + +if try_import_lib('warnings'): + import warnings + + warnings.filterwarnings('ignore') + +if try_import_lib('nlu'): + from nlu import load, to_nlu_pipe, autocomplete_pipeline, to_pretty_df + import nlu as nlu diff --git a/johnsnowlabs/ocr.py b/johnsnowlabs/ocr.py new file mode 100644 index 0000000000..706aa431a3 --- /dev/null +++ b/johnsnowlabs/ocr.py @@ -0,0 +1,25 @@ +from johnsnowlabs import settings +from johnsnowlabs.abstract_base.lib_resolver import try_import_lib +from johnsnowlabs.auto_install.softwares import Software +from johnsnowlabs.utils.print_messages import log_outdated_lib + + +warning_logged = False +if try_import_lib('sparkocr') and try_import_lib('sparknlp'): + from sparkocr.transformers import * + from sparkocr.enums import * + import pkg_resources + import sparkocr + from sparkocr.utils import * + from sparkocr.schemas import * + from sparkocr.metrics import * + from sparkocr.databricks import isRunningInDatabricks + + if isRunningInDatabricks(): + # Overwrites functions imported from sparkocr.utils + # but this is fine, since these are not compatible on databricks + from sparkocr.databricks import * + + if not Software.spark_ocr.check_installed_correct_version() and not warning_logged: + log_outdated_lib(Software.spark_ocr, sparkocr.version()) + warning_logged = True diff --git a/johnsnowlabs/py_models/__init__.py b/johnsnowlabs/py_models/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/johnsnowlabs/py_models/install_info.py b/johnsnowlabs/py_models/install_info.py new file mode 100644 index 0000000000..e51fe0ca93 --- /dev/null +++ b/johnsnowlabs/py_models/install_info.py @@ -0,0 +1,135 @@ +from typing import Optional, Tuple, Dict, Union, List + +from johnsnowlabs import settings +from johnsnowlabs.abstract_base.pydantic_model import WritableBaseModel +# from johnsnowlabs.abstract_base.software_product import AbstractSoftwareProduct +from johnsnowlabs.py_models.jsl_secrets import JslSecrets +from johnsnowlabs.utils.enums import ProductName, JvmHardwareTarget, PyInstallTypes +from johnsnowlabs.py_models.lib_version import LibVersion +import os + + +class InstallFileInfoBase(WritableBaseModel): + file_name: str + product: ProductName + compatible_spark_version: Union[str, LibVersion] + product_version: Union[str, LibVersion] + + # install_type: Optional[JvmHardwareTarget] + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.compatible_spark_version = LibVersion(self.compatible_spark_version) + self.product_version = LibVersion(self.product_version) + + +class PyInstallInfo(InstallFileInfoBase): + install_type: PyInstallTypes + + +class JvmInstallInfo(InstallFileInfoBase): + install_type: JvmHardwareTarget + + +class LocalPyLib(WritableBaseModel): + py_lib: Optional[PyInstallInfo] = None + + def get_py_path(self): + if not self.py_lib: + return False + if not os.path.exists(f'{settings.py_dir}/{self.py_lib.file_name}'): + return False + return f'{settings.py_dir}/{self.py_lib.file_name}' + + +class LocalPy4JLib(WritableBaseModel): + java_lib: Optional[JvmInstallInfo] = None + py_lib: Optional[PyInstallInfo] = None + + def get_java_path(self): + if not self.java_lib: + return False + if not os.path.exists(f'{settings.java_dir}/{self.java_lib.file_name}'): + return False + return f'{settings.java_dir}/{self.java_lib.file_name}' + + def get_py_path(self): + if not self.py_lib: + return False + if not os.path.exists(f'{settings.py_dir}/{self.py_lib.file_name}'): + return False + return f'{settings.py_dir}/{self.py_lib.file_name}' + + +class RootInfo(WritableBaseModel): + version: Union[str, LibVersion] + run_from: str + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.version = LibVersion(self.version) + + @staticmethod + def get_from_jsl_home(): + return RootInfo.parse_file(settings.root_info_file) + + +class InstallFolder(WritableBaseModel): + infos: Dict[str, Union[PyInstallInfo, JvmInstallInfo]] + + def get_product_entry(self, product: ProductName, + hardware_target: Optional[Union[PyInstallTypes, JvmHardwareTarget]] = None): + for file_name, install_info in self.infos.items(): + if install_info.product == product: + if hardware_target: + if install_info.install_type == hardware_target: + return install_info + else: + return install_info + + @staticmethod + def java_folder_from_home(): + if os.path.exists(settings.java_info_file): + return InstallFolder.parse_file(settings.java_info_file) + return False + + @staticmethod + def py_folder_from_home(): + if os.path.exists(settings.py_info_file): + return InstallFolder.parse_file(settings.py_info_file) + return False + + +class InstallSuite(WritableBaseModel): + info: RootInfo + secrets: Optional[JslSecrets] = None + # Py4J Libs + nlp: LocalPy4JLib + ocr: Optional[LocalPy4JLib] = None + hc: Optional[LocalPy4JLib] = None + # Pure Python Libs + pure_py_jsl: Optional[LocalPyLib] = None + + def get_missing_products(self, ): + missing = [] + from johnsnowlabs.auto_install.softwares import Software + if self.secrets.OCR_LICENSE: + if not self.ocr.java_lib or not self.ocr.get_java_path(): + missing.append(Software.spark_ocr) + if self.secrets.HC_LICENSE: + if not self.hc.java_lib or not self.hc.get_java_path(): + missing.append(Software.spark_hc) + if not self.nlp.java_lib or not self.nlp.get_java_path(): + missing.append(Software.spark_nlp) + return missing + + def log_missing_jars(self, + should_have_ocr, + should_have_hc, + should_have_nlp, ): + print(f'🚨 Looks like some of the missing jars could not be fetched...') + if not self.ocr.java_lib and self.secrets.OCR_LICENSE and should_have_ocr: + print(f'🚨 Missing Jar for OCR') + if not self.hc.java_lib and self.secrets.HC_LICENSE and should_have_hc: + print(f'🚨 Missing Jar for Medical') + if not self.nlp.java_lib and should_have_nlp: + print(f'🚨 Missing Jar for NLP') diff --git a/johnsnowlabs/py_models/jsl_secrets.py b/johnsnowlabs/py_models/jsl_secrets.py new file mode 100644 index 0000000000..a5bdbd3ce0 --- /dev/null +++ b/johnsnowlabs/py_models/jsl_secrets.py @@ -0,0 +1,625 @@ +from dataclasses import dataclass +from enum import Enum +from functools import partial +from pathlib import Path +from typing import Optional, Union, Dict, List +from abc import ABC, abstractmethod +import glob + +import requests +import json + +from johnsnowlabs.py_models.lib_version import LibVersion +from johnsnowlabs.utils.enums import ProductName + +from johnsnowlabs.abstract_base.pydantic_model import WritableBaseModel +from johnsnowlabs.py_models.primitive import LibVersionIdentifier, Secret +import os + +from johnsnowlabs.utils.file_utils import json_path_as_dict +from johnsnowlabs.utils.my_jsl_api import get_user_licenses, download_license, get_access_token, \ + get_access_key_from_browser, get_user_lib_secrets +from johnsnowlabs import settings +from pydantic import Field, validator + +secret_json_keys = ['JSL_SECRET', 'SECRET', 'SPARK_NLP_LICENSE', 'JSL_LICENSE', 'JSL_VERSION', 'PUBLIC_VERSION', + 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', + 'SPARK_OCR_LICENSE', 'SPARK_OCR_SECRET', 'OCR_VERSION', + 'HC_SECRET', 'HC_LICENSE', 'HC_VERSION', 'OCR_SECRET', 'OCR_LICENSE', + 'JSL_LEGAL_LICENSE', + 'JSL_FINANCE_LICENSE', + ] + +already_logged = False +ocr_validation_logged = False +hc_validation_logged = False + + +class JslSecrets(WritableBaseModel): + """Representation of a JSL credentials and helper + methods for reading/storing found_secrets and managing .jslhome folder + """ + HC_SECRET: Secret = None + HC_LICENSE: Secret = None + HC_VERSION: Optional[LibVersionIdentifier] = None + OCR_SECRET: Secret = None + OCR_LICENSE: Secret = None + OCR_VERSION: Optional[LibVersionIdentifier] = None + AWS_ACCESS_KEY_ID: Secret = None + AWS_SECRET_ACCESS_KEY: Secret = None + NLP_VERSION: Optional[LibVersionIdentifier] = None + JSL_LEGAL_LICENSE: Secret = None + JSL_FINANCE_LICENSE: Secret = None + + @staticmethod + def raise_invalid_version(): + print( + f'To fix invalid license please visit https://my.johnsnowlabs.com/ and download license with the latest secrets. ' + f'This file cannot be used to install any of the licensed libraries ') + raise ValueError('Invalid secrets') + + @validator('HC_SECRET') + def hc_version_check(cls, HC_SECRET): + global hc_validation_logged + try: + if not JslSecrets.is_hc_secret_correct_version(HC_SECRET) and not hc_validation_logged: + hc_validation_logged = True + print( + f"🚨 Outdated Medical Secrets in license file. Version={HC_SECRET.split('-')[0]} but should be Version={settings.raw_version_medical}") + if settings.enforce_secret_on_version: + raise ValueError('Invalid HC Secret') + else: + return HC_SECRET + + else: + return HC_SECRET + except ValueError as err: + cls.raise_invalid_version() + except Exception as err: + pass + + @staticmethod + def is_ocr_secret_correct_version(ocr_secret: Optional[str]) -> bool: + return ocr_secret and ocr_secret.split('-')[0] == settings.raw_version_ocr + + @staticmethod + def is_hc_secret_correct_version(hc_secret: Optional[str]) -> bool: + return hc_secret and hc_secret.split('-')[0] == settings.raw_version_medical + + @validator('OCR_SECRET') + def ocr_version_check(cls, OCR_SECRET): + global ocr_validation_logged + try: + if not JslSecrets.is_ocr_secret_correct_version(OCR_SECRET) and not ocr_validation_logged: + ocr_validation_logged = True + print( + f"🚨 Outdated OCR Secrets in license file. Version={OCR_SECRET.split('-')[0]} but should be Version={settings.raw_version_ocr}") + if settings.enforce_secret_on_version: + raise ValueError("Invalid OCR Secret") + else: + return OCR_SECRET + else: + return OCR_SECRET + except ValueError as err: + cls.raise_invalid_version() + except Exception as err: + pass + + @staticmethod + def is_other_older_secret(first, other: Optional[str]): + """Compare one secret value to another i.e. first.secret >= other.secret . + Returns True if first is larger or equal to other + Returns False if the other is a newer secret""" + v1 = LibVersion(first.split('-')[0]) + if not other: + return False + v2 = LibVersion(other.split('-')[0]) + if v1.equals(v2): + return False + # If the other lib is older, then it must not be greater + return not v1.is_other_greater(v2) + + def equal_credentials(self, other: 'JslSecrets'): + """ + Compare this secret to another secret, returns True for equal and False otherwise. + Since library secrets are universally equal across all secrets, + we just jest the fields,AWS_SECRET_ACCESS_KEY,AWS_ACCESS_KEY_ID,OCR_LICENSE,HC_LICENSE + for equality and omit secret fields for Lib Version + :param other: another instance of JslSecrets to compare + :return: True for equal False otherwise + """ + if any([ + self.AWS_SECRET_ACCESS_KEY != other.AWS_SECRET_ACCESS_KEY, + self.AWS_ACCESS_KEY_ID != other.AWS_ACCESS_KEY_ID, + self.OCR_LICENSE != other.OCR_LICENSE, + self.HC_LICENSE != other.HC_LICENSE, + self.JSL_LEGAL_LICENSE != other.JSL_LEGAL_LICENSE, + self.JSL_FINANCE_LICENSE != other.JSL_FINANCE_LICENSE, + ]): + return False + else: + return True + + def equal_lib_secrets(self, other: 'JslSecrets'): + """ + Compare lib secrets to another secret, returns True for equal and False otherwise. + Anything which is not a lib secret is referred to as a credential + for equality and omit secret fields for Lib Version + :param other: another instance of JslSecrets to compare + :return: True for equal False otherwise + """ + if any([ + self.OCR_SECRET != other.OCR_SECRET, + self.HC_SECRET != other.HC_SECRET, + ]): + return False + else: + return True + + @staticmethod + def build_or_try_find_secrets( + browser_login: bool = True, + force_browser: bool = False, + access_token: Optional[str] = None, + local_license_number: int = 0, + remote_license_number: int = 0, + secrets_file: Optional[str] = None, + hc_license: Optional[str] = None, + hc_secret: Optional[str] = None, + ocr_secret: Optional[str] = None, + ocr_license: Optional[str] = None, + aws_access_key: Optional[str] = None, + aws_key_id: Optional[str] = None, + fin_license: Optional[str] = None, + leg_license: Optional[str] = None, + return_empty_secrets_if_none_found=False, + store_in_jsl_home=True + ) -> Union['JslSecrets', bool]: + """ + Builds JslSecrets object if any found_secrets supplied or if none supplied, + tries out every default resolution method defined to find found_secrets + and build a JSlSecrets object. + at the end of flow we always check if secrets are new and store to disk if they are, unless + + :return: JslSecrets if any found_secrets found otherwise False + """ + secrets = None + try: + # we wrap this flow with try/except, so that incase we get invalid license data + # we can still try loading from JSL-Home afterwards + + if any([hc_license, hc_secret, ocr_secret, ocr_license, aws_access_key, aws_key_id]): + # Some found_secrets are supplied + secrets = JslSecrets(HC_SECRET=hc_secret, HC_LICENSE=hc_license, OCR_SECRET=ocr_secret, + OCR_LICENSE=ocr_license, + AWS_ACCESS_KEY_ID=aws_key_id, AWS_SECRET_ACCESS_KEY=aws_access_key, + JSL_LEGAL_LICENSE=leg_license, JSL_FINANCE_LICENSE=fin_license) + elif access_token: + secrets = JslSecrets.from_access_token(access_token, remote_license_number) + + # elif email and passw: + # found_secrets = JslSecrets.from_email_and_pass(email, passw,local_license_number) + + elif secrets_file: + # Load from JSON file from provided secret file + secrets = JslSecrets.from_json_file_path(secrets_file) + + if not secrets and not force_browser: + # Try auto Resolve credentials if none are supplied + secrets = JslSecrets.search_default_locations(license_number=local_license_number) + if not secrets and not force_browser: + # Search Env Vars + secrets = JslSecrets.search_env_vars() + except Exception as err: + print(f'🚨 Failure Trying to read license {err}\n', + f'Trying to use license from John Snow Labs home folder if it exists') + + if not secrets and not force_browser: + # Search Env Vars + secrets = JslSecrets.from_jsl_home(license_number=local_license_number) + + if browser_login and not secrets or force_browser: + # TODO more exception handling and pick License from UI? + access_token = get_access_key_from_browser() + secrets = JslSecrets.from_access_token(access_token, remote_license_number) + + if not secrets and return_empty_secrets_if_none_found: + # Return empty found_secrets object + return JslSecrets() + if secrets and store_in_jsl_home: + # We found some found_secrets + # Store them if this is the first time JSL-Creds are loaded on this machine + JslSecrets.store_in_jsl_home_if_new(secrets) + return secrets + + return False + + @staticmethod + def dict_has_jsl_secrets(secret_dict: Dict[str, str]) -> bool: + + for key in secret_json_keys: + if key in secret_dict: + return True + return False + + @staticmethod + def search_env_vars() -> Union['JslSecrets', bool]: + """ + Search env vars for valid JSL-Secret values + :return: JslSecrets if secret found, False otherwise + """ + # We define max json size, anything above this will not be checked + + hc_secret = os.environ['JSL_SECRET'] if 'JSL_SECRET' in os.environ else None + if not hc_secret: + hc_secret = os.environ['SECRET'] if 'SECRET' in os.environ else None + if not hc_secret: + hc_secret = os.environ['HC_SECRET'] if 'HC_SECRET' in os.environ else None + + hc_license = os.environ['SPARK_NLP_LICENSE'] if 'SPARK_NLP_LICENSE' in os.environ else None + if not hc_license: + hc_license = os.environ['JSL_LICENSE'] if 'JSL_LICENSE' in os.environ else None + if not hc_license: + hc_license = os.environ['HC_LICENSE'] if 'HC_LICENSE' in os.environ else None + + hc_version = os.environ['JSL_VERSION'] if 'JSL_VERSION' in os.environ else None + if not hc_version: + hc_version = os.environ['HC_VERSION'] if 'HC_VERSION' in os.environ else None + + nlp_version = os.environ['PUBLIC_VERSION'] if 'PUBLIC_VERSION' in os.environ else None + aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID'] if 'AWS_ACCESS_KEY_ID' in os.environ else None + aws_access_key = os.environ['AWS_SECRET_ACCESS_KEY'] if 'AWS_SECRET_ACCESS_KEY' in os.environ else None + + ocr_license = os.environ['SPARK_OCR_LICENSE'] if 'SPARK_OCR_LICENSE' in os.environ else None + if not ocr_license: + ocr_license = os.environ['OCR_LICENSE'] if 'OCR_LICENSE' in os.environ else None + + ocr_secret = os.environ['SPARK_OCR_SECRET'] if 'SPARK_OCR_SECRET' in os.environ else None + if not ocr_secret: + ocr_secret = os.environ['OCR_SECRET'] if 'OCR_SECRET' in os.environ else None + + ocr_version = os.environ['OCR_VERSION'] if 'OCR_VERSION' in os.environ else None + + leg_license = os.environ['JSL_LEGAL_LICENSE'] if 'JSL_LEGAL_LICENSE' in os.environ else None + fin_license = os.environ['JSL_FINANCE_LICENSE'] if 'JSL_FINANCE_LICENSE' in os.environ else None + + if any([hc_secret, hc_license, hc_license, hc_version, nlp_version, aws_access_key_id, aws_access_key, + ocr_license, ocr_secret, ocr_version]): + print('👌 License detected in Environment Variables') + return JslSecrets( + HC_SECRET=hc_secret, + HC_LICENSE=hc_license, + HC_VERSION=hc_version, + OCR_SECRET=ocr_secret, + OCR_LICENSE=ocr_license, + OCR_VERSION=ocr_version, + NLP_VERSION=nlp_version, + AWS_ACCESS_KEY_ID=aws_access_key_id, + AWS_SECRET_ACCESS_KEY=aws_access_key, + JSL_LEGAL_LICENSE=leg_license, + JSL_FINANCE_LICENSE=fin_license, ) + return False + + @staticmethod + def search_default_locations(license_number=0) -> Union['JslSecrets', bool]: + """ + Search default google colab folder and current working dir for + for JSL Secret json file + :return: JslSecrets if secret found, False otherwise + """ + # We define max json size, anything above this will not be checked + max_json_file_size = 10000 + + # 1. Check colab content folder + if os.path.exists('/content'): + j_files = glob.glob('/content/*.json') + for f_path in j_files: + try: + if os.path.getsize(f_path) > max_json_file_size: + continue + json_dict = JslSecrets.json_path_as_dict(f_path) + if JslSecrets.dict_has_jsl_secrets(json_dict): + print(f'👌 Detected license file {f_path}') # ✅ + return JslSecrets.from_json_file_path(f_path) + except: + continue + + # 2. Check current working dir + j_files = glob.glob(f'{os.getcwd()}/*.json') + for f_path in j_files: + try: + if os.path.getsize(f_path) > max_json_file_size: + continue + + json_dict = JslSecrets.json_path_as_dict(f_path) + if JslSecrets.dict_has_jsl_secrets(json_dict): + print(f'👌 Detected license file {f_path}') # ✅ + return JslSecrets.from_json_file_path(f_path) + except: + continue + # 3. Check JSL home + return JslSecrets.from_jsl_home(license_number=license_number) + + @staticmethod + def json_path_as_dict(path): + with open(path) as f: + return json.load(f) + + @staticmethod + def from_json_file_path(secrets_path): + if not os.path.exists(secrets_path): + raise FileNotFoundError(f'No file found for secrets_path={secrets_path}') + f = open(secrets_path) + creds = JslSecrets.from_json_dict(json.load(f)) + f.close() + return creds + + @staticmethod + def from_access_token(access_token, license_number=0): + licenses = get_user_licenses(access_token) + secrets = get_user_lib_secrets(access_token) + # 1. Oct Secret + if license_number >= len(licenses) or license_number < 0: + raise ValueError( + f'You have {len(licenses)} in total. Input License Number {license_number} is invalid, up to {len(licenses) - 1} accepted.') + data = download_license(licenses[license_number], access_token) + + # Fix lib secrets in license data to correct version + ocr_candidates = list( + filter(lambda x: x.version_secret == settings.raw_version_secret_ocr and x.product == ProductName.ocr, + secrets)) + hc_handidates = list( + filter(lambda x: x.version_secret == settings.raw_version_secret_medical and x.product == ProductName.hc, + secrets)) + if hc_handidates: + data['SECRET'] = hc_handidates[0].secret + data['JSL_VERSION'] = hc_handidates[0].version + if ocr_candidates: + data['SPARK_OCR_SECRET'] = ocr_candidates[0].secret + data['OCR_VERSION'] = ocr_candidates[0].version + return JslSecrets.from_json_dict(data, licenses[license_number]) + + @staticmethod + def from_email_and_pass(email, passw, license_number=0): + # TODO test and wait for PR ! + access_token = get_access_token(email, passw) + licenses = get_user_licenses(access_token) + data = download_license(licenses[license_number], access_token) + secrets = JslSecrets.from_json_dict(data, licenses[license_number], ) + return secrets + + @staticmethod + def from_json_dict(secrets, secrets_metadata: Optional = None) -> 'JslSecrets': + hc_secret = secrets['JSL_SECRET'] if 'JSL_SECRET' in secrets else None + if not hc_secret: + hc_secret = secrets['SECRET'] if 'SECRET' in secrets else None + if not hc_secret: + hc_secret = secrets['HC_SECRET'] if 'HC_SECRET' in secrets else None + + hc_license = secrets['SPARK_NLP_LICENSE'] if 'SPARK_NLP_LICENSE' in secrets else None + if not hc_license: + hc_license = secrets['JSL_LICENSE'] if 'JSL_LICENSE' in secrets else None + if not hc_license: + hc_license = secrets['HC_LICENSE'] if 'HC_LICENSE' in secrets else None + + hc_version = secrets['JSL_VERSION'] if 'JSL_VERSION' in secrets else None + if not hc_version: + hc_version = secrets['HC_VERSION'] if 'HC_VERSION' in secrets else None + + nlp_version = secrets['PUBLIC_VERSION'] if 'PUBLIC_VERSION' in secrets else None + aws_access_key_id = secrets['AWS_ACCESS_KEY_ID'] if 'AWS_ACCESS_KEY_ID' in secrets else None + aws_access_key = secrets['AWS_SECRET_ACCESS_KEY'] if 'AWS_SECRET_ACCESS_KEY' in secrets else None + + ocr_license = secrets['SPARK_OCR_LICENSE'] if 'SPARK_OCR_LICENSE' in secrets else None + if not ocr_license: + ocr_license = secrets['OCR_LICENSE'] if 'OCR_LICENSE' in secrets else None + + ocr_secret = secrets['SPARK_OCR_SECRET'] if 'SPARK_OCR_SECRET' in secrets else None + if not ocr_secret: + ocr_secret = secrets['OCR_SECRET'] if 'OCR_SECRET' in secrets else None + + ocr_version = secrets['OCR_VERSION'] if 'OCR_VERSION' in secrets else None + + leg_license = secrets['JSL_LEGAL_LICENSE'] if 'JSL_LEGAL_LICENSE' in secrets else None + fin_license = secrets['JSL_FINANCE_LICENSE'] if 'JSL_FINANCE_LICENSE' in secrets else None + + return JslSecrets( + HC_SECRET=hc_secret, + HC_LICENSE=hc_license, + HC_VERSION=hc_version, + OCR_SECRET=ocr_secret, + OCR_LICENSE=ocr_license, + OCR_VERSION=ocr_version, + NLP_VERSION=nlp_version, + AWS_ACCESS_KEY_ID=aws_access_key_id, + AWS_SECRET_ACCESS_KEY=aws_access_key, + JSL_LEGAL_LICENSE=leg_license, + JSL_FINANCE_LICENSE=fin_license, + + # id=secrets_metadata['id'], + # license_type=secrets_metadata['type'], + # end_date=secrets_metadata['endDate'], + # platform=secrets_metadata['platform'], + # products=secrets_metadata['products'], + + ) + + @staticmethod + def from_jsl_home(license_number=0, log=True, raise_error=False) -> Union['JslSecrets', bool]: + global already_logged + if not os.path.exists(settings.creds_info_file): + return False + + try: + # Try/Catch incase we get validation errors from outdated files + license_infos = LicenseInfos.parse_file(settings.creds_info_file) + if log and not already_logged: + already_logged = True + print( + f'📋 Loading license number {license_number} from {settings.license_dir}/{list(license_infos.infos.keys())[license_number]}') + except: + license_infos = JslSecrets.update_outdated_lib_secrets() + if license_number >= len(license_infos.infos) or license_number < 0: + if raise_error: + raise ValueError( + f'You have {len(license_infos.infos)} different credentials in total ' + f'but specified license_number={license_number}.' + f'Please specify a number smaller than {len(license_infos.infos)}') + else: + return False + return license_infos.infos[list(license_infos.infos.keys())[license_number]].jsl_secrets + + @staticmethod + def update_outdated_lib_secrets(new_secrets: 'JslSecrets') -> Optional['LicenseInfos']: + print('Trying to fix outdated licenses') + hc_secrets = new_secrets.HC_SECRET + ocr_secret = new_secrets.OCR_SECRET + invalid_licenses = [] + for license in os.listdir(settings.license_dir): + if license == 'info.json': + continue + secrets = JslSecrets.parse_file(f'{settings.license_dir}/{license}') + if secrets.HC_SECRET and hc_secrets and \ + JslSecrets.is_other_older_secret(hc_secrets, secrets.HC_SECRET): + invalid_licenses.append(f'{settings.license_dir}/{license}') + elif secrets.OCR_SECRET and ocr_secret \ + and JslSecrets.is_other_older_secret(ocr_secret, secrets.OCR_SECRET): + invalid_licenses.append(f'{settings.license_dir}/{license}') + + for license_path in invalid_licenses: + print(f'Updating license file {license_path}') + license_dict = json_path_as_dict(license_path) + if license_dict['HC_LICENSE'] and hc_secrets \ + and JslSecrets.is_other_older_secret(hc_secrets, license_dict['HC_SECRET']): + print( + f'🤓 Upgraded Medical Secrets to {hc_secrets.split("-")[0]} in credentials file {license_path} ') + license_dict['HC_SECRET'] = hc_secrets + if license_dict['OCR_LICENSE'] and ocr_secret \ + and JslSecrets.is_other_older_secret(ocr_secret, license_dict['OCR_SECRET']): + print( + f'🤓 Upgraded OCR Secrets to {ocr_secret.split("-")[0]} in credentials file {license_path} ') + license_dict['OCR_SECRET'] = ocr_secret + JslSecrets(**license_dict).write(license_path) + + # we need to update info dict as well + info_dict = json_path_as_dict(settings.creds_info_file) + for license_file, license_metadata in info_dict['infos'].items(): + license_dict = license_metadata['jsl_secrets'] + if license_dict['HC_LICENSE'] and hc_secrets \ + and JslSecrets.is_other_older_secret(hc_secrets, license_dict['HC_SECRET']): + license_dict['HC_SECRET'] = hc_secrets + if license_dict['OCR_LICENSE'] and ocr_secret \ + and JslSecrets.is_other_older_secret(ocr_secret, license_dict['OCR_SECRET']): + license_dict['OCR_SECRET'] = ocr_secret + LicenseInfos(**info_dict).write(settings.creds_info_file) + + try: + return LicenseInfos.from_home() + except: + print( + '🚨 Looks like all your Credentials are outdated, please visist https://my.johnsnowlabs.com/ to get updated ones or contact John Snow Labs support') + raise ValueError('Outdated John Snow Labs Credentials Directory') + + @staticmethod + def are_credentials_known(found_secrets: 'JslSecrets') -> bool: + # Return True, if secrets are already stored in JSL-Home, otherwise False + Path(settings.py_dir).mkdir(parents=True, exist_ok=True) + if os.path.exists(settings.creds_info_file): + license_infos = LicenseInfos.parse_file(settings.creds_info_file) + else: + # If license dir did not exist yet, secrets are certainly new + return False + + # if any stored secrets equal to found_secrets, then we already know then + return any(map(lambda x: found_secrets.equal_credentials(x.jsl_secrets), license_infos.infos.values())) + + @staticmethod + def are_lib_secrets_an_upgrade(found_secrets: 'JslSecrets') -> bool: + # Return True, if lib are newer than existing ones, if yes upgrade locally stored secrets + Path(settings.py_dir).mkdir(parents=True, exist_ok=True) + if os.path.exists(settings.creds_info_file): + license_infos = LicenseInfos.parse_file(settings.creds_info_file) + else: + # If license dir did not exist yet, secrets are certainly new + return False + + # if any stored secrets equal to found_secrets, then we already know them + # check OCR secrets + if found_secrets.HC_SECRET: + if any(map(lambda x: JslSecrets.is_other_older_secret(found_secrets.HC_SECRET, x.jsl_secrets.HC_SECRET), + license_infos.infos.values())): + return True + if found_secrets.OCR_SECRET: + if any(map(lambda x: JslSecrets.is_other_older_secret(found_secrets.OCR_SECRET, x.jsl_secrets.OCR_SECRET), + license_infos.infos.values())): + return True + return False + + @staticmethod + def store_in_jsl_home_if_new(secrets: 'JslSecrets') -> None: + global already_logged + # Store secrets in JSL home and update info file if secrets are new + if JslSecrets.are_lib_secrets_an_upgrade(secrets): + # Update all secret files in JSL home, since this one has an upgrade + JslSecrets.update_outdated_lib_secrets(secrets) + if JslSecrets.are_credentials_known(secrets): + return + + # Store the secret, since it's new + Path(settings.license_dir).mkdir(parents=True, exist_ok=True) + products = [] + file_name = 'license_number_{number}_for_' + if secrets.HC_LICENSE: + products.append(ProductName.hc.value) + if secrets.OCR_LICENSE: + products.append(ProductName.ocr.value) + + file_name = file_name + '_'.join(products) + f'.json' + + if os.path.exists(settings.creds_info_file): + license_infos = LicenseInfos.parse_file(settings.creds_info_file) + file_name = file_name.format(number=str(len(license_infos.infos))) + license_info = LicenseInfo(jsl_secrets=secrets, products=products, id=str(len(license_infos.infos))) + license_infos.infos[file_name] = license_info + license_infos.write(settings.creds_info_file) + out_dir = f'{settings.license_dir}/{file_name}' + secrets.write(out_dir) + print(f'📋 Stored new John Snow Labs License in {out_dir}') + else: + file_name = file_name.format(number='0') + license_info = LicenseInfo(jsl_secrets=secrets, products=products, id='0') + LicenseInfos(infos={file_name: license_info}).write(settings.creds_info_file) + out_dir = f'{settings.license_dir}/{file_name}' + secrets.write(out_dir) + print(f'📋 Stored John Snow Labs License in {out_dir}') + # We might load again JSL-Secrets from local + already_logged = True + + +class MyJslLicenseDataResponse(WritableBaseModel): + """Representation of MyJSL API Response""" + id: str + license_type: str + end_date: str + platform: Optional[str] + products: List[ProductName] + product_name: ProductName + + +class LicenseInfo(WritableBaseModel): + id: str + jsl_secrets: JslSecrets + products: List[ProductName] + + +class LicenseInfos(WritableBaseModel): + """Representation of a LicenseInfo in ~/.johnsnowlabs/licenses/info.json + Maps file_name to LicenseInfo + """ + infos: Dict[str, LicenseInfo] + + @staticmethod + def from_home() -> Optional['LicenseInfos']: + if os.path.exists(settings.creds_info_file): + return LicenseInfos.parse_file(settings.creds_info_file) + return None diff --git a/johnsnowlabs/py_models/lib_version.py b/johnsnowlabs/py_models/lib_version.py new file mode 100644 index 0000000000..94aa1ba03c --- /dev/null +++ b/johnsnowlabs/py_models/lib_version.py @@ -0,0 +1,127 @@ +from typing import Optional + + +class LibVersion: + """Representation of a library version in format A.B.C + where elements can digits or X which matches all others digits """ + + def __init__(self, + major_or_canonical_str: str, + minor: Optional[str] = None, + patch: Optional[str] = None, ): + self.major, self.minor, self.patch = None, None, None + + if '.' in major_or_canonical_str: + + splits = major_or_canonical_str.lower().split('.') + if len(splits) == 0: + raise ValueError( + 'When using canonical representation to construct a LibVersion, format A.B.C must be used') + self.major = splits[0] + if len(splits) > 1: + self.minor = splits[1] + if len(splits) > 2: + self.patch = splits[2] + + else: + self.major = major_or_canonical_str + self.minor = minor + self.patch = patch + + def equals(self: 'LibVersion', lib2: 'LibVersion') -> bool: + """ + Compare two LibVersions of format A.B.C , consisting of either Digits 0-9 or x . + X equals to any other version, i.e. 3.X.X equals 3.5.1 + A Lib Version may have the pattern A.B or A + :param lib2: + :return: + """ + if self.major == lib2.major: + if self.minor == 'x' or lib2.minor == 'x': + return True + + if self.minor == lib2.minor: + if self.patch == 'x' or lib2.patch == 'x': + return True + + if self.patch == lib2.patch: + return True + return False + + def is_other_greater(self: 'LibVersion', other: 'LibVersion') -> bool: + # basically checks self < other + if self.as_str() == other.as_str(): + return False + + # Major Check + if self.major < other.major: + return True + elif self.major > other.major: + return False + + # Minor Check + if self.minor < other.minor: + return True + elif self.minor > other.minor: + return False + + # Patch Check, + # Patch could be missing + if self.patch and not other.patch: + return True + if not self.patch and other.patch: + return False + # Patch could also be a str + if isinstance(self.patch, int) and isinstance(other.patch, int) and self.patch < other.patch: + return True + if isinstance(self.patch, int) and isinstance(other.patch, int) and self.patch > other.patch: + return False + + # One lib must be rc, parse it + if isinstance(self.patch, str) and 'rc' in self.patch: + self_patch, self_rc = self.patch.split('rc') + self_patch = int(self_patch) + self_rc = int(self_rc) + else: + self_patch = self.patch + self_rc = None + + if isinstance(other.patch, str) and 'rc' in self.patch: + other_patch, other_rc = self.patch.split('rc') + other_patch = int(other_patch) + other_rc = int(other_rc) + + else: + other_patch = other.patch + other_rc = None + + if self_patch < other_patch: + return True + if self_patch > other_patch: + return True + + # One may have an rc + if not self_rc and other_rc: + return True + if self_rc and not other_rc: + return False + # Both have no rc + if not self_rc and not other_rc: + return False + + # Both have a int rc + if self_rc < other_rc: + return True + if self_rc > other_rc: + return True + + # Version is equal, not an upgrade + return False + + def as_str(self) -> str: + """Return LibVersion object as canonical str representation""" + # We filter out all values != None soo version checks match up + return '.'.join(filter(lambda x: x, [self.major, self.minor, self.patch])) + + +from distutils.version import LooseVersion, StrictVersion diff --git a/johnsnowlabs/py_models/license_info.py b/johnsnowlabs/py_models/license_info.py new file mode 100644 index 0000000000..e6f8e7d97d --- /dev/null +++ b/johnsnowlabs/py_models/license_info.py @@ -0,0 +1,10 @@ +import secrets +from typing import List, Optional, Dict + + +from johnsnowlabs.py_models.jsl_secrets import JslSecrets + +from johnsnowlabs.utils.enums import ProductName +from johnsnowlabs.abstract_base.pydantic_model import WritableBaseModel +import hashlib + diff --git a/johnsnowlabs/py_models/primitive.py b/johnsnowlabs/py_models/primitive.py new file mode 100644 index 0000000000..031cec0f0d --- /dev/null +++ b/johnsnowlabs/py_models/primitive.py @@ -0,0 +1,6 @@ +class LibVersionIdentifier(str): + """Representation of a specific library version""" + pass + + +class Secret(str): pass diff --git a/johnsnowlabs/py_models/url_dependency.py b/johnsnowlabs/py_models/url_dependency.py new file mode 100644 index 0000000000..0f1e8a5a6a --- /dev/null +++ b/johnsnowlabs/py_models/url_dependency.py @@ -0,0 +1,54 @@ +import shutil +import urllib.request +from typing import Union, Any +from urllib.request import urlopen + +import requests + +from johnsnowlabs.utils.enums import JvmHardwareTarget, PyInstallTypes, ProductName +from johnsnowlabs.abstract_base.pydantic_model import WritableBaseModel +from johnsnowlabs.utils.enums import SparkVersion +from johnsnowlabs.py_models.lib_version import LibVersion + + +class UrlDependency(WritableBaseModel): + """Representation of a URL""" + url: str + dependency_type: Union[JvmHardwareTarget, PyInstallTypes] + spark_version: SparkVersion + dependency_version: LibVersion + file_name: str + product_name: ProductName + + def __init__(self, **data: Any): + super().__init__(**data) + self.file_name = self.url.split('/')[-1] + def update_url(self, new_url): + self.url = new_url + + def validate(self): + # Try GET on the URL and see if its valid/reachable + return requests.head(self.url).status_code == 200 + + @staticmethod + def internet_on(): + try: + return True if urlopen('https://www.google.com/', timeout=10) else False + except: + return False + + def download_url(self, save_path, name_print_prefix: str = '', keep_default_file_name=True): + if not UrlDependency.internet_on(): + print(f'Warning! It looks like there is no active internet connection on this machine') + print(f'Trying to continue but might run into problems...') + + if not self.validate(): + raise ValueError(f"Trying to download Invalid URL! {self.url}") + if keep_default_file_name: + self.file_name = self.url.split('/')[-1] + save_path = save_path + '/' + self.file_name + + print(f'Downloading {name_print_prefix} {self.file_name}') + # Download the file from `url` and save it locally under `file_name`: + with urllib.request.urlopen(self.url) as response, open(save_path, 'wb') as out_file: + shutil.copyfileobj(response, out_file) diff --git a/johnsnowlabs/settings.py b/johnsnowlabs/settings.py new file mode 100644 index 0000000000..89c8cd6edb --- /dev/null +++ b/johnsnowlabs/settings.py @@ -0,0 +1,88 @@ +from os.path import expanduser +import os + +# libs, these versions are used for auto-installs and version checks +raw_version_jsl_lib = '4.2.0' +raw_version_nlp = '4.2.0' +raw_version_medical = '4.2.0' +raw_version_secret_medical = '4.2.0' +raw_version_secret_ocr = '4.1.0' + +raw_version_ocr = '4.1.0' +raw_version_nlu = '4.0.1rc4' +raw_version_pyspark = '3.1.2' +raw_version_nlp_display = '4.1' +pypi_page = 'https://pypi.org/project/johnsnowlabs' + +json_indent = 4 +enforce_secret_on_version = False +enforce_versions = True + + +# Local paths for jsl home + +## Directories +def is_running_in_databricks(): + """ Check if the currently running Python Process is running in Databricks or not + If any Environment Variable name contains 'DATABRICKS' this will return True, otherwise False""" + for k in os.environ.keys(): + if 'DATABRICKS' in k: + return True + return False + + +on_databricks = is_running_in_databricks() + +if on_databricks: + try: + root_dir = f'/dbfs/johnsnowlabs' + import logging + logger = spark._jvm.org.apache.log4j + logging.getLogger("py4j.java_gateway").setLevel(logging.ERROR) + except: + pass +else: + root_dir = f'{expanduser("~")}/.johnsnowlabs' +license_dir = f'{root_dir}/licenses' +java_dir = f'{root_dir}/java_installs' +py_dir = f'{root_dir}/py_installs' + +# Info Files +root_info_file = f'{root_dir}/info.json' +java_info_file = f'{java_dir}/info.json' +py_info_file = f'{py_dir}/info.json' +creds_info_file = f'{license_dir}/info.json' + +# databricks paths +dbfs_home_dir = 'dbfs:/johnsnowlabs' +dbfs_java_dir = f'{dbfs_home_dir}/java_installs' +dbfs_py_dir = f'{dbfs_home_dir}/py_installs' +db_py_jobs_dir = f'{dbfs_home_dir}/py_jobs' +db_py_notebook_dir = f'{dbfs_home_dir}/py_notebook_jobs' +db_jar_jobs_dir = f'{dbfs_home_dir}/jar_jobs' + +db_cluster_name = 'John-Snow-Labs-Databricks-Auto-Cluster🚀' +db_driver_node_type = 'i3.xlarge' +db_node_type_id = 'i3.xlarge' +db_spark_version = '10.5.x-scala2.12' + +db_job_name = 'John-Snow-Labs-Job {job} 🚀' +db_run_name = 'John-Snow-Labs-Run 🚀' + +# Local Spark mode +spark_session_name = 'John-Snow-Labs-Spark-Session 🚀' + +#### Testing +success_worker_print = '$$JSL_TESTING_WORKER_SUC$$' +testing_dir = f'{root_dir}/tmp_tests' +tmp_notebook_dir = f'{testing_dir}/notebook_tests' +tmp_py_script_dir = f'{testing_dir}/notebook_tests' +tmp_markdown_dir = f'{testing_dir}/markdown_tests' + +workshop_git = 'https://github.com/JohnSnowLabs/spark-nlp-workshop.git' +workshop_local_folder = f'{tmp_notebook_dir}/spark-nlp-workshop' +workshop_cert_nb_folder = f'{workshop_local_folder}/tutorials/Certification_Trainings' +workshop_fin_folder = f'{workshop_cert_nb_folder}/Finance' +workshop_leg_folder = f'{workshop_cert_nb_folder}/Healthcare' +workshop_med_folder = f'{workshop_cert_nb_folder}/Legal' +workshop_pub_folder = f'{workshop_cert_nb_folder}/Public' diff --git a/johnsnowlabs/utils/__init__.py b/johnsnowlabs/utils/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/johnsnowlabs/utils/enums.py b/johnsnowlabs/utils/enums.py new file mode 100644 index 0000000000..51647ead25 --- /dev/null +++ b/johnsnowlabs/utils/enums.py @@ -0,0 +1,228 @@ +from dataclasses import dataclass +from typing import Optional + +from johnsnowlabs import settings + +from johnsnowlabs.abstract_base.base_enum import BaseEnum +from johnsnowlabs.py_models.primitive import LibVersionIdentifier +from johnsnowlabs.py_models.lib_version import LibVersion + + +class DatabricksClusterStates(BaseEnum): + # https://docs.databricks.com/dev-tools/api/latest/clusters.html#clusterclusterstate + PENDING = 'PENDING' + RUNNING = 'RUNNING' + RESTARTING = 'RESTARTING' + RESIZING = 'RESIZING' + TERMINATING = 'TERMINATING' + TERMINATED = 'TERMINATED' + ERROR = 'ERROR' + UNKNOWN = 'UNKNOWN' + + +class JvmHardwareTarget(BaseEnum): + gpu = 'gpu' + cpu = 'cpu' + m1 = 'm1' + aarch = 'aarch' + + @classmethod + def bool_choice_to_hardware(cls, gpu: bool = False, cpu: bool = False, m1: bool = False) -> 'JvmHardwareTarget': + if gpu: + return cls.gpu + elif cpu: + return cls.cpu + elif m1: + return cls.m1 + else: + return cls.cpu + + @staticmethod + def from_str(s): + if s not in JvmHardwareTarget: + bck = "\n" + raise Exception(f'Invalid value for jvm_install_type: {s}' + f' please specify on of:\n{bck.join([n.value for n in JvmHardwareTarget])}') + else: + return JvmHardwareTarget(s) + + +class PyInstallTypes(BaseEnum): + tar = 'tar.gz' + wheel = 'whl' + + @staticmethod + def from_str(s): + bck = "\n" + if s not in PyInstallTypes: + raise Exception(f'Invalid value for py_install_type: {s}' + f' please specify on of:\n{bck.join([n.value for n in PyInstallTypes])}') + else: + return PyInstallTypes(s) + + +class SparkVersion(BaseEnum): + # Broad versions + spark3xx = LibVersion('3.x.x') + spark31x = LibVersion('3.1.x') + spark32x = LibVersion('3.2.x') + spark33x = LibVersion('3.3.x') + spark330 = LibVersion('3.3.0') + spark322 = LibVersion('3.2.2') + spark321 = LibVersion('3.2.1') + spark320 = LibVersion('3.2.0') + spark313 = LibVersion('3.1.3') + spark312 = LibVersion('3.1.2') + spark311 = LibVersion('3.1.1') + spark303 = LibVersion('3.0.3') + spark302 = LibVersion('3.0.2') + spark301 = LibVersion('3.0.1') + spark300 = LibVersion('3.0.0') + + +# hc = LibVersion('4.0.2') +# nlp = LibVersion('4.0.2') +# ocr = LibVersion('4.0.0') +# nlp_display = LibVersion('4.0.0') +# nlu = LibVersion('4.0.0') + + + + +class LicenseType(BaseEnum): + trial = 'Trial' + research = 'Research' + + +class LicensePlattform(BaseEnum): + none = None + databricks = 'Databricks' + floating = 'Floating' + + +class LicensePlattform(BaseEnum): + none = None + research = 'Research' + + +class ProductName(BaseEnum): + hc = 'Spark-Healthcare' + nlp = 'Spark-NLP' + ocr = 'Spark-OCR' + finance = 'Spark-Finance' + nlp_display = 'NLP-Display' + nlu = 'nlu' + jsl_full = 'full' + pyspark = 'PySpark' + spark = 'spark' + java = 'java' + python = 'python' + jsl_lib = 'John Snow Labs Python Library' + + @staticmethod + def from_str(s): + bck = "\n" + if s not in ProductName: + raise Exception(f'Invalid Product to install: {s}' + f' please specify on of:\n{bck.join([n.value for n in ProductName])}') + else: + return ProductName(s) + + @staticmethod + def from_jsl_api(s: str): + if s == 'Visual NLP': + return ProductName.ocr + if s == 'Healthcare NLP': + return ProductName.hc + if s == 'Spark NLP': + return ProductName.hc + if s == 'Finance NLP': + return ProductName.hc + if s == 'Legal NLP': + return ProductName.hc + + + raise ValueError(f'Unknown product name from jsl-api {s}') + + +class ProductLogo(BaseEnum): + hc = '💊' # 🏥 🩺 💊 ❤️ ‍🩹 ‍⚕️💉 🥼 🚑 🔬 🫀 🩻 🧪 + nlp = '🚀' + ocr = '🕶' # 👁️ 🤖 🦾🦿 🥽 👀 🕶 🥽 ⚕ + finance = '🤑' # 🤑🏦💲💳💰💸💵💴💶💷 + nlp_display = '🎨' + nlu = '🤖' + jsl_full = '💯🕴' # 🕴 + java = '🫘' # 🫘 # ☕ ♨ 🥃 🥃 🧋🍹♨️🥤🫖 + python = '🐍' # 🐉 + pyspark = '🐍+⚡' + spark = '⚡ ' + databricks = '🧱' + jsl_lib = '🧪' + + +class ProductSlogan(BaseEnum): + healthcare = 'Heal the planet with NLP!' + spark_nlp = 'State of the art NLP at scale' + ocr = 'Empower your NLP with a set of eyes' + pyspark = 'The big data Engine' + nlu = '1 line of code to conquer nlp!' + jsl_full = 'The entire John Snow Labs arsenal!' + finance = 'NLP for the Finance Industry' + nlp_display = 'Visualize and Explain NLP!' + jsl_lib = 'Easy access to all of John Snow Labs Enterprise Software!' + spark = '⚡' + java = '☕' + python = '🐍' + + +@dataclass +class InstalledProductInfo: + """Representation of a JSL product install. Version is None if not installed """ + product: ProductName + version: Optional[LibVersionIdentifier] = None + + +@dataclass +class JslSuiteStatus: + """Representation and install status of all JSL products and its dependencies. + Version attribute of InstalledProductInfo is None for uninstalled products + """ + spark_nlp_info: Optional[InstalledProductInfo] = None + spark_hc_info: Optional[InstalledProductInfo] = None + spark_ocr_info: Optional[InstalledProductInfo] = None + nlu_info: Optional[InstalledProductInfo] = None + sparknlp_display_info: Optional[InstalledProductInfo] = None + pyspark_info: Optional[InstalledProductInfo] = None + + +class LatestCompatibleProductVersion(BaseEnum): + jsl_lib = LibVersion(settings.raw_version_jsl_lib) + healthcare = LibVersion(settings.raw_version_medical) + spark_nlp = LibVersion(settings.raw_version_nlp) + ocr = LibVersion(settings.raw_version_ocr) + nlu = LibVersion(settings.raw_version_nlu) + nlp_display = LibVersion(settings.raw_version_nlp_display) + pyspark = LibVersion(settings.raw_version_pyspark) + finance = LibVersion('finance') + spark = LibVersion(settings.raw_version_pyspark) + java = LibVersion('java') + python = LibVersion('python') + + @staticmethod + def from_settings(p : ProductName): + if p == ProductName.hc: + return settings.raw_version_medical + if p == ProductName.ocr: + return settings.raw_version_ocr + if p == ProductName.nlp: + return settings.raw_version_nlp + + @staticmethod + def sct_from_settings(p : ProductName): + if p == ProductName.hc: + return settings.raw_version_secret_medical + if p == ProductName.ocr: + return settings.raw_version_secret_ocr + if p == ProductName.nlp: + return settings.raw_version_nlp diff --git a/johnsnowlabs/utils/env_utils.py b/johnsnowlabs/utils/env_utils.py new file mode 100644 index 0000000000..e1bacc51d8 --- /dev/null +++ b/johnsnowlabs/utils/env_utils.py @@ -0,0 +1,39 @@ +import importlib +import site +import os +import subprocess + + +def try_import(lib): + try: + importlib.reload(site) + globals()[lib] = importlib.import_module(lib) + importlib.import_module(lib) + except Exception as _: + # print(f'Failed to import {lib}') + return False + return True + + +def try_import_in_venv(lib, py_path): + c = f'{py_path} -c "import {lib}"' + try: + result = subprocess.check_output(c, shell=True, stderr=subprocess.STDOUT) + if result == b'': + return True + print('all good!') + else: + print(f'Looks like {lib} is missing \n{result}') + return False + except: + print(f'Looks like {lib} is missing') + return False + + +def is_running_in_databricks(): + """ Check if the currently running Python Process is running in Databricks or not + If any Environment Variable file_name contains 'DATABRICKS' this will return True, otherwise False""" + for k in os.environ.keys(): + if 'DATABRICKS' in k: + return True + return False diff --git a/johnsnowlabs/utils/file_utils.py b/johnsnowlabs/utils/file_utils.py new file mode 100644 index 0000000000..a0c09d8f1f --- /dev/null +++ b/johnsnowlabs/utils/file_utils.py @@ -0,0 +1,29 @@ +import json +from typing import Dict, Any + + +def dump_dataclass_to_json(data_class_instance, out_file_path, overwrite_if_exist: bool = True): + with open(out_file_path, 'w') as json_file: + json.dump(data_class_instance.__dict__, json_file, indent=4) + + +def json_path_as_dict(path) -> Dict[Any, Any]: + with open(path) as f: + return json.load(f) + + +def str_to_file(str_, path): + with open(path, "w") as text_file: + text_file.write(str_) + return path + + +def file_to_str(path): + with open(path, 'r') as file: + return file.read() + + +def path_tail(str_path): return str_path.split('/')[-1] + + +def path_head(str_path): return str_path.split('/')[0] diff --git a/johnsnowlabs/utils/functional.py b/johnsnowlabs/utils/functional.py new file mode 100644 index 0000000000..01914d86be --- /dev/null +++ b/johnsnowlabs/utils/functional.py @@ -0,0 +1,17 @@ +from typing import Callable, Dict + + +def extract_callee_args_as_kwargs(func: Callable) -> Dict[str, any]: + """ + Extract all parameters a function was called with during execution time as kwarg dict. + Call this from inside a function and this returns all parameters the function was given at run time. + Handy to reduce boilerplate on funcs with many args + :param func: The function to get kwargs from. A function should pass itself as parameter when calling this + :return: value kwargs of the func at the time of calling the extract_args_as_kwargs function + """ + import inspect + # Get the names of the functions parameters + param_names = list(map(lambda x: x.name, inspect.signature(func).parameters.values())) + # We go 2 frame up, one for the dict comprehension, and one for extract_args_as_kwargs func itself + kwargs = {n: inspect.getouterframes(inspect.currentframe())[2].frame.f_locals[n] for n in param_names} + return kwargs diff --git a/johnsnowlabs/utils/modelhub_markdown.py b/johnsnowlabs/utils/modelhub_markdown.py new file mode 100644 index 0000000000..1c67fdbafd --- /dev/null +++ b/johnsnowlabs/utils/modelhub_markdown.py @@ -0,0 +1,95 @@ +import os +from pathlib import Path + +import pandas as pd + +from johnsnowlabs import settings +from johnsnowlabs.utils.file_utils import path_tail +from johnsnowlabs.utils.py_process import execute_py_script_string_as_new_proc + +Path(settings.tmp_markdown_dir).mkdir(exist_ok=True, parents=True) + + +def get_py_snippet_from_modelhub(url): + import requests + from bs4 import BeautifulSoup + # get_py_snippet_from_modelhub('https://nlp.johnsnowlabs.com/2022/09/06/finclf_augmented_esg_en.html') + html_text = requests.get(url).text + soup = BeautifulSoup(html_text, 'html.parser') + python_code = soup.find_all("div", {"class": "language-python"})[0] + return python_code.getText() + + +def modelhub_md_to_pyscript(path): + start_s = '```python' + end_s = '```' + data = [] + started = False + with open(path, 'r') as f: + for l in f: + if start_s in l: + started = True + continue + if end_s in l: + return data + if started: + data.append(l) + return ['False'] + + +def get_all_py_scripts_in_md_folder(markdown_folder): + scripts = {} + for p in os.listdir(markdown_folder): + # print("TESTING", p) + if '.md' not in p: + continue + script = ''.join(modelhub_md_to_pyscript(f'{markdown_folder}/{p}')) + if script == 'False': + print("Badly Formatted Markdown File!", p) + continue + scripts[p] = script + return scripts + + +def run_modelhub_md_script(md_path_or_url): + if 'http' and '//' in md_path_or_url: + return execute_py_script_string_as_new_proc(''.join(get_py_snippet_from_modelhub(md_path_or_url)), + file_name=path_tail(md_path_or_url)) + return execute_py_script_string_as_new_proc( + ''.join(modelhub_md_to_pyscript(md_path_or_url)), file_name=path_tail(md_path_or_url)) + + +def test_folder_of_modelhub_md_files(markdown_folder): + results = [] + scripts = get_all_py_scripts_in_md_folder(markdown_folder) + total = len(scripts) + i = 0 + for file, script in scripts.items(): + print('#' * 10 + f'Testing {i}/{total} {file}' + '#' * 10) + i += 1 + results.append(execute_py_script_string_as_new_proc(script, file_name=file)) + return pd.DataFrame(results) + + +def test_markdown(file_path_or_url): + """ + ref can be URL, PATH, DIR, + """ + if not os.path.exists(file_path_or_url): + # Remote handling + if 'http' and '//' in file_path_or_url: + # URL + return run_modelhub_md_script(file_path_or_url) + if os.path.isdir(file_path_or_url): + # Folder + return test_folder_of_modelhub_md_files(file_path_or_url) + + if not os.path.isfile(file_path_or_url): + raise ValueError(f"""Invalid target, must either be: + 1. Path to local Notebook + 2. Path to local Notebook folder + 3. URL to Remote Notebook (Make sure to use RAW github URL) + 4. WORKSHOP, WORKSHOP-OS, WORKSHOP-MED, WORKSHOP-LEG, WORKSHOP-FIN + """) + # PATH + return run_modelhub_md_script(file_path_or_url) diff --git a/johnsnowlabs/utils/my_jsl_api.py b/johnsnowlabs/utils/my_jsl_api.py new file mode 100644 index 0000000000..58088f07bb --- /dev/null +++ b/johnsnowlabs/utils/my_jsl_api.py @@ -0,0 +1,325 @@ +from dataclasses import dataclass +from http.server import BaseHTTPRequestHandler, HTTPServer +import json +from typing import List, Dict +from urllib.request import Request, urlopen +import os +# imports related to get access token with PKCE Oauth +from urllib import parse +import string +import base64 +import random +import hashlib +import webbrowser + +from johnsnowlabs.utils.enums import ProductName + +MYJSL_ORIGIN = os.environ.get("MYJSL_ORIGIN", "https://my.johnsnowlabs.com") + +# save_path that license should be downloaded there +LICENSE_PATH = "downloaded-license.json" +# using urllib to avoid additional package dependencies like requests + + +class LibSecretResponse: + product: ProductName + version: str + secret: str + isLatest: bool + + def __init__(self, product: str, version: str, secret: str, isLatest: bool, ): + self.product = ProductName.from_jsl_api(product) + self.version = version + self.secret = secret + self.isLatest = isLatest + self.version_secret = secret.split('-')[0] + + +class LicenseResponse: + products: List[ProductName] + id: str + type: str + endDate: bool + platform: bool + + def __init__(self, products: List[Dict[str, str]], id: str, type: str, endDate: str, platform: str): + self.products = [ProductName.from_jsl_api(p['name']) for p in products] + self.id = id + self.type = type + self.endDate = endDate + self.platform = platform + + +# def pick_compatible_secrets() + + +def is_in_colab_notebook(): + try: + from IPython import get_ipython + return "google.colab" in str(get_ipython()) + except: + return False + + +def http_request(url, data=None, method="POST", is_json=True, access_token=None): + if data: + if is_json: + data = json.dumps(data).encode("utf-8") + else: + data = parse.urlencode(data).encode("utf-8") + request = Request(url, data=data, method=method) + if access_token: + request.add_header("Authorization", f"Bearer {access_token}") + if is_json: + request.add_header("Content-Type", "application/json") + else: + request.add_header("Content-type", "application/x-www-form-urlencoded") + response = urlopen(request) + status_code = response.getcode() + return ( + json.loads(response.read().decode("utf-8")) + if 200 <= status_code < 300 + else None + ) + + +def get_access_token(email, password): + """get access token (expires in 12h)""" + data = http_request( + MYJSL_ORIGIN + "/graphql", + data={ + "query": """mutation($input: LoginInput!) { + getAccessToken(input: $input) { + ok {token} + error { + errors { + key + message + } + } + } + }""", + "variables": {"input": {"email": email, "password": password}}, + }, + ) + if data["data"]["getAccessToken"]["error"]: + errors = "\n".join( + [ + error["message"] + for error in data["data"]["getAccessToken"]["error"]["errors"] + ] + ) + print(f"Cannot login. error={errors}") + exit(1) + access_token = data["data"]["getAccessToken"]["ok"]["token"] + return access_token + + +def get_user_lib_secrets(access_token): + secrets_query = '''query ReleasesQuery { + releases { + product + version + secret + isLatest + } +}''' + data = http_request( + f"{MYJSL_ORIGIN}/graphql", {"query": secrets_query}, access_token=access_token + ) + if data: + if "errors" in data: + raise Exception("Invalid or Expired token.") + return [LibSecretResponse(**r) for r in data['data']['releases']] + else: + raise Exception("Something went wrong...") + + +def get_user_licenses(access_token): + licenses_query = """query LicensesQuery { + licenses(isValid: true, platforms: ["Airgap", "Floating"]) { + edges { + node { + id + type + endDate + platform { + name + type + } + products { + name + } + } + } + } +} + """ + + data = http_request( + f"{MYJSL_ORIGIN}/graphql", {"query": licenses_query}, access_token=access_token + ) + if data: + if "errors" in data: + raise Exception("Invalid or Expired token.") + return [LicenseResponse(**s["node"]) for s in data["data"]["licenses"]["edges"]] + + else: + raise Exception("Something went wrong...") + + +def download_license(license: LicenseResponse, access_token): + print("Downloading license...") + data = http_request( + "{}/attachments/{}".format(MYJSL_ORIGIN, license.id), + method="GET", + access_token=access_token, + ) + if data: + print("Licenses extracted successfully") + return data + else: + raise Exception(f"Failed fetching license.") + + +def ensure_correct_choice(licenses_count): + license_id = input() + if license_id.isnumeric(): + index = int(license_id) - 1 + if licenses_count > index: + return index + else: + print(f"Please select value between 1 and {licenses_count}") + return ensure_correct_choice(licenses_count) + else: + print(f"Please select value between 1 and {licenses_count}") + return ensure_correct_choice(licenses_count) + + +def get_user_license_choice(licenses): + print("Please select the license to use.") + for idx, license in enumerate(licenses): + products = ",".join(s["file_name"] for s in license["products"]) + if license["platform"] is None: + scope = "Airgap" + else: + scope = license["platform"]["file_name"] + type = license["platform"]["type"] + if scope == "Floating": + if type: + scope = scope + "," + type.capitalize() + + print( + "{}. Libraries: {}\n License Type: {}\n Expiration Date: {}\n Scope: {}".format( + idx + 1, products, license["type"], license["endDate"], scope + ) + ) + + choice = ensure_correct_choice(len(licenses)) + return licenses[choice] + + +def open_authorized_url(url, in_colab=False): + if in_colab: + from IPython.display import display, Javascript + + display( + Javascript( + """ + var a = document.createElement("a"); + a.id="auth-btn" + a.setAttribute("target", "_blank"); + a.href="{{URL}}"; + a.style="padding:15px 20px;background-color:#0298d9;border-radius:7px;color:white;text-decoration:none;" + a.innerText="Click here to Authorize on My.Johnsnowlabs.com" + document.body.appendChild(a); + document.body.style = "text-align:center;padding-top:15px;" + a.click() + """.replace( + "{{URL}}", url + ) + ) + ) + else: + print("Please confirm authorization on :", url) + webbrowser.open_new_tab(url) + + +def get_access_key_from_browser(): + in_colab = is_in_colab_notebook() + client_id = "sI4MKSmLHOX2Pg7XhM3McJS2oyKG5PHcp0BlANEW" + + class OauthRequestHandler(BaseHTTPRequestHandler): + code = None + + def response(self, msg, code): + self.send_response(code) + self.end_headers() + self.wfile.write( + f"Johnsnowlabs" + f"
" + f"{msg}" + f"".encode("utf-8") + ) + + def do_GET(self): + global access_token + url_parts = parse.urlsplit(self.path) + if url_parts.path == "/login": + params = dict(parse.parse_qsl(url_parts.query)) + OauthRequestHandler.code = params.get("code") + if OauthRequestHandler.code: + self.response("Authorization successful!", 200) + else: + self.response("Authorization failed! please try again.", 400) + + verifier = "".join( + [random.choice(string.ascii_letters + string.digits) for _ in range(64)] + ) + hashed = hashlib.sha256(verifier.encode("utf-8")).digest() + challenge = base64.urlsafe_b64encode(hashed)[:-1].decode("utf-8") + if in_colab: + port = 8000 + from google.colab.output import eval_js + + redirect_uri = eval_js("google.colab.kernel.proxyPort(8000)") + "login" + else: + port = 0 + + with HTTPServer(("", port), OauthRequestHandler) as httpd: + if port == 0: + port = httpd.server_port + redirect_uri = f"http://localhost:{port}/login" + url = "{}/oauth/authorize/?{}".format( + MYJSL_ORIGIN, + parse.urlencode( + { + "client_id": client_id, + "response_type": "code", + "code_challenge_method": "S256", + "code_challenge": challenge, + "redirect_uri": redirect_uri, + } + ), + ) + open_authorized_url(url, in_colab) + httpd.handle_request() + if in_colab: + from IPython.display import display, Javascript + + display(Javascript("document.body.removeChild(a);")) + + if OauthRequestHandler.code: + data = http_request( + f"{MYJSL_ORIGIN}/oauth/token/", + data={ + "grant_type": "authorization_code", + "client_id": client_id, + "code_verifier": verifier, + "code": OauthRequestHandler.code, + "redirect_uri": redirect_uri, + }, + is_json=False, + ) + return data["access_token"] + return None diff --git a/johnsnowlabs/utils/notebooks.py b/johnsnowlabs/utils/notebooks.py new file mode 100644 index 0000000000..703117936f --- /dev/null +++ b/johnsnowlabs/utils/notebooks.py @@ -0,0 +1,209 @@ +import os +import shutil +import urllib +from pathlib import Path +from typing import Union, List + +import nbformat +import pandas as pd +from nbconvert import PythonExporter + +from johnsnowlabs import settings +from johnsnowlabs.utils.file_utils import str_to_file, path_tail +from johnsnowlabs.utils.py_process import str_to_file, execute_py_script_as_new_proc, log_multi_run_status + +Path(settings.tmp_notebook_dir, ).mkdir(exist_ok=True, parents=True) + + +def clean_workshop_notebook(py_script_path, suc_print=settings.success_worker_print, work_dir=os.getcwd(), + model_cache_dir=None): + out_path = f'{settings.tmp_notebook_dir}/{path_tail(py_script_path)}___CLEANED.py' + prefix = f""" +import os +os.chdir('{work_dir}') +from johnsnowlabs import * +""" + if model_cache_dir: + prefix = prefix + f""" +spark = jsl.start(model_cache_folder='{model_cache_dir}') + """ + else: + prefix = prefix + f""" +spark = jsl.start() + """ + + suffix = f""" +print('{suc_print}') +""" + + # Substring matches + bad_sub_strings = [ + 'files.upload()', + # 'get_ipython', + 'pip install', + 'pip -q', + 'from google', + 'google.', + 'colab', + 'jsl.install', + '#', + 'license_keys', + 'spark_ocr.json', + 'spark_jsl.json', + 'plt.', + # 'sparkocr.start', + # 'secret', + # 'nlp_version', + # 'nlp_secret', + # 'nlp_internal', + + ] + # Full file will be matched for those in the end and the will be removed + bad_regex_full_match = [ + r'spark = sparkocr.start\(.*?\)', + r'spark = sparknlp_jsl.start\(.*?\)' + ] + + import re + # Hard matches + bad_lines = [ # '\n', + 'jsl.install()', + ] + new_file = [] + with open(py_script_path, "r") as f: + for l in f: + if any(s in l for s in bad_sub_strings): continue + if '/content/' in l: l = l.replace('/content/', './') + if l in bad_lines: continue + # if 'get_ipython().system' in l: + # l = l.replace('get_ipython()', 'os') + # if 'get_ipython().run_line_magic' in l: + # continue + + new_file.append(l) + new_file = prefix + ''.join(new_file) + suffix + + matches_to_clean = [re.findall(r, new_file, re.DOTALL) for r in bad_regex_full_match] + + def flatten(l): + return [item for sublist in l for item in sublist] + + for m in flatten(matches_to_clean): new_file = new_file.replace(m, '') + # print(new_file) + str_to_file(new_file, out_path) + return out_path + + +def get_all_nb_in_local_folder(p): + ## filter all files ending in .ipynb + return [f'{p}/{f}' for f in os.listdir(p) if '.ipynb' in f] + + +def convert_notebook(notebookPath): + out_path = f'{settings.tmp_notebook_dir}/{path_tail(notebookPath)}.nb_converted.py' + with open(notebookPath) as fh: + nb = nbformat.reads(fh.read(), nbformat.NO_CONVERT) + exporter = PythonExporter() + source, meta = exporter.from_notebook_node(nb) + str_to_file(source, out_path) + return out_path + + +def convert_all_notebooks(nb_folder): + # Convert a folder which contains .ipynb into .py + store_folder = f'{nb_folder}/nb_converted/' + Path(store_folder).mkdir(parents=True, exist_ok=True) + for nb_path in get_all_nb_in_local_folder(nb_folder): + save_path = store_folder + nb_path.split('/')[-1] + '.py' + convert_notebook(nb_path, save_path) + + +def test_ipynb(file_path_or_url: Union[List[str], str], use_i_py=True, model_cache_dir=None): + """ + ref can be URL, PATH, DIR, + or ref= `WORKSHOP` or `WORKSHOP-OS` or `WORKSHOP-MED` , `WORKSHOP-LEG`, `WORKSHOP-FIN` + for testing a specific sub-folder of the workshop + """ + # TODO add GIT BRANCH?!?! + if not os.path.exists(file_path_or_url): + # Remote handling + if 'http' and '//' in file_path_or_url: + # URL + file_name = file_path_or_url.split('/')[-1] + print(f'Downloading {file_path_or_url} to {file_name}') + # Download the file from `url` and save it locally under `file_name`: + with urllib.request.urlopen(file_path_or_url) as response, open(file_name, 'wb') as out_file: + shutil.copyfileobj(response, out_file) + file_path_or_url = file_name + + elif 'WORKSHOP' in file_path_or_url: + # Workshop handing + if not os.path.exists(settings.workshop_cert_nb_folder): + # Clone repo + cur_dir = os.getcwd() + os.chdir(f'{settings.tmp_notebook_dir}') + os.system(f'git clone {settings.workshop_git}') + os.chdir(cur_dir) + if 'WORKSHOP-FIN' == file_path_or_url: + return test_ipynb_folder(settings.workshop_fin_folder, use_i_py=use_i_py, + model_cache_dir=model_cache_dir) + if 'WORKSHOP-LEG' == file_path_or_url: + return test_ipynb_folder(settings.workshop_leg_folder, use_i_py=use_i_py, + model_cache_dir=model_cache_dir) + if 'WORKSHOP-MED' == file_path_or_url: + return test_ipynb_folder(settings.workshop_med_folder, use_i_py=use_i_py, + model_cache_dir=model_cache_dir) + if 'WORKSHOP-PUB' == file_path_or_url: + return test_ipynb_folder(settings.workshop_pub_folder, use_i_py=use_i_py, + model_cache_dir=model_cache_dir) + + return pd.concat( + [test_ipynb_folder(settings.workshop_leg_folder, use_i_py=use_i_py, model_cache_dir=model_cache_dir), + test_ipynb_folder(settings.workshop_fin_folder, use_i_py=use_i_py, model_cache_dir=model_cache_dir), + test_ipynb_folder(settings.workshop_med_folder, use_i_py=use_i_py, model_cache_dir=model_cache_dir), + test_ipynb_folder(settings.workshop_pub_folder, use_i_py=use_i_py, model_cache_dir=model_cache_dir), ]) + + if os.path.isdir(file_path_or_url): + # Folder + return test_ipynb_folder(file_path_or_url, use_i_py=use_i_py, model_cache_dir=model_cache_dir) + + if not os.path.isfile(file_path_or_url): + raise ValueError(f"""Invalid target, must either be: + 1. Path to local Notebook + 2. Path to local Notebook folder + 3. URL to Remote Notebook (Make sure to use RAW github URL) + 4. WORKSHOP, WORKSHOP-OS, WORKSHOP-MED, WORKSHOP-LEG, WORKSHOP-FIN + """) + # PATH + nb_converted_path = convert_notebook(file_path_or_url) + final_py_script_path = clean_workshop_notebook(py_script_path=nb_converted_path, + model_cache_dir=model_cache_dir) + + succ, proc = execute_py_script_as_new_proc(py_script_path=final_py_script_path, use_i_py=use_i_py) + return make_log(file_path_or_url, succ, proc, final_py_script_path) + + +def test_ipynb_folder(nb_folder, work_dir=os.getcwd(), log=True, model_cache_dir=None, use_i_py=True): + return pd.DataFrame( + test_list_of_ipynb(get_all_nb_in_local_folder(nb_folder), work_dir, log, model_cache_dir=model_cache_dir, + use_i_py=use_i_py)) + + +def test_list_of_ipynb(nb_paths_or_urls, work_dir=os.getcwd(), log=True, model_cache_dir=None, use_i_py=True): + df = [] + for i, nb_path in enumerate(nb_paths_or_urls): + print(f'Testing {i}/{len(nb_paths_or_urls)} {nb_path}') + df.append(test_ipynb(file_path_or_url=nb_path, model_cache_dir=model_cache_dir, use_i_py=use_i_py)) + df = pd.DataFrame(df) + if log: + log_multi_run_status(df) + return df + + +def make_log(nb_file, suc, proc, final_py_script): + return { + 'notebook': nb_file, + 'success': suc, + 'stdout': proc.stdout.decode(), + 'stderr': proc.stderr.decode(), + 'test_script': final_py_script} diff --git a/johnsnowlabs/utils/pip_utils.py b/johnsnowlabs/utils/pip_utils.py new file mode 100644 index 0000000000..7b8bdeef9c --- /dev/null +++ b/johnsnowlabs/utils/pip_utils.py @@ -0,0 +1,171 @@ +import importlib +import site +import subprocess +from importlib import reload +from johnsnowlabs.py_models.lib_version import LibVersion +from johnsnowlabs.utils.venv_utils import VenvWrapper + +reload(site) +import os +from typing import Optional + +from johnsnowlabs.utils.enums import LatestCompatibleProductVersion, PyInstallTypes +from johnsnowlabs.py_models.primitive import LibVersionIdentifier +from johnsnowlabs.py_models.jsl_secrets import JslSecrets + +import json +import sys +from urllib import request +from pkg_resources import parse_version + + +def get_all_lib_version_on_pypi(pkg_name): + url = f'https://pypi.python.org/pypi/{pkg_name}/json' + releases = json.loads(request.urlopen(url).read())['releases'] + return sorted(releases, key=parse_version, reverse=True) + + +def get_latest_lib_version_on_pypi(pkg_name): + return get_all_lib_version_on_pypi(pkg_name)[0] + + +def get_pip_lib_version(lib: str, py_exec: str = sys.executable): + # Get lib version of a library according to pip + r = subprocess.run([py_exec, '-m', 'pip', 'list'], capture_output=True, text=True) + matches = list(filter(lambda x: x.split(' ')[0] == lib, r.stdout.split('\n'))) + if not matches: + return False # raise ValueError(f'Could not find lib {lib}') + else: + return LibVersion(matches[0].split(' ')[-1]) + + +def uninstall_lib(pip_package_name, py_path=sys.executable): + cmd = f'{py_path} -m pip uninstall {pip_package_name} -y ' + os.system(cmd) + reload(site) + + +def install_standard_pypi_lib(pypi_name: str, + module_name: Optional[str] = None, + python_path: str = sys.executable, + upgrade: bool = True, + re_install: bool = False, + version: Optional[str] = None, + download_folder: Optional[str] = None, + include_dependencies: bool = True + ): + """ + Install module via pypi. + runs the command : + `python -m pip install [module_name]` + `python -m pip install [module_name] --upgrade` + :param re_install: + :param version: + :param pypi_name: file_name of pypi package or path to local whl + :param module_name: If defined will import module into globals, making it available to running process + :param python_path: Which Python to use for installing. Defaults to the Python calling this method. + :param upgrade: use --upgrade flag or not + :return: + """ + if not pypi_name: + raise Exception(f'Tried to install software which has no pypi file_name! Aborting.') + print(f'Installing {pypi_name} to {python_path}') + c = f'{python_path} -m pip install {pypi_name}' + print(f'Running: {c}') + if version: + c = c + f'=={version.as_str()} ' + else: + c = c + ' ' + + if upgrade: + c = c + '--upgrade ' + if re_install: + c = c + ' --force-reinstall' + + if download_folder: + if version: + c = f'{python_path} -m pip download {pypi_name}=={version} -d {download_folder}' + else: + c = f'{python_path} -m pip download {pypi_name} -d {download_folder}' + + if not include_dependencies: + c = c + ' --no-deps' + + os.system(c) + if module_name and not download_folder: + try: + # See if install worked + # importlib.import_module(module_name) + reload(site) + globals()[module_name] = importlib.import_module(module_name) + except ImportError as err: + print(f'Failure Installing {pypi_name}') + return False + return True + + +def install_licensed_pypi_lib(secrets: JslSecrets, + pypi_name, + module_name, + product: 'AbstractSoftwareProduct', + spark_version: LibVersionIdentifier = LatestCompatibleProductVersion.pyspark.value, + upgrade=True, + py_path: str = sys.executable, + download_folder: Optional[str] = None, + include_dependencies: bool = True + + ): + """ Install Spark-NLP-Healthcare PyPI Package in target python executable + This just requires the secret of the library. + """ + get_deps = True + missmatch = False + if 'spark-nlp-jsl' in pypi_name or 'internal_with_finleg' in pypi_name: + if not secrets.HC_SECRET: + return False + module_name = 'sparknlp_jsl' + secret = secrets.HC_SECRET + # get_deps = True + elif 'ocr' in pypi_name: + if not secrets.OCR_SECRET: + return False + secret = secrets.OCR_SECRET + module_name = 'sparkocr' + # get_deps = True + + else: + raise ValueError(f'Invalid install licensed install target ={pypi_name}') + + try: + url = product.jsl_url_resolver.get_py_urls(secret=secret, + spark_version_to_match=spark_version, + install_type=PyInstallTypes.wheel).url + cmd = f'{py_path} -m pip install {url}' + + # Install lib + if upgrade: + cmd = cmd + ' --force-reinstall' + # cmd = f'{sys.executable} -m pip install {pypi_name}=={lib_version} --extra-index-url https://pypi.johnsnowlabs.com/{secret}' + + if download_folder: + cmd = f'{py_path} -m pip download {pypi_name} -d {download_folder}' + + if not include_dependencies: + cmd = cmd + ' --no-deps' + + print(f'Running "{cmd.replace(secret, "[LIB_SECRET]")}"') + os.system(cmd) + + # Check if Install succeeded + if py_path == sys.executable: + # Check for python executable that is currently running + reload(site) + globals()[module_name] = importlib.import_module(module_name) + else: + # Check for python executable which is on this machine but not the same as the running one + return VenvWrapper.is_lib_in_py_exec(py_path, module_name, False) + + except Exception as err: + print('Failure to install', err) + return False + return True diff --git a/johnsnowlabs/utils/print_messages.py b/johnsnowlabs/utils/print_messages.py new file mode 100644 index 0000000000..6d078b05a4 --- /dev/null +++ b/johnsnowlabs/utils/print_messages.py @@ -0,0 +1,20 @@ +from typing import Union + +from colorama import Fore +from johnsnowlabs.auto_install.softwares import AbstractSoftwareProduct + + +def log_outdated_lib(product: AbstractSoftwareProduct, installed_version): + print(Fore.LIGHTRED_EX + + f'🚨 Your {product.name} is outdated, installed=={installed_version} but latest version=={product.latest_version.as_str()}') + + print(f'You can run {Fore.LIGHTGREEN_EX} jsl.install() {Fore.RESET}to update {product.name}') + + +def log_broken_lib(product: Union[AbstractSoftwareProduct, str]): + if hasattr(product, 'name'): + product = product.name + print(Fore.LIGHTRED_EX + + f'🚨 {product} installation seems broken{Fore.RESET}, there was an exception while importing it. It will not be available on the jsl.xx module') + print( + f'You can run {Fore.LIGHTGREEN_EX} jsl.install(refresh_install=True, force_browser=True) {Fore.RESET} to re-install latest version. ') diff --git a/johnsnowlabs/utils/py_process.py b/johnsnowlabs/utils/py_process.py new file mode 100644 index 0000000000..41fe0ee063 --- /dev/null +++ b/johnsnowlabs/utils/py_process.py @@ -0,0 +1,116 @@ +import sys +from pathlib import Path +from typing import List, Callable +import subprocess +import colorama +from johnsnowlabs import settings +from johnsnowlabs.utils.file_utils import str_to_file +import pandas as pd +Path(settings.tmp_markdown_dir ).mkdir(exist_ok=True, parents=True) + +def run_cmd_and_check_succ(args: List[str], log=True, suc_print=settings.success_worker_print, + return_pipes=False) -> bool: + print(f'👷 Executing {colorama.Fore.LIGHTGREEN_EX}{args}{colorama.Fore.RESET}') + r = subprocess.run(args, capture_output=True) + was_suc = process_was_suc(r) + if was_suc: + print(f'{colorama.Fore.LIGHTGREEN_EX}✅ Success running {args}{colorama.Fore.RESET}') + else: + print(f'{colorama.Fore.LIGHTRED_EX}❌ Failure running {args}{colorama.Fore.LIGHTGREEN_EX}') + if log: + log_process(r) + if return_pipes: + return was_suc, r + return was_suc + + +def process_was_suc(result: subprocess.CompletedProcess, suc_print=settings.success_worker_print) -> bool: + return suc_print in result.stdout.decode() + + +def log_process(result: subprocess.CompletedProcess): + print("______________STDOUT:") + print(result.stdout.decode()) + print("______________STDERR:") + print(result.stderr.decode()) + + +# def execute_slave_test(py_cmd): +# prefix = 'from johnsnowlabs import * \n' +# postfix = f"\neval_class('{py_cmd}') \n" +# script_file_name = 'test_script.py' +# script = inspect.getsource(eval_class) +# script = f'{prefix}{script}{postfix}' +# print(script) +# str_to_file(script, script_file_name) +# return run_cmd_and_check_succ(['python', script_file_name]) +# + +def execute_function_as_new_proc(function: Callable, suc_print=settings.success_worker_print): + pass + + +def execute_py_script_string_as_new_proc(py_script, + suc_print=settings.success_worker_print, + py_exec_path=sys.executable, + log=True, + file_name=None, # Optional metadata + use_i_py=False, + ): + if file_name: + out_path = f'{settings.tmp_markdown_dir}/{file_name}_MD_TEST.py' + else: + out_path = 'tmp.py' + + prefix = """ +from johnsnowlabs import * +spark = jsl.start() +""" + + suffix = f""" +print('{suc_print}') + +""" + + str_to_file(prefix + py_script + suffix, out_path) + suc, proc = execute_py_script_as_new_proc(out_path,use_i_py=use_i_py) + return make_modelhub_snippet_log(file_name, suc, proc) + + +def execute_py_script_as_new_proc(py_script_path: str, + suc_print=settings.success_worker_print, + py_exec_path=sys.executable, + log=True, + use_i_py=True, + ): + # requires ipython installed + if use_i_py: + cmd_args = [py_exec_path, '-m', 'IPython', py_script_path] + else: + cmd_args = [py_exec_path, py_script_path] # '-m', 'IPython', + return run_cmd_and_check_succ(cmd_args, log=log, suc_print=suc_print, return_pipes=True) + + +def log_multi_run_status(run_df): + print(f'#' * 10 + "RUN RESULTS" + "#" * 10) + for idx, row in run_df[run_df.success == False].iterrows(): + print(f'Result for Notebook {row.notebook} {"#" * 25}') + print(row.stdout) + + +def make_modelhub_snippet_log(md_file, suc, proc): + return { + 'md_file': md_file, + 'success': suc, + 'stdout': proc.stdout.decode(), + 'stderr': proc.stderr.decode(), } + + +# def test_list_of_py_script_path(py_sc) +def test_list_of_py_script_strings(py_script_paths,use_i_py=False): + total = len(py_script_paths) + df = [] + for i, p in enumerate(py_script_paths): + print(f'Testing {i}/{total}') + df.append(execute_py_script_string_as_new_proc(p, file_name=f'{i}_TEST.py'), use_i_py=use_i_py) + return pd.DataFrame(df) diff --git a/johnsnowlabs/utils/sparksession_utils.py b/johnsnowlabs/utils/sparksession_utils.py new file mode 100644 index 0000000000..9933b09077 --- /dev/null +++ b/johnsnowlabs/utils/sparksession_utils.py @@ -0,0 +1,190 @@ +import time +import os +from typing import Optional, List, Dict + +from johnsnowlabs import settings +from johnsnowlabs.auto_install.jsl_home import get_install_suite_from_jsl_home +from johnsnowlabs.auto_install.softwares import Software +from johnsnowlabs.utils.enums import JvmHardwareTarget +from johnsnowlabs.py_models.install_info import InstallSuite + + +def authenticate_enviroment_HC(suite: InstallSuite): + """Set Secret environ variables for Spark Context""" + if suite.secrets.HC_LICENSE: + os.environ['SPARK_NLP_LICENSE'] = suite.secrets.HC_LICENSE + os.environ['AWS_ACCESS_KEY_ID'] = suite.secrets.AWS_ACCESS_KEY_ID + os.environ['AWS_SECRET_ACCESS_KEY'] = suite.secrets.AWS_SECRET_ACCESS_KEY + + +def authenticate_enviroment_OCR(suite: InstallSuite): + """Set Secret environ variables for Spark Context""" + if suite.secrets.OCR_LICENSE: + os.environ['SPARK_NLP_LICENSE'] = suite.secrets.OCR_LICENSE + os.environ['AWS_ACCESS_KEY_ID'] = suite.secrets.AWS_ACCESS_KEY_ID + os.environ['AWS_SECRET_ACCESS_KEY'] = suite.secrets.AWS_SECRET_ACCESS_KEY + + +def authenticate_enviroment_HC_and_OCR(suite: InstallSuite): + """Set Secret environ variables for Spark Context""" + authenticate_enviroment_HC(suite) + authenticate_enviroment_OCR(suite) + + +def retry(fun, max_tries=10): + for i in range(max_tries): + try: + time.sleep(0.3) + fun() + break + except Exception: + continue + + +def start( + # -- JSL-Auth Flows -- + # Browser Auth + browser_login: bool = False, + # JWT Token Auth + access_token: Optional[str] = None, + # JSON file Auth + json_license_path: Optional[str] = None, + # AWS Auth + aws_access_key: Optional[str] = None, + aws_key_id: Optional[str] = None, + # Manual License specification Auth + enterprise_nlp_secret: Optional[str] = None, + ocr_secret: Optional[str] = None, + hc_license: Optional[str] = None, + ocr_license: Optional[str] = None, + fin_license: Optional[str] = None, + leg_license: Optional[str] = None, + # License usage & Caching + remote_license_number: int = 0, + local_license_number: int = 0, + store_in_jsl_home: bool = True, + + # -- Spark Session Configs -- + spark_conf: Optional[Dict[str, str]] = None, + master_url: str = 'local[*]', + jar_paths: List[str] = None, + exclude_nlp: bool = False, + exclude_healthcare: bool = False, + exclude_ocr: bool = False, + hardware_target: str = JvmHardwareTarget.cpu.value, + model_cache_folder: str = None, + +) -> 'pyspark.sql.SparkSession': + from pyspark.sql import SparkSession + + already_launched = False + if '_instantiatedSession' in dir(SparkSession) and SparkSession._instantiatedSession is not None: + print('Spark Session already created, some configs may not take.') + already_launched = True + if settings.on_databricks: + print("Looks like you are on databricks. A Sparksession is launched by Databricks, jsl.start() will not ") + + from johnsnowlabs.auto_install.lib_resolvers import OcrLibResolver, HcLibResolver, NlpLibResolver + launched_products: List[str] = [] + hardware_target = JvmHardwareTarget.from_str(hardware_target) + + # Get all Local Jar Paths, downloads them if missing + suite = get_install_suite_from_jsl_home(only_jars=True, jvm_hardware_target=hardware_target, + force_browser=browser_login, + browser_login=browser_login, + access_token=access_token, + local_license_number=local_license_number, + remote_license_number=remote_license_number, + secrets_file=json_license_path, + hc_license=hc_license, + hc_secret=enterprise_nlp_secret, + ocr_secret=ocr_secret, + ocr_license=ocr_license, + aws_access_key=aws_access_key, + aws_key_id=aws_key_id, + fin_license=fin_license, + leg_license=leg_license, + store_in_jsl_home=store_in_jsl_home) + + # Collect all local Jar Paths we have access to for the SparkSession + jars = [] + if not exclude_nlp and Software.spark_nlp.check_installed(None) \ + and suite.nlp.get_java_path(): + jars.append(suite.nlp.get_java_path()) + import sparknlp + launched_products.append(f'{Software.spark_nlp.logo}{Software.spark_nlp.name}=={sparknlp.version()}') + + if suite.secrets: + if suite.hc and not exclude_healthcare and Software.spark_hc.check_installed(None) and suite.hc.get_java_path(): + jars.append(suite.hc.get_java_path()) + authenticate_enviroment_HC(suite) + import sparknlp_jsl + launched_products.append(f'{Software.spark_hc.logo}{Software.spark_hc.name}=={sparknlp_jsl.version()}') + + if suite.ocr and not exclude_ocr and Software.spark_ocr.check_installed(None) and suite.ocr.get_java_path(): + jars.append(suite.ocr.get_java_path()) + authenticate_enviroment_OCR(suite) + import sparkocr + launched_products.append(f'{Software.spark_ocr.logo}{Software.spark_ocr.name}=={sparkocr.version()}') + import pyspark + launched_products.append(f'running on {Software.spark.logo}{Software.pyspark.name}=={pyspark.version.__version__}') + + builder = SparkSession.builder \ + .appName(f'{settings.spark_session_name} with Jars for: {", ".join(launched_products)}') \ + .master(master_url) + + if jar_paths: + # Add user specified Jars + jars += jar_paths + default_conf = {"spark.driver.memory": "16G", + "spark.serializer": "org.apache.spark.serializer.KryoSerializer", + "spark.kryoserializer.buffer.max": "2000M", + 'spark.driver.maxResultSize': '2000M', + 'spark.jars': ','.join(jars), } + + if suite.ocr and suite.ocr.get_java_path(): + # is_spark_version_env('32') + default_conf["spark.sql.optimizer.expression.nestedPruning.enabled"] = "false" + default_conf["spark.sql.optimizer.nestedSchemaPruning.enabled"] = "false" + default_conf["spark.sql.legacy.allowUntypedScalaUDF"] = "true" + default_conf["spark.sql.repl.eagerEval.enabled"] = "true" + + for k, v in default_conf.items(): + builder.config(str(k), str(v)) + + if model_cache_folder: + if not spark_conf: + spark_conf = {} + spark_conf['spark.jsl.settings.pretrained.cache_folder'] = model_cache_folder + + if spark_conf: + for k, v in spark_conf.items(): + builder.config(str(k), str(v)) + spark = builder.getOrCreate() + + if suite.hc and exist_in_jvm('com.johnsnowlabs.util.start.registerListenerAndStartRefresh'): + spark._jvm.com.johnsnowlabs.util.start.registerListenerAndStartRefresh() + if suite.ocr and exist_in_jvm('com.johnsnowlabs.util.OcrStart.registerListenerAndStartRefresh'): + retry(spark._jvm.com.johnsnowlabs.util.OcrStart.registerListenerAndStartRefresh) + + from colorama import Fore + if not already_launched: + print( + f'👌 Launched {Fore.LIGHTGREEN_EX + hardware_target.value}-Optimized JVM{Fore.RESET} SparkSession with Jars for: {", ".join(launched_products)}') + + return spark + + +def exist_in_jvm(java_class): + from pyspark import SparkContext + from pyspark.ml.util import _jvm + from py4j.java_gateway import UserHelpAutoCompletion + java_obj = _jvm() + for name in java_class.split("."): + # Bug in P4J. Even if ClassPath does not Exist, JVM response is proto.SUCCESS_PACKAGE + # But it should give exception + java_obj = getattr(java_obj, name) + + if UserHelpAutoCompletion.KEY in dir(java_obj): + return False + return True diff --git a/johnsnowlabs/utils/venv_utils.py b/johnsnowlabs/utils/venv_utils.py new file mode 100644 index 0000000000..cc36a4d03f --- /dev/null +++ b/johnsnowlabs/utils/venv_utils.py @@ -0,0 +1,105 @@ +import subprocess +import os +import glob +from dataclasses import dataclass + + +def process_was_suc(result: subprocess.CompletedProcess) -> bool: + if result.stderr: + return False + return True + + +def log_process(result: subprocess.CompletedProcess): + print("______________STDOUT:") + print(result.stdout.decode()) + print("______________STDERR:") + print(result.stderr.decode()) + +# +# @dataclass +# class VenvState: +# installed_libs: list[str] +# py_version: str + + +class VenvWrapper: + """ + Utils to install into a Python Executable, which is not the same as current the currently running one + I.e. whenever you want to install to a local python executable which is != sys.executable use this + """ + + @staticmethod + def create_venv(venv_target_dir, ensure_pip=True, log=False): + # Create venv with given or current sys.executables + import venv + venv.create(venv_target_dir) + if ensure_pip: + VenvWrapper.install_pip_if_missing(venv_target_dir) + return True + + + @staticmethod + def glob_py_exec_from_venv(venv_dir, raise_except=True): + py_exec_path = glob.glob(f'{venv_dir}/bin/*python*') + if py_exec_path: + return py_exec_path[0] + if raise_except: + raise Exception(f"Could not find Python Executable in venv dir = {venv_dir} " + f"Please Specify correct path manually") + else: + return False + + @staticmethod + def is_pip_in_venv(venv_py_exec_path, log=False): + r = subprocess.run([venv_py_exec_path, '-m', 'pip'], capture_output=True) + if log: + log_process(r) + return process_was_suc(r) + + @staticmethod + def install_pip_in_venv(venv_py_exec_path, log=False): + if not os.path.exists('get-pip.py'): + pip_url = 'https://bootstrap.pypa.io/get-pip.py' + os.system(f'! wget {pip_url}') + + r = subprocess.run([venv_py_exec_path, 'get-pip.py'], capture_output=True) + if log: + log_process(r) + return process_was_suc(r) + + @staticmethod + def install_pip_if_missing(venv_target_dir: str): + venv_py_exec_path = VenvWrapper.glob_py_exec_from_venv(venv_target_dir, raise_except=True) + if not VenvWrapper.is_pip_in_venv(venv_py_exec_path): + if not VenvWrapper.install_pip_in_venv(venv_py_exec_path): + raise Exception( + f'Could not find or setup pip in venv at {venv_target_dir} using python executable {venv_py_exec_path}') + + @staticmethod + def install_to_venv(venv_target_dir, pypi_name, log=False): + venv_py_exec_path = VenvWrapper.glob_py_exec_from_venv(venv_target_dir, raise_except=True) + r = subprocess.run([venv_py_exec_path, '-m', 'pip', 'install', pypi_name], capture_output=True) + if log: + log_process(r) + return process_was_suc(r) + + @staticmethod + def uninstall_from_venv(venv_target_dir, pypi_name, log=False): + venv_py_exec_path = VenvWrapper.glob_py_exec_from_venv(venv_target_dir, raise_except=True) + r = subprocess.run([venv_py_exec_path, '-m', 'pip', 'uninstall', pypi_name, '-y'], capture_output=True) + if log: + log_process(r) + return process_was_suc(r) + + @staticmethod + def is_lib_in_venv(venv_target_dir, module_name, log=False): + venv_py_exec_path = VenvWrapper.glob_py_exec_from_venv(venv_target_dir, raise_except=True) + return VenvWrapper.is_lib_in_py_exec(venv_py_exec_path, module_name, log) + + @staticmethod + def is_lib_in_py_exec(venv_py_exec_path, module_name, log=False): + r = subprocess.run([venv_py_exec_path, '-c', f'import {module_name}'], capture_output=True) + if log: + log_process(r) + return process_was_suc(r) diff --git a/johnsnowlabs/viz.py b/johnsnowlabs/viz.py new file mode 100644 index 0000000000..3dfa73f402 --- /dev/null +++ b/johnsnowlabs/viz.py @@ -0,0 +1,10 @@ +from johnsnowlabs.abstract_base.lib_resolver import try_import_lib + +if try_import_lib('sparknlp_display', True): + from sparknlp_display import DependencyParserVisualizer + from sparknlp_display import NerVisualizer + from sparknlp_display import EntityResolverVisualizer + from sparknlp_display import RelationExtractionVisualizer + from sparknlp_display import AssertionVisualizer +else: + pass diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000..350dc556cd --- /dev/null +++ b/setup.py @@ -0,0 +1,50 @@ +from setuptools import setup, find_packages +from codecs import open +from os import path +import johnsnowlabs.settings +here = path.abspath(path.dirname(__file__)) + +with open(path.join(here, 'README.md'), encoding='utf-8') as f: + long_description = f.read() + +REQUIRED_PKGS = [ + f'pyspark=={johnsnowlabs.settings.raw_version_pyspark}', + f'spark-nlp=={johnsnowlabs.settings.raw_version_nlp}', + f'nlu=={johnsnowlabs.settings.raw_version_nlu}', + f'spark-nlp-display=={johnsnowlabs.settings.raw_version_nlp_display}', + 'numpy', + 'dataclasses', + 'requests', + 'databricks-api', + 'pydantic', + 'colorama' +] + +setup( + version=johnsnowlabs.settings.raw_version_jsl_lib, + # name='johnsnowlabs_for_databricks', + name='johnsnowlabs', + description='The John Snow Labs Library gives you access to all of John Snow Labs Enterprise And Open Source products in an easy and simple manner. Access 10000+ state-of-the-art NLP and OCR models for ' + 'Finance, Legal and Medical domains. Easily scalable to Spark Cluster ', + long_description=long_description, + install_requires=REQUIRED_PKGS, + long_description_content_type='text/markdown', + url='https://www.johnsnowlabs.com/', + author='John Snow Labs', + author_email='christian@johnsnowlabs.com', + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'Intended Audience :: Developers', + 'Topic :: Software Development :: Build Tools', + 'License :: OSI Approved :: Apache Software License', + # Specify the Python versions you support here. In particular, ensure + # that you indicate whether you support Python 2, Python 3 or both. + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + ], + keywords='Spark NLP OCR Finance Legal Medical John Snow Labs ', + packages=find_packages(exclude=['test*', 'tmp*']), # exclude=['test'] + include_package_data=True +) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/auto_install.py b/tests/auto_install.py new file mode 100644 index 0000000000..620788b610 --- /dev/null +++ b/tests/auto_install.py @@ -0,0 +1,166 @@ +import unittest +import os +from johnsnowlabs.auto_install.jsl_home import get_install_suite_from_jsl_home +from johnsnowlabs.utils.print_messages import log_outdated_lib +from johnsnowlabs import * +import sys +import tests.utils.secrets as sct +from johnsnowlabs.auto_install.databricks.install_utils import * +from johnsnowlabs.utils.venv_utils import VenvWrapper + + +class AutoInstallTestCase(unittest.TestCase): + venv_creation_dir = '/home/ckl/old_home/ckl/Documents/freelance/johnsnowlabs_lib/tmp/venv/tmp_test_venv' + zip_dir = '/home/ckl/Documents/freelance/jsl/johnsnowlabs/tmp/offline' + + def test_quick_bad(self): + jsl.settings.enforce_versions = False + jsl.install(enterprise_nlp_secret=sct.enterprise_nlp_sct) + + def test_install_to_databricks_creating_new_cluster(self): + cluster_id = jsl.install(json_license_path=sct.db_lic, databricks_host=sct.ckl_host, + databricks_token=sct.ckl_token) + + def test_install_to_databricks_existing_cluster(self): + # TODO WIP + cluster_id = '1006-022913-lb94q2m0' + jsl.install(json_license_path=sct.db_lic, databricks_host=sct.ckl_host, databricks_cluster_id=cluster_id) + db = get_db_client_for_token(sct.ckl_host, sct.ckl_token) + # install_py_lib_via_pip(db, cluster_id, 'nlu') + + def test_install_to_current_env_browser_pop_up(self): + jsl.install(force_browser=True, local_license_number=0) + import sparknlp + import sparknlp_jsl + import sparkocr + import nlu + import sparknlp_display + + def test_install_to_current_env(self): + settings.enforce_versions = False + jsl.install(json_license_path=sct.old_lic, refresh_install=True) + # import sparknlp + import sparknlp_jsl + import sparkocr + import nlu + import sparknlp_display + + def test_install_to_different_python_env(self): + # Install to env which is not the one we are currently running + os.system(f'rm -r {self.venv_creation_dir} ') + f = '/home/ckl/old_home/ckl/Documents/freelance/johnsnowlabs_lib/tmp/licenses/ocr_40.json' + VenvWrapper.create_venv(self.venv_creation_dir) + py_path = VenvWrapper.glob_py_exec_from_venv(self.venv_creation_dir) + jsl.install(json_license_path=f, python_exec_path=py_path) + self.assertTrue(VenvWrapper.is_lib_in_venv(self.venv_creation_dir, 'sparknlp')) + self.assertTrue(VenvWrapper.is_lib_in_venv(self.venv_creation_dir, 'sparkocr')) + self.assertTrue(VenvWrapper.is_lib_in_venv(self.venv_creation_dir, 'sparknlp_display')) + self.assertTrue(VenvWrapper.is_lib_in_venv(self.venv_creation_dir, 'nlu')) + self.assertTrue(VenvWrapper.is_lib_in_venv(self.venv_creation_dir, 'internal_with_finleg')) # ---> sparknlp_jsl + self.assertTrue(VenvWrapper.is_lib_in_venv(self.venv_creation_dir, 'jsl_tmp')) # --> johnsnowlabs + os.system(f'rm -r {self.venv_creation_dir} ') + + def test_create_fresh_venv_and_install_to_it(self): + # let jsl-lib create a fresh venv for us + os.system(f'rm -r {self.venv_creation_dir} ') + f = '/home/ckl/old_home/ckl/Documents/freelance/johnsnowlabs_lib/tmp/licenses/ocr_40.json' + jsl.install(json_license_path=f, venv_creation_path=self.venv_creation_dir) + self.assertTrue(VenvWrapper.is_lib_in_venv(self.venv_creation_dir, 'sparknlp')) + self.assertTrue(VenvWrapper.is_lib_in_venv(self.venv_creation_dir, 'sparkocr')) + self.assertTrue(VenvWrapper.is_lib_in_venv(self.venv_creation_dir, 'sparknlp_display')) + self.assertTrue(VenvWrapper.is_lib_in_venv(self.venv_creation_dir, 'nlu')) + self.assertTrue(VenvWrapper.is_lib_in_venv(self.venv_creation_dir, 'sparknlp_jsl')) + self.assertTrue(VenvWrapper.is_lib_in_venv(self.venv_creation_dir, 'johnsnowlabs')) + os.system(f'rm -r {self.venv_creation_dir} ') + + def test_list_license_status(self): + jsl.check_health(check_licenses=True) + jsl.list_remote_licenses() + jsl.list_local_licenses() + + def test_oudated_message(self): + from johnsnowlabs.auto_install.softwares import Software + log_outdated_lib(Software.spark_ocr, '79') + + def test_offline_install_print(self): + jsl.install(offline=True) + + def test_offline_install_zip(self): + os.system(f'rm -r {self.zip_dir} ') + jsl.install(offline=True, offline_zip_dir=self.zip_dir, install_optional=True, include_dependencies=True) + + def test_browser_install(self): + jsl.install(force_browser=True, local_license_number=2) + # jsl.install(local_license_number=2, force_browser=True) + + def test_upgrade_licensed_lib_via_secret_only(self): + new_secret = '' + jsl.install(ocr_secret=new_secret) + + def test_json_license_install(self): + jsl.install(json_license_path=sct.latest_lic) + import sparknlp + import sparknlp_jsl + import sparknlp + import sparknlp_jsl + # import sparkocr + import nlu + import sparknlp_display + # jsl.install(json_license_path=old_lic) + + def test_json_license_install_outdated(self): + jsl.settings.enforce_versions = False + jsl.install(json_license_path=sct.old_lic) + import sparknlp + import sparknlp_jsl + import sparknlp + import sparknlp_jsl + # import sparkocr + import nlu + import sparknlp_display + # jsl.install(json_license_path=old_lic) + + def test_create_and_install_cluster(self): + install_suite = get_install_suite_from_jsl_home() + print(install_suite) + + def test_uninstall_all(self): + # os.system(old_lic'{sys.py_executable} -py_executable pip uninstall spark-nlp -y') + # os.system(old_lic'{sys.py_executable} -py_executable pip uninstall spark-nlp-display -y') + # os.system(old_lic'{sys.py_executable} -py_executable pip uninstall nlu -y') + + os.system(f'{sys.executable} -m pip uninstall spark-nlp-jsl -y') + os.system(f'{sys.executable} -m pip uninstall spark-nlp-jsl -y') + os.system(f'{sys.executable} -m pip uninstall spark-ocr -y') + # os.system(old_lic'{sys.py_executable} -py_executable pip uninstall jsl_tmp -y') + os.system(f'{sys.executable} -m pip uninstall spark-nlp-internal -y') + + @classmethod + def tearDownClass(cls): + 1 + # print("TEARING DOWN") + # os.system(old_lic'rm -r {cls.venv_creation_dir} ') + # # os.system(old_lic'rm -r {cls.zip_dir} ') + # + # os.system(old_lic'{sys.py_executable} -py_executable pip uninstall spark-nlp-jsl -y') + # os.system(old_lic'{sys.py_executable} -py_executable pip uninstall spark-nlp-jsl -y') + # os.system(old_lic'{sys.py_executable} -py_executable pip uninstall spark-ocr -y') + # os.system(old_lic'{sys.py_executable} -py_executable pip uninstall jsl_tmp -y') + # os.system(old_lic'{sys.py_executable} -py_executable pip uninstall internal_with_finleg -y') + # + # os.system(old_lic'{sys.py_executable} -py_executable pip uninstall spark-nlp -y') + # os.system(old_lic'{sys.py_executable} -py_executable p ip uninstall spark-nlp-display -y') + # os.system(old_lic'{sys.py_executable} -py_executable pip uninstall nlu -y') + # os.system(old_lic'{sys.py_executable} -py_executable pip uninstall pyspark -y') + + def test_refresh_credentials(self): + # Use this to upgrade all secrets on every license file, if greater + jsl.install(json_license_path=sct.latest_lic, only_refresh_credentials=True) + + def test_refresh_install(self): + # Use this to upgrade all secrets on every license file, if greater + jsl.install(json_license_path=sct.latest_lic, refresh_install=True) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/databricks_tests.py b/tests/databricks_tests.py new file mode 100644 index 0000000000..2e38efa339 --- /dev/null +++ b/tests/databricks_tests.py @@ -0,0 +1,154 @@ +from pprint import pprint +import unittest + +import tests.utils.secrets as sct +import johnsnowlabs +from johnsnowlabs.auto_install.databricks.install_utils import * +from johnsnowlabs import * +from johnsnowlabs.auto_install.jsl_home import get_install_suite_from_jsl_home +from johnsnowlabs.utils.enums import JvmHardwareTarget +from johnsnowlabs.utils.file_utils import str_to_file, path_tail + +cluster_id = '0926-040523-d8k13d4f' + +from johnsnowlabs import * + + +# Test Function +def test_submit_func(): print("Test Function") + + +def nlu_func(): + import nlu + medical_text = """A 28-year-old female with a history of gestational + diabetes presented with a one-week history of polyuria , + polydipsia , poor appetite , and vomiting .""" + df = nlu.load('en.med_ner.diseases').predict(medical_text) + for c in df.columns: print(df[c]) + + +# TODO DATABRICKS SELF INSTALL!! +class DatabricksTestCase(unittest.TestCase): + def test_list_db_infos(self): + db_client = get_db_client_for_token(sct.ckl_host, sct.ckl_token) + list_db_runtime_versions(db_client) + list_node_types(db_client) + list_clusters(db_client) + + def test_get_db_lib_infos(self): + # WIP + cluster_id = '1006-022913-lb94q2m0' + db_client = get_db_client_for_token(sct.ckl_host, sct.ckl_token) + list_cluster_lib_status(db_client, cluster_id) + pprint(db_client.cluster.get_cluster(cluster_id)) + + def test_create_fresh_cluster(self): + jsl.install(json_license_path=sct.db_lic, databricks_host=sct.ckl_host, databricks_token=sct.ckl_token) + + def test_create_fresh_cluster_and_run_task(self): + py_script = '/home/ckl/old_home/ckl/Documents/freelance/johnsnowlabs_lib/johnsnowlabs/health_checks/hc.py' + johnsnowlabs.databricks_submit(databricks_host=sct.ckl_host, databricks_token=sct.ckl_token, + py_script_path=py_script) + + def test_install_to_databricks(self): + db_client = get_db_client_for_token(sct.ckl_host, sct.ckl_token) + cluster_id = jsl.install(json_license_path=sct.db_lic, databricks_host=sct.ckl_host, + databricks_token=sct.ckl_token) + + def test_hdfs_basic_methods(self): + db_client = get_db_client_for_token(sct.ckl_host, sct.ckl_token) + src_p = '/home/ckl/old_home/ckl/Documents/freelance/johnsnowlabs_lib/setup.py' + target_p = '/johnsnowlabs/testf' + copy_from_local_to_hdfs(db_client, local_path=src_p, dbfs_path=target_p) + copy_from_local_to_hdfs(db_client, local_path=src_p, dbfs_path=target_p + '2') + copy_from_local_to_hdfs(db_client, local_path=src_p, dbfs_path=target_p + '3') + pprint(dbfs_ls(db_client, '/johnsnowlabs')) + dbfs_rm(db_client, target_p) + pprint(dbfs_ls(db_client, '/johnsnowlabs')) + + def test_submit_task_to_databricks(self): + import inspect + import johnsnowlabs.auto_install.health_checks.hc_test + + # Make cluster + # cluster_id = jsl.install(json_license_path=sct.db_lic, databricks_host=sct.ckl_host, + # databricks_token=sct.ckl_token) + cluster_id = '1006-050402-4nsqdu8h' + # Test modules + import johnsnowlabs.auto_install.health_checks.hc_test as hc_test + import johnsnowlabs.auto_install.health_checks.ocr_test as ocr_test + import johnsnowlabs.auto_install.health_checks.nlp_test as nlp_test + jsl.run_in_databricks(nlp_test, + databricks_cluster_id=cluster_id, + databricks_host=sct.ckl_host, + databricks_token=sct.ckl_token, + run_name='nlp_test') + jsl.run_in_databricks(ocr_test, + databricks_cluster_id=cluster_id, + databricks_host=sct.ckl_host, + databricks_token=sct.ckl_token, + run_name='ocr_test') + jsl.run_in_databricks(hc_test, + databricks_cluster_id=cluster_id, + databricks_host=sct.ckl_host, + databricks_token=sct.ckl_token, + run_name='hc_test') + + jsl.run_in_databricks(test_submit_func, + databricks_cluster_id=cluster_id, + databricks_host=sct.ckl_host, + databricks_token=sct.ckl_token, + run_name='Function test') + + # Test script + py_script = '/home/ckl/Documents/freelance/jsl/johnsnowlabs/johnsnowlabs/auto_install/health_checks/hc_test.py' + jsl.run_in_databricks(py_script, + databricks_cluster_id=cluster_id, + databricks_host=sct.ckl_host, + databricks_token=sct.ckl_token, + run_name='Script test ') + + # Test String + script = """ +import nlu +print(nlu.load('sentiment').predict('That was easy!')) + """ + jsl.run_in_databricks(script, + databricks_cluster_id=cluster_id, + databricks_host=sct.ckl_host, + databricks_token=sct.ckl_token, + run_name='Python Code String Example') + + jsl.run_in_databricks("print('noice')", + databricks_cluster_id=cluster_id, + databricks_host=sct.ckl_host, + databricks_token=sct.ckl_token, + run_name='Code String test 2') + + jsl.run_in_databricks(nlu_func, + databricks_cluster_id=cluster_id, + databricks_host=sct.ckl_host, + databricks_token=sct.ckl_token, + run_name='Function test') + + # Test String + script = """ +import nlu +print(nlu.load('sentiment').predict('That was easy!')) + """ + jsl.run_in_databricks(script, + databricks_cluster_id=cluster_id, + databricks_host=sct.ckl_host, + databricks_token=sct.ckl_token, + run_name='Python Code String Example') + + + def test_test_for_job_finishwait(self): + # WIP + db_client = get_db_client_for_token(sct.ckl_host, sct.ckl_token) + r = checkon_db_task(db_client, run_id='32458') + print(r) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/file_models.py b/tests/file_models.py new file mode 100644 index 0000000000..ad6eb7a270 --- /dev/null +++ b/tests/file_models.py @@ -0,0 +1,30 @@ +from johnsnowlabs import * +import unittest + +from johnsnowlabs import settings + +from johnsnowlabs.py_models.install_info import InstallFolder + + +class MyTestCase(unittest.TestCase): + def test_something(self): + p = '/home/ckl/.johnsnowlabs/java_installs/info2.json' + p = '/home/ckl/.johnsnowlabs/java_installs/info3.json' + info = FolderInfoV2.parse_file(p) + print(info) + print('_'*20) + j = info.json(indent=4) + print(j) + + p = '/home/ckl/.johnsnowlabs/java_installs/info9.json' + info.write(p,indent=4) + + def test_parse_install_folder(self): + f = '/home/ckl/old_home/ckl/Documents/freelance/johnsnowlabs_lib/tests/utils/spark_nlp_ocr_hc.json' + # jsl.install() + java_info = InstallFolder.parse_file(settings.java_info_file) + print(java_info) + py_info = InstallFolder.parse_file(settings.py_info_file) + +if __name__ == '__main__': + unittest.main() diff --git a/tests/jsl_dependency_resolver.py b/tests/jsl_dependency_resolver.py new file mode 100644 index 0000000000..61fec627b6 --- /dev/null +++ b/tests/jsl_dependency_resolver.py @@ -0,0 +1,65 @@ +import unittest +from johnsnowlabs.utils.enums import * +from johnsnowlabs.auto_install.lib_resolvers import * +import tests.utils.secrets as sct + + +class LibDependencyResolutionCase(unittest.TestCase): + def test_list_ocr_lib_resolve(self): + for spark_version in OcrLibResolver.compatible_spark_versions: + for install_type in [PyInstallTypes.tar, PyInstallTypes.wheel, JvmHardwareTarget.cpu]: + print(f'Testing Spark version ={spark_version.as_str()} and install type = {install_type}') + dep = OcrLibResolver.get_dependency_url(secret=sct.OCR_SECRET, + spark_version_to_match=spark_version, + install_type=install_type) + print(dep) + self.assertTrue(dep.validate()) + + dep = OcrLibResolver.get_dependency_url(secret=sct.OCR_SECRET, + spark_version_to_match=None, + install_type=install_type) + print(dep) + self.assertTrue(dep.validate()) + + + def test_list_hc_lib_resolve(self): + for spark_version in HcLibResolver.compatible_spark_versions: + for install_type in [PyInstallTypes.tar, PyInstallTypes.wheel, JvmHardwareTarget.cpu]: + print(f'Testing Spark version ={spark_version.as_str()} and install type = {install_type}') + dep = HcLibResolver.get_dependency_url(secret=sct.JSL_SECRET, + spark_version_to_match=spark_version, + install_type=install_type) + dep = HcLibResolver.get_dependency_url(secret=sct.JSL_SECRET, + spark_version_to_match=None, + install_type=install_type) + + print(dep) + self.assertTrue(dep.validate()) + dep = HcLibResolver.get_dependency_url(secret=sct.JSL_SECRET, + spark_version_to_match=None, + install_type=install_type) + print(dep) + self.assertTrue(dep.validate()) + + + def test_list_nlp_lib_resolve(self): + for spark_version in NlpLibResolver.compatible_spark_versions: + for install_type in [PyInstallTypes.tar, PyInstallTypes.wheel, + JvmHardwareTarget.cpu, JvmHardwareTarget.gpu, JvmHardwareTarget.m1]: + print(f'Testing Spark version ={spark_version.as_str()} and install type = {install_type}') + dep = NlpLibResolver.get_dependency_url( + spark_version_to_match=spark_version, + install_type=install_type) + print(dep) + self.assertTrue(dep.validate()) + + dep = NlpLibResolver.get_dependency_url( + spark_version_to_match=None, + install_type=install_type) + print(dep) + self.assertTrue(dep.validate()) + + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/markdown_tests.py b/tests/markdown_tests.py new file mode 100644 index 0000000000..bd62f2aaaa --- /dev/null +++ b/tests/markdown_tests.py @@ -0,0 +1,38 @@ +import os +import unittest + +from johnsnowlabs.utils.file_utils import file_to_str +from johnsnowlabs.utils.modelhub_markdown import test_markdown +from johnsnowlabs.utils.notebooks import test_ipynb +from johnsnowlabs import start +from johnsnowlabs import * +from johnsnowlabs import * + + +# lxml +class MarkdownTestTestCase(unittest.TestCase): + def test_remote_markdown(self): + nb_to_test = [ + 'https://nlp.johnsnowlabs.com/2022/09/02/legner_roberta_zeroshot_en.html', + 'https://nlp.johnsnowlabs.com/2022/08/31/legpipe_deid_en.html', + 'https://nlp.johnsnowlabs.com/2022/10/02/legner_bert_large_courts_de.html', + ] + for n in nb_to_test: + print(test_markdown(n)) + + def test_local_markdown(self): + url = 'https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/docs/_posts/josejuanmartinez/2022-08-30-legmulticlf_edgar_en.md' + os.system(f'wget {url}') + print(test_markdown('2022-08-30-legmulticlf_edgar_en.md')) + + def test_folder_of_markdown(self): + urls = [ + 'https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/docs/_posts/josejuanmartinez/2022-02-14-clinical_deidentification_es.md', + 'https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/docs/_posts/josejuanmartinez/2022-02-15-ner_deid_generic_augmented_es.md', + 'https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/docs/_posts/josejuanmartinez/2022-08-30-legmulticlf_edgar_en.md', ] + for u in urls: + os.system(f'wget {u}') + test_markdown(os.getcwd()) + +if __name__ == '__main__': + unittest.main() diff --git a/tests/notebook_tests.py b/tests/notebook_tests.py new file mode 100644 index 0000000000..5f419508fe --- /dev/null +++ b/tests/notebook_tests.py @@ -0,0 +1,80 @@ +import unittest + +from johnsnowlabs.utils.file_utils import file_to_str +from johnsnowlabs.utils.notebooks import test_ipynb +from johnsnowlabs import start +from johnsnowlabs import * +from johnsnowlabs import * + +# lxml +class WorkshopNotebookTestCase(unittest.TestCase): + def test_remote_notebook(self): + ocr_5 = 'https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/5.Spark_OCR.ipynb' + ocr_51 = 'https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/5.1.Spark_OCR_Multi_Modals.ipynb' + ocr_52 = 'https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/5.2.Spark_OCR_Deidentification.ipynb' + nb_to_test = [ + ocr_5, + ocr_51, + ocr_52, + ] + for n in nb_to_test: + test_ipynb(n) + + def test_download_workshop_repo_and_run(self): + res = test_ipynb('WORKSHOP-FIN') + res.to_csv('WORKSHOP-FIN.csv',index=False) + + def test_folder_of_notebooks(self): + folder = '/home/ckl/old_home/ckl/Documents/freelance/johnsnowlabs_lib/tmp/nb_tests/latest_workshop/johnsnowlabs v1.0/Finance' + res.to_csv('WORKSHOP-FIN.csv',index=False) + + def test_local_notebook(self): + pass + + def test_parse(self): + import ast + f = '/home/ckl/.johnsnowlabs/tmp_tests/notebook_tests/5.2.Spark_OCR_Deidentification.ipynb.nb_converted.py' + from ast import Expr + a = ast.parse(file_to_str(f)) + + bad_regex = [ + + ] + + + + + + + + + + def test_nerModel(self): + jsl.start() + from sparknlp_jsl.finance import FinanceNerModel + m = FinanceNerModel.pretrained("finner_deid", "en", 'finance/models') + print("FOUND",m) + + + + + + + + def test_quick(self): + s = """ + +spark = sparkocr.start(secret=SPARK_OCR_SECRET, + nlp_version=PUBLIC_VERSION, + nlp_secret=SECRET, + nlp_internal=JSL_VERSION + ) + """ + + r = r'sparkocr.start\(.*\)' + import re + print(re.findall(r, s, re.DOTALL)) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/spark_session.py b/tests/spark_session.py new file mode 100644 index 0000000000..e90f06b818 --- /dev/null +++ b/tests/spark_session.py @@ -0,0 +1,208 @@ +from johnsnowlabs import * +import unittest +import pkg_resources + +# finance.ClassifierDLApproach() +class ImportTestCase(unittest.TestCase): + def test_sparknlp_session(self): + jsl.start() + d = nlp.DocumentAssembler().setInputCol('text').setOutputCol('doc') + t = nlp.Tokenizer().setInputCols('doc').setOutputCol('tok') + c = nlp.DeBertaForTokenClassification().setInputCols(['tok', 'doc']).setOutputCol('class') + p = Pipeline(stages=[d, t]) + p = nlu.to_nlu_pipe(p) + print(p.predict("Hello World")) + + def test_sparknlp_gpu_session(self): + jsl.start(hardware_target='gpu') + d = nlp.DocumentAssembler().setInputCol('text').setOutputCol('doc') + t = nlp.Tokenizer().setInputCols('doc').setOutputCol('tok') + c = nlp.DeBertaForTokenClassification().setInputCols(['tok', 'doc']).setOutputCol('class') + p = Pipeline(stages=[d, t]) + p = nlu.to_nlu_pipe(p) + print(p.predict("Hello form John SNow labs")) + + def test_sparknlp_m1_session(self): + import os + jsl.start(hardware_target='m1') + d = nlp.DocumentAssembler().setInputCol('text').setOutputCol('doc') + t = nlp.Tokenizer().setInputCols('doc').setOutputCol('tok') + c = nlp.DeBertaForTokenClassification().pretrained().setInputCols(['tok', 'doc']).setOutputCol('class') + nlp.UniversalSentenceEncoder.pretrained() + p = Pipeline(stages=[d, t]) + p = nlu.to_nlu_pipe(p) + print(p.predict("Hello form John SNow labs")) + + def test_healthcare_session(self): + jsl.start() + d = nlp.DocumentAssembler().setInputCol('text').setOutputCol('doc') + t = nlp.Tokenizer().setInputCols('doc').setOutputCol('tok') + c = medical.BertForTokenClassifier().pretrained().setInputCols(['tok', 'doc']).setOutputCol('class') + p = Pipeline(stages=[d, t, c]) + p = nlu.to_nlu_pipe(p) + print(p.predict("Hello form John SNow labs")) + + def test_ocr_session(self): + # Convert pdf to image + p = '/home/ckl/old_home/ckl/Documents/freelance/johnsnowlabs_lib/tmp/licenses/4_1_LATEST_OCR_HC_BCK.json' + spark = jsl.start(json_license_path=p) + + pdf_to_image = ocr.PdfToImage() + pdf_to_image.setImageType(jsl.ocr.ImageType.TYPE_3BYTE_BGR) + + # Detect tables on the page using pretrained model + # It can be finetuned for have more accurate results for more specific documents + table_detector = ocr.ImageTableDetector.pretrained("general_model_table_detection_v2", "en", "clinical/ocr") + table_detector.setInputCol("image") + table_detector.setOutputCol("region") + + # Draw detected region's with table to the page + draw_regions = ocr.ImageDrawRegions() + draw_regions.setInputCol("image") + draw_regions.setInputRegionsCol("region") + draw_regions.setOutputCol("image_with_regions") + draw_regions.setRectColor(jsl.ocr.Color.red) + + # Extract table regions to separate images + splitter = ocr.ImageSplitRegions() + splitter.setInputCol("image") + splitter.setInputRegionsCol("region") + splitter.setOutputCol("table_image") + splitter.setDropCols("image") + + # Detect cells on the table image + cell_detector = ocr.ImageTableCellDetector() + cell_detector.setInputCol("table_image") + cell_detector.setOutputCol("cells") + cell_detector.setAlgoType("morphops") + + # Extract text from the detected cells + table_recognition = ocr.ImageCellsToTextTable() + table_recognition.setInputCol("table_image") + table_recognition.setCellsCol('cells') + table_recognition.setMargin(3) + table_recognition.setStrip(True) + table_recognition.setOutputCol('table') + + pipeline = PipelineModel(stages=[ + pdf_to_image, + table_detector, + draw_regions, + splitter, + cell_detector, + table_recognition + ]) + + import pkg_resources + pdf_example = pkg_resources.resource_filename('sparkocr', 'resources/ocr/pdfs/tabular-pdf/data.pdf') + pdf_example_df = spark.read.format("binaryFile").load(pdf_example).cache() + pipeline.transform(pdf_example_df).show() + + def test_legal_session(self): + jsl.start() + + LightPipeline(self.get_legal_pipe()).fullAnnotate("Shwrm") + + def test_finance_session(self): + jsl.start() + LightPipeline(self.get_finance_pipe()).fullAnnotate("unit") + + @staticmethod + def get_finance_pipe() -> PipelineModel: + documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("ner_chunk") + + embeddings = UniversalSentenceEncoder.pretrained("tfhub_use", "en") \ + .setInputCols("ner_chunk") \ + .setOutputCol("sentence_embeddings") + + resolver = finance.SentenceEntityResolverModel.pretrained("finel_tickers2names", "en", "finance/models") \ + .setInputCols(["ner_chunk", "sentence_embeddings"]) \ + .setOutputCol("name") \ + .setDistanceFunction("EUCLIDEAN") + + return PipelineModel( + stages=[ + documentAssembler, + embeddings, + resolver]) + + @staticmethod + def get_legal_pipe() -> PipelineModel: + z = legal.ZeroShotRelationExtractionModel.pretrained("finre_zero_shot", "en", "finance/models") + documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("ner_chunk") + + embeddings = UniversalSentenceEncoder.pretrained("tfhub_use", "en") \ + .setInputCols("ner_chunk") \ + .setOutputCol("sentence_embeddings") + + resolver = legal.SentenceEntityResolverModel.pretrained("legel_crunchbase_companynames", "en", "legal/models") \ + .setInputCols(["ner_chunk", "sentence_embeddings"]) \ + .setOutputCol("name") \ + .setDistanceFunction("EUCLIDEAN") + + return PipelineModel( + stages=[ + documentAssembler, + embeddings, + resolver]) + + @staticmethod + def get_cross_lib_pipe() -> PipelineModel: + # Returns pipe with one anno per lib + # TODO add some fancy OCR DL models? + doc2text = ocr.DocToText().setInputCol("content").setOutputCol("text") + d = nlp.DocumentAssembler().setInputCol('text').setOutputCol('doc') + t = nlp.Tokenizer().setInputCols('doc').setOutputCol('tok') + # One classifier per NLP lib + + c1 = medical.BertForTokenClassifier().pretrained() \ + .setInputCols(['tok', 'doc']) \ + .setOutputCol('medical') + + c2 = nlp.DeBertaForTokenClassification() \ + .setInputCols(['tok', 'doc']) \ + .setOutputCol('opene_source') + + c3 = finance.BertForSequenceClassification \ + .pretrained("finclf_augmented_esg", "en", "finance/models") \ + .setInputCols(['tok', 'doc']) \ + .setOutputCol("finance") + + c4 = legal.BertForSequenceClassification \ + .pretrained("legclf_bert_judgements_agent", "en", "legal/models") \ + .setInputCols(['tok', 'doc']) \ + .setOutputCol("legal") + + return Pipeline(stages=[doc2text, d, t, c1, c2, c3, c4]) + + def test_simple_cross_lib(self): + spark = jsl.start() + doc_example = pkg_resources.resource_filename('sparkocr', 'resources/ocr/docs/doc2.docx') + df = spark.read.format("binaryFile").load(doc_example).cache() + self.get_cross_lib_pipe().fit(df).transform(df).show() + + def test_simple_cross_lib_gpu(self): + spark = jsl.start(hardware_target='gpu') + doc_example = pkg_resources.resource_filename('sparkocr', 'resources/ocr/docs/doc2.docx') + df = spark.read.format("binaryFile").load(doc_example).cache() + self.get_cross_lib_pipe().fit(df).transform(df).show() + + def test_cross_engine_session(self): + import itertools + # Test every combination of jars with CPU jars + for c in range(3): + p = itertools.combinations(['nlp-cpu', 'ocr', 'hc'], c) + for pp in p: print(pp) + + # Test every combination of jars with GPU jars + for c in range(3): + p = itertools.combinations(['nlp-gpu', 'ocr', 'hc'], c) + for pp in p: print(pp) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/venv_wrapper_tests.py b/tests/venv_wrapper_tests.py new file mode 100644 index 0000000000..e7befafd31 --- /dev/null +++ b/tests/venv_wrapper_tests.py @@ -0,0 +1,49 @@ +import unittest + +from johnsnowlabs import JslSecrets +from johnsnowlabs.utils.venv_utils import * + + +class VenvTests(unittest.TestCase): + test_dir = '/home/ckl/old_home/ckl/Documents/freelance/johnsnowlabs_lib/tmp/venv/tmp_test_venv' + + # venv_wrapper = VenvWrapper(self.venv_creation_dir, ) + def test_venv_wrapper(self): + """ + Test all core features of the VenvWrapper, install, uninstall and check_if_contains_module + """ + # Lib should not be found after having uninstalled it + self.test_dir = '/home/ckl/old_home/ckl/Documents/freelance/johnsnowlabs_lib/tmp/venv/tmp_test_venv' + VenvWrapper.create_venv(self.test_dir) + VenvWrapper.uninstall_from_venv(self.test_dir, 'pyspark') + self.assertTrue(VenvWrapper.is_lib_in_venv(self.test_dir, 'pyspark') is False) + + # After installing lib should be found + self.assertTrue(VenvWrapper.install_to_venv(self.test_dir, 'pyspark')) + self.assertTrue(VenvWrapper.is_lib_in_venv(self.test_dir, 'pyspark')) + + # Uninstall and check its missing + VenvWrapper.uninstall_from_venv(self.test_dir, 'pyspark') + self.assertTrue(VenvWrapper.is_lib_in_venv(self.test_dir, 'pyspark') is False) + os.system(f'rm -r {self.test_dir} ') + + def test_install_jsl_suite_to_venv(self): + f = '/home/ckl/old_home/ckl/Documents/freelance/johnsnowlabs_lib/tmp/licenses/ocr_40.json' + secrets: JslSecrets = JslSecrets.build_or_try_find_secrets(secrets_file=f) + + + print(secrets) + # TODO + pass + + # def test_jsl_suite_status(self): + # # TODO + # pass + + @classmethod + def tearDownClass(cls): + print("TEARING DOWN") + os.system(f'rm -r {cls.test_dir} ') + + if __name__ == '__main__': + unittest.main()