DataFog
diff --git a/‎.env
Lines changed: 4 additions & 0 deletions b/‎.env
Lines changed: 4 additions & 0 deletions
diff --git a/‎.github/workflows/dev-cicd-tests.yml
Lines changed: 8 additions & 2 deletions b/‎.github/workflows/dev-cicd-tests.yml
Lines changed: 8 additions & 2 deletions
diff --git a/‎.github/workflows/feature-ci-cd.yml
Lines changed: 48 additions & 0 deletions b/‎.github/workflows/feature-ci-cd.yml
Lines changed: 48 additions & 0 deletions
diff --git a/‎.gitignore
Lines changed: 2 additions & 0 deletions b/‎.gitignore
Lines changed: 2 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 4 additions & 5 deletions b/‎README.md
Lines changed: 4 additions & 5 deletions
diff --git a/‎datafog/__about__.py
Lines changed: 1 addition & 1 deletion b/‎datafog/__about__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎datafog/__init__.py
Lines changed: 4 additions & 3 deletions b/‎datafog/__init__.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎datafog/config.py
Lines changed: 0 additions & 2 deletions b/‎datafog/config.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎datafog/main.py
Lines changed: 69 additions & 13 deletions b/‎datafog/main.py
Lines changed: 69 additions & 13 deletions
diff --git a/‎datafog/processing/__init__.py
Lines changed: 3 additions & 1 deletion b/‎datafog/processing/__init__.py
Lines changed: 3 additions & 1 deletion
@@ -0,0 +1,4 @@
+APPLICATIONINSIGHTS_CONNECTION_STRING="InstrumentationKey=00bea047-1836-46fa-9652-26d43d63a3fa;IngestionEndpoint=https://eastus-8.in.applicationinsights.azure.com/;LiveEndpoint=https://eastus.livediagnostics.monitor.azure.com/;ApplicationId=959cc365-c112-491b-af69-b196d0943ca4"
+
+
+# note this is an Azure specific implementation of the OpenTelemetry distro. for more information please see https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/monitor/azure-monitor-opentelemetry
@@ -21,7 +21,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.10", "3.11"]
+        python-version: ["3.10"]
     steps:
       - uses: actions/setup-python@v4
         with:
@@ -33,10 +33,16 @@ jobs:
           tox -- --cov datafog --cov-report xml --cov-report term
       - name: Submit to codecov
         uses: codecov/codecov-action@v3
-        if: ${{ matrix.python-version == '3.11' }}
+        if: ${{ matrix.python-version == '3.10' }}
 
       - name: Upload coverage reports to Codecov
         uses: codecov/[email protected]
         env:
           token: ${{ secrets.CODECOV_TOKEN }}
           slug: DataFog/datafog-python
+
+      - name: Run script
+        env:
+          APPLICATIONINSIGHTS_CONNECTION_STRING: ${{ secrets.APPLICATIONINSIGHTS_CONNECTION_STRING }}
+        run: |
+          python datafog/telemetry/open_telemetry.py
@@ -0,0 +1,48 @@
+name: feature-cicd-tests
+
+on:
+  push:
+    branches:
+      - feature/*
+  pull_request:
+    branches:
+      - feature/*
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v3
+      - name: Run pre-commit
+        uses: pre-commit/[email protected]
+
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+    steps:
+      - uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - uses: actions/checkout@v3
+      - name: Test with tox
+        run: |
+          pip install tox
+          tox -- --cov datafog --cov-report xml --cov-report term
+      - name: Submit to codecov
+        uses: codecov/codecov-action@v3
+        if: ${{ matrix.python-version == '3.10' }}
+
+      - name: Upload coverage reports to Codecov
+        uses: codecov/[email protected]
+        env:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          slug: DataFog/datafog-python
+
+      - name: Run script
+        env:
+          APPLICATIONINSIGHTS_CONNECTION_STRING: ${{ secrets.APPLICATIONINSIGHTS_CONNECTION_STRING }}
+        run: |
+          python datafog/telemetry/open_telemetry.py
@@ -13,6 +13,7 @@ build/
 /src/datafog/pii_tools/__pycache__/
 /tests/__pycache__/
 /tests/scratch.py
+/tests/.datafog_env/
 node_modules/
 datafog_debug.log
 sotu_2023.txt
@@ -23,4 +24,5 @@ datafog-python/datafog/processing/text_processing/__pycache__/
 datafog-python/datafog/services/__pycache__/
 datafog-python/datafog/processing/__pycache__/
 datafog-python/datafog/__pycache__/
+.env
 
@@ -39,7 +39,6 @@ DataFog can be installed via pip:
 pip install datafog
 ```
 
-
 ## Getting Started
 
 The DataFog library provides functionality for text and image processing, including PII (Personally Identifiable Information) annotation and OCR (Optical Character Recognition) capabilities.
@@ -54,8 +53,7 @@ pip install datafog
 
 ### Usage
 
-The [Getting Started notebook](/datafog-python/examples/getting_started.ipynb)  features a standalone Colab notebook. 
-
+The [Getting Started notebook](/datafog-python/examples/getting_started.ipynb) features a standalone Colab notebook.
 
 #### Text PII Annotation
 
@@ -75,7 +73,9 @@ with open(os.path.join(folder_path, text_files[0]), 'r') as file:
 
 display(Markdown(clinical_note))
 ```
+
 which looks like this:
+
 ```
 
 **Date:** April 10, 2024
@@ -124,7 +124,6 @@ loop = asyncio.get_event_loop()
 results = loop.run_until_complete(run_text_pipeline_demo())
 ```
 
-
 Note: The DataFog library uses asynchronous programming, so make sure to use the `async`/`await` syntax when calling the appropriate methods.
 
 #### OCR PII Annotation
@@ -146,7 +145,7 @@ loop.run_until_complete(run_ocr_pipeline_demo())
 
 ```
 
-You'll notice that we use async functions liberally throughout the SDK - given the nature of the functions we're providing and the extension of DataFog into API/other formats, this allows the functions to be more easily adapted for those uses. 
+You'll notice that we use async functions liberally throughout the SDK - given the nature of the functions we're providing and the extension of DataFog into API/other formats, this allows the functions to be more easily adapted for those uses.
 
 ## Contributing
 
 
@@ -1 +1 @@
-__version__ = "3.2.0"
+__version__ = "3.2.1"
@@ -1,3 +1,4 @@
+from .__about__ import __version__
 from .config import OperationType
 from .main import DataFog, OCRPIIAnnotator, TextPIIAnnotator
 from .processing.image_processing.donut_processor import DonutProcessor
@@ -7,8 +8,7 @@
 from .services.image_service import ImageService
 from .services.spark_service import SparkService
 from .services.text_service import TextService
-
-# from .__about__ import __version__
+from .telemetry import Telemetry
 
 __all__ = [
     "DonutProcessor",
@@ -22,5 +22,6 @@
     "SpacyPIIAnnotator",
     "ImageDownloader",
     "PytesseractProcessor",
-    # "__version__",
+    "__version__",
+    "Telemetry",
 ]
@@ -1,7 +1,5 @@
 from enum import Enum
 
-from pydantic import BaseModel
-
 
 class OperationType(str, Enum):
     ANNOTATE_PII = "annotate_pii"
 
@@ -13,7 +13,34 @@
 from .services.image_service import ImageService
 from .services.spark_service import SparkService
 from .services.text_service import TextService
-
+from opentelemetry import trace
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+import os
+from opentelemetry import trace
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from azure.monitor.opentelemetry.exporter import AzureMonitorTraceExporter
+from azure.monitor.opentelemetry import configure_azure_monitor
+import platform
+from opentelemetry.trace import Status, StatusCode
+
+# Use environment variable if available, otherwise fall back to hardcoded value
+from opentelemetry import trace
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from logging import INFO, getLogger
+from dotenv import load_dotenv
+import logging
+
+load_dotenv()  # Load environment variables from .env file
+APPLICATIONINSIGHTS_CONNECTION_STRING = os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING")
+configure_azure_monitor(connection_string=APPLICATIONINSIGHTS_CONNECTION_STRING)
+trace.set_tracer_provider(TracerProvider())
+exporter = AzureMonitorTraceExporter(connection_string=APPLICATIONINSIGHTS_CONNECTION_STRING)
+trace.get_tracer_provider().add_span_processor(BatchSpanProcessor(exporter))
+logger = logging.getLogger("datafog_logger")
+logger.setLevel(INFO)
 
 class DataFog:
     def __init__(
@@ -27,23 +54,52 @@ def __init__(
         self.text_service = text_service
         self.spark_service: SparkService = spark_service
         self.operations: List[OperationType] = operations
+        self.logger = logging.getLogger(__name__)
+        self.logger.info("Initializing DataFog class with the following services and operations:")
+        self.logger.info(f"Image Service: {type(image_service)}")
+        self.logger.info(f"Text Service: {type(text_service)}")
+        self.logger.info(f"Spark Service: {type(spark_service) if spark_service else 'None'}")
+        self.logger.info(f"Operations: {operations}")
+        self.tracer = trace.get_tracer(__name__)
 
     async def run_ocr_pipeline(self, image_urls: List[str]):
         """Run the OCR pipeline asynchronously."""
-        extracted_text = await self.image_service.ocr_extract(image_urls)
-        if OperationType.ANNOTATE_PII in self.operations:
-            annotated_text = await self.text_service.batch_annotate_texts(
-                extracted_text
-            )
-            return annotated_text
-        return extracted_text
-
+        with self.tracer.start_as_current_span("run_ocr_pipeline") as span:
+            try:
+                extracted_text = await self.image_service.ocr_extract(image_urls)
+                self.logger.info(f"OCR extraction completed for {len(image_urls)} images.")
+                self.logger.debug(f"Total length of extracted text: {sum(len(text) for text in extracted_text)}")
+
+                if OperationType.ANNOTATE_PII in self.operations:
+                    annotated_text = await self.text_service.batch_annotate_texts(extracted_text)
+                    self.logger.info(f"Text annotation completed with {len(annotated_text)} annotations.")
+                    return annotated_text
+                
+                return extracted_text
+            except Exception as e:
+                self.logger.error(f"Error in run_ocr_pipeline: {str(e)}")
+                span.set_status(Status(StatusCode.ERROR, str(e)))
+                raise
     async def run_text_pipeline(self, texts: List[str]):
         """Run the text pipeline asynchronously."""
-        if OperationType.ANNOTATE_PII in self.operations:
-            annotated_text = await self.text_service.batch_annotate_texts(texts)
-            return annotated_text
-        return texts
+        with self.tracer.start_as_current_span("run_text_pipeline") as span:
+            try:
+                self.logger.info(f"Starting text pipeline with {len(texts)} texts.")
+                if OperationType.ANNOTATE_PII in self.operations:
+                    annotated_text = await self.text_service.batch_annotate_texts(texts)
+                    self.logger.info(f"Text annotation completed with {len(annotated_text)} annotations.")
+                    return annotated_text
+                
+                self.logger.info("No annotation operation found; returning original texts.")
+                return texts
+            except Exception as e:
+                self.logger.error(f"Error in run_text_pipeline: {str(e)}")
+                span.set_status(Status(StatusCode.ERROR, str(e)))
+                raise
+    def _add_attributes(self, span, attributes: dict):
+        """Add multiple attributes to a span."""
+        for key, value in attributes.items():
+            span.set_attribute(key, value)
 
 
 class OCRPIIAnnotator:
 
@@ -1,5 +1,7 @@
 from .image_processing.donut_processor import DonutProcessor
 from .image_processing.image_downloader import ImageDownloader
 from .image_processing.pytesseract_processor import PytesseractProcessor
-from .spark_processing.pyspark_udfs import broadcast_pii_annotator_udf, pii_annotator
+
+# from .spark_processing.pyspark_udfs import broadcast_pii_annotator_udf, pii_annotator
+from .spark_processing import get_pyspark_udfs
 from .text_processing.spacy_pii_annotator import SpacyPIIAnnotator
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +APPLICATIONINSIGHTS_CONNECTION_STRING="InstrumentationKey=00bea047-1836-46fa-9652-26d43d63a3fa;IngestionEndpoint=https://eastus-8.in.applicationinsights.azure.com/;LiveEndpoint=https://eastus.livediagnostics.monitor.azure.com/;ApplicationId=959cc365-c112-491b-af69-b196d0943ca4"
++
++
 +# note this is an Azure specific implementation of the OpenTelemetry distro. for more information please see https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/monitor/azure-monitor-opentelemetry
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "3.2.0"`
	`1`	`+__version__ = "3.2.1"`