add files

jlstro · jlstro · commit 01612cbce09c · 2023-08-25T13:56:02.000+02:00
diff --git a/Readme.md b/Readme.md
@@ -0,0 +1,70 @@
+# Timelark Data Pipeline
+
+This very basic data pipeline built in Python is part of the Timelark project. It reads unstructured text from text files, extracts named entities using spaCy, queries the Aleph API to enrich these entities, and saves the enriched data to an SQLite database. From here it can be visualized.
+
+## Table of Contents
+
+- [Prerequisites](#prerequisites)
+- [Installation](#installation)
+- [Configuration](#configuration)
+- [Running the Pipeline](#running-the-pipeline)
+
+## Prerequisites
+
+- Python 3.x
+- [spaCy](https://spacy.io/) and spaCy model (e.g., en_core_web_lg)
+- [Dataset](https://dataset.readthedocs.io/en/latest/index.html) (sqlite wrapper)
+- Aleph API access and API key (for example [OCCRP's Aleph](https://aleph.occrp.org/))
+- [Confection](https://github.com/explosion/confection) (for configuration management)
+
+## Installation
+
+Clone this repository:
+
+```bash
+git clone <https://github.com/jlstro/timelark-pipeline.git>
+cd timelark-pipeline
+```
+
+Create a virtual environment and nstall the required Python packages:
+
+```bash
+python3 -m venv venv
+source venv/bin/activate  
+# On Windows: venv\Scripts\activate
+python3 -m pip install spacy confection dataset
+```
+
+Download and install the spaCy model (e.g., "en_core_web_lg"):
+
+``` bash
+python3 -m spacy download en_core_web_lg
+```
+
+## Configuration
+
+1. Create a configuration file named `config.cfg` in the root directory of the repository. Define the paths to your database, text files, and other configuration values as needed. Refer to the [confection documentation](https://github.com/timelark/confection) for more information on writing the configuration.
+
+Example `config.cfg`:
+
+```ini
+[paths]
+db = "./db/data.db"
+files = "./text_files"
+
+[aleph]
+host = "https://aleph.occrp.org"
+collections = 25, 55, 90
+```
+
+Make sure you set your Aleph API key as an environment variable named ALEPH_API_KEY.
+
+Running the Pipeline
+
+Run the main script to start the pipeline:
+
+```bash
+python3 main.py
+```
+
+The pipeline will read text files from the specified directory, extract entities, enrich them using the API, and save the enriched data to the SQLite database.
diff --git a/alephutil.py b/alephutil.py
@@ -0,0 +1,81 @@
+import requests
+import logging
+import json
+
+# Set up logging for the module
+logger = logging.getLogger(__name__)
+
+
+def build_aleph_url(host, collection_ids, schema, limit, query):
+    base_url = f'{host}/api/2/entities'
+    
+    # Construct the filter parameters for collection IDs
+    collection_filters = "&".join([f"filter%3Acollection_id={collection_id}" for collection_id in collection_ids])
+    
+    # Construct the full URL with parameters
+    url = f"{base_url}?{collection_filters}&filter%3Aschemata={schema}&limit={limit}&q={query}"
+    return url
+
+def query_aleph(entity, config):
+    """
+    Queries the Aleph API to retrieve information based on a an entity type and name.
+    
+    :param entity: A tuple containing the name and the type of an entity
+    :param config: Configuration dictionary containing API key, Aleph host, and collection IDs
+    """
+    
+    host = config['host']
+    api_key = config['api_key']
+    collections = config['collections']
+    schema = entity['schema']
+    name = entity['name']
+
+    HEADERS = { "Authorization": api_key, "Content-Type": 'application/json', 'Accept-Encoding': 'gzip, deflate, br' }
+    url = build_aleph_url(host, collections, schema, 50, name)
+    response = requests.get(url, headers=HEADERS)
+
+    if response.status_code == 200:  # Check if the response is successful
+        logger.info(f'Aleph API call successful: {entity}')
+        data = json.loads(response.text)
+        return data
+    else:
+        logger.error(f"API call failed: {url}")
+        return None
+
+def parse_schema(schema):
+    if schema == 'ORG':
+        return 'Company'
+    elif schema == 'PERSON':
+        return 'Person'
+    elif schema == 'EVENT':
+        return 'Event'
+    else:
+        return 'LegalEntity'
+    
+def enrich_entities(entity, config):
+    """
+    Enriches entity information using the Aleph API data.
+
+    :param entity: a single spacy extracted entity
+    :param config: Configuration dictionary containing API key, Aleph host, and collection IDs
+    :return: Enriched entity information
+    """
+    schema = parse_schema(entity['category'])
+    name = entity['text']
+    enriched_entity = {'name': name, 'schema': schema}
+    
+    try:
+        data = query_aleph(enriched_entity, config)
+        if data:
+            for result in data['results']:
+                for k, v in result['properties'].items():
+                    if k in enriched_entity and enriched_entity[k] != result['properties'][k]:
+                        enriched_entity[k] = [enriched_entity[k]] + result['properties'][k]
+                    else:
+                        enriched_entity[k] = result['properties'][k]
+        else:
+            logger.warning(f"No data received from Aleph API for entity: {entity['text']}")
+    except Exception as e:
+        logger.error(f"Error during API query for entity {entity['text']}: {e}")
+    
+    return enriched_entity
diff --git a/config.cfg b/config.cfg
@@ -0,0 +1,8 @@
+[paths]
+examples = "./example_data.jsonl"
+db = "./db/timelark.db"
+files = "./text"
+
+[aleph]
+host = "https://aleph.occrp.org/"
+collections = 6102, 4558, 119, 845
diff --git a/dbmanager.py b/dbmanager.py
@@ -0,0 +1,26 @@
+import dataset
+import logging
+import uuid
+
+# Set up logging for the module
+logger = logging.getLogger(__name__)
+
+
+def save_to_db(data, db_path):
+    try:
+        db = dataset.connect(f"sqlite:///{db_path}")
+
+        for entity in data:
+            schema = entity.get("schema")
+            if schema:
+                table_name = f"{schema}_entities"
+                primary_key = str(uuid.uuid4())  # Generate a UUID as the primary key
+
+                table = db.get_table(table_name) or db.create_table(table_name, primary_id="uuid")
+                entity["uuid"] = primary_key  # Add the UUID to the entity dictionary
+                table.upsert(entity, keys=["uuid"])  # Use the UUID as the primary key
+                logger.info(f"Inserted '{schema}' into the database")
+            else:
+                logger.warning(f"Entity has no schema specified, skipping insertion: {entity}")
+    except Exception as e:
+        logger.error(f"Error while saving to the database: {e}")
diff --git a/extractor.py b/extractor.py
@@ -0,0 +1,32 @@
+import spacy
+import logging
+
+# Set up logging for the module
+logger = logging.getLogger(__name__)
+
+# Load the larger spaCy model for entity recognition
+nlp = spacy.load("en_core_web_lg")  # or en_core_web_md or sm
+
+def extract_entities(text):
+    """
+    Extracts entities from the given text and filters for specific categories.
+
+    Args:
+        text (str): Input text to extract entities from.
+
+    Returns:
+        list: List of extracted entities.
+    """
+    try:
+        logger.info("Extracting entities from text")
+        doc = nlp(text)
+        entities = []
+        target_labels = ["LOC", "PERSON", "ORG", "GPE", "DATE", "EVENT"]
+        for ent in doc.ents:
+            if ent.label_ in target_labels:
+                entities.append({"text": ent.text, "category": ent.label_})
+        logger.info(f"Extracted {len(entities)} entities from text")
+        return entities
+    except Exception as e:
+        logger.error(f"Error extracting entities: {e}")
+        raise RuntimeError(f"Error extracting entities: {e}")
diff --git a/file_reader.py b/file_reader.py
@@ -0,0 +1,24 @@
+import logging
+
+# Set up logging for the module
+logger = logging.getLogger(__name__)
+
+def read_text_from_files(file_path):
+    """
+    Reads text from a file.
+
+    Args:
+        file_path (str): Path to the text file.
+
+    Returns:
+        str: Contents of the text file.
+    """
+    try:
+        logger.info("Reading text from file: %s", file_path)
+        with open(file_path, "r", encoding="utf-8") as file:
+            text = file.read()
+            logger.info("Read text from file: %s", file_path)
+            return text
+    except Exception as e:
+        logger.error("Error reading file %s: %s", file_path, e)
+        raise RuntimeError(f"Error reading file {file_path}: {e}")
diff --git a/main.py b/main.py
@@ -0,0 +1,74 @@
+import os
+import logging
+from confection import registry, Config
+from file_reader import read_text_from_files
+from extractor import extract_entities
+from alephutil import enrich_entities
+from dbmanager import save_to_db
+from glob import glob
+
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S"
+)
+
+# Load configuration from disk
+config_file = Config().from_disk("./config.cfg")
+config = registry.resolve(config_file)
+# Access configuration values
+db_path = config['paths']['db']
+files_path = config['paths']['files']
+aleph_host = config['aleph']['host']
+collections = list[config['aleph']['collections']]
+# Read API key from environment variable
+api_key = os.environ.get("ALEPH_API_KEY")
+# create aleph_config variable for easier handling
+aleph_config = {"host":aleph_host, "api_key":api_key, "collections":collections}
+
+# Check for None and log warnings if needed
+missing_values = []
+if db_path is None:
+    missing_values.append("db_path")
+if files_path is None:
+    missing_values.append("files_path")
+if aleph_host is None:
+    missing_values.append("aleph_host")
+if api_key is None:
+    missing_values.append("api_key")
+
+if missing_values:
+    logging.warning("The following configuration values are missing or None: %s", ", ".join(missing_values))
+
+
+if __name__ == "__main__":
+    logging.info("Starting the data pipeline")
+
+    # Use glob to get a list of all text files in the specified folder
+    text_files = glob(files_path+"/*.txt")
+
+    logging.info(f"Found {len(text_files)} text files to process")
+
+    for txt in text_files:
+        logging.info(f"Processing text file: {txt}")
+        
+        # Read text from the file
+        text = read_text_from_files(txt)
+        
+        # Extract entities from the text
+        entities = extract_entities(text)
+        
+        enriched_entities = []
+        if entities:
+            # Enrich entities using Aleph API
+            for entity in entities: 
+                enriched_entity = enrich_entities(entity, aleph_config)
+                enriched_entities.append(enriched_entity)
+        
+        # Save enriched entities to the database
+        save_to_db(enriched_entities, db_path)
+
+        logging.info(f"Processed and saved data from {txt}")
+
+    logging.info("Data pipeline completed")