rebase and add tests and small fixes

tim-quix · tim-quix · commit 36fc6c484e63 · 2025-10-16T15:58:02.000-04:00
diff --git a/python/destinations/s3-file/requirements.txt b/python/destinations/s3-file/requirements.txt
@@ -1,3 +1,2 @@
-# TODO: finalize version
-quixstreams[aws]==3.22.0
+quixstreams[s3]==3.23.1
 python-dotenv
diff --git a/tests/destinations/s3-file/docker-compose.test.yml b/tests/destinations/s3-file/docker-compose.test.yml
@@ -0,0 +1,120 @@
+# timeout: 60
+services:
+  minio:
+    image: minio/minio:latest
+    command: server /data --console-address ":9001"
+    environment:
+      - MINIO_ROOT_USER=minioadmin
+      - MINIO_ROOT_PASSWORD=minioadmin
+    networks:
+      - test-network
+    healthcheck:
+      test: ["CMD", "mc", "ready", "local"]
+      interval: 3s
+      timeout: 5s
+      retries: 10
+    stop_grace_period: 3s
+
+  minio-init:
+    image: minio/mc:latest
+    depends_on:
+      minio:
+        condition: service_healthy
+    entrypoint: >
+      /bin/sh -c "
+      mc alias set myminio http://minio:9000 minioadmin minioadmin;
+      mc mb myminio/test-bucket --ignore-existing;
+      echo 'MinIO bucket created';
+      echo 'Keeping minio-init alive...';
+      tail -f /dev/null
+      "
+    networks:
+      - test-network
+
+  kafka:
+    image: docker.redpanda.com/redpandadata/redpanda:v24.2.4
+    command:
+      - redpanda
+      - start
+      - --kafka-addr internal://0.0.0.0:9092
+      - --advertise-kafka-addr internal://kafka:9092
+      - --mode dev-container
+      - --smp 1
+    healthcheck:
+      test: ["CMD-SHELL", "rpk cluster health | grep -E 'Healthy:.+true' || exit 1"]
+      interval: 5s
+      timeout: 10s
+      retries: 10
+    networks:
+      - test-network
+    stop_grace_period: 3s
+
+  s3-file-dest:
+    build:
+      context: ../../../python/destinations/s3-file
+      dockerfile: Dockerfile
+    environment:
+      - Quix__Broker__Address=kafka:9092
+      - Quix__Consumer__Group=s3-file-dest-test
+      - Quix__Deployment__Id=test-s3-file-dest
+      - input=test-s3-input
+      - S3_BUCKET=test-bucket
+      - S3_BUCKET_DIRECTORY=test_data
+      - AWS_ACCESS_KEY_ID=minioadmin
+      - AWS_SECRET_ACCESS_KEY=minioadmin
+      - AWS_REGION_NAME=us-east-1
+      - AWS_ENDPOINT_URL_S3=http://minio:9000
+      - FILE_FORMAT=json
+    networks:
+      - test-network
+    depends_on:
+      minio:
+        condition: service_healthy
+      kafka:
+        condition: service_healthy
+      minio-init:
+        condition: service_started
+    stop_grace_period: 3s
+
+  test-runner:
+    build:
+      context: ../../framework
+      dockerfile: Dockerfile
+    environment:
+      - Quix__Broker__Address=kafka:9092
+      - TEST_INPUT_TOPIC=test-s3-input
+      - TEST_MESSAGE_COUNT=3
+      - MINIO_ENDPOINT=minio:9000
+      - MINIO_ACCESS_KEY=minioadmin
+      - MINIO_SECRET_KEY=minioadmin
+      - S3_BUCKET=test-bucket
+      - S3_PREFIX=test_data
+    command: >
+      sh -c "
+        echo 'Installing boto3 for S3 access...' &&
+        pip install boto3 > /dev/null 2>&1 &&
+        echo 'Producing test messages to Kafka...' &&
+        python /tests/produce_test_data.py &&
+        echo 'Waiting for s3-file-dest to process messages...' &&
+        sleep 15 &&
+        echo 'Verifying data in S3...' &&
+        python /tests/verify_output.py
+      "
+    volumes:
+      - ./produce_test_data.py:/tests/produce_test_data.py:ro
+      - ./verify_output.py:/tests/verify_output.py:ro
+    working_dir: /
+    networks:
+      - test-network
+    depends_on:
+      minio:
+        condition: service_healthy
+      kafka:
+        condition: service_healthy
+      s3-file-dest:
+        condition: service_started
+    stop_grace_period: 3s
+
+networks:
+  test-network:
+    driver: bridge
diff --git a/tests/destinations/s3-file/produce_test_data.py b/tests/destinations/s3-file/produce_test_data.py
@@ -0,0 +1,51 @@
+import os
+import time
+import json
+from quixstreams import Application
+
+def main():
+    broker_address = os.getenv("Quix__Broker__Address", "kafka:9092")
+    topic_name = os.getenv("TEST_INPUT_TOPIC", "test-s3-input")
+    message_count = int(os.getenv("TEST_MESSAGE_COUNT", "3"))
+
+    print(f"Producing {message_count} test messages to topic: {topic_name}")
+
+    app = Application(
+        broker_address=broker_address,
+        producer_extra_config={
+            "allow.auto.create.topics": "true"
+        }
+    )
+
+    topic = app.topic(topic_name)
+
+    with app.get_producer() as producer:
+        for i in range(message_count):
+            message = {
+                "id": i,
+                "name": f"test_item_{i}",
+                "value": f"test_value_{i}",
+                "timestamp": int(time.time() * 1000)
+            }
+            print(f"Producing message {i}: {message}")
+
+            serialized = json.dumps(message).encode('utf-8')
+
+            producer.produce(
+                topic=topic.name,
+                key=f"key_{i}",
+                value=serialized
+            )
+
+        producer.flush()
+
+    print(f"Successfully produced {message_count} messages")
+
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception as e:
+        print(f"Error: {e}")
+        import traceback
+        traceback.print_exc()
+        exit(1)
diff --git a/tests/destinations/s3-file/verify_output.py b/tests/destinations/s3-file/verify_output.py
@@ -0,0 +1,176 @@
+import boto3
+import os
+import sys
+import time
+import json
+
+def main():
+    minio_endpoint = os.getenv("MINIO_ENDPOINT", "minio:9000")
+    access_key = os.getenv("MINIO_ACCESS_KEY", "minioadmin")
+    secret_key = os.getenv("MINIO_SECRET_KEY", "minioadmin")
+    bucket_name = os.getenv("S3_BUCKET", "test-bucket")
+    prefix = os.getenv("S3_PREFIX", "test_data")
+
+    print(f"Connecting to MinIO at {minio_endpoint}")
+
+    # Create S3 client for MinIO
+    s3_client = boto3.client(
+        's3',
+        endpoint_url=f'http://{minio_endpoint}',
+        aws_access_key_id=access_key,
+        aws_secret_access_key=secret_key,
+        region_name='us-east-1'
+    )
+
+    expected_count = 1  # Expecting at least 1 file
+    max_attempts = 20
+    found_files = []
+
+    print(f"Checking S3 bucket '{bucket_name}' with prefix '{prefix}' for files...")
+
+    # Retry logic with polling
+    for attempt in range(max_attempts):
+        found_files = []
+
+        try:
+            # List objects in the bucket with the prefix
+            response = s3_client.list_objects_v2(
+                Bucket=bucket_name,
+                Prefix=prefix
+            )
+
+            if 'Contents' in response:
+                for obj in response['Contents']:
+                    key = obj['Key']
+                    # Skip directory markers
+                    if not key.endswith('/'):
+                        found_files.append(key)
+                        print(f"Found file: {key} (size: {obj['Size']} bytes)")
+
+        except Exception as e:
+            print(f"Error listing objects: {e}")
+
+        if len(found_files) >= expected_count:
+            print(f"\nSuccess: Found {len(found_files)} file(s) in S3")
+
+            # Verify the file(s) contain valid data
+            try:
+                all_records = []
+
+                # Read all files and collect records
+                for file_key in found_files:
+                    print(f"\nReading file: {file_key}")
+
+                    obj_response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
+                    content = obj_response['Body'].read().decode('utf-8')
+
+                    print(f"File size: {len(content)} bytes")
+                    print(f"File content (first 1000 chars):\n{content[:1000]}")
+                    print(f"---")
+
+                    if file_key.endswith('.json') or file_key.endswith('.jsonl'):
+                        lines = content.strip().split('\n')
+                        print(f"File contains {len(lines)} line(s)")
+
+                        for idx, line in enumerate(lines):
+                            if line.strip():
+                                print(f"Parsing line {idx}: {line[:100]}...")
+                                try:
+                                    data = json.loads(line)
+                                    all_records.append(data)
+                                    print(f"✓ Successfully parsed record: {data}")
+                                except json.JSONDecodeError as je:
+                                    print(f"ERROR: Invalid JSON on line {idx}: {je}")
+                                    print(f"Line content: {line}")
+                                    sys.exit(1)
+                    else:
+                        print(f"WARNING: Skipping non-JSON file: {file_key}")
+
+                print(f"\nTotal records found: {len(all_records)}")
+
+                # Verify we have the expected number of records
+                expected_message_count = 3
+                if len(all_records) < expected_message_count:
+                    print(f"ERROR: Expected {expected_message_count} records, found {len(all_records)}")
+                    sys.exit(1)
+
+                # Verify each record has the expected structure and values
+                expected_metadata_fields = {'_key', '_timestamp', '_value'}
+                expected_value_fields = {'id', 'name', 'value', 'timestamp'}
+                found_ids = set()
+
+                for i, record in enumerate(all_records):
+                    print(f"\nValidating record {i}: {record}")
+
+                    # Check for Kafka metadata fields
+                    actual_fields = set(record.keys())
+                    if not expected_metadata_fields.issubset(actual_fields):
+                        missing = expected_metadata_fields - actual_fields
+                        print(f"ERROR: Record {i} missing metadata fields: {missing}")
+                        sys.exit(1)
+
+                    # Extract the actual message value
+                    message_value = record['_value']
+                    if not isinstance(message_value, dict):
+                        print(f"ERROR: Record {i} _value is not a dict: {type(message_value)}")
+                        sys.exit(1)
+
+                    # Check for required fields in _value
+                    actual_value_fields = set(message_value.keys())
+                    if not expected_value_fields.issubset(actual_value_fields):
+                        missing = expected_value_fields - actual_value_fields
+                        print(f"ERROR: Record {i} _value missing fields: {missing}")
+                        sys.exit(1)
+
+                    # Verify id is an integer
+                    if not isinstance(message_value['id'], int):
+                        print(f"ERROR: Record {i} has invalid id type: {type(message_value['id'])}")
+                        sys.exit(1)
+
+                    found_ids.add(message_value['id'])
+
+                    # Verify _key matches expected pattern
+                    expected_key = f"key_{message_value['id']}"
+                    if record['_key'] != expected_key:
+                        print(f"ERROR: Record {i} has incorrect _key. Expected '{expected_key}', got '{record['_key']}'")
+                        sys.exit(1)
+
+                    # Verify name matches expected pattern
+                    expected_name = f"test_item_{message_value['id']}"
+                    if message_value['name'] != expected_name:
+                        print(f"ERROR: Record {i} has incorrect name. Expected '{expected_name}', got '{message_value['name']}'")
+                        sys.exit(1)
+
+                    # Verify value matches expected pattern
+                    expected_value = f"test_value_{message_value['id']}"
+                    if message_value['value'] != expected_value:
+                        print(f"ERROR: Record {i} has incorrect value. Expected '{expected_value}', got '{message_value['value']}'")
+                        sys.exit(1)
+
+                    print(f"✓ Record {i} validated: _key={record['_key']}, id={message_value['id']}, name={message_value['name']}, value={message_value['value']}")
+
+                # Verify we got all expected IDs (0, 1, 2)
+                expected_ids = set(range(expected_message_count))
+                if found_ids != expected_ids:
+                    print(f"ERROR: Missing IDs. Expected {expected_ids}, found {found_ids}")
+                    sys.exit(1)
+
+                print(f"\n✓ All {len(all_records)} records validated successfully")
+                print(f"✓ All expected IDs present: {sorted(found_ids)}")
+
+            except Exception as e:
+                print(f"ERROR: Failed to verify file content: {e}")
+                import traceback
+                traceback.print_exc()
+                sys.exit(1)
+
+            sys.exit(0)
+
+        print(f"Attempt {attempt + 1}/{max_attempts}: Found {len(found_files)} file(s), waiting...")
+        time.sleep(2)
+
+    print(f"\nFAILED: Only found {len(found_files)} file(s) after {max_attempts} attempts")
+    sys.exit(1)
+
+if __name__ == "__main__":
+    main()