S3: Add tests for PyArrow with native S3 filesystem (#7508)

* PyArrow native S3 filesystem

* add sse-s3 tests

* update

* minor

* ENABLE_SSE_S3

* Update test_pyarrow_native_s3.py

* clean up

* refactoring

* Update test_pyarrow_native_s3.py
This commit is contained in:
Chris Lu
2025-11-19 13:49:22 -08:00
committed by GitHub
parent ca84a8a713
commit 8be9e258fc
7 changed files with 1008 additions and 5 deletions

View File

@@ -13,6 +13,7 @@ SECRET_KEY ?= some_secret_key1
VOLUME_MAX_SIZE_MB ?= 50
VOLUME_MAX_COUNT ?= 100
BUCKET_NAME ?= test-parquet-bucket
ENABLE_SSE_S3 ?= false
# Python configuration
PYTHON ?= python3
@@ -29,7 +30,7 @@ GREEN := \033[0;32m
YELLOW := \033[1;33m
NC := \033[0m # No Color
.PHONY: all build-weed check-binary check-python ci-test clean debug-logs debug-status help manual-start manual-stop setup-python start-seaweedfs start-seaweedfs-ci stop-seaweedfs stop-seaweedfs-safe test test-implicit-dir test-implicit-dir-with-server test-quick test-with-server
.PHONY: all build-weed check-binary check-python ci-test clean debug-logs debug-status help manual-start manual-stop setup-python start-seaweedfs start-seaweedfs-ci stop-seaweedfs stop-seaweedfs-safe test test-implicit-dir test-implicit-dir-with-server test-native-s3 test-native-s3-with-server test-native-s3-with-sse test-quick test-sse-s3-compat test-with-server
all: test
@@ -48,6 +49,10 @@ help:
@echo " test-quick - Run quick tests with small files only (sets TEST_QUICK=1)"
@echo " test-implicit-dir - Test implicit directory fix for s3fs compatibility"
@echo " test-implicit-dir-with-server - Test implicit directory fix with server management"
@echo " test-native-s3 - Test PyArrow's native S3 filesystem (assumes server running)"
@echo " test-native-s3-with-server - Test PyArrow's native S3 filesystem with server management"
@echo " test-native-s3-with-sse - Test PyArrow's native S3 with SSE-S3 encryption enabled"
@echo " test-sse-s3-compat - Comprehensive SSE-S3 compatibility test (multipart uploads)"
@echo " setup-python - Setup Python virtual environment and install dependencies"
@echo " check-python - Check if Python and required packages are available"
@echo " start-seaweedfs - Start SeaweedFS server for testing"
@@ -66,6 +71,7 @@ help:
@echo " MASTER_PORT=$(MASTER_PORT)"
@echo " BUCKET_NAME=$(BUCKET_NAME)"
@echo " VOLUME_MAX_SIZE_MB=$(VOLUME_MAX_SIZE_MB)"
@echo " ENABLE_SSE_S3=$(ENABLE_SSE_S3)"
@echo " PYTHON=$(PYTHON)"
check-binary:
@@ -131,7 +137,13 @@ start-seaweedfs-ci: check-binary
# Start filer server with embedded S3
@echo "Starting filer server with embedded S3..."
@printf '{"identities":[{"name":"%s","credentials":[{"accessKey":"%s","secretKey":"%s"}],"actions":["Admin","Read","Write"]}]}' "$(ACCESS_KEY)" "$(ACCESS_KEY)" "$(SECRET_KEY)" > /tmp/seaweedfs-parquet-s3.json
@if [ "$(ENABLE_SSE_S3)" = "true" ]; then \
echo " SSE-S3 encryption: ENABLED"; \
printf '{"identities":[{"name":"%s","credentials":[{"accessKey":"%s","secretKey":"%s"}],"actions":["Admin","Read","Write"]}],"buckets":[{"name":"$(BUCKET_NAME)","encryption":{"sseS3":{"enabled":true}}}]}' "$(ACCESS_KEY)" "$(ACCESS_KEY)" "$(SECRET_KEY)" > /tmp/seaweedfs-parquet-s3.json; \
else \
echo " SSE-S3 encryption: DISABLED"; \
printf '{"identities":[{"name":"%s","credentials":[{"accessKey":"%s","secretKey":"%s"}],"actions":["Admin","Read","Write"]}]}' "$(ACCESS_KEY)" "$(ACCESS_KEY)" "$(SECRET_KEY)" > /tmp/seaweedfs-parquet-s3.json; \
fi
@AWS_ACCESS_KEY_ID=$(ACCESS_KEY) AWS_SECRET_ACCESS_KEY=$(SECRET_KEY) nohup $(SEAWEEDFS_BINARY) filer -port=$(FILER_PORT) -port.grpc=$$(( $(FILER_PORT) + 10000 )) -master=127.0.0.1:$(MASTER_PORT) -dataCenter=defaultDataCenter -ip=127.0.0.1 -s3 -s3.port=$(S3_PORT) -s3.config=/tmp/seaweedfs-parquet-s3.json > /tmp/seaweedfs-parquet-filer.log 2>&1 &
@sleep 5
@@ -274,7 +286,6 @@ test-with-server: build-weed setup-python
BUCKET_NAME=$(BUCKET_NAME) \
$(VENV_DIR)/bin/$(PYTHON) $(PYTHON_TEST_SCRIPT) || exit 1; \
echo "✅ All tests completed successfully"; \
$(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true; \
else \
echo "❌ Failed to start SeaweedFS cluster"; \
echo "=== Server startup logs ==="; \
@@ -329,7 +340,6 @@ test-implicit-dir-with-server: build-weed setup-python
BUCKET_NAME=test-implicit-dir \
$(VENV_DIR)/bin/$(PYTHON) test_implicit_directory_fix.py || exit 1; \
echo "✅ All tests completed successfully"; \
$(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true; \
else \
echo "❌ Failed to start SeaweedFS cluster"; \
echo "=== Server startup logs ==="; \
@@ -360,6 +370,80 @@ manual-start: start-seaweedfs
manual-stop: stop-seaweedfs clean
# Test PyArrow's native S3 filesystem
test-native-s3: setup-python
@echo "$(YELLOW)Running PyArrow native S3 filesystem tests...$(NC)"
@echo "$(YELLOW)Assuming SeaweedFS is already running on localhost:$(S3_PORT)$(NC)"
@S3_ENDPOINT_URL=http://localhost:$(S3_PORT) \
S3_ACCESS_KEY=$(ACCESS_KEY) \
S3_SECRET_KEY=$(SECRET_KEY) \
BUCKET_NAME=$(BUCKET_NAME) \
$(VENV_DIR)/bin/$(PYTHON) test_pyarrow_native_s3.py
# Test PyArrow's native S3 filesystem with automatic server management
test-native-s3-with-server: build-weed setup-python
@echo "🚀 Starting PyArrow native S3 filesystem tests with automated server management..."
@echo "Starting SeaweedFS cluster..."
@if $(MAKE) start-seaweedfs-ci > weed-test.log 2>&1; then \
echo "✅ SeaweedFS cluster started successfully"; \
echo "Running PyArrow native S3 filesystem tests..."; \
trap '$(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true' EXIT; \
S3_ENDPOINT_URL=http://localhost:$(S3_PORT) \
S3_ACCESS_KEY=$(ACCESS_KEY) \
S3_SECRET_KEY=$(SECRET_KEY) \
BUCKET_NAME=$(BUCKET_NAME) \
$(VENV_DIR)/bin/$(PYTHON) test_pyarrow_native_s3.py || exit 1; \
echo "✅ All tests completed successfully"; \
else \
echo "❌ Failed to start SeaweedFS cluster"; \
echo "=== Server startup logs ==="; \
tail -100 weed-test.log 2>/dev/null || echo "No startup log available"; \
exit 1; \
fi
# Test PyArrow's native S3 filesystem compatibility with SSE-S3 enabled backend
# (For encryption-specific validation, use test-sse-s3-compat)
test-native-s3-with-sse: build-weed setup-python
@echo "🚀 Testing PyArrow native S3 compatibility with SSE-S3 enabled backend..."
@echo "Starting SeaweedFS cluster with SSE-S3 enabled..."
@if $(MAKE) start-seaweedfs-ci ENABLE_SSE_S3=true > weed-test-sse.log 2>&1; then \
echo "✅ SeaweedFS cluster started successfully with SSE-S3"; \
echo "Running PyArrow native S3 filesystem tests with SSE-S3..."; \
trap '$(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true' EXIT; \
S3_ENDPOINT_URL=http://localhost:$(S3_PORT) \
S3_ACCESS_KEY=$(ACCESS_KEY) \
S3_SECRET_KEY=$(SECRET_KEY) \
BUCKET_NAME=$(BUCKET_NAME) \
$(VENV_DIR)/bin/$(PYTHON) test_pyarrow_native_s3.py || exit 1; \
echo "✅ All SSE-S3 tests completed successfully"; \
else \
echo "❌ Failed to start SeaweedFS cluster with SSE-S3"; \
echo "=== Server startup logs ==="; \
tail -100 weed-test-sse.log 2>/dev/null || echo "No startup log available"; \
exit 1; \
fi
# Comprehensive SSE-S3 compatibility test
test-sse-s3-compat: build-weed setup-python
@echo "🚀 Starting comprehensive SSE-S3 compatibility tests..."
@echo "Starting SeaweedFS cluster with SSE-S3 enabled..."
@if $(MAKE) start-seaweedfs-ci ENABLE_SSE_S3=true > weed-test-sse-compat.log 2>&1; then \
echo "✅ SeaweedFS cluster started successfully with SSE-S3"; \
echo "Running comprehensive SSE-S3 compatibility tests..."; \
trap '$(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true' EXIT; \
S3_ENDPOINT_URL=http://localhost:$(S3_PORT) \
S3_ACCESS_KEY=$(ACCESS_KEY) \
S3_SECRET_KEY=$(SECRET_KEY) \
BUCKET_NAME=$(BUCKET_NAME) \
$(VENV_DIR)/bin/$(PYTHON) test_sse_s3_compatibility.py || exit 1; \
echo "✅ All SSE-S3 compatibility tests completed successfully"; \
else \
echo "❌ Failed to start SeaweedFS cluster with SSE-S3"; \
echo "=== Server startup logs ==="; \
tail -100 weed-test-sse-compat.log 2>/dev/null || echo "No startup log available"; \
exit 1; \
fi
# CI/CD targets
ci-test: test-with-server

View File

@@ -10,6 +10,22 @@ SeaweedFS implements implicit directory detection to improve compatibility with
## Quick Start
### Running the Example Script
```bash
# Start SeaweedFS server
make start-seaweedfs-ci
# Run the example script
python3 example_pyarrow_native.py
# Or with uv (if available)
uv run example_pyarrow_native.py
# Stop the server when done
make stop-seaweedfs-safe
```
### Running Tests
```bash
@@ -25,12 +41,20 @@ make test-quick
# Run implicit directory fix tests
make test-implicit-dir-with-server
# Run PyArrow native S3 filesystem tests
make test-native-s3-with-server
# Run SSE-S3 encryption tests
make test-sse-s3-compat
# Clean up
make clean
```
### Using PyArrow with SeaweedFS
#### Option 1: Using s3fs (recommended for compatibility)
```python
import pyarrow as pa
import pyarrow.parquet as pq
@@ -55,13 +79,55 @@ table = pq.read_table('bucket/dataset', filesystem=fs) # ✅
dataset = pq.ParquetDataset('bucket/dataset', filesystem=fs) # ✅
```
#### Option 2: Using PyArrow's native S3 filesystem (pure PyArrow)
```python
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as pads
import pyarrow.fs as pafs
# Configure PyArrow's native S3 filesystem
s3 = pafs.S3FileSystem(
access_key='your_access_key',
secret_key='your_secret_key',
endpoint_override='localhost:8333',
scheme='http',
allow_bucket_creation=True,
allow_bucket_deletion=True
)
# Write dataset
table = pa.table({'id': [1, 2, 3], 'value': ['a', 'b', 'c']})
pads.write_dataset(table, 'bucket/dataset', filesystem=s3)
# Read dataset (all methods work!)
table = pq.read_table('bucket/dataset', filesystem=s3) # ✅
dataset = pq.ParquetDataset('bucket/dataset', filesystem=s3) # ✅
dataset = pads.dataset('bucket/dataset', filesystem=s3) # ✅
```
## Test Files
### Main Test Suite
- **`s3_parquet_test.py`** - Comprehensive PyArrow test suite
- Tests 2 write methods × 5 read methods × 2 dataset sizes = 20 combinations
- Uses s3fs library for S3 operations
- All tests pass with the implicit directory fix ✅
### PyArrow Native S3 Tests
- **`test_pyarrow_native_s3.py`** - PyArrow's native S3 filesystem tests
- Tests PyArrow's built-in S3FileSystem (pyarrow.fs.S3FileSystem)
- Pure PyArrow solution without s3fs dependency
- Tests 3 read methods × 2 dataset sizes = 6 scenarios
- All tests pass ✅
- **`test_sse_s3_compatibility.py`** - SSE-S3 encryption compatibility tests
- Tests PyArrow native S3 with SSE-S3 server-side encryption
- Tests 5 different file sizes (10 to 500,000 rows)
- Verifies multipart upload encryption works correctly
- All tests pass ✅
### Implicit Directory Tests
- **`test_implicit_directory_fix.py`** - Specific tests for the implicit directory fix
- Tests HEAD request behavior
@@ -69,6 +135,12 @@ dataset = pq.ParquetDataset('bucket/dataset', filesystem=fs) # ✅
- Tests PyArrow dataset reading
- All 6 tests pass ✅
### Examples
- **`example_pyarrow_native.py`** - Simple standalone example
- Demonstrates PyArrow's native S3 filesystem usage
- Can be run with `uv run` or regular Python
- Minimal dependencies (pyarrow, boto3)
### Configuration
- **`Makefile`** - Build and test automation
- **`requirements.txt`** - Python dependencies (pyarrow, s3fs, boto3)
@@ -128,6 +200,9 @@ make test # Run full tests (assumes server is already running)
make test-with-server # Run full PyArrow test suite with server (small + large files)
make test-quick # Run quick tests with small files only (assumes server is running)
make test-implicit-dir-with-server # Run implicit directory tests with server
make test-native-s3 # Run PyArrow native S3 tests (assumes server is running)
make test-native-s3-with-server # Run PyArrow native S3 tests with server management
make test-sse-s3-compat # Run comprehensive SSE-S3 encryption compatibility tests
# Server Management
make start-seaweedfs-ci # Start SeaweedFS in background (CI mode)
@@ -146,10 +221,20 @@ The tests are automatically run in GitHub Actions on every push/PR that affects
**Test Matrix**:
- Python versions: 3.9, 3.11, 3.12
- PyArrow integration tests: 20 test combinations
- PyArrow integration tests (s3fs): 20 test combinations
- PyArrow native S3 tests: 6 test scenarios ✅ **NEW**
- SSE-S3 encryption tests: 5 file sizes ✅ **NEW**
- Implicit directory fix tests: 6 test scenarios
- Go unit tests: 17 test cases
**Test Steps** (run for each Python version):
1. Build SeaweedFS
2. Run PyArrow Parquet integration tests (`make test-with-server`)
3. Run implicit directory fix tests (`make test-implicit-dir-with-server`)
4. Run PyArrow native S3 filesystem tests (`make test-native-s3-with-server`) ✅ **NEW**
5. Run SSE-S3 encryption compatibility tests (`make test-sse-s3-compat`) ✅ **NEW**
6. Run Go unit tests for implicit directory handling
**Triggers**:
- Push/PR to master (when `weed/s3api/**` or `weed/filer/**` changes)
- Manual trigger via GitHub UI (workflow_dispatch)

View File

@@ -0,0 +1,134 @@
#!/usr/bin/env python3
# /// script
# dependencies = [
# "pyarrow>=22",
# "boto3>=1.28.0",
# ]
# ///
"""
Simple example of using PyArrow's native S3 filesystem with SeaweedFS.
This is a minimal example demonstrating how to write and read Parquet files
using PyArrow's built-in S3FileSystem without any additional dependencies
like s3fs.
Usage:
# Set environment variables
export S3_ENDPOINT_URL=localhost:8333
export S3_ACCESS_KEY=some_access_key1
export S3_SECRET_KEY=some_secret_key1
export BUCKET_NAME=test-parquet-bucket
# Run the script
python3 example_pyarrow_native.py
# Or run with uv (if available)
uv run example_pyarrow_native.py
"""
import os
import secrets
import pyarrow as pa
import pyarrow.dataset as pads
import pyarrow.fs as pafs
import pyarrow.parquet as pq
from parquet_test_utils import create_sample_table
# Configuration
BUCKET_NAME = os.getenv("BUCKET_NAME", "test-parquet-bucket")
S3_ENDPOINT_URL = os.getenv("S3_ENDPOINT_URL", "localhost:8333")
S3_ACCESS_KEY = os.getenv("S3_ACCESS_KEY", "some_access_key1")
S3_SECRET_KEY = os.getenv("S3_SECRET_KEY", "some_secret_key1")
# Determine scheme from endpoint
if S3_ENDPOINT_URL.startswith("http://"):
scheme = "http"
endpoint = S3_ENDPOINT_URL[7:]
elif S3_ENDPOINT_URL.startswith("https://"):
scheme = "https"
endpoint = S3_ENDPOINT_URL[8:]
else:
scheme = "http" # Default to http for localhost
endpoint = S3_ENDPOINT_URL
print(f"Connecting to S3 endpoint: {scheme}://{endpoint}")
# Initialize PyArrow's NATIVE S3 filesystem
s3 = pafs.S3FileSystem(
access_key=S3_ACCESS_KEY,
secret_key=S3_SECRET_KEY,
endpoint_override=endpoint,
scheme=scheme,
allow_bucket_creation=True,
allow_bucket_deletion=True,
)
print("✓ Connected to S3 endpoint")
# Create bucket if needed (using boto3)
try:
import boto3
from botocore.exceptions import ClientError
s3_client = boto3.client(
's3',
endpoint_url=f"{scheme}://{endpoint}",
aws_access_key_id=S3_ACCESS_KEY,
aws_secret_access_key=S3_SECRET_KEY,
region_name='us-east-1',
)
try:
s3_client.head_bucket(Bucket=BUCKET_NAME)
print(f"✓ Bucket exists: {BUCKET_NAME}")
except ClientError as e:
if e.response['Error']['Code'] == '404':
print(f"Creating bucket: {BUCKET_NAME}")
s3_client.create_bucket(Bucket=BUCKET_NAME)
print(f"✓ Bucket created: {BUCKET_NAME}")
else:
raise
except ImportError:
print("Warning: boto3 not available, assuming bucket exists")
# Generate a unique filename
filename = f"{BUCKET_NAME}/dataset-{secrets.token_hex(8)}/test.parquet"
print(f"\nWriting Parquet dataset to: {filename}")
# Write dataset
table = create_sample_table(200_000)
pads.write_dataset(
table,
filename,
filesystem=s3,
format="parquet",
)
print(f"✓ Wrote {table.num_rows:,} rows")
# Read with pq.read_table
print("\nReading with pq.read_table...")
table_read = pq.read_table(filename, filesystem=s3)
print(f"✓ Read {table_read.num_rows:,} rows")
# Read with pq.ParquetDataset
print("\nReading with pq.ParquetDataset...")
dataset = pq.ParquetDataset(filename, filesystem=s3)
table_dataset = dataset.read()
print(f"✓ Read {table_dataset.num_rows:,} rows")
# Read with pads.dataset
print("\nReading with pads.dataset...")
dataset_pads = pads.dataset(filename, filesystem=s3)
table_pads = dataset_pads.to_table()
print(f"✓ Read {table_pads.num_rows:,} rows")
print("\n✅ All operations completed successfully!")
print(f"\nFile written to: {filename}")
print("You can verify the file using the SeaweedFS S3 API or weed shell")

View File

@@ -0,0 +1,41 @@
"""
Shared utility functions for PyArrow Parquet tests.
This module provides common test utilities used across multiple test scripts
to avoid code duplication and ensure consistency.
"""
import pyarrow as pa
def create_sample_table(num_rows: int = 5) -> pa.Table:
"""Create a sample PyArrow table for testing.
Args:
num_rows: Number of rows to generate (default: 5)
Returns:
PyArrow Table with test data containing:
- id: int64 sequential IDs (0 to num_rows-1)
- name: string user names (user_0, user_1, ...)
- value: float64 values (id * 1.5)
- flag: bool alternating True/False based on even/odd id
Example:
>>> table = create_sample_table(3)
>>> print(table)
pyarrow.Table
id: int64
name: string
value: double
flag: bool
"""
return pa.table(
{
"id": pa.array(range(num_rows), type=pa.int64()),
"name": pa.array([f"user_{i}" for i in range(num_rows)], type=pa.string()),
"value": pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()),
"flag": pa.array([i % 2 == 0 for i in range(num_rows)], type=pa.bool_()),
}
)

View File

@@ -0,0 +1,383 @@
#!/usr/bin/env python3
"""
Test script for PyArrow's NATIVE S3 filesystem with SeaweedFS.
This test uses PyArrow's built-in S3FileSystem (pyarrow.fs.S3FileSystem)
instead of s3fs, providing a pure PyArrow solution for reading and writing
Parquet files to S3-compatible storage.
Requirements:
- pyarrow>=10.0.0
Environment Variables:
S3_ENDPOINT_URL: S3 endpoint (default: localhost:8333)
S3_ACCESS_KEY: S3 access key (default: some_access_key1)
S3_SECRET_KEY: S3 secret key (default: some_secret_key1)
BUCKET_NAME: S3 bucket name (default: test-parquet-bucket)
TEST_QUICK: Run only small/quick tests (default: 0, set to 1 for quick mode)
Usage:
# Run with default environment variables
python3 test_pyarrow_native_s3.py
# Run with custom environment variables
S3_ENDPOINT_URL=localhost:8333 \
S3_ACCESS_KEY=mykey \
S3_SECRET_KEY=mysecret \
BUCKET_NAME=mybucket \
python3 test_pyarrow_native_s3.py
"""
import os
import secrets
import sys
import logging
from typing import Optional
import pyarrow as pa
import pyarrow.dataset as pads
import pyarrow.fs as pafs
import pyarrow.parquet as pq
try:
import boto3
from botocore.exceptions import ClientError
HAS_BOTO3 = True
except ImportError:
HAS_BOTO3 = False
from parquet_test_utils import create_sample_table
logging.basicConfig(level=logging.INFO, format="%(message)s")
# Configuration from environment variables with defaults
S3_ENDPOINT_URL = os.environ.get("S3_ENDPOINT_URL", "localhost:8333")
S3_ACCESS_KEY = os.environ.get("S3_ACCESS_KEY", "some_access_key1")
S3_SECRET_KEY = os.environ.get("S3_SECRET_KEY", "some_secret_key1")
BUCKET_NAME = os.getenv("BUCKET_NAME", "test-parquet-bucket")
TEST_QUICK = os.getenv("TEST_QUICK", "0") == "1"
# Create randomized test directory
TEST_RUN_ID = secrets.token_hex(8)
TEST_DIR = f"parquet-native-tests/{TEST_RUN_ID}"
# Test file sizes
TEST_SIZES = {
"small": 5,
"large": 200_000, # This will create multiple row groups
}
# Filter to only small tests if quick mode is enabled
if TEST_QUICK:
TEST_SIZES = {"small": TEST_SIZES["small"]}
logging.info("Quick test mode enabled - running only small tests")
def init_s3_filesystem() -> tuple[Optional[pafs.S3FileSystem], str, str]:
"""Initialize PyArrow's native S3 filesystem.
Returns:
tuple: (S3FileSystem instance, scheme, endpoint)
"""
try:
logging.info("Initializing PyArrow S3FileSystem...")
logging.info(f" Endpoint: {S3_ENDPOINT_URL}")
logging.info(f" Bucket: {BUCKET_NAME}")
# Determine scheme from endpoint
if S3_ENDPOINT_URL.startswith("http://"):
scheme = "http"
endpoint = S3_ENDPOINT_URL[7:] # Remove http://
elif S3_ENDPOINT_URL.startswith("https://"):
scheme = "https"
endpoint = S3_ENDPOINT_URL[8:] # Remove https://
else:
# Default to http for localhost
scheme = "http"
endpoint = S3_ENDPOINT_URL
# Enable bucket creation and deletion for testing
s3 = pafs.S3FileSystem(
access_key=S3_ACCESS_KEY,
secret_key=S3_SECRET_KEY,
endpoint_override=endpoint,
scheme=scheme,
allow_bucket_creation=True,
allow_bucket_deletion=True,
)
logging.info("✓ PyArrow S3FileSystem initialized successfully\n")
return s3, scheme, endpoint
except Exception:
logging.exception("✗ Failed to initialize PyArrow S3FileSystem")
return None, "", ""
def ensure_bucket_exists_boto3(scheme: str, endpoint: str) -> bool:
"""Ensure the test bucket exists using boto3."""
if not HAS_BOTO3:
logging.error("boto3 is required for bucket creation")
return False
try:
# Create boto3 client
endpoint_url = f"{scheme}://{endpoint}"
s3_client = boto3.client(
's3',
endpoint_url=endpoint_url,
aws_access_key_id=S3_ACCESS_KEY,
aws_secret_access_key=S3_SECRET_KEY,
region_name='us-east-1',
)
# Check if bucket exists
try:
s3_client.head_bucket(Bucket=BUCKET_NAME)
logging.info(f"✓ Bucket exists: {BUCKET_NAME}")
return True
except ClientError as e:
error_code = e.response['Error']['Code']
if error_code == '404':
# Bucket doesn't exist, create it
logging.info(f"Creating bucket: {BUCKET_NAME}")
s3_client.create_bucket(Bucket=BUCKET_NAME)
logging.info(f"✓ Bucket created: {BUCKET_NAME}")
return True
else:
raise
except Exception:
logging.exception("✗ Failed to create/check bucket")
return False
def ensure_bucket_exists(s3: pafs.S3FileSystem) -> bool:
"""Ensure the test bucket exists using PyArrow's native S3FileSystem."""
try:
# Check if bucket exists by trying to list it
try:
file_info = s3.get_file_info(BUCKET_NAME)
if file_info.type == pafs.FileType.Directory:
logging.info(f"✓ Bucket exists: {BUCKET_NAME}")
return True
except OSError as e:
# OSError typically means bucket not found or network/permission issues
error_msg = str(e).lower()
if "not found" in error_msg or "does not exist" in error_msg or "nosuchbucket" in error_msg:
logging.debug(f"Bucket '{BUCKET_NAME}' not found, will attempt creation: {e}")
else:
# Log other OSErrors (network, auth, etc.) for debugging
logging.debug(f"Error checking bucket '{BUCKET_NAME}', will attempt creation anyway: {type(e).__name__}: {e}")
except Exception as e:
# Catch any other unexpected exceptions and log them
logging.debug(f"Unexpected error checking bucket '{BUCKET_NAME}', will attempt creation: {type(e).__name__}: {e}")
# Try to create the bucket
logging.info(f"Creating bucket: {BUCKET_NAME}")
s3.create_dir(BUCKET_NAME)
logging.info(f"✓ Bucket created: {BUCKET_NAME}")
return True
except Exception:
logging.exception(f"✗ Failed to create/check bucket '{BUCKET_NAME}' with PyArrow")
return False
def test_write_and_read(s3: pafs.S3FileSystem, test_name: str, num_rows: int) -> tuple[bool, str]:
"""Test writing and reading a Parquet dataset using PyArrow's native S3 filesystem."""
try:
table = create_sample_table(num_rows)
# Write using pads.write_dataset
filename = f"{BUCKET_NAME}/{TEST_DIR}/{test_name}/data.parquet"
logging.info(f" Writing {num_rows:,} rows to {filename}...")
pads.write_dataset(
table,
filename,
filesystem=s3,
format="parquet",
)
logging.info(" ✓ Write completed")
# Test Method 1: Read with pq.read_table
logging.info(" Reading with pq.read_table...")
table_read = pq.read_table(filename, filesystem=s3)
if table_read.num_rows != num_rows:
return False, f"pq.read_table: Row count mismatch (expected {num_rows}, got {table_read.num_rows})"
# Check schema first
if not table_read.schema.equals(table.schema):
return False, f"pq.read_table: Schema mismatch (expected {table.schema}, got {table_read.schema})"
# Sort both tables by 'id' column before comparison to handle potential row order differences
table_sorted = table.sort_by([('id', 'ascending')])
table_read_sorted = table_read.sort_by([('id', 'ascending')])
if not table_read_sorted.equals(table_sorted):
# Provide more detailed error information
error_details = []
for col_name in table.column_names:
col_original = table_sorted.column(col_name)
col_read = table_read_sorted.column(col_name)
if not col_original.equals(col_read):
error_details.append(f"column '{col_name}' differs")
return False, f"pq.read_table: Table contents mismatch ({', '.join(error_details)})"
logging.info(f" ✓ pq.read_table: {table_read.num_rows:,} rows")
# Test Method 2: Read with pq.ParquetDataset
logging.info(" Reading with pq.ParquetDataset...")
dataset = pq.ParquetDataset(filename, filesystem=s3)
table_dataset = dataset.read()
if table_dataset.num_rows != num_rows:
return False, f"pq.ParquetDataset: Row count mismatch (expected {num_rows}, got {table_dataset.num_rows})"
# Sort before comparison
table_dataset_sorted = table_dataset.sort_by([('id', 'ascending')])
if not table_dataset_sorted.equals(table_sorted):
error_details = []
for col_name in table.column_names:
col_original = table_sorted.column(col_name)
col_read = table_dataset_sorted.column(col_name)
if not col_original.equals(col_read):
error_details.append(f"column '{col_name}' differs")
return False, f"pq.ParquetDataset: Table contents mismatch ({', '.join(error_details)})"
logging.info(f" ✓ pq.ParquetDataset: {table_dataset.num_rows:,} rows")
# Test Method 3: Read with pads.dataset
logging.info(" Reading with pads.dataset...")
dataset_pads = pads.dataset(filename, filesystem=s3)
table_pads = dataset_pads.to_table()
if table_pads.num_rows != num_rows:
return False, f"pads.dataset: Row count mismatch (expected {num_rows}, got {table_pads.num_rows})"
# Sort before comparison
table_pads_sorted = table_pads.sort_by([('id', 'ascending')])
if not table_pads_sorted.equals(table_sorted):
error_details = []
for col_name in table.column_names:
col_original = table_sorted.column(col_name)
col_read = table_pads_sorted.column(col_name)
if not col_original.equals(col_read):
error_details.append(f"column '{col_name}' differs")
return False, f"pads.dataset: Table contents mismatch ({', '.join(error_details)})"
logging.info(f" ✓ pads.dataset: {table_pads.num_rows:,} rows")
return True, "All read methods passed"
except Exception as exc:
logging.exception(" ✗ Test failed")
return False, f"{type(exc).__name__}: {exc}"
def cleanup_test_files(s3: pafs.S3FileSystem) -> None:
"""Clean up test files from S3.
Note: We cannot use s3.delete_dir() directly because SeaweedFS uses implicit
directories (path prefixes without physical directory objects). PyArrow's
delete_dir() attempts to delete the directory marker itself, which fails with
"INTERNAL_FAILURE" on SeaweedFS. Instead, we list and delete files individually,
letting implicit directories disappear automatically.
"""
try:
test_path = f"{BUCKET_NAME}/{TEST_DIR}"
logging.info(f"Cleaning up test directory: {test_path}")
# List and delete files individually to handle implicit directories
try:
file_selector = pafs.FileSelector(test_path, recursive=True)
files = s3.get_file_info(file_selector)
# Delete files first (not directories)
for file_info in files:
if file_info.type == pafs.FileType.File:
s3.delete_file(file_info.path)
logging.debug(f" Deleted file: {file_info.path}")
logging.info("✓ Test directory cleaned up")
except OSError as e:
# Handle the case where the path doesn't exist or is inaccessible
if "does not exist" in str(e).lower() or "not found" in str(e).lower():
logging.info("✓ Test directory already clean or doesn't exist")
else:
raise
except Exception:
logging.exception("Failed to cleanup test directory")
def main():
"""Run all tests with PyArrow's native S3 filesystem."""
print("=" * 80)
print("PyArrow Native S3 Filesystem Tests for SeaweedFS")
print("Testing Parquet Files with Multiple Row Groups")
if TEST_QUICK:
print("*** QUICK TEST MODE - Small files only ***")
print("=" * 80 + "\n")
print("Configuration:")
print(f" S3 Endpoint: {S3_ENDPOINT_URL}")
print(f" Access Key: {S3_ACCESS_KEY}")
print(f" Bucket: {BUCKET_NAME}")
print(f" Test Directory: {TEST_DIR}")
print(f" Quick Mode: {'Yes (small files only)' if TEST_QUICK else 'No (all file sizes)'}")
print(f" PyArrow Version: {pa.__version__}")
print()
# Initialize S3 filesystem
s3, scheme, endpoint = init_s3_filesystem()
if s3 is None:
print("Cannot proceed without S3 connection")
return 1
# Ensure bucket exists - try PyArrow first, fall back to boto3
bucket_created = ensure_bucket_exists(s3)
if not bucket_created:
logging.info("Trying to create bucket with boto3...")
bucket_created = ensure_bucket_exists_boto3(scheme, endpoint)
if not bucket_created:
print("Cannot proceed without bucket")
return 1
results = []
# Test all file sizes
for size_name, num_rows in TEST_SIZES.items():
print(f"\n{'='*80}")
print(f"Testing with {size_name} files ({num_rows:,} rows)")
print(f"{'='*80}\n")
test_name = f"{size_name}_test"
success, message = test_write_and_read(s3, test_name, num_rows)
results.append((test_name, success, message))
status = "✓ PASS" if success else "✗ FAIL"
print(f"\n{status}: {message}\n")
# Summary
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
passed = sum(1 for _, success, _ in results if success)
total = len(results)
print(f"\nTotal: {passed}/{total} passed\n")
for test_name, success, message in results:
status = "" if success else ""
print(f" {status} {test_name}: {message}")
print("\n" + "=" * 80)
if passed == total:
print("✓ ALL TESTS PASSED!")
else:
print(f"{total - passed} test(s) failed")
print("=" * 80 + "\n")
# Cleanup
cleanup_test_files(s3)
return 0 if passed == total else 1
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,254 @@
#!/usr/bin/env python3
"""
Test script for SSE-S3 compatibility with PyArrow native S3 filesystem.
This test specifically targets the SSE-S3 multipart upload bug where
SeaweedFS panics with "bad IV length" when reading multipart uploads
that were encrypted with bucket-default SSE-S3.
Requirements:
- pyarrow>=10.0.0
- boto3>=1.28.0
Environment Variables:
S3_ENDPOINT_URL: S3 endpoint (default: localhost:8333)
S3_ACCESS_KEY: S3 access key (default: some_access_key1)
S3_SECRET_KEY: S3 secret key (default: some_secret_key1)
BUCKET_NAME: S3 bucket name (default: test-parquet-bucket)
Usage:
# Start SeaweedFS with SSE-S3 enabled
make start-seaweedfs-ci ENABLE_SSE_S3=true
# Run the test
python3 test_sse_s3_compatibility.py
"""
import os
import secrets
import sys
import logging
from typing import Optional
import pyarrow as pa
import pyarrow.dataset as pads
import pyarrow.fs as pafs
import pyarrow.parquet as pq
try:
import boto3
from botocore.exceptions import ClientError
HAS_BOTO3 = True
except ImportError:
HAS_BOTO3 = False
logging.exception("boto3 is required for this test")
sys.exit(1)
from parquet_test_utils import create_sample_table
logging.basicConfig(level=logging.INFO, format="%(message)s")
# Configuration
S3_ENDPOINT_URL = os.environ.get("S3_ENDPOINT_URL", "localhost:8333")
S3_ACCESS_KEY = os.environ.get("S3_ACCESS_KEY", "some_access_key1")
S3_SECRET_KEY = os.environ.get("S3_SECRET_KEY", "some_secret_key1")
BUCKET_NAME = os.getenv("BUCKET_NAME", "test-parquet-bucket")
TEST_RUN_ID = secrets.token_hex(8)
TEST_DIR = f"sse-s3-tests/{TEST_RUN_ID}"
# Test sizes designed to trigger multipart uploads
# PyArrow typically uses 5MB chunks, so these sizes should trigger multipart
TEST_SIZES = {
"tiny": 10, # Single part
"small": 1_000, # Single part
"medium": 50_000, # Single part (~1.5MB)
"large": 200_000, # Multiple parts (~6MB)
"very_large": 500_000, # Multiple parts (~15MB)
}
def init_s3_filesystem() -> tuple[Optional[pafs.S3FileSystem], str, str]:
"""Initialize PyArrow's native S3 filesystem."""
try:
logging.info("Initializing PyArrow S3FileSystem...")
# Determine scheme from endpoint
if S3_ENDPOINT_URL.startswith("http://"):
scheme = "http"
endpoint = S3_ENDPOINT_URL[7:]
elif S3_ENDPOINT_URL.startswith("https://"):
scheme = "https"
endpoint = S3_ENDPOINT_URL[8:]
else:
scheme = "http"
endpoint = S3_ENDPOINT_URL
s3 = pafs.S3FileSystem(
access_key=S3_ACCESS_KEY,
secret_key=S3_SECRET_KEY,
endpoint_override=endpoint,
scheme=scheme,
allow_bucket_creation=True,
allow_bucket_deletion=True,
)
logging.info("✓ PyArrow S3FileSystem initialized\n")
return s3, scheme, endpoint
except Exception:
logging.exception("✗ Failed to initialize PyArrow S3FileSystem")
return None, "", ""
def ensure_bucket_exists(scheme: str, endpoint: str) -> bool:
"""Ensure the test bucket exists using boto3."""
try:
endpoint_url = f"{scheme}://{endpoint}"
s3_client = boto3.client(
's3',
endpoint_url=endpoint_url,
aws_access_key_id=S3_ACCESS_KEY,
aws_secret_access_key=S3_SECRET_KEY,
region_name='us-east-1',
)
try:
s3_client.head_bucket(Bucket=BUCKET_NAME)
logging.info(f"✓ Bucket exists: {BUCKET_NAME}")
except ClientError as e:
error_code = e.response['Error']['Code']
if error_code == '404':
logging.info(f"Creating bucket: {BUCKET_NAME}")
s3_client.create_bucket(Bucket=BUCKET_NAME)
logging.info(f"✓ Bucket created: {BUCKET_NAME}")
else:
logging.exception("✗ Failed to access bucket")
return False
# Note: SeaweedFS doesn't support GetBucketEncryption API
# so we can't verify if SSE-S3 is enabled via API
# We assume it's configured correctly in the s3.json config file
logging.info("✓ Assuming SSE-S3 is configured in s3.json")
return True
except Exception:
logging.exception("✗ Failed to check bucket")
return False
def test_write_read_with_sse(
s3: pafs.S3FileSystem,
test_name: str,
num_rows: int
) -> tuple[bool, str, int]:
"""Test writing and reading with SSE-S3 encryption."""
try:
table = create_sample_table(num_rows)
filename = f"{BUCKET_NAME}/{TEST_DIR}/{test_name}/data.parquet"
logging.info(f" Writing {num_rows:,} rows...")
pads.write_dataset(
table,
filename,
filesystem=s3,
format="parquet",
)
logging.info(" Reading back...")
table_read = pq.read_table(filename, filesystem=s3)
if table_read.num_rows != num_rows:
return False, f"Row count mismatch: {table_read.num_rows} != {num_rows}", 0
return True, "Success", table_read.num_rows
except Exception as e:
error_msg = f"{type(e).__name__}: {e!s}"
logging.exception(" ✗ Failed")
return False, error_msg, 0
def main():
"""Run SSE-S3 compatibility tests."""
print("=" * 80)
print("SSE-S3 Compatibility Tests for PyArrow Native S3")
print("Testing Multipart Upload Encryption")
print("=" * 80 + "\n")
print("Configuration:")
print(f" S3 Endpoint: {S3_ENDPOINT_URL}")
print(f" Bucket: {BUCKET_NAME}")
print(f" Test Directory: {TEST_DIR}")
print(f" PyArrow Version: {pa.__version__}")
print()
# Initialize
s3, scheme, endpoint = init_s3_filesystem()
if s3 is None:
print("Cannot proceed without S3 connection")
return 1
# Check bucket and SSE-S3
if not ensure_bucket_exists(scheme, endpoint):
print("\n⚠ WARNING: Failed to access or create the test bucket!")
print("This test requires a reachable bucket with SSE-S3 enabled.")
print("Please ensure SeaweedFS is running with: make start-seaweedfs-ci ENABLE_SSE_S3=true")
return 1
print()
results = []
# Test all sizes
for size_name, num_rows in TEST_SIZES.items():
print(f"\n{'='*80}")
print(f"Testing {size_name} dataset ({num_rows:,} rows)")
print(f"{'='*80}")
success, message, rows_read = test_write_read_with_sse(
s3, size_name, num_rows
)
results.append((size_name, num_rows, success, message, rows_read))
if success:
print(f" ✓ SUCCESS: Read {rows_read:,} rows")
else:
print(f" ✗ FAILED: {message}")
# Summary
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
passed = sum(1 for _, _, success, _, _ in results if success)
total = len(results)
print(f"\nTotal: {passed}/{total} tests passed\n")
print(f"{'Size':<15} {'Rows':>10} {'Status':<10} {'Rows Read':>10} {'Message':<40}")
print("-" * 90)
for size_name, num_rows, success, message, rows_read in results:
status = "✓ PASS" if success else "✗ FAIL"
rows_str = f"{rows_read:,}" if success else "N/A"
print(f"{size_name:<15} {num_rows:>10,} {status:<10} {rows_str:>10} {message[:40]}")
print("\n" + "=" * 80)
if passed == total:
print("✓ ALL TESTS PASSED WITH SSE-S3!")
print("\nThis means:")
print(" - SSE-S3 encryption is working correctly")
print(" - PyArrow native S3 filesystem is compatible")
print(" - Multipart uploads are handled properly")
else:
print(f"{total - passed} test(s) failed")
print("\nPossible issues:")
print(" - SSE-S3 multipart upload bug with empty IV")
print(" - Encryption/decryption mismatch")
print(" - File corruption during upload")
print("=" * 80 + "\n")
return 0 if passed == total else 1
if __name__ == "__main__":
sys.exit(main())