diff --git a/.gitignore b/.gitignore index 027a56e59..b330bbd96 100644 --- a/.gitignore +++ b/.gitignore @@ -112,3 +112,6 @@ test/s3/retention/weed-server.pid test/s3/retention/weed-test.log /test/s3/versioning/test-volume-data test/s3/versioning/weed-test.log +/docker/admin_integration/data +docker/agent_pub_record +docker/admin_integration/weed-local diff --git a/DESIGN.md b/DESIGN.md new file mode 100644 index 000000000..d164467c3 --- /dev/null +++ b/DESIGN.md @@ -0,0 +1,413 @@ +# SeaweedFS Task Distribution System Design + +## Overview + +This document describes the design of a distributed task management system for SeaweedFS that handles Erasure Coding (EC) and vacuum operations through a scalable admin server and worker process architecture. + +## System Architecture + +### High-Level Components + +``` +┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐ +│ Master │◄──►│ Admin Server │◄──►│ Workers │ +│ │ │ │ │ │ +│ - Volume Info │ │ - Task Discovery │ │ - Task Exec │ +│ - Shard Status │ │ - Task Assign │ │ - Progress │ +│ - Heartbeats │ │ - Progress Track │ │ - Error Report │ +└─────────────────┘ └──────────────────┘ └─────────────────┘ + │ │ │ + │ │ │ + ▼ ▼ ▼ +┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐ +│ Volume Servers │ │ Volume Monitor │ │ Task Execution │ +│ │ │ │ │ │ +│ - Store Volumes │ │ - Health Check │ │ - EC Convert │ +│ - EC Shards │ │ - Usage Stats │ │ - Vacuum Clean │ +│ - Report Status │ │ - State Sync │ │ - Status Report │ +└─────────────────┘ └──────────────────┘ └─────────────────┘ +``` + +## 1. Admin Server Design + +### 1.1 Core Responsibilities + +- **Task Discovery**: Scan volumes to identify EC and vacuum candidates +- **Worker Management**: Track available workers and their capabilities +- **Task Assignment**: Match tasks to optimal workers +- **Progress Tracking**: Monitor in-progress tasks for capacity planning +- **State Reconciliation**: Sync with master server for volume state updates + +### 1.2 Task Discovery Engine + +```go +type TaskDiscoveryEngine struct { + masterClient MasterClient + volumeScanner VolumeScanner + taskDetectors map[TaskType]TaskDetector + scanInterval time.Duration +} + +type VolumeCandidate struct { + VolumeID uint32 + Server string + Collection string + TaskType TaskType + Priority TaskPriority + Reason string + DetectedAt time.Time + Parameters map[string]interface{} +} +``` + +**EC Detection Logic**: +- Find volumes >= 95% full and idle for > 1 hour +- Exclude volumes already in EC format +- Exclude volumes with ongoing operations +- Prioritize by collection and age + +**Vacuum Detection Logic**: +- Find volumes with garbage ratio > 30% +- Exclude read-only volumes +- Exclude volumes with recent vacuum operations +- Prioritize by garbage percentage + +### 1.3 Worker Registry & Management + +```go +type WorkerRegistry struct { + workers map[string]*Worker + capabilities map[TaskType][]*Worker + lastHeartbeat map[string]time.Time + taskAssignment map[string]*Task + mutex sync.RWMutex +} + +type Worker struct { + ID string + Address string + Capabilities []TaskType + MaxConcurrent int + CurrentLoad int + Status WorkerStatus + LastSeen time.Time + Performance WorkerMetrics +} +``` + +### 1.4 Task Assignment Algorithm + +```go +type TaskScheduler struct { + registry *WorkerRegistry + taskQueue *PriorityQueue + inProgressTasks map[string]*InProgressTask + volumeReservations map[uint32]*VolumeReservation +} + +// Worker Selection Criteria: +// 1. Has required capability (EC or Vacuum) +// 2. Available capacity (CurrentLoad < MaxConcurrent) +// 3. Best performance history for task type +// 4. Lowest current load +// 5. Geographically close to volume server (optional) +``` + +## 2. Worker Process Design + +### 2.1 Worker Architecture + +```go +type MaintenanceWorker struct { + id string + config *WorkerConfig + adminClient AdminClient + taskExecutors map[TaskType]TaskExecutor + currentTasks map[string]*RunningTask + registry *TaskRegistry + heartbeatTicker *time.Ticker + requestTicker *time.Ticker +} +``` + +### 2.2 Task Execution Framework + +```go +type TaskExecutor interface { + Execute(ctx context.Context, task *Task) error + EstimateTime(task *Task) time.Duration + ValidateResources(task *Task) error + GetProgress() float64 + Cancel() error +} + +type ErasureCodingExecutor struct { + volumeClient VolumeServerClient + progress float64 + cancelled bool +} + +type VacuumExecutor struct { + volumeClient VolumeServerClient + progress float64 + cancelled bool +} +``` + +### 2.3 Worker Capabilities & Registration + +```go +type WorkerCapabilities struct { + SupportedTasks []TaskType + MaxConcurrent int + ResourceLimits ResourceLimits + PreferredServers []string // Affinity for specific volume servers +} + +type ResourceLimits struct { + MaxMemoryMB int64 + MaxDiskSpaceMB int64 + MaxNetworkMbps int64 + MaxCPUPercent float64 +} +``` + +## 3. Task Lifecycle Management + +### 3.1 Task States + +```go +type TaskState string + +const ( + TaskStatePending TaskState = "pending" + TaskStateAssigned TaskState = "assigned" + TaskStateInProgress TaskState = "in_progress" + TaskStateCompleted TaskState = "completed" + TaskStateFailed TaskState = "failed" + TaskStateCancelled TaskState = "cancelled" + TaskStateStuck TaskState = "stuck" // Taking too long + TaskStateDuplicate TaskState = "duplicate" // Detected duplicate +) +``` + +### 3.2 Progress Tracking & Monitoring + +```go +type InProgressTask struct { + Task *Task + WorkerID string + StartedAt time.Time + LastUpdate time.Time + Progress float64 + EstimatedEnd time.Time + VolumeReserved bool // Reserved for capacity planning +} + +type TaskMonitor struct { + inProgressTasks map[string]*InProgressTask + timeoutChecker *time.Ticker + stuckDetector *time.Ticker + duplicateChecker *time.Ticker +} +``` + +## 4. Volume Capacity Reconciliation + +### 4.1 Volume State Tracking + +```go +type VolumeStateManager struct { + masterClient MasterClient + inProgressTasks map[uint32]*InProgressTask // VolumeID -> Task + committedChanges map[uint32]*VolumeChange // Changes not yet in master + reconcileInterval time.Duration +} + +type VolumeChange struct { + VolumeID uint32 + ChangeType ChangeType // "ec_encoding", "vacuum_completed" + OldCapacity int64 + NewCapacity int64 + TaskID string + CompletedAt time.Time + ReportedToMaster bool +} +``` + +### 4.2 Shard Assignment Integration + +When the master needs to assign shards, it must consider: +1. **Current volume state** from its own records +2. **In-progress capacity changes** from admin server +3. **Committed but unreported changes** from admin server + +```go +type CapacityOracle struct { + adminServer AdminServerClient + masterState *MasterVolumeState + updateFreq time.Duration +} + +func (o *CapacityOracle) GetAdjustedCapacity(volumeID uint32) int64 { + baseCapacity := o.masterState.GetCapacity(volumeID) + + // Adjust for in-progress tasks + if task := o.adminServer.GetInProgressTask(volumeID); task != nil { + switch task.Type { + case TaskTypeErasureCoding: + // EC reduces effective capacity + return baseCapacity / 2 // Simplified + case TaskTypeVacuum: + // Vacuum may increase available space + return baseCapacity + int64(float64(baseCapacity) * 0.3) + } + } + + // Adjust for completed but unreported changes + if change := o.adminServer.GetPendingChange(volumeID); change != nil { + return change.NewCapacity + } + + return baseCapacity +} +``` + +## 5. Error Handling & Recovery + +### 5.1 Worker Failure Scenarios + +```go +type FailureHandler struct { + taskRescheduler *TaskRescheduler + workerMonitor *WorkerMonitor + alertManager *AlertManager +} + +// Failure Scenarios: +// 1. Worker becomes unresponsive (heartbeat timeout) +// 2. Task execution fails (reported by worker) +// 3. Task gets stuck (progress timeout) +// 4. Duplicate task detection +// 5. Resource exhaustion +``` + +### 5.2 Recovery Strategies + +**Worker Timeout Recovery**: +- Mark worker as inactive after 3 missed heartbeats +- Reschedule all assigned tasks to other workers +- Cleanup any partial state + +**Task Stuck Recovery**: +- Detect tasks with no progress for > 2x estimated time +- Cancel stuck task and mark volume for cleanup +- Reschedule if retry count < max_retries + +**Duplicate Task Prevention**: +```go +type DuplicateDetector struct { + activeFingerprints map[string]bool // VolumeID+TaskType + recentCompleted *LRUCache // Recently completed tasks +} + +func (d *DuplicateDetector) IsTaskDuplicate(task *Task) bool { + fingerprint := fmt.Sprintf("%d-%s", task.VolumeID, task.Type) + return d.activeFingerprints[fingerprint] || + d.recentCompleted.Contains(fingerprint) +} +``` + +## 6. Simulation & Testing Framework + +### 6.1 Failure Simulation + +```go +type TaskSimulator struct { + scenarios map[string]SimulationScenario +} + +type SimulationScenario struct { + Name string + WorkerCount int + VolumeCount int + FailurePatterns []FailurePattern + Duration time.Duration +} + +type FailurePattern struct { + Type FailureType // "worker_timeout", "task_stuck", "duplicate" + Probability float64 // 0.0 to 1.0 + Timing TimingSpec // When during task execution + Duration time.Duration +} +``` + +### 6.2 Test Scenarios + +**Scenario 1: Worker Timeout During EC** +- Start EC task on 30GB volume +- Kill worker at 50% progress +- Verify task reassignment +- Verify no duplicate EC operations + +**Scenario 2: Stuck Vacuum Task** +- Start vacuum on high-garbage volume +- Simulate worker hanging at 75% progress +- Verify timeout detection and cleanup +- Verify volume state consistency + +**Scenario 3: Duplicate Task Prevention** +- Submit same EC task from multiple sources +- Verify only one task executes +- Verify proper conflict resolution + +**Scenario 4: Master-Admin State Divergence** +- Create in-progress EC task +- Simulate master restart +- Verify state reconciliation +- Verify shard assignment accounts for in-progress work + +## 7. Performance & Scalability + +### 7.1 Metrics & Monitoring + +```go +type SystemMetrics struct { + TasksPerSecond float64 + WorkerUtilization float64 + AverageTaskTime time.Duration + FailureRate float64 + QueueDepth int + VolumeStatesSync bool +} +``` + +### 7.2 Scalability Considerations + +- **Horizontal Worker Scaling**: Add workers without admin server changes +- **Admin Server HA**: Master-slave admin servers for fault tolerance +- **Task Partitioning**: Partition tasks by collection or datacenter +- **Batch Operations**: Group similar tasks for efficiency + +## 8. Implementation Plan + +### Phase 1: Core Infrastructure +1. Admin server basic framework +2. Worker registration and heartbeat +3. Simple task assignment +4. Basic progress tracking + +### Phase 2: Advanced Features +1. Volume state reconciliation +2. Sophisticated worker selection +3. Failure detection and recovery +4. Duplicate prevention + +### Phase 3: Optimization & Monitoring +1. Performance metrics +2. Load balancing algorithms +3. Capacity planning integration +4. Comprehensive monitoring + +This design provides a robust, scalable foundation for distributed task management in SeaweedFS while maintaining consistency with the existing architecture patterns. \ No newline at end of file diff --git a/docker/Makefile b/docker/Makefile index 777357758..c6f6a50ae 100644 --- a/docker/Makefile +++ b/docker/Makefile @@ -8,7 +8,7 @@ cgo ?= 0 binary: export SWCOMMIT=$(shell git rev-parse --short HEAD) export SWLDFLAGS="-X github.com/seaweedfs/seaweedfs/weed/util/version.COMMIT=$(SWCOMMIT)" - cd ../weed && CGO_ENABLED=$(cgo) GOOS=linux go build $(options) -tags "$(tags)" -ldflags "-s -w -extldflags -static $(SWLDFLAGS)" && mv weed ../docker/ + cd ../weed && CGO_ENABLED=$(cgo) GOOS=linux go build $(options) -tags "$(tags)" -ldflags "-s -w -extldflags -static $(SWLDFLAGS)" -o weed_binary && mv weed_binary ../docker/weed cd ../other/mq_client_example/agent_pub_record && CGO_ENABLED=$(cgo) GOOS=linux go build && mv agent_pub_record ../../../docker/ cd ../other/mq_client_example/agent_sub_record && CGO_ENABLED=$(cgo) GOOS=linux go build && mv agent_sub_record ../../../docker/ diff --git a/docker/admin_integration/Dockerfile.local b/docker/admin_integration/Dockerfile.local new file mode 100644 index 000000000..9795b6ea3 --- /dev/null +++ b/docker/admin_integration/Dockerfile.local @@ -0,0 +1,18 @@ +FROM alpine:latest + +# Install required packages +RUN apk add --no-cache \ + ca-certificates \ + fuse \ + curl \ + jq + +# Copy our locally built binary +COPY weed-local /usr/bin/weed +RUN chmod +x /usr/bin/weed + +# Create working directory +WORKDIR /data + +# Default command +ENTRYPOINT ["/usr/bin/weed"] \ No newline at end of file diff --git a/docker/admin_integration/EC-TESTING-README.md b/docker/admin_integration/EC-TESTING-README.md new file mode 100644 index 000000000..57e0a5985 --- /dev/null +++ b/docker/admin_integration/EC-TESTING-README.md @@ -0,0 +1,438 @@ +# SeaweedFS EC Worker Testing Environment + +This Docker Compose setup provides a comprehensive testing environment for SeaweedFS Erasure Coding (EC) workers using **official SeaweedFS commands**. + +## 📂 Directory Structure + +The testing environment is located in `docker/admin_integration/` and includes: + +``` +docker/admin_integration/ +├── Makefile # Main management interface +├── docker-compose-ec-test.yml # Docker compose configuration +├── EC-TESTING-README.md # This documentation +└── run-ec-test.sh # Quick start script +``` + +## 🏗️ Architecture + +The testing environment uses **official SeaweedFS commands** and includes: + +- **1 Master Server** (port 9333) - Coordinates the cluster with 50MB volume size limit +- **6 Volume Servers** (ports 8080-8085) - Distributed across 2 data centers and 3 racks for diversity +- **1 Filer** (port 8888) - Provides file system interface +- **1 Admin Server** (port 23646) - Detects volumes needing EC and manages workers using official `admin` command +- **3 EC Workers** - Execute erasure coding tasks using official `worker` command with task-specific working directories +- **1 Load Generator** - Continuously writes and deletes files using SeaweedFS shell commands +- **1 Monitor** - Tracks cluster health and EC progress using shell scripts + +## ✨ New Features + +### **Task-Specific Working Directories** +Each worker now creates dedicated subdirectories for different task types: +- `/work/erasure_coding/` - For EC encoding tasks +- `/work/vacuum/` - For vacuum cleanup tasks +- `/work/balance/` - For volume balancing tasks + +This provides: +- **Organization**: Each task type gets isolated working space +- **Debugging**: Easy to find files/logs related to specific task types +- **Cleanup**: Can clean up task-specific artifacts easily +- **Concurrent Safety**: Different task types won't interfere with each other's files + +## 🚀 Quick Start + +### Prerequisites + +- Docker and Docker Compose installed +- GNU Make installed +- At least 4GB RAM available for containers +- Ports 8080-8085, 8888, 9333, 23646 available + +### Start the Environment + +```bash +# Navigate to the admin integration directory +cd docker/admin_integration/ + +# Show available commands +make help + +# Start the complete testing environment +make start +``` + +The `make start` command will: +1. Start all services using official SeaweedFS images +2. Configure workers with task-specific working directories +3. Wait for services to be ready +4. Display monitoring URLs and run health checks + +### Alternative Commands + +```bash +# Quick start aliases +make up # Same as 'make start' + +# Development mode (higher load for faster testing) +make dev-start + +# Build images without starting +make build +``` + +## 📋 Available Make Targets + +Run `make help` to see all available targets: + +### **🚀 Main Operations** +- `make start` - Start the complete EC testing environment +- `make stop` - Stop all services +- `make restart` - Restart all services +- `make clean` - Complete cleanup (containers, volumes, images) + +### **📊 Monitoring & Status** +- `make health` - Check health of all services +- `make status` - Show status of all containers +- `make urls` - Display all monitoring URLs +- `make monitor` - Open monitor dashboard in browser +- `make monitor-status` - Show monitor status via API +- `make volume-status` - Show volume status from master +- `make admin-status` - Show admin server status +- `make cluster-status` - Show complete cluster status + +### **📋 Logs Management** +- `make logs` - Show logs from all services +- `make logs-admin` - Show admin server logs +- `make logs-workers` - Show all worker logs +- `make logs-worker1/2/3` - Show specific worker logs +- `make logs-load` - Show load generator logs +- `make logs-monitor` - Show monitor logs +- `make backup-logs` - Backup all logs to files + +### **⚖️ Scaling & Testing** +- `make scale-workers WORKERS=5` - Scale workers to 5 instances +- `make scale-load RATE=25` - Increase load generation rate +- `make test-ec` - Run focused EC test scenario + +### **🔧 Development & Debug** +- `make shell-admin` - Open shell in admin container +- `make shell-worker1` - Open shell in worker container +- `make debug` - Show debug information +- `make troubleshoot` - Run troubleshooting checks + +## 📊 Monitoring URLs + +| Service | URL | Description | +|---------|-----|-------------| +| Master UI | http://localhost:9333 | Cluster status and topology | +| Filer | http://localhost:8888 | File operations | +| Admin Server | http://localhost:23646/ | Task management | +| Monitor | http://localhost:9999/status | Complete cluster monitoring | +| Volume Servers | http://localhost:8080-8085/status | Individual volume server stats | + +Quick access: `make urls` or `make monitor` + +## 🔄 How EC Testing Works + +### 1. Continuous Load Generation +- **Write Rate**: 10 files/second (1-5MB each) +- **Delete Rate**: 2 files/second +- **Target**: Fill volumes to 50MB limit quickly + +### 2. Volume Detection +- Admin server scans master every 30 seconds +- Identifies volumes >40MB (80% of 50MB limit) +- Queues EC tasks for eligible volumes + +### 3. EC Worker Assignment +- **Worker 1**: EC specialist (max 2 concurrent tasks) +- **Worker 2**: EC + Vacuum hybrid (max 2 concurrent tasks) +- **Worker 3**: EC + Vacuum hybrid (max 1 concurrent task) + +### 4. Comprehensive EC Process +Each EC task follows 6 phases: +1. **Copy Volume Data** (5-15%) - Stream .dat/.idx files locally +2. **Mark Read-Only** (20-25%) - Ensure data consistency +3. **Local Encoding** (30-60%) - Create 14 shards (10+4 Reed-Solomon) +4. **Calculate Placement** (65-70%) - Smart rack-aware distribution +5. **Distribute Shards** (75-90%) - Upload to optimal servers +6. **Verify & Cleanup** (95-100%) - Validate and clean temporary files + +### 5. Real-Time Monitoring +- Volume analysis and EC candidate detection +- Worker health and task progress +- No data loss verification +- Performance metrics + +## 📋 Key Features Tested + +### ✅ EC Implementation Features +- [x] Local volume data copying with progress tracking +- [x] Local Reed-Solomon encoding (10+4 shards) +- [x] Intelligent shard placement with rack awareness +- [x] Load balancing across available servers +- [x] Backup server selection for redundancy +- [x] Detailed step-by-step progress tracking +- [x] Comprehensive error handling and recovery + +### ✅ Infrastructure Features +- [x] Multi-datacenter topology (dc1, dc2) +- [x] Rack diversity (rack1, rack2, rack3) +- [x] Volume size limits (50MB) +- [x] Worker capability matching +- [x] Health monitoring and alerting +- [x] Continuous workload simulation + +## 🛠️ Common Usage Patterns + +### Basic Testing Workflow +```bash +# Start environment +make start + +# Watch progress +make monitor-status + +# Check for EC candidates +make volume-status + +# View worker activity +make logs-workers + +# Stop when done +make stop +``` + +### High-Load Testing +```bash +# Start with higher load +make dev-start + +# Scale up workers and load +make scale-workers WORKERS=5 +make scale-load RATE=50 + +# Monitor intensive EC activity +make logs-admin +``` + +### Debugging Issues +```bash +# Check port conflicts and system state +make troubleshoot + +# View specific service logs +make logs-admin +make logs-worker1 + +# Get shell access for debugging +make shell-admin +make shell-worker1 + +# Check detailed status +make debug +``` + +### Development Iteration +```bash +# Quick restart after code changes +make restart + +# Rebuild and restart +make clean +make start + +# Monitor specific components +make logs-monitor +``` + +## 📈 Expected Results + +### Successful EC Testing Shows: +1. **Volume Growth**: Steady increase in volume sizes toward 50MB limit +2. **EC Detection**: Admin server identifies volumes >40MB for EC +3. **Task Assignment**: Workers receive and execute EC tasks +4. **Shard Distribution**: 14 shards distributed across 6 volume servers +5. **No Data Loss**: All files remain accessible during and after EC +6. **Performance**: EC tasks complete within estimated timeframes + +### Sample Monitor Output: +```bash +# Check current status +make monitor-status + +# Output example: +{ + "monitor": { + "uptime": "15m30s", + "master_addr": "master:9333", + "admin_addr": "admin:9900" + }, + "stats": { + "VolumeCount": 12, + "ECTasksDetected": 3, + "WorkersActive": 3 + } +} +``` + +## 🔧 Configuration + +### Environment Variables + +You can customize the environment by setting variables: + +```bash +# High load testing +WRITE_RATE=25 DELETE_RATE=5 make start + +# Extended test duration +TEST_DURATION=7200 make start # 2 hours +``` + +### Scaling Examples + +```bash +# Scale workers +make scale-workers WORKERS=6 + +# Increase load generation +make scale-load RATE=30 + +# Combined scaling +make scale-workers WORKERS=4 +make scale-load RATE=40 +``` + +## 🧹 Cleanup Options + +```bash +# Stop services only +make stop + +# Remove containers but keep volumes +make down + +# Remove data volumes only +make clean-volumes + +# Remove built images only +make clean-images + +# Complete cleanup (everything) +make clean +``` + +## 🐛 Troubleshooting + +### Quick Diagnostics +```bash +# Run complete troubleshooting +make troubleshoot + +# Check specific components +make health +make debug +make status +``` + +### Common Issues + +**Services not starting:** +```bash +# Check port availability +make troubleshoot + +# View startup logs +make logs-master +make logs-admin +``` + +**No EC tasks being created:** +```bash +# Check volume status +make volume-status + +# Increase load to fill volumes faster +make scale-load RATE=30 + +# Check admin detection +make logs-admin +``` + +**Workers not responding:** +```bash +# Check worker registration +make admin-status + +# View worker logs +make logs-workers + +# Restart workers +make restart +``` + +### Performance Tuning + +**For faster testing:** +```bash +make dev-start # Higher default load +make scale-load RATE=50 # Very high load +``` + +**For stress testing:** +```bash +make scale-workers WORKERS=8 +make scale-load RATE=100 +``` + +## 📚 Technical Details + +### Network Architecture +- Custom bridge network (172.20.0.0/16) +- Service discovery via container names +- Health checks for all services + +### Storage Layout +- Each volume server: max 100 volumes +- Data centers: dc1, dc2 +- Racks: rack1, rack2, rack3 +- Volume limit: 50MB per volume + +### EC Algorithm +- Reed-Solomon RS(10,4) +- 10 data shards + 4 parity shards +- Rack-aware distribution +- Backup server redundancy + +### Make Integration +- Color-coded output for better readability +- Comprehensive help system (`make help`) +- Parallel execution support +- Error handling and cleanup +- Cross-platform compatibility + +## 🎯 Quick Reference + +```bash +# Essential commands +make help # Show all available targets +make start # Start complete environment +make health # Check all services +make monitor # Open dashboard +make logs-admin # View admin activity +make clean # Complete cleanup + +# Monitoring +make volume-status # Check for EC candidates +make admin-status # Check task queue +make monitor-status # Full cluster status + +# Scaling & Testing +make test-ec # Run focused EC test +make scale-load RATE=X # Increase load +make troubleshoot # Diagnose issues +``` + +This environment provides a realistic testing scenario for SeaweedFS EC workers with actual data operations, comprehensive monitoring, and easy management through Make targets. \ No newline at end of file diff --git a/docker/admin_integration/Makefile b/docker/admin_integration/Makefile new file mode 100644 index 000000000..68fb0cec6 --- /dev/null +++ b/docker/admin_integration/Makefile @@ -0,0 +1,346 @@ +# SeaweedFS Admin Integration Test Makefile +# Tests the admin server and worker functionality using official weed commands + +.PHONY: help build build-and-restart restart-workers start stop restart logs clean status test admin-ui worker-logs master-logs admin-logs vacuum-test vacuum-demo vacuum-status vacuum-data vacuum-data-high vacuum-data-low vacuum-continuous vacuum-clean vacuum-help +.DEFAULT_GOAL := help + +COMPOSE_FILE := docker-compose-ec-test.yml +PROJECT_NAME := admin_integration + +build: ## Build SeaweedFS with latest changes and create Docker image + @echo "🔨 Building SeaweedFS with latest changes..." + @echo "1️⃣ Generating admin templates..." + @cd ../../ && make admin-generate + @echo "2️⃣ Building Docker image with latest changes..." + @cd ../ && make build + @echo "3️⃣ Copying binary for local docker-compose..." + @cp ../weed ./weed-local + @echo "✅ Build complete! Updated image: chrislusf/seaweedfs:local" + @echo "💡 Run 'make restart' to apply changes to running services" + +build-and-restart: build ## Build with latest changes and restart services + @echo "🔄 Recreating services with new image..." + @echo "1️⃣ Recreating admin server with new image..." + @docker-compose -f $(COMPOSE_FILE) up -d admin + @sleep 5 + @echo "2️⃣ Recreating workers to reconnect..." + @docker-compose -f $(COMPOSE_FILE) up -d worker1 worker2 worker3 + @echo "✅ All services recreated with latest changes!" + @echo "🌐 Admin UI: http://localhost:23646/" + @echo "💡 Workers will reconnect to the new admin server" + +restart-workers: ## Restart all workers to reconnect to admin server + @echo "🔄 Restarting workers to reconnect to admin server..." + @docker-compose -f $(COMPOSE_FILE) restart worker1 worker2 worker3 + @echo "✅ Workers restarted and will reconnect to admin server" + +help: ## Show this help message + @echo "SeaweedFS Admin Integration Test" + @echo "================================" + @echo "Tests admin server task distribution to workers using official weed commands" + @echo "" + @echo "🏗️ Cluster Management:" + @grep -E '^(start|stop|restart|clean|status|build):.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " %-18s %s\n", $$1, $$2}' + @echo "" + @echo "🧪 Testing:" + @grep -E '^(test|demo|validate|quick-test):.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " %-18s %s\n", $$1, $$2}' + @echo "" + @echo "🗑️ Vacuum Testing:" + @grep -E '^vacuum-.*:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " %-18s %s\n", $$1, $$2}' + @echo "" + @echo "📜 Monitoring:" + @grep -E '^(logs|admin-logs|worker-logs|master-logs|admin-ui):.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " %-18s %s\n", $$1, $$2}' + @echo "" + @echo "🚀 Quick Start:" + @echo " make start # Start cluster" + @echo " make vacuum-test # Test vacuum tasks" + @echo " make vacuum-help # Vacuum testing guide" + @echo "" + @echo "💡 For detailed vacuum testing: make vacuum-help" + +start: ## Start the complete SeaweedFS cluster with admin and workers + @echo "🚀 Starting SeaweedFS cluster with admin and workers..." + @docker-compose -f $(COMPOSE_FILE) up -d + @echo "✅ Cluster started!" + @echo "" + @echo "📊 Access points:" + @echo " • Admin UI: http://localhost:23646/" + @echo " • Master UI: http://localhost:9333/" + @echo " • Filer: http://localhost:8888/" + @echo "" + @echo "📈 Services starting up..." + @echo " • Master server: ✓" + @echo " • Volume servers: Starting (6 servers)..." + @echo " • Filer: Starting..." + @echo " • Admin server: Starting..." + @echo " • Workers: Starting (3 workers)..." + @echo "" + @echo "⏳ Use 'make status' to check startup progress" + @echo "💡 Use 'make logs' to watch the startup process" + +start-staged: ## Start services in proper order with delays + @echo "🚀 Starting SeaweedFS cluster in stages..." + @echo "" + @echo "Stage 1: Starting Master server..." + @docker-compose -f $(COMPOSE_FILE) up -d master + @sleep 10 + @echo "" + @echo "Stage 2: Starting Volume servers..." + @docker-compose -f $(COMPOSE_FILE) up -d volume1 volume2 volume3 volume4 volume5 volume6 + @sleep 15 + @echo "" + @echo "Stage 3: Starting Filer..." + @docker-compose -f $(COMPOSE_FILE) up -d filer + @sleep 10 + @echo "" + @echo "Stage 4: Starting Admin server..." + @docker-compose -f $(COMPOSE_FILE) up -d admin + @sleep 15 + @echo "" + @echo "Stage 5: Starting Workers..." + @docker-compose -f $(COMPOSE_FILE) up -d worker1 worker2 worker3 + @sleep 10 + @echo "" + @echo "Stage 6: Starting Load generator and Monitor..." + @docker-compose -f $(COMPOSE_FILE) up -d load_generator monitor + @echo "" + @echo "✅ All services started!" + @echo "" + @echo "📊 Access points:" + @echo " • Admin UI: http://localhost:23646/" + @echo " • Master UI: http://localhost:9333/" + @echo " • Filer: http://localhost:8888/" + @echo "" + @echo "⏳ Services are initializing... Use 'make status' to check progress" + +stop: ## Stop all services + @echo "🛑 Stopping SeaweedFS cluster..." + @docker-compose -f $(COMPOSE_FILE) down + @echo "✅ Cluster stopped" + +restart: stop start ## Restart the entire cluster + +clean: ## Stop and remove all containers, networks, and volumes + @echo "🧹 Cleaning up SeaweedFS test environment..." + @docker-compose -f $(COMPOSE_FILE) down -v --remove-orphans + @docker system prune -f + @rm -rf data/ + @echo "✅ Environment cleaned" + +status: ## Check the status of all services + @echo "📊 SeaweedFS Cluster Status" + @echo "==========================" + @docker-compose -f $(COMPOSE_FILE) ps + @echo "" + @echo "📋 Service Health:" + @echo "Master:" + @curl -s http://localhost:9333/cluster/status | jq '.IsLeader' 2>/dev/null || echo " ❌ Master not ready" + @echo "Admin:" + @curl -s http://localhost:23646/ | grep -q "Admin" && echo " ✅ Admin ready" || echo " ❌ Admin not ready" + +logs: ## Show logs from all services + @echo "📜 Following logs from all services..." + @echo "💡 Press Ctrl+C to stop following logs" + @docker-compose -f $(COMPOSE_FILE) logs -f + +admin-logs: ## Show logs from admin server only + @echo "📜 Admin server logs:" + @docker-compose -f $(COMPOSE_FILE) logs -f admin + +worker-logs: ## Show logs from all workers + @echo "📜 Worker logs:" + @docker-compose -f $(COMPOSE_FILE) logs -f worker1 worker2 worker3 + +master-logs: ## Show logs from master server + @echo "📜 Master server logs:" + @docker-compose -f $(COMPOSE_FILE) logs -f master + +admin-ui: ## Open admin UI in browser (macOS) + @echo "🌐 Opening admin UI in browser..." + @open http://localhost:23646/ || echo "💡 Manually open: http://localhost:23646/" + +test: ## Run integration test to verify task assignment and completion + @echo "🧪 Running Admin-Worker Integration Test" + @echo "========================================" + @echo "" + @echo "1️⃣ Checking cluster health..." + @sleep 5 + @curl -s http://localhost:9333/cluster/status | jq '.IsLeader' > /dev/null && echo "✅ Master healthy" || echo "❌ Master not ready" + @curl -s http://localhost:23646/ | grep -q "Admin" && echo "✅ Admin healthy" || echo "❌ Admin not ready" + @echo "" + @echo "2️⃣ Checking worker registration..." + @sleep 10 + @echo "💡 Check admin UI for connected workers: http://localhost:23646/" + @echo "" + @echo "3️⃣ Generating load to trigger EC tasks..." + @echo "📝 Creating test files to fill volumes..." + @echo "Creating large files with random data to trigger EC (targeting ~60MB total to exceed 50MB limit)..." + @for i in {1..12}; do \ + echo "Creating 5MB random file $$i..."; \ + docker run --rm --network admin_integration_seaweed_net -v /tmp:/tmp --entrypoint sh chrislusf/seaweedfs:local -c "dd if=/dev/urandom of=/tmp/largefile$$i.dat bs=1M count=5 2>/dev/null && weed upload -master=master:9333 /tmp/largefile$$i.dat && rm /tmp/largefile$$i.dat"; \ + sleep 3; \ + done + @echo "" + @echo "4️⃣ Waiting for volumes to process large files and reach 50MB limit..." + @echo "This may take a few minutes as we're uploading 60MB of data..." + @sleep 60 + @echo "" + @echo "5️⃣ Checking for EC task creation and assignment..." + @echo "💡 Monitor the admin UI to see:" + @echo " • Tasks being created for volumes needing EC" + @echo " • Workers picking up tasks" + @echo " • Task progress (pending → running → completed)" + @echo " • EC shards being distributed" + @echo "" + @echo "✅ Integration test setup complete!" + @echo "📊 Monitor progress at: http://localhost:23646/" + +quick-test: ## Quick verification that core services are running + @echo "⚡ Quick Health Check" + @echo "====================" + @echo "Master: $$(curl -s http://localhost:9333/cluster/status | jq -r '.IsLeader // "not ready"')" + @echo "Admin: $$(curl -s http://localhost:23646/ | grep -q "Admin" && echo "ready" || echo "not ready")" + @echo "Workers: $$(docker-compose -f $(COMPOSE_FILE) ps worker1 worker2 worker3 | grep -c Up) running" + +validate: ## Validate integration test configuration + @echo "🔍 Validating Integration Test Configuration" + @echo "===========================================" + @chmod +x test-integration.sh + @./test-integration.sh + +demo: start ## Start cluster and run demonstration + @echo "🎭 SeaweedFS Admin-Worker Demo" + @echo "=============================" + @echo "" + @echo "⏳ Waiting for services to start..." + @sleep 45 + @echo "" + @echo "🎯 Demo Overview:" + @echo " • 1 Master server (coordinates cluster)" + @echo " • 6 Volume servers (50MB volume limit)" + @echo " • 1 Admin server (task management)" + @echo " • 3 Workers (execute EC tasks)" + @echo " • Load generator (creates files continuously)" + @echo "" + @echo "📊 Watch the process:" + @echo " 1. Visit: http://localhost:23646/" + @echo " 2. Observe workers connecting" + @echo " 3. Watch tasks being created and assigned" + @echo " 4. See tasks progress from pending → completed" + @echo "" + @echo "🔄 The demo will:" + @echo " • Fill volumes to 50MB limit" + @echo " • Admin detects volumes needing EC" + @echo " • Workers receive and execute EC tasks" + @echo " • Tasks complete with shard distribution" + @echo "" + @echo "💡 Use 'make worker-logs' to see worker activity" + @echo "💡 Use 'make admin-logs' to see admin task management" + +# Vacuum Testing Targets +vacuum-test: ## Create test data with garbage and verify vacuum detection + @echo "🧪 SeaweedFS Vacuum Task Testing" + @echo "================================" + @echo "" + @echo "1️⃣ Checking cluster health..." + @curl -s http://localhost:9333/cluster/status | jq '.IsLeader' > /dev/null && echo "✅ Master ready" || (echo "❌ Master not ready. Run 'make start' first." && exit 1) + @curl -s http://localhost:23646/ | grep -q "Admin" && echo "✅ Admin ready" || (echo "❌ Admin not ready. Run 'make start' first." && exit 1) + @echo "" + @echo "2️⃣ Creating test data with garbage..." + @docker-compose -f $(COMPOSE_FILE) exec vacuum-tester go run create_vacuum_test_data.go -files=25 -delete=0.5 -size=200 + @echo "" + @echo "3️⃣ Configuration Instructions:" + @echo " Visit: http://localhost:23646/maintenance/config/vacuum" + @echo " Set for testing:" + @echo " • Enable Vacuum Tasks: ✅ Checked" + @echo " • Garbage Threshold: 0.20 (20%)" + @echo " • Scan Interval: [30] [Seconds]" + @echo " • Min Volume Age: [0] [Minutes]" + @echo " • Max Concurrent: 2" + @echo "" + @echo "4️⃣ Monitor vacuum tasks at: http://localhost:23646/maintenance" + @echo "" + @echo "💡 Use 'make vacuum-status' to check volume garbage ratios" + +vacuum-demo: ## Run automated vacuum testing demonstration + @echo "🎭 Vacuum Task Demo" + @echo "==================" + @echo "" + @echo "⚠️ This demo requires user interaction for configuration" + @echo "💡 Make sure cluster is running with 'make start'" + @echo "" + @docker-compose -f $(COMPOSE_FILE) exec vacuum-tester sh -c "chmod +x demo_vacuum_testing.sh && ./demo_vacuum_testing.sh" + +vacuum-status: ## Check current volume status and garbage ratios + @echo "📊 Current Volume Status" + @echo "=======================" + @docker-compose -f $(COMPOSE_FILE) exec vacuum-tester sh -c "chmod +x check_volumes.sh && ./check_volumes.sh" + +vacuum-data: ## Create test data with configurable parameters + @echo "📁 Creating vacuum test data..." + @echo "Usage: make vacuum-data [FILES=20] [DELETE=0.4] [SIZE=100]" + @echo "" + @docker-compose -f $(COMPOSE_FILE) exec vacuum-tester go run create_vacuum_test_data.go \ + -files=$${FILES:-20} \ + -delete=$${DELETE:-0.4} \ + -size=$${SIZE:-100} + +vacuum-data-high: ## Create high garbage ratio test data (should trigger vacuum) + @echo "📁 Creating high garbage test data (70% garbage)..." + @docker-compose -f $(COMPOSE_FILE) exec vacuum-tester go run create_vacuum_test_data.go -files=30 -delete=0.7 -size=150 + +vacuum-data-low: ## Create low garbage ratio test data (should NOT trigger vacuum) + @echo "📁 Creating low garbage test data (15% garbage)..." + @docker-compose -f $(COMPOSE_FILE) exec vacuum-tester go run create_vacuum_test_data.go -files=30 -delete=0.15 -size=150 + +vacuum-continuous: ## Generate garbage continuously for testing + @echo "🔄 Generating continuous garbage for vacuum testing..." + @echo "Creating 5 rounds of test data with 30-second intervals..." + @for i in {1..5}; do \ + echo "Round $$i: Creating garbage..."; \ + docker-compose -f $(COMPOSE_FILE) exec vacuum-tester go run create_vacuum_test_data.go -files=10 -delete=0.6 -size=100; \ + echo "Waiting 30 seconds..."; \ + sleep 30; \ + done + @echo "✅ Continuous test complete. Check vacuum task activity!" + +vacuum-clean: ## Clean up vacuum test data (removes all volumes!) + @echo "🧹 Cleaning up vacuum test data..." + @echo "⚠️ WARNING: This will delete ALL volumes!" + @read -p "Are you sure? (y/N): " confirm && [ "$$confirm" = "y" ] || exit 1 + @echo "Stopping cluster..." + @docker-compose -f $(COMPOSE_FILE) down + @echo "Removing volume data..." + @rm -rf data/volume*/ + @echo "Restarting cluster..." + @docker-compose -f $(COMPOSE_FILE) up -d + @echo "✅ Clean up complete. Fresh volumes ready for testing." + +vacuum-help: ## Show vacuum testing help and examples + @echo "🧪 Vacuum Testing Commands (Docker-based)" + @echo "==========================================" + @echo "" + @echo "Quick Start:" + @echo " make start # Start SeaweedFS cluster with vacuum-tester" + @echo " make vacuum-test # Create test data and instructions" + @echo " make vacuum-status # Check volume status" + @echo "" + @echo "Data Generation:" + @echo " make vacuum-data-high # High garbage (should trigger)" + @echo " make vacuum-data-low # Low garbage (should NOT trigger)" + @echo " make vacuum-continuous # Continuous garbage generation" + @echo "" + @echo "Monitoring:" + @echo " make vacuum-status # Quick volume status check" + @echo " make vacuum-demo # Full guided demonstration" + @echo "" + @echo "Configuration:" + @echo " Visit: http://localhost:23646/maintenance/config/vacuum" + @echo " Monitor: http://localhost:23646/maintenance" + @echo "" + @echo "Custom Parameters:" + @echo " make vacuum-data FILES=50 DELETE=0.8 SIZE=200" + @echo "" + @echo "💡 All commands now run inside Docker containers" + @echo "Documentation:" + @echo " See: VACUUM_TEST_README.md for complete guide" \ No newline at end of file diff --git a/docker/admin_integration/check_volumes.sh b/docker/admin_integration/check_volumes.sh new file mode 100755 index 000000000..8cc6c14c5 --- /dev/null +++ b/docker/admin_integration/check_volumes.sh @@ -0,0 +1,32 @@ +#!/bin/sh + +echo "📊 Quick Volume Status Check" +echo "============================" +echo "" + +# Check if master is running +MASTER_URL="${MASTER_HOST:-master:9333}" +if ! curl -s http://$MASTER_URL/cluster/status > /dev/null; then + echo "❌ Master server not available at $MASTER_URL" + exit 1 +fi + +echo "🔍 Fetching volume status from master..." +curl -s "http://$MASTER_URL/vol/status" | jq -r ' +if .Volumes and .Volumes.DataCenters then + .Volumes.DataCenters | to_entries[] | .value | to_entries[] | .value | to_entries[] | .value | if . then .[] else empty end | + "Volume \(.Id): + Size: \(.Size | if . < 1024 then "\(.) B" elif . < 1048576 then "\(. / 1024 | floor) KB" elif . < 1073741824 then "\(. / 1048576 * 100 | floor / 100) MB" else "\(. / 1073741824 * 100 | floor / 100) GB" end) + Files: \(.FileCount) active, \(.DeleteCount) deleted + Garbage: \(.DeletedByteCount | if . < 1024 then "\(.) B" elif . < 1048576 then "\(. / 1024 | floor) KB" elif . < 1073741824 then "\(. / 1048576 * 100 | floor / 100) MB" else "\(. / 1073741824 * 100 | floor / 100) GB" end) (\(if .Size > 0 then (.DeletedByteCount / .Size * 100 | floor) else 0 end)%) + Status: \(if (.DeletedByteCount / .Size * 100) > 30 then "🎯 NEEDS VACUUM" else "✅ OK" end) +" +else + "No volumes found" +end' + +echo "" +echo "💡 Legend:" +echo " 🎯 NEEDS VACUUM: >30% garbage ratio" +echo " ✅ OK: <30% garbage ratio" +echo "" \ No newline at end of file diff --git a/docker/admin_integration/create_vacuum_test_data.go b/docker/admin_integration/create_vacuum_test_data.go new file mode 100644 index 000000000..46acdd4cd --- /dev/null +++ b/docker/admin_integration/create_vacuum_test_data.go @@ -0,0 +1,280 @@ +package main + +import ( + "bytes" + "crypto/rand" + "encoding/json" + "flag" + "fmt" + "io" + "log" + "net/http" + "time" +) + +var ( + master = flag.String("master", "master:9333", "SeaweedFS master server address") + fileCount = flag.Int("files", 20, "Number of files to create") + deleteRatio = flag.Float64("delete", 0.4, "Ratio of files to delete (0.0-1.0)") + fileSizeKB = flag.Int("size", 100, "Size of each file in KB") +) + +type AssignResult struct { + Fid string `json:"fid"` + Url string `json:"url"` + PublicUrl string `json:"publicUrl"` + Count int `json:"count"` + Error string `json:"error"` +} + +func main() { + flag.Parse() + + fmt.Println("🧪 Creating fake data for vacuum task testing...") + fmt.Printf("Master: %s\n", *master) + fmt.Printf("Files to create: %d\n", *fileCount) + fmt.Printf("Delete ratio: %.1f%%\n", *deleteRatio*100) + fmt.Printf("File size: %d KB\n", *fileSizeKB) + fmt.Println() + + if *fileCount == 0 { + // Just check volume status + fmt.Println("📊 Checking volume status...") + checkVolumeStatus() + return + } + + // Step 1: Create test files + fmt.Println("📁 Step 1: Creating test files...") + fids := createTestFiles() + + // Step 2: Delete some files to create garbage + fmt.Println("🗑️ Step 2: Deleting files to create garbage...") + deleteFiles(fids) + + // Step 3: Check volume status + fmt.Println("📊 Step 3: Checking volume status...") + checkVolumeStatus() + + // Step 4: Configure vacuum for testing + fmt.Println("⚙️ Step 4: Instructions for testing...") + printTestingInstructions() +} + +func createTestFiles() []string { + var fids []string + + for i := 0; i < *fileCount; i++ { + // Generate random file content + fileData := make([]byte, *fileSizeKB*1024) + rand.Read(fileData) + + // Get file ID assignment + assign, err := assignFileId() + if err != nil { + log.Printf("Failed to assign file ID for file %d: %v", i, err) + continue + } + + // Upload file + err = uploadFile(assign, fileData, fmt.Sprintf("test_file_%d.dat", i)) + if err != nil { + log.Printf("Failed to upload file %d: %v", i, err) + continue + } + + fids = append(fids, assign.Fid) + + if (i+1)%5 == 0 { + fmt.Printf(" Created %d/%d files...\n", i+1, *fileCount) + } + } + + fmt.Printf("✅ Created %d files successfully\n\n", len(fids)) + return fids +} + +func deleteFiles(fids []string) { + deleteCount := int(float64(len(fids)) * *deleteRatio) + + for i := 0; i < deleteCount; i++ { + err := deleteFile(fids[i]) + if err != nil { + log.Printf("Failed to delete file %s: %v", fids[i], err) + continue + } + + if (i+1)%5 == 0 { + fmt.Printf(" Deleted %d/%d files...\n", i+1, deleteCount) + } + } + + fmt.Printf("✅ Deleted %d files (%.1f%% of total)\n\n", deleteCount, *deleteRatio*100) +} + +func assignFileId() (*AssignResult, error) { + resp, err := http.Get(fmt.Sprintf("http://%s/dir/assign", *master)) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + var result AssignResult + err = json.NewDecoder(resp.Body).Decode(&result) + if err != nil { + return nil, err + } + + if result.Error != "" { + return nil, fmt.Errorf("assignment error: %s", result.Error) + } + + return &result, nil +} + +func uploadFile(assign *AssignResult, data []byte, filename string) error { + url := fmt.Sprintf("http://%s/%s", assign.Url, assign.Fid) + + body := &bytes.Buffer{} + body.Write(data) + + req, err := http.NewRequest("POST", url, body) + if err != nil { + return err + } + + req.Header.Set("Content-Type", "application/octet-stream") + if filename != "" { + req.Header.Set("Content-Disposition", fmt.Sprintf("attachment; filename=\"%s\"", filename)) + } + + client := &http.Client{Timeout: 30 * time.Second} + resp, err := client.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusCreated && resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("upload failed with status %d: %s", resp.StatusCode, string(body)) + } + + return nil +} + +func deleteFile(fid string) error { + url := fmt.Sprintf("http://%s/%s", *master, fid) + + req, err := http.NewRequest("DELETE", url, nil) + if err != nil { + return err + } + + client := &http.Client{Timeout: 10 * time.Second} + resp, err := client.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + + return nil +} + +func checkVolumeStatus() { + // Get volume list from master + resp, err := http.Get(fmt.Sprintf("http://%s/vol/status", *master)) + if err != nil { + log.Printf("Failed to get volume status: %v", err) + return + } + defer resp.Body.Close() + + var volumes map[string]interface{} + err = json.NewDecoder(resp.Body).Decode(&volumes) + if err != nil { + log.Printf("Failed to decode volume status: %v", err) + return + } + + fmt.Println("📊 Volume Status Summary:") + + if vols, ok := volumes["Volumes"].([]interface{}); ok { + for _, vol := range vols { + if v, ok := vol.(map[string]interface{}); ok { + id := int(v["Id"].(float64)) + size := uint64(v["Size"].(float64)) + fileCount := int(v["FileCount"].(float64)) + deleteCount := int(v["DeleteCount"].(float64)) + deletedBytes := uint64(v["DeletedByteCount"].(float64)) + + garbageRatio := 0.0 + if size > 0 { + garbageRatio = float64(deletedBytes) / float64(size) * 100 + } + + fmt.Printf(" Volume %d:\n", id) + fmt.Printf(" Size: %s\n", formatBytes(size)) + fmt.Printf(" Files: %d (active), %d (deleted)\n", fileCount, deleteCount) + fmt.Printf(" Garbage: %s (%.1f%%)\n", formatBytes(deletedBytes), garbageRatio) + + if garbageRatio > 30 { + fmt.Printf(" 🎯 This volume should trigger vacuum (>30%% garbage)\n") + } + fmt.Println() + } + } + } +} + +func formatBytes(bytes uint64) string { + if bytes < 1024 { + return fmt.Sprintf("%d B", bytes) + } else if bytes < 1024*1024 { + return fmt.Sprintf("%.1f KB", float64(bytes)/1024) + } else if bytes < 1024*1024*1024 { + return fmt.Sprintf("%.1f MB", float64(bytes)/(1024*1024)) + } else { + return fmt.Sprintf("%.1f GB", float64(bytes)/(1024*1024*1024)) + } +} + +func printTestingInstructions() { + fmt.Println("🧪 Testing Instructions:") + fmt.Println() + fmt.Println("1. Configure Vacuum for Testing:") + fmt.Println(" Visit: http://localhost:23646/maintenance/config/vacuum") + fmt.Println(" Set:") + fmt.Printf(" - Garbage Percentage Threshold: 20 (20%% - lower than default 30)\n") + fmt.Printf(" - Scan Interval: [30] [Seconds] (faster than default)\n") + fmt.Printf(" - Min Volume Age: [0] [Minutes] (no age requirement)\n") + fmt.Printf(" - Max Concurrent: 2\n") + fmt.Printf(" - Min Interval: 1m (faster repeat)\n") + fmt.Println() + + fmt.Println("2. Monitor Vacuum Tasks:") + fmt.Println(" Visit: http://localhost:23646/maintenance") + fmt.Println(" Watch for vacuum tasks to appear in the queue") + fmt.Println() + + fmt.Println("3. Manual Vacuum (Optional):") + fmt.Println(" curl -X POST 'http://localhost:9333/vol/vacuum?garbageThreshold=0.20'") + fmt.Println(" (Note: Master API still uses 0.0-1.0 decimal format)") + fmt.Println() + + fmt.Println("4. Check Logs:") + fmt.Println(" Look for messages like:") + fmt.Println(" - 'Vacuum detector found X volumes needing vacuum'") + fmt.Println(" - 'Applied vacuum configuration'") + fmt.Println(" - 'Worker executing task: vacuum'") + fmt.Println() + + fmt.Println("5. Verify Results:") + fmt.Println(" Re-run this script with -files=0 to check volume status") + fmt.Println(" Garbage ratios should decrease after vacuum operations") + fmt.Println() + + fmt.Printf("🚀 Quick test command:\n") + fmt.Printf(" go run create_vacuum_test_data.go -files=0\n") + fmt.Println() +} diff --git a/docker/admin_integration/demo_vacuum_testing.sh b/docker/admin_integration/demo_vacuum_testing.sh new file mode 100755 index 000000000..6835e14cc --- /dev/null +++ b/docker/admin_integration/demo_vacuum_testing.sh @@ -0,0 +1,105 @@ +#!/bin/sh + +echo "🧪 SeaweedFS Vacuum Task Testing Demo" +echo "======================================" +echo "" + +# Check if SeaweedFS is running +echo "📋 Checking SeaweedFS status..." +MASTER_URL="${MASTER_HOST:-master:9333}" +ADMIN_URL="${ADMIN_HOST:-admin:23646}" + +if ! curl -s http://$MASTER_URL/cluster/status > /dev/null; then + echo "❌ SeaweedFS master not running at $MASTER_URL" + echo " Please ensure Docker cluster is running: make start" + exit 1 +fi + +if ! curl -s http://volume1:8080/status > /dev/null; then + echo "❌ SeaweedFS volume servers not running" + echo " Please ensure Docker cluster is running: make start" + exit 1 +fi + +if ! curl -s http://$ADMIN_URL/ > /dev/null; then + echo "❌ SeaweedFS admin server not running at $ADMIN_URL" + echo " Please ensure Docker cluster is running: make start" + exit 1 +fi + +echo "✅ All SeaweedFS components are running" +echo "" + +# Phase 1: Create test data +echo "📁 Phase 1: Creating test data with garbage..." +go run create_vacuum_test_data.go -master=$MASTER_URL -files=15 -delete=0.5 -size=150 +echo "" + +# Phase 2: Check initial status +echo "📊 Phase 2: Checking initial volume status..." +go run create_vacuum_test_data.go -master=$MASTER_URL -files=0 +echo "" + +# Phase 3: Configure vacuum +echo "⚙️ Phase 3: Vacuum configuration instructions..." +echo " 1. Visit: http://localhost:23646/maintenance/config/vacuum" +echo " 2. Set these values for testing:" +echo " - Enable Vacuum Tasks: ✅ Checked" +echo " - Garbage Threshold: 0.30" +echo " - Scan Interval: [30] [Seconds]" +echo " - Min Volume Age: [0] [Minutes]" +echo " - Max Concurrent: 2" +echo " 3. Click 'Save Configuration'" +echo "" + +read -p " Press ENTER after configuring vacuum settings..." +echo "" + +# Phase 4: Monitor tasks +echo "🎯 Phase 4: Monitoring vacuum tasks..." +echo " Visit: http://localhost:23646/maintenance" +echo " You should see vacuum tasks appear within 30 seconds" +echo "" + +echo " Waiting 60 seconds for vacuum detection and execution..." +for i in {60..1}; do + printf "\r Countdown: %02d seconds" $i + sleep 1 +done +echo "" +echo "" + +# Phase 5: Check results +echo "📈 Phase 5: Checking results after vacuum..." +go run create_vacuum_test_data.go -master=$MASTER_URL -files=0 +echo "" + +# Phase 6: Create more garbage for continuous testing +echo "🔄 Phase 6: Creating additional garbage for continuous testing..." +echo " Running 3 rounds of garbage creation..." + +for round in {1..3}; do + echo " Round $round: Creating garbage..." + go run create_vacuum_test_data.go -master=$MASTER_URL -files=8 -delete=0.6 -size=100 + echo " Waiting 30 seconds before next round..." + sleep 30 +done + +echo "" +echo "📊 Final volume status:" +go run create_vacuum_test_data.go -master=$MASTER_URL -files=0 +echo "" + +echo "🎉 Demo Complete!" +echo "" +echo "🔍 Things to check:" +echo " 1. Maintenance Queue: http://localhost:23646/maintenance" +echo " 2. Volume Status: http://localhost:9333/vol/status" +echo " 3. Admin Dashboard: http://localhost:23646" +echo "" +echo "💡 Next Steps:" +echo " - Try different garbage thresholds (0.10, 0.50, 0.80)" +echo " - Adjust scan intervals (10s, 1m, 5m)" +echo " - Monitor logs for vacuum operations" +echo " - Test with multiple volumes" +echo "" \ No newline at end of file diff --git a/docker/admin_integration/docker-compose-ec-test.yml b/docker/admin_integration/docker-compose-ec-test.yml new file mode 100644 index 000000000..197c9bda5 --- /dev/null +++ b/docker/admin_integration/docker-compose-ec-test.yml @@ -0,0 +1,240 @@ +name: admin_integration + +networks: + seaweed_net: + driver: bridge + +services: + master: + image: chrislusf/seaweedfs:local + ports: + - "9333:9333" + - "19333:19333" + command: "master -ip=master -mdir=/data -volumeSizeLimitMB=50" + environment: + - WEED_MASTER_VOLUME_GROWTH_COPY_1=1 + - WEED_MASTER_VOLUME_GROWTH_COPY_2=2 + - WEED_MASTER_VOLUME_GROWTH_COPY_OTHER=1 + volumes: + - ./data/master:/data + networks: + - seaweed_net + + volume1: + image: chrislusf/seaweedfs:local + ports: + - "8080:8080" + - "18080:18080" + command: "volume -mserver=master:9333 -ip=volume1 -dir=/data -max=10" + depends_on: + - master + volumes: + - ./data/volume1:/data + networks: + - seaweed_net + + volume2: + image: chrislusf/seaweedfs:local + ports: + - "8081:8080" + - "18081:18080" + command: "volume -mserver=master:9333 -ip=volume2 -dir=/data -max=10" + depends_on: + - master + volumes: + - ./data/volume2:/data + networks: + - seaweed_net + + volume3: + image: chrislusf/seaweedfs:local + ports: + - "8082:8080" + - "18082:18080" + command: "volume -mserver=master:9333 -ip=volume3 -dir=/data -max=10" + depends_on: + - master + volumes: + - ./data/volume3:/data + networks: + - seaweed_net + + volume4: + image: chrislusf/seaweedfs:local + ports: + - "8083:8080" + - "18083:18080" + command: "volume -mserver=master:9333 -ip=volume4 -dir=/data -max=10" + depends_on: + - master + volumes: + - ./data/volume4:/data + networks: + - seaweed_net + + volume5: + image: chrislusf/seaweedfs:local + ports: + - "8084:8080" + - "18084:18080" + command: "volume -mserver=master:9333 -ip=volume5 -dir=/data -max=10" + depends_on: + - master + volumes: + - ./data/volume5:/data + networks: + - seaweed_net + + volume6: + image: chrislusf/seaweedfs:local + ports: + - "8085:8080" + - "18085:18080" + command: "volume -mserver=master:9333 -ip=volume6 -dir=/data -max=10" + depends_on: + - master + volumes: + - ./data/volume6:/data + networks: + - seaweed_net + + filer: + image: chrislusf/seaweedfs:local + ports: + - "8888:8888" + - "18888:18888" + command: "filer -master=master:9333 -ip=filer" + depends_on: + - master + volumes: + - ./data/filer:/data + networks: + - seaweed_net + + admin: + image: chrislusf/seaweedfs:local + ports: + - "23646:23646" # HTTP admin interface (default port) + - "33646:33646" # gRPC worker communication (23646 + 10000) + command: "admin -port=23646 -masters=master:9333 -dataDir=/data" + depends_on: + - master + - filer + volumes: + - ./data/admin:/data + networks: + - seaweed_net + + worker1: + image: chrislusf/seaweedfs:local + command: "-v=2 worker -admin=admin:23646 -capabilities=erasure_coding,vacuum -maxConcurrent=2" + depends_on: + - admin + volumes: + - ./data/worker1:/data + networks: + - seaweed_net + environment: + - WORKER_ID=worker-1 + + worker2: + image: chrislusf/seaweedfs:local + command: "-v=2 worker -admin=admin:23646 -capabilities=erasure_coding,vacuum -maxConcurrent=2" + depends_on: + - admin + volumes: + - ./data/worker2:/data + networks: + - seaweed_net + environment: + - WORKER_ID=worker-2 + + worker3: + image: chrislusf/seaweedfs:local + command: "-v=2 worker -admin=admin:23646 -capabilities=erasure_coding,vacuum -maxConcurrent=2" + depends_on: + - admin + volumes: + - ./data/worker3:/data + networks: + - seaweed_net + environment: + - WORKER_ID=worker-3 + + load_generator: + image: chrislusf/seaweedfs:local + entrypoint: ["/bin/sh"] + command: > + -c " + echo 'Starting load generator...'; + sleep 30; + echo 'Generating continuous load with 50MB volume limit...'; + while true; do + echo 'Writing test files...'; + echo 'Test file content at $(date)' | /usr/bin/weed upload -server=master:9333; + sleep 5; + echo 'Deleting some files...'; + /usr/bin/weed shell -master=master:9333 <<< 'fs.rm /test_file_*' || true; + sleep 10; + done + " + depends_on: + - master + - filer + - admin + networks: + - seaweed_net + + monitor: + image: alpine:latest + entrypoint: ["/bin/sh"] + command: > + -c " + apk add --no-cache curl jq; + echo 'Starting cluster monitor...'; + sleep 30; + while true; do + echo '=== Cluster Status $(date) ==='; + echo 'Master status:'; + curl -s http://master:9333/cluster/status | jq '.IsLeader, .Peers' || echo 'Master not ready'; + echo; + echo 'Admin status:'; + curl -s http://admin:23646/ | grep -o 'Admin.*Interface' || echo 'Admin not ready'; + echo; + echo 'Volume count by server:'; + curl -s http://master:9333/vol/status | jq '.Volumes | length' || echo 'Volumes not ready'; + echo; + sleep 60; + done + " + depends_on: + - master + - admin + - filer + networks: + - seaweed_net + + vacuum-tester: + image: chrislusf/seaweedfs:local + entrypoint: ["/bin/sh"] + command: > + -c " + echo 'Installing dependencies for vacuum testing...'; + apk add --no-cache jq curl go bash; + echo 'Vacuum tester ready...'; + echo 'Use: docker-compose exec vacuum-tester sh'; + echo 'Available commands: go, weed, curl, jq, bash, sh'; + sleep infinity + " + depends_on: + - master + - admin + - filer + volumes: + - .:/testing + working_dir: /testing + networks: + - seaweed_net + environment: + - MASTER_HOST=master:9333 + - ADMIN_HOST=admin:23646 \ No newline at end of file diff --git a/docker/admin_integration/test-integration.sh b/docker/admin_integration/test-integration.sh new file mode 100755 index 000000000..b355b1dfd --- /dev/null +++ b/docker/admin_integration/test-integration.sh @@ -0,0 +1,73 @@ +#!/bin/bash + +set -e + +echo "🧪 Testing SeaweedFS Admin-Worker Integration" +echo "=============================================" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +cd "$(dirname "$0")" + +echo -e "${BLUE}1. Validating docker-compose configuration...${NC}" +if docker-compose -f docker-compose-ec-test.yml config > /dev/null; then + echo -e "${GREEN}✅ Docker compose configuration is valid${NC}" +else + echo -e "${RED}❌ Docker compose configuration is invalid${NC}" + exit 1 +fi + +echo -e "${BLUE}2. Checking if required ports are available...${NC}" +for port in 9333 8080 8081 8082 8083 8084 8085 8888 23646; do + if lsof -i :$port > /dev/null 2>&1; then + echo -e "${YELLOW}⚠️ Port $port is in use${NC}" + else + echo -e "${GREEN}✅ Port $port is available${NC}" + fi +done + +echo -e "${BLUE}3. Testing worker command syntax...${NC}" +# Test that the worker command in docker-compose has correct syntax +if docker-compose -f docker-compose-ec-test.yml config | grep -q "workingDir=/work"; then + echo -e "${GREEN}✅ Worker working directory option is properly configured${NC}" +else + echo -e "${RED}❌ Worker working directory option is missing${NC}" + exit 1 +fi + +echo -e "${BLUE}4. Verifying admin server configuration...${NC}" +if docker-compose -f docker-compose-ec-test.yml config | grep -q "admin:23646"; then + echo -e "${GREEN}✅ Admin server port configuration is correct${NC}" +else + echo -e "${RED}❌ Admin server port configuration is incorrect${NC}" + exit 1 +fi + +echo -e "${BLUE}5. Checking service dependencies...${NC}" +if docker-compose -f docker-compose-ec-test.yml config | grep -q "depends_on"; then + echo -e "${GREEN}✅ Service dependencies are configured${NC}" +else + echo -e "${YELLOW}⚠️ Service dependencies may not be configured${NC}" +fi + +echo "" +echo -e "${GREEN}🎉 Integration test configuration is ready!${NC}" +echo "" +echo -e "${BLUE}To start the integration test:${NC}" +echo " make start # Start all services" +echo " make health # Check service health" +echo " make logs # View logs" +echo " make stop # Stop all services" +echo "" +echo -e "${BLUE}Key features verified:${NC}" +echo " ✅ Official SeaweedFS images are used" +echo " ✅ Worker working directories are configured" +echo " ✅ Admin-worker communication on correct ports" +echo " ✅ Task-specific directories will be created" +echo " ✅ Load generator will trigger EC tasks" +echo " ✅ Monitor will track progress" \ No newline at end of file diff --git a/weed/admin/config/schema.go b/weed/admin/config/schema.go new file mode 100644 index 000000000..54fb615f9 --- /dev/null +++ b/weed/admin/config/schema.go @@ -0,0 +1,360 @@ +package config + +import ( + "fmt" + "reflect" + "strings" + "time" +) + +// ConfigWithDefaults defines an interface for configurations that can apply their own defaults +type ConfigWithDefaults interface { + // ApplySchemaDefaults applies default values using the provided schema + ApplySchemaDefaults(schema *Schema) error + // Validate validates the configuration + Validate() error +} + +// FieldType defines the type of a configuration field +type FieldType string + +const ( + FieldTypeBool FieldType = "bool" + FieldTypeInt FieldType = "int" + FieldTypeDuration FieldType = "duration" + FieldTypeInterval FieldType = "interval" + FieldTypeString FieldType = "string" + FieldTypeFloat FieldType = "float" +) + +// FieldUnit defines the unit for display purposes +type FieldUnit string + +const ( + UnitSeconds FieldUnit = "seconds" + UnitMinutes FieldUnit = "minutes" + UnitHours FieldUnit = "hours" + UnitDays FieldUnit = "days" + UnitCount FieldUnit = "count" + UnitNone FieldUnit = "" +) + +// Field defines a configuration field with all its metadata +type Field struct { + // Field identification + Name string `json:"name"` + JSONName string `json:"json_name"` + Type FieldType `json:"type"` + + // Default value and validation + DefaultValue interface{} `json:"default_value"` + MinValue interface{} `json:"min_value,omitempty"` + MaxValue interface{} `json:"max_value,omitempty"` + Required bool `json:"required"` + + // UI display + DisplayName string `json:"display_name"` + Description string `json:"description"` + HelpText string `json:"help_text"` + Placeholder string `json:"placeholder"` + Unit FieldUnit `json:"unit"` + + // Form rendering + InputType string `json:"input_type"` // "checkbox", "number", "text", "interval", etc. + CSSClasses string `json:"css_classes,omitempty"` +} + +// GetDisplayValue returns the value formatted for display in the specified unit +func (f *Field) GetDisplayValue(value interface{}) interface{} { + if (f.Type == FieldTypeDuration || f.Type == FieldTypeInterval) && f.Unit != UnitSeconds { + if duration, ok := value.(time.Duration); ok { + switch f.Unit { + case UnitMinutes: + return int(duration.Minutes()) + case UnitHours: + return int(duration.Hours()) + case UnitDays: + return int(duration.Hours() / 24) + } + } + if seconds, ok := value.(int); ok { + switch f.Unit { + case UnitMinutes: + return seconds / 60 + case UnitHours: + return seconds / 3600 + case UnitDays: + return seconds / (24 * 3600) + } + } + } + return value +} + +// GetIntervalDisplayValue returns the value and unit for interval fields +func (f *Field) GetIntervalDisplayValue(value interface{}) (int, string) { + if f.Type != FieldTypeInterval { + return 0, "minutes" + } + + seconds := 0 + if duration, ok := value.(time.Duration); ok { + seconds = int(duration.Seconds()) + } else if s, ok := value.(int); ok { + seconds = s + } + + return SecondsToIntervalValueUnit(seconds) +} + +// SecondsToIntervalValueUnit converts seconds to the most appropriate interval unit +func SecondsToIntervalValueUnit(totalSeconds int) (int, string) { + if totalSeconds == 0 { + return 0, "minutes" + } + + // Check if it's evenly divisible by days + if totalSeconds%(24*3600) == 0 { + return totalSeconds / (24 * 3600), "days" + } + + // Check if it's evenly divisible by hours + if totalSeconds%3600 == 0 { + return totalSeconds / 3600, "hours" + } + + // Default to minutes + return totalSeconds / 60, "minutes" +} + +// IntervalValueUnitToSeconds converts interval value and unit to seconds +func IntervalValueUnitToSeconds(value int, unit string) int { + switch unit { + case "days": + return value * 24 * 3600 + case "hours": + return value * 3600 + case "minutes": + return value * 60 + default: + return value * 60 // Default to minutes + } +} + +// ParseDisplayValue converts a display value back to the storage format +func (f *Field) ParseDisplayValue(displayValue interface{}) interface{} { + if (f.Type == FieldTypeDuration || f.Type == FieldTypeInterval) && f.Unit != UnitSeconds { + if val, ok := displayValue.(int); ok { + switch f.Unit { + case UnitMinutes: + return val * 60 + case UnitHours: + return val * 3600 + case UnitDays: + return val * 24 * 3600 + } + } + } + return displayValue +} + +// ParseIntervalFormData parses form data for interval fields (value + unit) +func (f *Field) ParseIntervalFormData(valueStr, unitStr string) (int, error) { + if f.Type != FieldTypeInterval { + return 0, fmt.Errorf("field %s is not an interval field", f.Name) + } + + value := 0 + if valueStr != "" { + var err error + value, err = fmt.Sscanf(valueStr, "%d", &value) + if err != nil { + return 0, fmt.Errorf("invalid interval value: %s", valueStr) + } + } + + return IntervalValueUnitToSeconds(value, unitStr), nil +} + +// ValidateValue validates a value against the field constraints +func (f *Field) ValidateValue(value interface{}) error { + if f.Required && (value == nil || value == "" || value == 0) { + return fmt.Errorf("%s is required", f.DisplayName) + } + + if f.MinValue != nil { + if !f.compareValues(value, f.MinValue, ">=") { + return fmt.Errorf("%s must be >= %v", f.DisplayName, f.MinValue) + } + } + + if f.MaxValue != nil { + if !f.compareValues(value, f.MaxValue, "<=") { + return fmt.Errorf("%s must be <= %v", f.DisplayName, f.MaxValue) + } + } + + return nil +} + +// compareValues compares two values based on the operator +func (f *Field) compareValues(a, b interface{}, op string) bool { + switch f.Type { + case FieldTypeInt: + aVal, aOk := a.(int) + bVal, bOk := b.(int) + if !aOk || !bOk { + return false + } + switch op { + case ">=": + return aVal >= bVal + case "<=": + return aVal <= bVal + } + case FieldTypeFloat: + aVal, aOk := a.(float64) + bVal, bOk := b.(float64) + if !aOk || !bOk { + return false + } + switch op { + case ">=": + return aVal >= bVal + case "<=": + return aVal <= bVal + } + } + return true +} + +// Schema provides common functionality for configuration schemas +type Schema struct { + Fields []*Field `json:"fields"` +} + +// GetFieldByName returns a field by its JSON name +func (s *Schema) GetFieldByName(jsonName string) *Field { + for _, field := range s.Fields { + if field.JSONName == jsonName { + return field + } + } + return nil +} + +// ApplyDefaultsToConfig applies defaults to a configuration that implements ConfigWithDefaults +func (s *Schema) ApplyDefaultsToConfig(config ConfigWithDefaults) error { + return config.ApplySchemaDefaults(s) +} + +// ApplyDefaultsToProtobuf applies defaults to protobuf types using reflection +func (s *Schema) ApplyDefaultsToProtobuf(config interface{}) error { + return s.applyDefaultsReflection(config) +} + +// applyDefaultsReflection applies default values using reflection (internal use only) +// Used for protobuf types and embedded struct handling +func (s *Schema) applyDefaultsReflection(config interface{}) error { + configValue := reflect.ValueOf(config) + if configValue.Kind() == reflect.Ptr { + configValue = configValue.Elem() + } + + if configValue.Kind() != reflect.Struct { + return fmt.Errorf("config must be a struct or pointer to struct") + } + + configType := configValue.Type() + + for i := 0; i < configValue.NumField(); i++ { + field := configValue.Field(i) + fieldType := configType.Field(i) + + // Handle embedded structs recursively (before JSON tag check) + if field.Kind() == reflect.Struct && fieldType.Anonymous { + if !field.CanAddr() { + return fmt.Errorf("embedded struct %s is not addressable - config must be a pointer", fieldType.Name) + } + err := s.applyDefaultsReflection(field.Addr().Interface()) + if err != nil { + return fmt.Errorf("failed to apply defaults to embedded struct %s: %v", fieldType.Name, err) + } + continue + } + + // Get JSON tag name + jsonTag := fieldType.Tag.Get("json") + if jsonTag == "" { + continue + } + + // Remove options like ",omitempty" + if commaIdx := strings.Index(jsonTag, ","); commaIdx >= 0 { + jsonTag = jsonTag[:commaIdx] + } + + // Find corresponding schema field + schemaField := s.GetFieldByName(jsonTag) + if schemaField == nil { + continue + } + + // Apply default if field is zero value + if field.CanSet() && field.IsZero() { + defaultValue := reflect.ValueOf(schemaField.DefaultValue) + if defaultValue.Type().ConvertibleTo(field.Type()) { + field.Set(defaultValue.Convert(field.Type())) + } + } + } + + return nil +} + +// ValidateConfig validates a configuration against the schema +func (s *Schema) ValidateConfig(config interface{}) []error { + var errors []error + + configValue := reflect.ValueOf(config) + if configValue.Kind() == reflect.Ptr { + configValue = configValue.Elem() + } + + if configValue.Kind() != reflect.Struct { + errors = append(errors, fmt.Errorf("config must be a struct or pointer to struct")) + return errors + } + + configType := configValue.Type() + + for i := 0; i < configValue.NumField(); i++ { + field := configValue.Field(i) + fieldType := configType.Field(i) + + // Get JSON tag name + jsonTag := fieldType.Tag.Get("json") + if jsonTag == "" { + continue + } + + // Remove options like ",omitempty" + if commaIdx := strings.Index(jsonTag, ","); commaIdx > 0 { + jsonTag = jsonTag[:commaIdx] + } + + // Find corresponding schema field + schemaField := s.GetFieldByName(jsonTag) + if schemaField == nil { + continue + } + + // Validate field value + fieldValue := field.Interface() + if err := schemaField.ValidateValue(fieldValue); err != nil { + errors = append(errors, err) + } + } + + return errors +} diff --git a/weed/admin/config/schema_test.go b/weed/admin/config/schema_test.go new file mode 100644 index 000000000..3d0d74a38 --- /dev/null +++ b/weed/admin/config/schema_test.go @@ -0,0 +1,226 @@ +package config + +import ( + "testing" +) + +// Test structs that mirror the actual configuration structure +type TestBaseConfigForSchema struct { + Enabled bool `json:"enabled"` + ScanIntervalSeconds int `json:"scan_interval_seconds"` + MaxConcurrent int `json:"max_concurrent"` +} + +// ApplySchemaDefaults implements ConfigWithDefaults for test struct +func (c *TestBaseConfigForSchema) ApplySchemaDefaults(schema *Schema) error { + return schema.ApplyDefaultsToProtobuf(c) +} + +// Validate implements ConfigWithDefaults for test struct +func (c *TestBaseConfigForSchema) Validate() error { + return nil +} + +type TestTaskConfigForSchema struct { + TestBaseConfigForSchema + TaskSpecificField float64 `json:"task_specific_field"` + AnotherSpecificField string `json:"another_specific_field"` +} + +// ApplySchemaDefaults implements ConfigWithDefaults for test struct +func (c *TestTaskConfigForSchema) ApplySchemaDefaults(schema *Schema) error { + return schema.ApplyDefaultsToProtobuf(c) +} + +// Validate implements ConfigWithDefaults for test struct +func (c *TestTaskConfigForSchema) Validate() error { + return nil +} + +func createTestSchema() *Schema { + return &Schema{ + Fields: []*Field{ + { + Name: "enabled", + JSONName: "enabled", + Type: FieldTypeBool, + DefaultValue: true, + }, + { + Name: "scan_interval_seconds", + JSONName: "scan_interval_seconds", + Type: FieldTypeInt, + DefaultValue: 1800, + }, + { + Name: "max_concurrent", + JSONName: "max_concurrent", + Type: FieldTypeInt, + DefaultValue: 3, + }, + { + Name: "task_specific_field", + JSONName: "task_specific_field", + Type: FieldTypeFloat, + DefaultValue: 0.25, + }, + { + Name: "another_specific_field", + JSONName: "another_specific_field", + Type: FieldTypeString, + DefaultValue: "default_value", + }, + }, + } +} + +func TestApplyDefaults_WithEmbeddedStruct(t *testing.T) { + schema := createTestSchema() + + // Start with zero values + config := &TestTaskConfigForSchema{} + + err := schema.ApplyDefaultsToConfig(config) + if err != nil { + t.Fatalf("ApplyDefaultsToConfig failed: %v", err) + } + + // Verify embedded struct fields got default values + if config.Enabled != true { + t.Errorf("Expected Enabled=true (default), got %v", config.Enabled) + } + + if config.ScanIntervalSeconds != 1800 { + t.Errorf("Expected ScanIntervalSeconds=1800 (default), got %v", config.ScanIntervalSeconds) + } + + if config.MaxConcurrent != 3 { + t.Errorf("Expected MaxConcurrent=3 (default), got %v", config.MaxConcurrent) + } + + // Verify task-specific fields got default values + if config.TaskSpecificField != 0.25 { + t.Errorf("Expected TaskSpecificField=0.25 (default), got %v", config.TaskSpecificField) + } + + if config.AnotherSpecificField != "default_value" { + t.Errorf("Expected AnotherSpecificField='default_value' (default), got %v", config.AnotherSpecificField) + } +} + +func TestApplyDefaults_PartiallySet(t *testing.T) { + schema := createTestSchema() + + // Start with some pre-set values + config := &TestTaskConfigForSchema{ + TestBaseConfigForSchema: TestBaseConfigForSchema{ + Enabled: true, // Non-zero value, should not be overridden + ScanIntervalSeconds: 0, // Should get default + MaxConcurrent: 5, // Non-zero value, should not be overridden + }, + TaskSpecificField: 0.0, // Should get default + AnotherSpecificField: "custom", // Non-zero value, should not be overridden + } + + err := schema.ApplyDefaultsToConfig(config) + if err != nil { + t.Fatalf("ApplyDefaultsToConfig failed: %v", err) + } + + // Verify already-set values are preserved + if config.Enabled != true { + t.Errorf("Expected Enabled=true (pre-set), got %v", config.Enabled) + } + + if config.MaxConcurrent != 5 { + t.Errorf("Expected MaxConcurrent=5 (pre-set), got %v", config.MaxConcurrent) + } + + if config.AnotherSpecificField != "custom" { + t.Errorf("Expected AnotherSpecificField='custom' (pre-set), got %v", config.AnotherSpecificField) + } + + // Verify zero values got defaults + if config.ScanIntervalSeconds != 1800 { + t.Errorf("Expected ScanIntervalSeconds=1800 (default), got %v", config.ScanIntervalSeconds) + } + + if config.TaskSpecificField != 0.25 { + t.Errorf("Expected TaskSpecificField=0.25 (default), got %v", config.TaskSpecificField) + } +} + +func TestApplyDefaults_NonPointer(t *testing.T) { + schema := createTestSchema() + config := TestTaskConfigForSchema{} + // This should fail since we need a pointer to modify the struct + err := schema.ApplyDefaultsToProtobuf(config) + if err == nil { + t.Fatal("Expected error for non-pointer config, but got nil") + } +} + +func TestApplyDefaults_NonStruct(t *testing.T) { + schema := createTestSchema() + var config interface{} = "not a struct" + err := schema.ApplyDefaultsToProtobuf(config) + if err == nil { + t.Fatal("Expected error for non-struct config, but got nil") + } +} + +func TestApplyDefaults_EmptySchema(t *testing.T) { + schema := &Schema{Fields: []*Field{}} + config := &TestTaskConfigForSchema{} + + err := schema.ApplyDefaultsToConfig(config) + if err != nil { + t.Fatalf("ApplyDefaultsToConfig failed for empty schema: %v", err) + } + + // All fields should remain at zero values since no defaults are defined + if config.Enabled != false { + t.Errorf("Expected Enabled=false (zero value), got %v", config.Enabled) + } +} + +func TestApplyDefaults_MissingSchemaField(t *testing.T) { + // Schema with fewer fields than the struct + schema := &Schema{ + Fields: []*Field{ + { + Name: "enabled", + JSONName: "enabled", + Type: FieldTypeBool, + DefaultValue: true, + }, + // Note: missing scan_interval_seconds and other fields + }, + } + + config := &TestTaskConfigForSchema{} + err := schema.ApplyDefaultsToConfig(config) + if err != nil { + t.Fatalf("ApplyDefaultsToConfig failed: %v", err) + } + + // Only the field with a schema definition should get a default + if config.Enabled != true { + t.Errorf("Expected Enabled=true (has schema), got %v", config.Enabled) + } + + // Fields without schema should remain at zero values + if config.ScanIntervalSeconds != 0 { + t.Errorf("Expected ScanIntervalSeconds=0 (no schema), got %v", config.ScanIntervalSeconds) + } +} + +func BenchmarkApplyDefaults(b *testing.B) { + schema := createTestSchema() + config := &TestTaskConfigForSchema{} + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = schema.ApplyDefaultsToConfig(config) + } +} diff --git a/weed/admin/dash/admin_server.go b/weed/admin/dash/admin_server.go index 6ebade19f..376f3edc7 100644 --- a/weed/admin/dash/admin_server.go +++ b/weed/admin/dash/admin_server.go @@ -25,6 +25,7 @@ import ( "google.golang.org/grpc" "github.com/seaweedfs/seaweedfs/weed/s3api" + "github.com/seaweedfs/seaweedfs/weed/worker/tasks" ) type AdminServer struct { @@ -126,30 +127,67 @@ func NewAdminServer(masters string, templateFS http.FileSystem, dataDir string) } } - // Initialize maintenance system with persistent configuration + // Initialize maintenance system - always initialize even without persistent storage + var maintenanceConfig *maintenance.MaintenanceConfig if server.configPersistence.IsConfigured() { - maintenanceConfig, err := server.configPersistence.LoadMaintenanceConfig() + var err error + maintenanceConfig, err = server.configPersistence.LoadMaintenanceConfig() if err != nil { glog.Errorf("Failed to load maintenance configuration: %v", err) maintenanceConfig = maintenance.DefaultMaintenanceConfig() } - server.InitMaintenanceManager(maintenanceConfig) - // Start maintenance manager if enabled - if maintenanceConfig.Enabled { - go func() { - if err := server.StartMaintenanceManager(); err != nil { - glog.Errorf("Failed to start maintenance manager: %v", err) - } - }() + // Apply new defaults to handle schema changes (like enabling by default) + schema := maintenance.GetMaintenanceConfigSchema() + if err := schema.ApplyDefaultsToProtobuf(maintenanceConfig); err != nil { + glog.Warningf("Failed to apply schema defaults to loaded config: %v", err) } + + // Force enable maintenance system for new default behavior + // This handles the case where old configs had Enabled=false as default + if !maintenanceConfig.Enabled { + glog.V(1).Infof("Enabling maintenance system (new default behavior)") + maintenanceConfig.Enabled = true + } + + glog.V(1).Infof("Maintenance system initialized with persistent configuration (enabled: %v)", maintenanceConfig.Enabled) } else { - glog.V(1).Infof("No data directory configured, maintenance system will run in memory-only mode") + maintenanceConfig = maintenance.DefaultMaintenanceConfig() + glog.V(1).Infof("No data directory configured, maintenance system will run in memory-only mode (enabled: %v)", maintenanceConfig.Enabled) + } + + // Always initialize maintenance manager + server.InitMaintenanceManager(maintenanceConfig) + + // Load saved task configurations from persistence + server.loadTaskConfigurationsFromPersistence() + + // Start maintenance manager if enabled + if maintenanceConfig.Enabled { + go func() { + // Give master client a bit of time to connect before starting scans + time.Sleep(2 * time.Second) + if err := server.StartMaintenanceManager(); err != nil { + glog.Errorf("Failed to start maintenance manager: %v", err) + } + }() } return server } +// loadTaskConfigurationsFromPersistence loads saved task configurations from protobuf files +func (s *AdminServer) loadTaskConfigurationsFromPersistence() { + if s.configPersistence == nil || !s.configPersistence.IsConfigured() { + glog.V(1).Infof("Config persistence not available, using default task configurations") + return + } + + // Load task configurations dynamically using the config update registry + configUpdateRegistry := tasks.GetGlobalConfigUpdateRegistry() + configUpdateRegistry.UpdateAllConfigs(s.configPersistence) +} + // GetCredentialManager returns the credential manager func (s *AdminServer) GetCredentialManager() *credential.CredentialManager { return s.credentialManager @@ -852,6 +890,15 @@ func (as *AdminServer) CancelMaintenanceTask(c *gin.Context) { c.JSON(http.StatusOK, gin.H{"success": true, "message": "Task cancelled"}) } +// cancelMaintenanceTask cancels a pending maintenance task +func (as *AdminServer) cancelMaintenanceTask(taskID string) error { + if as.maintenanceManager == nil { + return fmt.Errorf("maintenance manager not initialized") + } + + return as.maintenanceManager.CancelTask(taskID) +} + // GetMaintenanceWorkersAPI returns all maintenance workers func (as *AdminServer) GetMaintenanceWorkersAPI(c *gin.Context) { workers, err := as.getMaintenanceWorkers() @@ -899,13 +946,21 @@ func (as *AdminServer) GetMaintenanceConfigAPI(c *gin.Context) { // UpdateMaintenanceConfigAPI updates maintenance configuration via API func (as *AdminServer) UpdateMaintenanceConfigAPI(c *gin.Context) { - var config MaintenanceConfig - if err := c.ShouldBindJSON(&config); err != nil { + // Parse JSON into a generic map first to handle type conversions + var jsonConfig map[string]interface{} + if err := c.ShouldBindJSON(&jsonConfig); err != nil { c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) return } - err := as.updateMaintenanceConfig(&config) + // Convert JSON map to protobuf configuration + config, err := convertJSONToMaintenanceConfig(jsonConfig) + if err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "Failed to parse configuration: " + err.Error()}) + return + } + + err = as.updateMaintenanceConfig(config) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return @@ -951,17 +1006,36 @@ func (as *AdminServer) getMaintenanceQueueData() (*maintenance.MaintenanceQueueD }, nil } +// GetMaintenanceQueueStats returns statistics for the maintenance queue (exported for handlers) +func (as *AdminServer) GetMaintenanceQueueStats() (*maintenance.QueueStats, error) { + return as.getMaintenanceQueueStats() +} + // getMaintenanceQueueStats returns statistics for the maintenance queue func (as *AdminServer) getMaintenanceQueueStats() (*maintenance.QueueStats, error) { - // This would integrate with the maintenance queue to get real statistics - // For now, return mock data - return &maintenance.QueueStats{ - PendingTasks: 5, - RunningTasks: 2, - CompletedToday: 15, - FailedToday: 1, - TotalTasks: 23, - }, nil + if as.maintenanceManager == nil { + return &maintenance.QueueStats{ + PendingTasks: 0, + RunningTasks: 0, + CompletedToday: 0, + FailedToday: 0, + TotalTasks: 0, + }, nil + } + + // Get real statistics from maintenance manager + stats := as.maintenanceManager.GetStats() + + // Convert MaintenanceStats to QueueStats + queueStats := &maintenance.QueueStats{ + PendingTasks: stats.TasksByStatus[maintenance.TaskStatusPending], + RunningTasks: stats.TasksByStatus[maintenance.TaskStatusAssigned] + stats.TasksByStatus[maintenance.TaskStatusInProgress], + CompletedToday: stats.CompletedToday, + FailedToday: stats.FailedToday, + TotalTasks: stats.TotalTasks, + } + + return queueStats, nil } // getMaintenanceTasks returns all maintenance tasks @@ -1000,15 +1074,6 @@ func (as *AdminServer) getMaintenanceTask(taskID string) (*MaintenanceTask, erro return nil, fmt.Errorf("task %s not found", taskID) } -// cancelMaintenanceTask cancels a pending maintenance task -func (as *AdminServer) cancelMaintenanceTask(taskID string) error { - if as.maintenanceManager == nil { - return fmt.Errorf("maintenance manager not initialized") - } - - return as.maintenanceManager.CancelTask(taskID) -} - // getMaintenanceWorkers returns all maintenance workers func (as *AdminServer) getMaintenanceWorkers() ([]*maintenance.MaintenanceWorker, error) { if as.maintenanceManager == nil { @@ -1110,11 +1175,14 @@ func (as *AdminServer) getMaintenanceConfig() (*maintenance.MaintenanceConfigDat // Load configuration from persistent storage config, err := as.configPersistence.LoadMaintenanceConfig() if err != nil { - glog.Errorf("Failed to load maintenance configuration: %v", err) // Fallback to default configuration - config = DefaultMaintenanceConfig() + config = maintenance.DefaultMaintenanceConfig() } + // Note: Do NOT apply schema defaults to existing config as it overrides saved values + // Only apply defaults when creating new configs or handling fallback cases + // The schema defaults should only be used in the UI for new installations + // Get system stats from maintenance manager if available var systemStats *MaintenanceStats if as.maintenanceManager != nil { @@ -1139,18 +1207,25 @@ func (as *AdminServer) getMaintenanceConfig() (*maintenance.MaintenanceConfigDat } } - return &MaintenanceConfigData{ + configData := &MaintenanceConfigData{ Config: config, IsEnabled: config.Enabled, LastScanTime: systemStats.LastScanTime, NextScanTime: systemStats.NextScanTime, SystemStats: systemStats, MenuItems: maintenance.BuildMaintenanceMenuItems(), - }, nil + } + + return configData, nil } // updateMaintenanceConfig updates maintenance configuration func (as *AdminServer) updateMaintenanceConfig(config *maintenance.MaintenanceConfig) error { + // Use ConfigField validation instead of standalone validation + if err := maintenance.ValidateMaintenanceConfigWithSchema(config); err != nil { + return fmt.Errorf("configuration validation failed: %v", err) + } + // Save configuration to persistent storage if err := as.configPersistence.SaveMaintenanceConfig(config); err != nil { return fmt.Errorf("failed to save maintenance configuration: %w", err) @@ -1175,7 +1250,14 @@ func (as *AdminServer) triggerMaintenanceScan() error { return fmt.Errorf("maintenance manager not initialized") } - return as.maintenanceManager.TriggerScan() + glog.V(1).Infof("Triggering maintenance scan") + err := as.maintenanceManager.TriggerScan() + if err != nil { + glog.Errorf("Failed to trigger maintenance scan: %v", err) + return err + } + glog.V(1).Infof("Maintenance scan triggered successfully") + return nil } // TriggerTopicRetentionPurgeAPI triggers topic retention purge via HTTP API @@ -1265,14 +1347,11 @@ func (as *AdminServer) GetMaintenanceWorkersData() (*MaintenanceWorkersData, err } // StartWorkerGrpcServer starts the worker gRPC server -func (s *AdminServer) StartWorkerGrpcServer(httpPort int) error { +func (s *AdminServer) StartWorkerGrpcServer(grpcPort int) error { if s.workerGrpcServer != nil { return fmt.Errorf("worker gRPC server is already running") } - // Calculate gRPC port (HTTP port + 10000) - grpcPort := httpPort + 10000 - s.workerGrpcServer = NewWorkerGrpcServer(s) return s.workerGrpcServer.StartWithTLS(grpcPort) } @@ -1412,7 +1491,7 @@ func (s *AdminServer) UpdateTopicRetention(namespace, name string, enabled bool, } // Create gRPC connection - conn, err := grpc.Dial(brokerAddress, s.grpcDialOption) + conn, err := grpc.NewClient(brokerAddress, s.grpcDialOption) if err != nil { return fmt.Errorf("failed to connect to broker: %w", err) } @@ -1501,3 +1580,161 @@ func extractVersioningFromEntry(entry *filer_pb.Entry) bool { enabled, _ := s3api.LoadVersioningFromExtended(entry) return enabled } + +// GetConfigPersistence returns the config persistence manager +func (as *AdminServer) GetConfigPersistence() *ConfigPersistence { + return as.configPersistence +} + +// convertJSONToMaintenanceConfig converts JSON map to protobuf MaintenanceConfig +func convertJSONToMaintenanceConfig(jsonConfig map[string]interface{}) (*maintenance.MaintenanceConfig, error) { + config := &maintenance.MaintenanceConfig{} + + // Helper function to get int32 from interface{} + getInt32 := func(key string) (int32, error) { + if val, ok := jsonConfig[key]; ok { + switch v := val.(type) { + case int: + return int32(v), nil + case int32: + return v, nil + case int64: + return int32(v), nil + case float64: + return int32(v), nil + default: + return 0, fmt.Errorf("invalid type for %s: expected number, got %T", key, v) + } + } + return 0, nil + } + + // Helper function to get bool from interface{} + getBool := func(key string) bool { + if val, ok := jsonConfig[key]; ok { + if b, ok := val.(bool); ok { + return b + } + } + return false + } + + var err error + + // Convert basic fields + config.Enabled = getBool("enabled") + + if config.ScanIntervalSeconds, err = getInt32("scan_interval_seconds"); err != nil { + return nil, err + } + if config.WorkerTimeoutSeconds, err = getInt32("worker_timeout_seconds"); err != nil { + return nil, err + } + if config.TaskTimeoutSeconds, err = getInt32("task_timeout_seconds"); err != nil { + return nil, err + } + if config.RetryDelaySeconds, err = getInt32("retry_delay_seconds"); err != nil { + return nil, err + } + if config.MaxRetries, err = getInt32("max_retries"); err != nil { + return nil, err + } + if config.CleanupIntervalSeconds, err = getInt32("cleanup_interval_seconds"); err != nil { + return nil, err + } + if config.TaskRetentionSeconds, err = getInt32("task_retention_seconds"); err != nil { + return nil, err + } + + // Convert policy if present + if policyData, ok := jsonConfig["policy"]; ok { + if policyMap, ok := policyData.(map[string]interface{}); ok { + policy := &maintenance.MaintenancePolicy{} + + if globalMaxConcurrent, err := getInt32FromMap(policyMap, "global_max_concurrent"); err != nil { + return nil, err + } else { + policy.GlobalMaxConcurrent = globalMaxConcurrent + } + + if defaultRepeatIntervalSeconds, err := getInt32FromMap(policyMap, "default_repeat_interval_seconds"); err != nil { + return nil, err + } else { + policy.DefaultRepeatIntervalSeconds = defaultRepeatIntervalSeconds + } + + if defaultCheckIntervalSeconds, err := getInt32FromMap(policyMap, "default_check_interval_seconds"); err != nil { + return nil, err + } else { + policy.DefaultCheckIntervalSeconds = defaultCheckIntervalSeconds + } + + // Convert task policies if present + if taskPoliciesData, ok := policyMap["task_policies"]; ok { + if taskPoliciesMap, ok := taskPoliciesData.(map[string]interface{}); ok { + policy.TaskPolicies = make(map[string]*maintenance.TaskPolicy) + + for taskType, taskPolicyData := range taskPoliciesMap { + if taskPolicyMap, ok := taskPolicyData.(map[string]interface{}); ok { + taskPolicy := &maintenance.TaskPolicy{} + + taskPolicy.Enabled = getBoolFromMap(taskPolicyMap, "enabled") + + if maxConcurrent, err := getInt32FromMap(taskPolicyMap, "max_concurrent"); err != nil { + return nil, err + } else { + taskPolicy.MaxConcurrent = maxConcurrent + } + + if repeatIntervalSeconds, err := getInt32FromMap(taskPolicyMap, "repeat_interval_seconds"); err != nil { + return nil, err + } else { + taskPolicy.RepeatIntervalSeconds = repeatIntervalSeconds + } + + if checkIntervalSeconds, err := getInt32FromMap(taskPolicyMap, "check_interval_seconds"); err != nil { + return nil, err + } else { + taskPolicy.CheckIntervalSeconds = checkIntervalSeconds + } + + policy.TaskPolicies[taskType] = taskPolicy + } + } + } + } + + config.Policy = policy + } + } + + return config, nil +} + +// Helper functions for map conversion +func getInt32FromMap(m map[string]interface{}, key string) (int32, error) { + if val, ok := m[key]; ok { + switch v := val.(type) { + case int: + return int32(v), nil + case int32: + return v, nil + case int64: + return int32(v), nil + case float64: + return int32(v), nil + default: + return 0, fmt.Errorf("invalid type for %s: expected number, got %T", key, v) + } + } + return 0, nil +} + +func getBoolFromMap(m map[string]interface{}, key string) bool { + if val, ok := m[key]; ok { + if b, ok := val.(bool); ok { + return b + } + } + return false +} diff --git a/weed/admin/dash/collection_management.go b/weed/admin/dash/collection_management.go index a70c82918..03c1e452b 100644 --- a/weed/admin/dash/collection_management.go +++ b/weed/admin/dash/collection_management.go @@ -12,6 +12,7 @@ import ( func (s *AdminServer) GetClusterCollections() (*ClusterCollectionsData, error) { var collections []CollectionInfo var totalVolumes int + var totalEcVolumes int var totalFiles int64 var totalSize int64 collectionMap := make(map[string]*CollectionInfo) @@ -28,6 +29,7 @@ func (s *AdminServer) GetClusterCollections() (*ClusterCollectionsData, error) { for _, rack := range dc.RackInfos { for _, node := range rack.DataNodeInfos { for _, diskInfo := range node.DiskInfos { + // Process regular volumes for _, volInfo := range diskInfo.VolumeInfos { // Extract collection name from volume info collectionName := volInfo.Collection @@ -69,12 +71,13 @@ func (s *AdminServer) GetClusterCollections() (*ClusterCollectionsData, error) { totalSize += int64(volInfo.Size) } else { newCollection := CollectionInfo{ - Name: collectionName, - DataCenter: dc.Id, - VolumeCount: 1, - FileCount: int64(volInfo.FileCount), - TotalSize: int64(volInfo.Size), - DiskTypes: []string{diskType}, + Name: collectionName, + DataCenter: dc.Id, + VolumeCount: 1, + EcVolumeCount: 0, + FileCount: int64(volInfo.FileCount), + TotalSize: int64(volInfo.Size), + DiskTypes: []string{diskType}, } collectionMap[collectionName] = &newCollection totalVolumes++ @@ -82,6 +85,63 @@ func (s *AdminServer) GetClusterCollections() (*ClusterCollectionsData, error) { totalSize += int64(volInfo.Size) } } + + // Process EC volumes + ecVolumeMap := make(map[uint32]bool) // Track unique EC volumes to avoid double counting + for _, ecShardInfo := range diskInfo.EcShardInfos { + // Extract collection name from EC shard info + collectionName := ecShardInfo.Collection + if collectionName == "" { + collectionName = "default" // Default collection for EC volumes without explicit collection + } + + // Only count each EC volume once (not per shard) + if !ecVolumeMap[ecShardInfo.Id] { + ecVolumeMap[ecShardInfo.Id] = true + + // Get disk type from disk info, default to hdd if empty + diskType := diskInfo.Type + if diskType == "" { + diskType = "hdd" + } + + // Get or create collection info + if collection, exists := collectionMap[collectionName]; exists { + collection.EcVolumeCount++ + + // Update data center if this collection spans multiple DCs + if collection.DataCenter != dc.Id && collection.DataCenter != "multi" { + collection.DataCenter = "multi" + } + + // Add disk type if not already present + diskTypeExists := false + for _, existingDiskType := range collection.DiskTypes { + if existingDiskType == diskType { + diskTypeExists = true + break + } + } + if !diskTypeExists { + collection.DiskTypes = append(collection.DiskTypes, diskType) + } + + totalEcVolumes++ + } else { + newCollection := CollectionInfo{ + Name: collectionName, + DataCenter: dc.Id, + VolumeCount: 0, + EcVolumeCount: 1, + FileCount: 0, + TotalSize: 0, + DiskTypes: []string{diskType}, + } + collectionMap[collectionName] = &newCollection + totalEcVolumes++ + } + } + } } } } @@ -112,6 +172,7 @@ func (s *AdminServer) GetClusterCollections() (*ClusterCollectionsData, error) { Collections: []CollectionInfo{}, TotalCollections: 0, TotalVolumes: 0, + TotalEcVolumes: 0, TotalFiles: 0, TotalSize: 0, LastUpdated: time.Now(), @@ -122,8 +183,203 @@ func (s *AdminServer) GetClusterCollections() (*ClusterCollectionsData, error) { Collections: collections, TotalCollections: len(collections), TotalVolumes: totalVolumes, + TotalEcVolumes: totalEcVolumes, TotalFiles: totalFiles, TotalSize: totalSize, LastUpdated: time.Now(), }, nil } + +// GetCollectionDetails retrieves detailed information for a specific collection including volumes and EC volumes +func (s *AdminServer) GetCollectionDetails(collectionName string, page int, pageSize int, sortBy string, sortOrder string) (*CollectionDetailsData, error) { + // Set defaults + if page < 1 { + page = 1 + } + if pageSize < 1 || pageSize > 1000 { + pageSize = 25 + } + if sortBy == "" { + sortBy = "volume_id" + } + if sortOrder == "" { + sortOrder = "asc" + } + + var regularVolumes []VolumeWithTopology + var ecVolumes []EcVolumeWithShards + var totalFiles int64 + var totalSize int64 + dataCenters := make(map[string]bool) + diskTypes := make(map[string]bool) + + // Get regular volumes for this collection + regularVolumeData, err := s.GetClusterVolumes(1, 10000, "volume_id", "asc", collectionName) // Get all volumes + if err != nil { + return nil, err + } + + regularVolumes = regularVolumeData.Volumes + totalSize = regularVolumeData.TotalSize + + // Calculate total files from regular volumes + for _, vol := range regularVolumes { + totalFiles += int64(vol.FileCount) + } + + // Collect data centers and disk types from regular volumes + for _, vol := range regularVolumes { + dataCenters[vol.DataCenter] = true + diskTypes[vol.DiskType] = true + } + + // Get EC volumes for this collection + ecVolumeData, err := s.GetClusterEcVolumes(1, 10000, "volume_id", "asc", collectionName) // Get all EC volumes + if err != nil { + return nil, err + } + + ecVolumes = ecVolumeData.EcVolumes + + // Collect data centers from EC volumes + for _, ecVol := range ecVolumes { + for _, dc := range ecVol.DataCenters { + dataCenters[dc] = true + } + } + + // Combine all volumes for sorting and pagination + type VolumeForSorting struct { + Type string // "regular" or "ec" + RegularVolume *VolumeWithTopology + EcVolume *EcVolumeWithShards + } + + var allVolumes []VolumeForSorting + for i := range regularVolumes { + allVolumes = append(allVolumes, VolumeForSorting{ + Type: "regular", + RegularVolume: ®ularVolumes[i], + }) + } + for i := range ecVolumes { + allVolumes = append(allVolumes, VolumeForSorting{ + Type: "ec", + EcVolume: &ecVolumes[i], + }) + } + + // Sort all volumes + sort.Slice(allVolumes, func(i, j int) bool { + var less bool + switch sortBy { + case "volume_id": + var idI, idJ uint32 + if allVolumes[i].Type == "regular" { + idI = allVolumes[i].RegularVolume.Id + } else { + idI = allVolumes[i].EcVolume.VolumeID + } + if allVolumes[j].Type == "regular" { + idJ = allVolumes[j].RegularVolume.Id + } else { + idJ = allVolumes[j].EcVolume.VolumeID + } + less = idI < idJ + case "type": + // Sort by type first (regular before ec), then by volume ID + if allVolumes[i].Type == allVolumes[j].Type { + var idI, idJ uint32 + if allVolumes[i].Type == "regular" { + idI = allVolumes[i].RegularVolume.Id + } else { + idI = allVolumes[i].EcVolume.VolumeID + } + if allVolumes[j].Type == "regular" { + idJ = allVolumes[j].RegularVolume.Id + } else { + idJ = allVolumes[j].EcVolume.VolumeID + } + less = idI < idJ + } else { + less = allVolumes[i].Type < allVolumes[j].Type // "ec" < "regular" + } + default: + // Default to volume ID sort + var idI, idJ uint32 + if allVolumes[i].Type == "regular" { + idI = allVolumes[i].RegularVolume.Id + } else { + idI = allVolumes[i].EcVolume.VolumeID + } + if allVolumes[j].Type == "regular" { + idJ = allVolumes[j].RegularVolume.Id + } else { + idJ = allVolumes[j].EcVolume.VolumeID + } + less = idI < idJ + } + + if sortOrder == "desc" { + return !less + } + return less + }) + + // Apply pagination + totalVolumesAndEc := len(allVolumes) + totalPages := (totalVolumesAndEc + pageSize - 1) / pageSize + startIndex := (page - 1) * pageSize + endIndex := startIndex + pageSize + if endIndex > totalVolumesAndEc { + endIndex = totalVolumesAndEc + } + + if startIndex >= totalVolumesAndEc { + startIndex = 0 + endIndex = 0 + } + + // Extract paginated results + var paginatedRegularVolumes []VolumeWithTopology + var paginatedEcVolumes []EcVolumeWithShards + + for i := startIndex; i < endIndex; i++ { + if allVolumes[i].Type == "regular" { + paginatedRegularVolumes = append(paginatedRegularVolumes, *allVolumes[i].RegularVolume) + } else { + paginatedEcVolumes = append(paginatedEcVolumes, *allVolumes[i].EcVolume) + } + } + + // Convert maps to slices + var dcList []string + for dc := range dataCenters { + dcList = append(dcList, dc) + } + sort.Strings(dcList) + + var diskTypeList []string + for diskType := range diskTypes { + diskTypeList = append(diskTypeList, diskType) + } + sort.Strings(diskTypeList) + + return &CollectionDetailsData{ + CollectionName: collectionName, + RegularVolumes: paginatedRegularVolumes, + EcVolumes: paginatedEcVolumes, + TotalVolumes: len(regularVolumes), + TotalEcVolumes: len(ecVolumes), + TotalFiles: totalFiles, + TotalSize: totalSize, + DataCenters: dcList, + DiskTypes: diskTypeList, + LastUpdated: time.Now(), + Page: page, + PageSize: pageSize, + TotalPages: totalPages, + SortBy: sortBy, + SortOrder: sortOrder, + }, nil +} diff --git a/weed/admin/dash/config_persistence.go b/weed/admin/dash/config_persistence.go index a2f74f4e7..b6b3074ab 100644 --- a/weed/admin/dash/config_persistence.go +++ b/weed/admin/dash/config_persistence.go @@ -1,23 +1,50 @@ package dash import ( - "encoding/json" "fmt" "os" "path/filepath" "time" "github.com/seaweedfs/seaweedfs/weed/glog" + "github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" + "github.com/seaweedfs/seaweedfs/weed/worker/tasks/balance" + "github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding" + "github.com/seaweedfs/seaweedfs/weed/worker/tasks/vacuum" + "google.golang.org/protobuf/encoding/protojson" + "google.golang.org/protobuf/proto" ) const ( - // Configuration file names - MaintenanceConfigFile = "maintenance.json" - AdminConfigFile = "admin.json" + // Configuration subdirectory + ConfigSubdir = "conf" + + // Configuration file names (protobuf binary) + MaintenanceConfigFile = "maintenance.pb" + VacuumTaskConfigFile = "task_vacuum.pb" + ECTaskConfigFile = "task_erasure_coding.pb" + BalanceTaskConfigFile = "task_balance.pb" + ReplicationTaskConfigFile = "task_replication.pb" + + // JSON reference files + MaintenanceConfigJSONFile = "maintenance.json" + VacuumTaskConfigJSONFile = "task_vacuum.json" + ECTaskConfigJSONFile = "task_erasure_coding.json" + BalanceTaskConfigJSONFile = "task_balance.json" + ReplicationTaskConfigJSONFile = "task_replication.json" + ConfigDirPermissions = 0755 ConfigFilePermissions = 0644 ) +// Task configuration types +type ( + VacuumTaskConfig = worker_pb.VacuumTaskConfig + ErasureCodingTaskConfig = worker_pb.ErasureCodingTaskConfig + BalanceTaskConfig = worker_pb.BalanceTaskConfig + ReplicationTaskConfig = worker_pb.ReplicationTaskConfig +) + // ConfigPersistence handles saving and loading configuration files type ConfigPersistence struct { dataDir string @@ -30,122 +57,67 @@ func NewConfigPersistence(dataDir string) *ConfigPersistence { } } -// SaveMaintenanceConfig saves maintenance configuration to JSON file +// SaveMaintenanceConfig saves maintenance configuration to protobuf file and JSON reference func (cp *ConfigPersistence) SaveMaintenanceConfig(config *MaintenanceConfig) error { if cp.dataDir == "" { return fmt.Errorf("no data directory specified, cannot save configuration") } - configPath := filepath.Join(cp.dataDir, MaintenanceConfigFile) - - // Create directory if it doesn't exist - if err := os.MkdirAll(cp.dataDir, ConfigDirPermissions); err != nil { + confDir := filepath.Join(cp.dataDir, ConfigSubdir) + if err := os.MkdirAll(confDir, ConfigDirPermissions); err != nil { return fmt.Errorf("failed to create config directory: %w", err) } - // Marshal configuration to JSON - configData, err := json.MarshalIndent(config, "", " ") + // Save as protobuf (primary format) + pbConfigPath := filepath.Join(confDir, MaintenanceConfigFile) + pbData, err := proto.Marshal(config) if err != nil { - return fmt.Errorf("failed to marshal maintenance config: %w", err) + return fmt.Errorf("failed to marshal maintenance config to protobuf: %w", err) } - // Write to file - if err := os.WriteFile(configPath, configData, ConfigFilePermissions); err != nil { - return fmt.Errorf("failed to write maintenance config file: %w", err) + if err := os.WriteFile(pbConfigPath, pbData, ConfigFilePermissions); err != nil { + return fmt.Errorf("failed to write protobuf config file: %w", err) + } + + // Save JSON reference copy for debugging + jsonConfigPath := filepath.Join(confDir, MaintenanceConfigJSONFile) + jsonData, err := protojson.MarshalOptions{ + Multiline: true, + Indent: " ", + EmitUnpopulated: true, + }.Marshal(config) + if err != nil { + return fmt.Errorf("failed to marshal maintenance config to JSON: %w", err) + } + + if err := os.WriteFile(jsonConfigPath, jsonData, ConfigFilePermissions); err != nil { + return fmt.Errorf("failed to write JSON reference file: %w", err) } - glog.V(1).Infof("Saved maintenance configuration to %s", configPath) return nil } -// LoadMaintenanceConfig loads maintenance configuration from JSON file +// LoadMaintenanceConfig loads maintenance configuration from protobuf file func (cp *ConfigPersistence) LoadMaintenanceConfig() (*MaintenanceConfig, error) { if cp.dataDir == "" { - glog.V(1).Infof("No data directory specified, using default maintenance configuration") return DefaultMaintenanceConfig(), nil } - configPath := filepath.Join(cp.dataDir, MaintenanceConfigFile) + confDir := filepath.Join(cp.dataDir, ConfigSubdir) + configPath := filepath.Join(confDir, MaintenanceConfigFile) - // Check if file exists - if _, err := os.Stat(configPath); os.IsNotExist(err) { - glog.V(1).Infof("Maintenance config file does not exist, using defaults: %s", configPath) - return DefaultMaintenanceConfig(), nil + // Try to load from protobuf file + if configData, err := os.ReadFile(configPath); err == nil { + var config MaintenanceConfig + if err := proto.Unmarshal(configData, &config); err == nil { + // Always populate policy from separate task configuration files + config.Policy = buildPolicyFromTaskConfigs() + return &config, nil + } } - // Read file - configData, err := os.ReadFile(configPath) - if err != nil { - return nil, fmt.Errorf("failed to read maintenance config file: %w", err) - } - - // Unmarshal JSON - var config MaintenanceConfig - if err := json.Unmarshal(configData, &config); err != nil { - return nil, fmt.Errorf("failed to unmarshal maintenance config: %w", err) - } - - glog.V(1).Infof("Loaded maintenance configuration from %s", configPath) - return &config, nil -} - -// SaveAdminConfig saves general admin configuration to JSON file -func (cp *ConfigPersistence) SaveAdminConfig(config map[string]interface{}) error { - if cp.dataDir == "" { - return fmt.Errorf("no data directory specified, cannot save configuration") - } - - configPath := filepath.Join(cp.dataDir, AdminConfigFile) - - // Create directory if it doesn't exist - if err := os.MkdirAll(cp.dataDir, ConfigDirPermissions); err != nil { - return fmt.Errorf("failed to create config directory: %w", err) - } - - // Marshal configuration to JSON - configData, err := json.MarshalIndent(config, "", " ") - if err != nil { - return fmt.Errorf("failed to marshal admin config: %w", err) - } - - // Write to file - if err := os.WriteFile(configPath, configData, ConfigFilePermissions); err != nil { - return fmt.Errorf("failed to write admin config file: %w", err) - } - - glog.V(1).Infof("Saved admin configuration to %s", configPath) - return nil -} - -// LoadAdminConfig loads general admin configuration from JSON file -func (cp *ConfigPersistence) LoadAdminConfig() (map[string]interface{}, error) { - if cp.dataDir == "" { - glog.V(1).Infof("No data directory specified, using default admin configuration") - return make(map[string]interface{}), nil - } - - configPath := filepath.Join(cp.dataDir, AdminConfigFile) - - // Check if file exists - if _, err := os.Stat(configPath); os.IsNotExist(err) { - glog.V(1).Infof("Admin config file does not exist, using defaults: %s", configPath) - return make(map[string]interface{}), nil - } - - // Read file - configData, err := os.ReadFile(configPath) - if err != nil { - return nil, fmt.Errorf("failed to read admin config file: %w", err) - } - - // Unmarshal JSON - var config map[string]interface{} - if err := json.Unmarshal(configData, &config); err != nil { - return nil, fmt.Errorf("failed to unmarshal admin config: %w", err) - } - - glog.V(1).Infof("Loaded admin configuration from %s", configPath) - return config, nil + // File doesn't exist or failed to load, use defaults + return DefaultMaintenanceConfig(), nil } // GetConfigPath returns the path to a configuration file @@ -153,24 +125,35 @@ func (cp *ConfigPersistence) GetConfigPath(filename string) string { if cp.dataDir == "" { return "" } - return filepath.Join(cp.dataDir, filename) + + // All configs go in conf subdirectory + confDir := filepath.Join(cp.dataDir, ConfigSubdir) + return filepath.Join(confDir, filename) } -// ListConfigFiles returns all configuration files in the data directory +// ListConfigFiles returns all configuration files in the conf subdirectory func (cp *ConfigPersistence) ListConfigFiles() ([]string, error) { if cp.dataDir == "" { return nil, fmt.Errorf("no data directory specified") } - files, err := os.ReadDir(cp.dataDir) + confDir := filepath.Join(cp.dataDir, ConfigSubdir) + files, err := os.ReadDir(confDir) if err != nil { + // If conf directory doesn't exist, return empty list + if os.IsNotExist(err) { + return []string{}, nil + } return nil, fmt.Errorf("failed to read config directory: %w", err) } var configFiles []string for _, file := range files { - if !file.IsDir() && filepath.Ext(file.Name()) == ".json" { - configFiles = append(configFiles, file.Name()) + if !file.IsDir() { + ext := filepath.Ext(file.Name()) + if ext == ".json" || ext == ".pb" { + configFiles = append(configFiles, file.Name()) + } } } @@ -183,7 +166,7 @@ func (cp *ConfigPersistence) BackupConfig(filename string) error { return fmt.Errorf("no data directory specified") } - configPath := filepath.Join(cp.dataDir, filename) + configPath := cp.GetConfigPath(filename) if _, err := os.Stat(configPath); os.IsNotExist(err) { return fmt.Errorf("config file does not exist: %s", filename) } @@ -191,7 +174,10 @@ func (cp *ConfigPersistence) BackupConfig(filename string) error { // Create backup filename with timestamp timestamp := time.Now().Format("2006-01-02_15-04-05") backupName := fmt.Sprintf("%s.backup_%s", filename, timestamp) - backupPath := filepath.Join(cp.dataDir, backupName) + + // Determine backup directory (conf subdirectory) + confDir := filepath.Join(cp.dataDir, ConfigSubdir) + backupPath := filepath.Join(confDir, backupName) // Copy file configData, err := os.ReadFile(configPath) @@ -213,7 +199,10 @@ func (cp *ConfigPersistence) RestoreConfig(filename, backupName string) error { return fmt.Errorf("no data directory specified") } - backupPath := filepath.Join(cp.dataDir, backupName) + // Determine backup path (conf subdirectory) + confDir := filepath.Join(cp.dataDir, ConfigSubdir) + backupPath := filepath.Join(confDir, backupName) + if _, err := os.Stat(backupPath); os.IsNotExist(err) { return fmt.Errorf("backup file does not exist: %s", backupName) } @@ -225,7 +214,7 @@ func (cp *ConfigPersistence) RestoreConfig(filename, backupName string) error { } // Write to config file - configPath := filepath.Join(cp.dataDir, filename) + configPath := cp.GetConfigPath(filename) if err := os.WriteFile(configPath, backupData, ConfigFilePermissions); err != nil { return fmt.Errorf("failed to restore config: %w", err) } @@ -234,6 +223,364 @@ func (cp *ConfigPersistence) RestoreConfig(filename, backupName string) error { return nil } +// SaveVacuumTaskConfig saves vacuum task configuration to protobuf file +func (cp *ConfigPersistence) SaveVacuumTaskConfig(config *VacuumTaskConfig) error { + return cp.saveTaskConfig(VacuumTaskConfigFile, config) +} + +// SaveVacuumTaskPolicy saves complete vacuum task policy to protobuf file +func (cp *ConfigPersistence) SaveVacuumTaskPolicy(policy *worker_pb.TaskPolicy) error { + return cp.saveTaskConfig(VacuumTaskConfigFile, policy) +} + +// LoadVacuumTaskConfig loads vacuum task configuration from protobuf file +func (cp *ConfigPersistence) LoadVacuumTaskConfig() (*VacuumTaskConfig, error) { + // Load as TaskPolicy and extract vacuum config + if taskPolicy, err := cp.LoadVacuumTaskPolicy(); err == nil && taskPolicy != nil { + if vacuumConfig := taskPolicy.GetVacuumConfig(); vacuumConfig != nil { + return vacuumConfig, nil + } + } + + // Return default config if no valid config found + return &VacuumTaskConfig{ + GarbageThreshold: 0.3, + MinVolumeAgeHours: 24, + MinIntervalSeconds: 7 * 24 * 60 * 60, // 7 days + }, nil +} + +// LoadVacuumTaskPolicy loads complete vacuum task policy from protobuf file +func (cp *ConfigPersistence) LoadVacuumTaskPolicy() (*worker_pb.TaskPolicy, error) { + if cp.dataDir == "" { + // Return default policy if no data directory + return &worker_pb.TaskPolicy{ + Enabled: true, + MaxConcurrent: 2, + RepeatIntervalSeconds: 24 * 3600, // 24 hours in seconds + CheckIntervalSeconds: 6 * 3600, // 6 hours in seconds + TaskConfig: &worker_pb.TaskPolicy_VacuumConfig{ + VacuumConfig: &worker_pb.VacuumTaskConfig{ + GarbageThreshold: 0.3, + MinVolumeAgeHours: 24, + MinIntervalSeconds: 7 * 24 * 60 * 60, // 7 days + }, + }, + }, nil + } + + confDir := filepath.Join(cp.dataDir, ConfigSubdir) + configPath := filepath.Join(confDir, VacuumTaskConfigFile) + + // Check if file exists + if _, err := os.Stat(configPath); os.IsNotExist(err) { + // Return default policy if file doesn't exist + return &worker_pb.TaskPolicy{ + Enabled: true, + MaxConcurrent: 2, + RepeatIntervalSeconds: 24 * 3600, // 24 hours in seconds + CheckIntervalSeconds: 6 * 3600, // 6 hours in seconds + TaskConfig: &worker_pb.TaskPolicy_VacuumConfig{ + VacuumConfig: &worker_pb.VacuumTaskConfig{ + GarbageThreshold: 0.3, + MinVolumeAgeHours: 24, + MinIntervalSeconds: 7 * 24 * 60 * 60, // 7 days + }, + }, + }, nil + } + + // Read file + configData, err := os.ReadFile(configPath) + if err != nil { + return nil, fmt.Errorf("failed to read vacuum task config file: %w", err) + } + + // Try to unmarshal as TaskPolicy + var policy worker_pb.TaskPolicy + if err := proto.Unmarshal(configData, &policy); err == nil { + // Validate that it's actually a TaskPolicy with vacuum config + if policy.GetVacuumConfig() != nil { + glog.V(1).Infof("Loaded vacuum task policy from %s", configPath) + return &policy, nil + } + } + + return nil, fmt.Errorf("failed to unmarshal vacuum task configuration") +} + +// SaveErasureCodingTaskConfig saves EC task configuration to protobuf file +func (cp *ConfigPersistence) SaveErasureCodingTaskConfig(config *ErasureCodingTaskConfig) error { + return cp.saveTaskConfig(ECTaskConfigFile, config) +} + +// SaveErasureCodingTaskPolicy saves complete EC task policy to protobuf file +func (cp *ConfigPersistence) SaveErasureCodingTaskPolicy(policy *worker_pb.TaskPolicy) error { + return cp.saveTaskConfig(ECTaskConfigFile, policy) +} + +// LoadErasureCodingTaskConfig loads EC task configuration from protobuf file +func (cp *ConfigPersistence) LoadErasureCodingTaskConfig() (*ErasureCodingTaskConfig, error) { + // Load as TaskPolicy and extract EC config + if taskPolicy, err := cp.LoadErasureCodingTaskPolicy(); err == nil && taskPolicy != nil { + if ecConfig := taskPolicy.GetErasureCodingConfig(); ecConfig != nil { + return ecConfig, nil + } + } + + // Return default config if no valid config found + return &ErasureCodingTaskConfig{ + FullnessRatio: 0.9, + QuietForSeconds: 3600, + MinVolumeSizeMb: 1024, + CollectionFilter: "", + }, nil +} + +// LoadErasureCodingTaskPolicy loads complete EC task policy from protobuf file +func (cp *ConfigPersistence) LoadErasureCodingTaskPolicy() (*worker_pb.TaskPolicy, error) { + if cp.dataDir == "" { + // Return default policy if no data directory + return &worker_pb.TaskPolicy{ + Enabled: true, + MaxConcurrent: 1, + RepeatIntervalSeconds: 168 * 3600, // 1 week in seconds + CheckIntervalSeconds: 24 * 3600, // 24 hours in seconds + TaskConfig: &worker_pb.TaskPolicy_ErasureCodingConfig{ + ErasureCodingConfig: &worker_pb.ErasureCodingTaskConfig{ + FullnessRatio: 0.9, + QuietForSeconds: 3600, + MinVolumeSizeMb: 1024, + CollectionFilter: "", + }, + }, + }, nil + } + + confDir := filepath.Join(cp.dataDir, ConfigSubdir) + configPath := filepath.Join(confDir, ECTaskConfigFile) + + // Check if file exists + if _, err := os.Stat(configPath); os.IsNotExist(err) { + // Return default policy if file doesn't exist + return &worker_pb.TaskPolicy{ + Enabled: true, + MaxConcurrent: 1, + RepeatIntervalSeconds: 168 * 3600, // 1 week in seconds + CheckIntervalSeconds: 24 * 3600, // 24 hours in seconds + TaskConfig: &worker_pb.TaskPolicy_ErasureCodingConfig{ + ErasureCodingConfig: &worker_pb.ErasureCodingTaskConfig{ + FullnessRatio: 0.9, + QuietForSeconds: 3600, + MinVolumeSizeMb: 1024, + CollectionFilter: "", + }, + }, + }, nil + } + + // Read file + configData, err := os.ReadFile(configPath) + if err != nil { + return nil, fmt.Errorf("failed to read EC task config file: %w", err) + } + + // Try to unmarshal as TaskPolicy + var policy worker_pb.TaskPolicy + if err := proto.Unmarshal(configData, &policy); err == nil { + // Validate that it's actually a TaskPolicy with EC config + if policy.GetErasureCodingConfig() != nil { + glog.V(1).Infof("Loaded EC task policy from %s", configPath) + return &policy, nil + } + } + + return nil, fmt.Errorf("failed to unmarshal EC task configuration") +} + +// SaveBalanceTaskConfig saves balance task configuration to protobuf file +func (cp *ConfigPersistence) SaveBalanceTaskConfig(config *BalanceTaskConfig) error { + return cp.saveTaskConfig(BalanceTaskConfigFile, config) +} + +// SaveBalanceTaskPolicy saves complete balance task policy to protobuf file +func (cp *ConfigPersistence) SaveBalanceTaskPolicy(policy *worker_pb.TaskPolicy) error { + return cp.saveTaskConfig(BalanceTaskConfigFile, policy) +} + +// LoadBalanceTaskConfig loads balance task configuration from protobuf file +func (cp *ConfigPersistence) LoadBalanceTaskConfig() (*BalanceTaskConfig, error) { + // Load as TaskPolicy and extract balance config + if taskPolicy, err := cp.LoadBalanceTaskPolicy(); err == nil && taskPolicy != nil { + if balanceConfig := taskPolicy.GetBalanceConfig(); balanceConfig != nil { + return balanceConfig, nil + } + } + + // Return default config if no valid config found + return &BalanceTaskConfig{ + ImbalanceThreshold: 0.1, + MinServerCount: 2, + }, nil +} + +// LoadBalanceTaskPolicy loads complete balance task policy from protobuf file +func (cp *ConfigPersistence) LoadBalanceTaskPolicy() (*worker_pb.TaskPolicy, error) { + if cp.dataDir == "" { + // Return default policy if no data directory + return &worker_pb.TaskPolicy{ + Enabled: true, + MaxConcurrent: 1, + RepeatIntervalSeconds: 6 * 3600, // 6 hours in seconds + CheckIntervalSeconds: 12 * 3600, // 12 hours in seconds + TaskConfig: &worker_pb.TaskPolicy_BalanceConfig{ + BalanceConfig: &worker_pb.BalanceTaskConfig{ + ImbalanceThreshold: 0.1, + MinServerCount: 2, + }, + }, + }, nil + } + + confDir := filepath.Join(cp.dataDir, ConfigSubdir) + configPath := filepath.Join(confDir, BalanceTaskConfigFile) + + // Check if file exists + if _, err := os.Stat(configPath); os.IsNotExist(err) { + // Return default policy if file doesn't exist + return &worker_pb.TaskPolicy{ + Enabled: true, + MaxConcurrent: 1, + RepeatIntervalSeconds: 6 * 3600, // 6 hours in seconds + CheckIntervalSeconds: 12 * 3600, // 12 hours in seconds + TaskConfig: &worker_pb.TaskPolicy_BalanceConfig{ + BalanceConfig: &worker_pb.BalanceTaskConfig{ + ImbalanceThreshold: 0.1, + MinServerCount: 2, + }, + }, + }, nil + } + + // Read file + configData, err := os.ReadFile(configPath) + if err != nil { + return nil, fmt.Errorf("failed to read balance task config file: %w", err) + } + + // Try to unmarshal as TaskPolicy + var policy worker_pb.TaskPolicy + if err := proto.Unmarshal(configData, &policy); err == nil { + // Validate that it's actually a TaskPolicy with balance config + if policy.GetBalanceConfig() != nil { + glog.V(1).Infof("Loaded balance task policy from %s", configPath) + return &policy, nil + } + } + + return nil, fmt.Errorf("failed to unmarshal balance task configuration") +} + +// SaveReplicationTaskConfig saves replication task configuration to protobuf file +func (cp *ConfigPersistence) SaveReplicationTaskConfig(config *ReplicationTaskConfig) error { + return cp.saveTaskConfig(ReplicationTaskConfigFile, config) +} + +// LoadReplicationTaskConfig loads replication task configuration from protobuf file +func (cp *ConfigPersistence) LoadReplicationTaskConfig() (*ReplicationTaskConfig, error) { + var config ReplicationTaskConfig + err := cp.loadTaskConfig(ReplicationTaskConfigFile, &config) + if err != nil { + // Return default config if file doesn't exist + if os.IsNotExist(err) { + return &ReplicationTaskConfig{ + TargetReplicaCount: 1, + }, nil + } + return nil, err + } + return &config, nil +} + +// saveTaskConfig is a generic helper for saving task configurations with both protobuf and JSON reference +func (cp *ConfigPersistence) saveTaskConfig(filename string, config proto.Message) error { + if cp.dataDir == "" { + return fmt.Errorf("no data directory specified, cannot save task configuration") + } + + // Create conf subdirectory path + confDir := filepath.Join(cp.dataDir, ConfigSubdir) + configPath := filepath.Join(confDir, filename) + + // Generate JSON reference filename + jsonFilename := filename[:len(filename)-3] + ".json" // Replace .pb with .json + jsonPath := filepath.Join(confDir, jsonFilename) + + // Create conf directory if it doesn't exist + if err := os.MkdirAll(confDir, ConfigDirPermissions); err != nil { + return fmt.Errorf("failed to create config directory: %w", err) + } + + // Marshal configuration to protobuf binary format + configData, err := proto.Marshal(config) + if err != nil { + return fmt.Errorf("failed to marshal task config: %w", err) + } + + // Write protobuf file + if err := os.WriteFile(configPath, configData, ConfigFilePermissions); err != nil { + return fmt.Errorf("failed to write task config file: %w", err) + } + + // Marshal configuration to JSON for reference + marshaler := protojson.MarshalOptions{ + Multiline: true, + Indent: " ", + EmitUnpopulated: true, + } + jsonData, err := marshaler.Marshal(config) + if err != nil { + glog.Warningf("Failed to marshal task config to JSON reference: %v", err) + } else { + // Write JSON reference file + if err := os.WriteFile(jsonPath, jsonData, ConfigFilePermissions); err != nil { + glog.Warningf("Failed to write task config JSON reference: %v", err) + } + } + + glog.V(1).Infof("Saved task configuration to %s (with JSON reference)", configPath) + return nil +} + +// loadTaskConfig is a generic helper for loading task configurations from conf subdirectory +func (cp *ConfigPersistence) loadTaskConfig(filename string, config proto.Message) error { + if cp.dataDir == "" { + return os.ErrNotExist // Will trigger default config return + } + + confDir := filepath.Join(cp.dataDir, ConfigSubdir) + configPath := filepath.Join(confDir, filename) + + // Check if file exists + if _, err := os.Stat(configPath); os.IsNotExist(err) { + return err // Will trigger default config return + } + + // Read file + configData, err := os.ReadFile(configPath) + if err != nil { + return fmt.Errorf("failed to read task config file: %w", err) + } + + // Unmarshal protobuf binary data + if err := proto.Unmarshal(configData, config); err != nil { + return fmt.Errorf("failed to unmarshal task config: %w", err) + } + + glog.V(1).Infof("Loaded task configuration from %s", configPath) + return nil +} + // GetDataDir returns the data directory path func (cp *ConfigPersistence) GetDataDir() string { return cp.dataDir @@ -249,6 +596,7 @@ func (cp *ConfigPersistence) GetConfigInfo() map[string]interface{} { info := map[string]interface{}{ "data_dir_configured": cp.IsConfigured(), "data_dir": cp.dataDir, + "config_subdir": ConfigSubdir, } if cp.IsConfigured() { @@ -256,10 +604,18 @@ func (cp *ConfigPersistence) GetConfigInfo() map[string]interface{} { if _, err := os.Stat(cp.dataDir); err == nil { info["data_dir_exists"] = true - // List config files - configFiles, err := cp.ListConfigFiles() - if err == nil { - info["config_files"] = configFiles + // Check if conf subdirectory exists + confDir := filepath.Join(cp.dataDir, ConfigSubdir) + if _, err := os.Stat(confDir); err == nil { + info["conf_dir_exists"] = true + + // List config files + configFiles, err := cp.ListConfigFiles() + if err == nil { + info["config_files"] = configFiles + } + } else { + info["conf_dir_exists"] = false } } else { info["data_dir_exists"] = false @@ -268,3 +624,67 @@ func (cp *ConfigPersistence) GetConfigInfo() map[string]interface{} { return info } + +// buildPolicyFromTaskConfigs loads task configurations from separate files and builds a MaintenancePolicy +func buildPolicyFromTaskConfigs() *worker_pb.MaintenancePolicy { + policy := &worker_pb.MaintenancePolicy{ + GlobalMaxConcurrent: 4, + DefaultRepeatIntervalSeconds: 6 * 3600, // 6 hours in seconds + DefaultCheckIntervalSeconds: 12 * 3600, // 12 hours in seconds + TaskPolicies: make(map[string]*worker_pb.TaskPolicy), + } + + // Load vacuum task configuration + if vacuumConfig := vacuum.LoadConfigFromPersistence(nil); vacuumConfig != nil { + policy.TaskPolicies["vacuum"] = &worker_pb.TaskPolicy{ + Enabled: vacuumConfig.Enabled, + MaxConcurrent: int32(vacuumConfig.MaxConcurrent), + RepeatIntervalSeconds: int32(vacuumConfig.ScanIntervalSeconds), + CheckIntervalSeconds: int32(vacuumConfig.ScanIntervalSeconds), + TaskConfig: &worker_pb.TaskPolicy_VacuumConfig{ + VacuumConfig: &worker_pb.VacuumTaskConfig{ + GarbageThreshold: float64(vacuumConfig.GarbageThreshold), + MinVolumeAgeHours: int32(vacuumConfig.MinVolumeAgeSeconds / 3600), // Convert seconds to hours + MinIntervalSeconds: int32(vacuumConfig.MinIntervalSeconds), + }, + }, + } + } + + // Load erasure coding task configuration + if ecConfig := erasure_coding.LoadConfigFromPersistence(nil); ecConfig != nil { + policy.TaskPolicies["erasure_coding"] = &worker_pb.TaskPolicy{ + Enabled: ecConfig.Enabled, + MaxConcurrent: int32(ecConfig.MaxConcurrent), + RepeatIntervalSeconds: int32(ecConfig.ScanIntervalSeconds), + CheckIntervalSeconds: int32(ecConfig.ScanIntervalSeconds), + TaskConfig: &worker_pb.TaskPolicy_ErasureCodingConfig{ + ErasureCodingConfig: &worker_pb.ErasureCodingTaskConfig{ + FullnessRatio: float64(ecConfig.FullnessRatio), + QuietForSeconds: int32(ecConfig.QuietForSeconds), + MinVolumeSizeMb: int32(ecConfig.MinSizeMB), + CollectionFilter: ecConfig.CollectionFilter, + }, + }, + } + } + + // Load balance task configuration + if balanceConfig := balance.LoadConfigFromPersistence(nil); balanceConfig != nil { + policy.TaskPolicies["balance"] = &worker_pb.TaskPolicy{ + Enabled: balanceConfig.Enabled, + MaxConcurrent: int32(balanceConfig.MaxConcurrent), + RepeatIntervalSeconds: int32(balanceConfig.ScanIntervalSeconds), + CheckIntervalSeconds: int32(balanceConfig.ScanIntervalSeconds), + TaskConfig: &worker_pb.TaskPolicy_BalanceConfig{ + BalanceConfig: &worker_pb.BalanceTaskConfig{ + ImbalanceThreshold: float64(balanceConfig.ImbalanceThreshold), + MinServerCount: int32(balanceConfig.MinServerCount), + }, + }, + } + } + + glog.V(1).Infof("Built maintenance policy from separate task configs - %d task policies loaded", len(policy.TaskPolicies)) + return policy +} diff --git a/weed/admin/dash/ec_shard_management.go b/weed/admin/dash/ec_shard_management.go new file mode 100644 index 000000000..272890cf0 --- /dev/null +++ b/weed/admin/dash/ec_shard_management.go @@ -0,0 +1,734 @@ +package dash + +import ( + "context" + "fmt" + "sort" + "time" + + "github.com/seaweedfs/seaweedfs/weed/glog" + "github.com/seaweedfs/seaweedfs/weed/pb" + "github.com/seaweedfs/seaweedfs/weed/pb/master_pb" + "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb" + "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding" +) + +// GetClusterEcShards retrieves cluster EC shards data with pagination, sorting, and filtering +func (s *AdminServer) GetClusterEcShards(page int, pageSize int, sortBy string, sortOrder string, collection string) (*ClusterEcShardsData, error) { + // Set defaults + if page < 1 { + page = 1 + } + if pageSize < 1 || pageSize > 1000 { + pageSize = 100 + } + if sortBy == "" { + sortBy = "volume_id" + } + if sortOrder == "" { + sortOrder = "asc" + } + + var ecShards []EcShardWithInfo + volumeShardsMap := make(map[uint32]map[int]bool) // volumeId -> set of shards present + volumesWithAllShards := 0 + volumesWithMissingShards := 0 + + // Get detailed EC shard information via gRPC + err := s.WithMasterClient(func(client master_pb.SeaweedClient) error { + resp, err := client.VolumeList(context.Background(), &master_pb.VolumeListRequest{}) + if err != nil { + return err + } + + if resp.TopologyInfo != nil { + for _, dc := range resp.TopologyInfo.DataCenterInfos { + for _, rack := range dc.RackInfos { + for _, node := range rack.DataNodeInfos { + for _, diskInfo := range node.DiskInfos { + // Process EC shard information + for _, ecShardInfo := range diskInfo.EcShardInfos { + volumeId := ecShardInfo.Id + + // Initialize volume shards map if needed + if volumeShardsMap[volumeId] == nil { + volumeShardsMap[volumeId] = make(map[int]bool) + } + + // Create individual shard entries for each shard this server has + shardBits := ecShardInfo.EcIndexBits + for shardId := 0; shardId < erasure_coding.TotalShardsCount; shardId++ { + if (shardBits & (1 << uint(shardId))) != 0 { + // Mark this shard as present for this volume + volumeShardsMap[volumeId][shardId] = true + + ecShard := EcShardWithInfo{ + VolumeID: volumeId, + ShardID: uint32(shardId), + Collection: ecShardInfo.Collection, + Size: 0, // EC shards don't have individual size in the API response + Server: node.Id, + DataCenter: dc.Id, + Rack: rack.Id, + DiskType: diskInfo.Type, + ModifiedTime: 0, // Not available in current API + EcIndexBits: ecShardInfo.EcIndexBits, + ShardCount: getShardCount(ecShardInfo.EcIndexBits), + } + ecShards = append(ecShards, ecShard) + } + } + } + } + } + } + } + } + + return nil + }) + + if err != nil { + return nil, err + } + + // Calculate volume-level completeness (across all servers) + volumeCompleteness := make(map[uint32]bool) + volumeMissingShards := make(map[uint32][]int) + + for volumeId, shardsPresent := range volumeShardsMap { + var missingShards []int + shardCount := len(shardsPresent) + + // Find which shards are missing for this volume across ALL servers + for shardId := 0; shardId < erasure_coding.TotalShardsCount; shardId++ { + if !shardsPresent[shardId] { + missingShards = append(missingShards, shardId) + } + } + + isComplete := (shardCount == erasure_coding.TotalShardsCount) + volumeCompleteness[volumeId] = isComplete + volumeMissingShards[volumeId] = missingShards + + if isComplete { + volumesWithAllShards++ + } else { + volumesWithMissingShards++ + } + } + + // Update completeness info for each shard based on volume-level completeness + for i := range ecShards { + volumeId := ecShards[i].VolumeID + ecShards[i].IsComplete = volumeCompleteness[volumeId] + ecShards[i].MissingShards = volumeMissingShards[volumeId] + } + + // Filter by collection if specified + if collection != "" { + var filteredShards []EcShardWithInfo + for _, shard := range ecShards { + if shard.Collection == collection { + filteredShards = append(filteredShards, shard) + } + } + ecShards = filteredShards + } + + // Sort the results + sortEcShards(ecShards, sortBy, sortOrder) + + // Calculate statistics for conditional display + dataCenters := make(map[string]bool) + racks := make(map[string]bool) + collections := make(map[string]bool) + + for _, shard := range ecShards { + dataCenters[shard.DataCenter] = true + racks[shard.Rack] = true + if shard.Collection != "" { + collections[shard.Collection] = true + } + } + + // Pagination + totalShards := len(ecShards) + totalPages := (totalShards + pageSize - 1) / pageSize + startIndex := (page - 1) * pageSize + endIndex := startIndex + pageSize + if endIndex > totalShards { + endIndex = totalShards + } + + if startIndex >= totalShards { + startIndex = 0 + endIndex = 0 + } + + paginatedShards := ecShards[startIndex:endIndex] + + // Build response + data := &ClusterEcShardsData{ + EcShards: paginatedShards, + TotalShards: totalShards, + TotalVolumes: len(volumeShardsMap), + LastUpdated: time.Now(), + + // Pagination + CurrentPage: page, + TotalPages: totalPages, + PageSize: pageSize, + + // Sorting + SortBy: sortBy, + SortOrder: sortOrder, + + // Statistics + DataCenterCount: len(dataCenters), + RackCount: len(racks), + CollectionCount: len(collections), + + // Conditional display flags + ShowDataCenterColumn: len(dataCenters) > 1, + ShowRackColumn: len(racks) > 1, + ShowCollectionColumn: len(collections) > 1 || collection != "", + + // Filtering + FilterCollection: collection, + + // EC specific statistics + ShardsPerVolume: make(map[uint32]int), // This will be recalculated below + VolumesWithAllShards: volumesWithAllShards, + VolumesWithMissingShards: volumesWithMissingShards, + } + + // Recalculate ShardsPerVolume for the response + for volumeId, shardsPresent := range volumeShardsMap { + data.ShardsPerVolume[volumeId] = len(shardsPresent) + } + + // Set single values when only one exists + if len(dataCenters) == 1 { + for dc := range dataCenters { + data.SingleDataCenter = dc + break + } + } + if len(racks) == 1 { + for rack := range racks { + data.SingleRack = rack + break + } + } + if len(collections) == 1 { + for col := range collections { + data.SingleCollection = col + break + } + } + + return data, nil +} + +// GetClusterEcVolumes retrieves cluster EC volumes data grouped by volume ID with shard locations +func (s *AdminServer) GetClusterEcVolumes(page int, pageSize int, sortBy string, sortOrder string, collection string) (*ClusterEcVolumesData, error) { + // Set defaults + if page < 1 { + page = 1 + } + if pageSize < 1 || pageSize > 1000 { + pageSize = 100 + } + if sortBy == "" { + sortBy = "volume_id" + } + if sortOrder == "" { + sortOrder = "asc" + } + + volumeData := make(map[uint32]*EcVolumeWithShards) + totalShards := 0 + + // Get detailed EC shard information via gRPC + err := s.WithMasterClient(func(client master_pb.SeaweedClient) error { + resp, err := client.VolumeList(context.Background(), &master_pb.VolumeListRequest{}) + if err != nil { + return err + } + + if resp.TopologyInfo != nil { + for _, dc := range resp.TopologyInfo.DataCenterInfos { + for _, rack := range dc.RackInfos { + for _, node := range rack.DataNodeInfos { + for _, diskInfo := range node.DiskInfos { + // Process EC shard information + for _, ecShardInfo := range diskInfo.EcShardInfos { + volumeId := ecShardInfo.Id + + // Initialize volume data if needed + if volumeData[volumeId] == nil { + volumeData[volumeId] = &EcVolumeWithShards{ + VolumeID: volumeId, + Collection: ecShardInfo.Collection, + TotalShards: 0, + IsComplete: false, + MissingShards: []int{}, + ShardLocations: make(map[int]string), + ShardSizes: make(map[int]int64), + DataCenters: []string{}, + Servers: []string{}, + Racks: []string{}, + } + } + + volume := volumeData[volumeId] + + // Track data centers and servers + dcExists := false + for _, existingDc := range volume.DataCenters { + if existingDc == dc.Id { + dcExists = true + break + } + } + if !dcExists { + volume.DataCenters = append(volume.DataCenters, dc.Id) + } + + serverExists := false + for _, existingServer := range volume.Servers { + if existingServer == node.Id { + serverExists = true + break + } + } + if !serverExists { + volume.Servers = append(volume.Servers, node.Id) + } + + // Track racks + rackExists := false + for _, existingRack := range volume.Racks { + if existingRack == rack.Id { + rackExists = true + break + } + } + if !rackExists { + volume.Racks = append(volume.Racks, rack.Id) + } + + // Process each shard this server has for this volume + shardBits := ecShardInfo.EcIndexBits + for shardId := 0; shardId < erasure_coding.TotalShardsCount; shardId++ { + if (shardBits & (1 << uint(shardId))) != 0 { + // Record shard location + volume.ShardLocations[shardId] = node.Id + totalShards++ + } + } + } + } + } + } + } + } + + return nil + }) + + if err != nil { + return nil, err + } + + // Collect shard size information from volume servers + for volumeId, volume := range volumeData { + // Group servers by volume to minimize gRPC calls + serverHasVolume := make(map[string]bool) + for _, server := range volume.Servers { + serverHasVolume[server] = true + } + + // Query each server for shard sizes + for server := range serverHasVolume { + err := s.WithVolumeServerClient(pb.ServerAddress(server), func(client volume_server_pb.VolumeServerClient) error { + resp, err := client.VolumeEcShardsInfo(context.Background(), &volume_server_pb.VolumeEcShardsInfoRequest{ + VolumeId: volumeId, + }) + if err != nil { + glog.V(1).Infof("Failed to get EC shard info from %s for volume %d: %v", server, volumeId, err) + return nil // Continue with other servers, don't fail the entire request + } + + // Update shard sizes + for _, shardInfo := range resp.EcShardInfos { + volume.ShardSizes[int(shardInfo.ShardId)] = shardInfo.Size + } + + return nil + }) + if err != nil { + glog.V(1).Infof("Failed to connect to volume server %s: %v", server, err) + } + } + } + + // Calculate completeness for each volume + completeVolumes := 0 + incompleteVolumes := 0 + + for _, volume := range volumeData { + volume.TotalShards = len(volume.ShardLocations) + + // Find missing shards + var missingShards []int + for shardId := 0; shardId < erasure_coding.TotalShardsCount; shardId++ { + if _, exists := volume.ShardLocations[shardId]; !exists { + missingShards = append(missingShards, shardId) + } + } + + volume.MissingShards = missingShards + volume.IsComplete = (len(missingShards) == 0) + + if volume.IsComplete { + completeVolumes++ + } else { + incompleteVolumes++ + } + } + + // Convert map to slice + var ecVolumes []EcVolumeWithShards + for _, volume := range volumeData { + // Filter by collection if specified + if collection == "" || volume.Collection == collection { + ecVolumes = append(ecVolumes, *volume) + } + } + + // Sort the results + sortEcVolumes(ecVolumes, sortBy, sortOrder) + + // Calculate statistics for conditional display + dataCenters := make(map[string]bool) + collections := make(map[string]bool) + + for _, volume := range ecVolumes { + for _, dc := range volume.DataCenters { + dataCenters[dc] = true + } + if volume.Collection != "" { + collections[volume.Collection] = true + } + } + + // Pagination + totalVolumes := len(ecVolumes) + totalPages := (totalVolumes + pageSize - 1) / pageSize + startIndex := (page - 1) * pageSize + endIndex := startIndex + pageSize + if endIndex > totalVolumes { + endIndex = totalVolumes + } + + if startIndex >= totalVolumes { + startIndex = 0 + endIndex = 0 + } + + paginatedVolumes := ecVolumes[startIndex:endIndex] + + // Build response + data := &ClusterEcVolumesData{ + EcVolumes: paginatedVolumes, + TotalVolumes: totalVolumes, + LastUpdated: time.Now(), + + // Pagination + Page: page, + PageSize: pageSize, + TotalPages: totalPages, + + // Sorting + SortBy: sortBy, + SortOrder: sortOrder, + + // Filtering + Collection: collection, + + // Conditional display flags + ShowDataCenterColumn: len(dataCenters) > 1, + ShowRackColumn: false, // We don't track racks in this view for simplicity + ShowCollectionColumn: len(collections) > 1 || collection != "", + + // Statistics + CompleteVolumes: completeVolumes, + IncompleteVolumes: incompleteVolumes, + TotalShards: totalShards, + } + + return data, nil +} + +// sortEcVolumes sorts EC volumes based on the specified field and order +func sortEcVolumes(volumes []EcVolumeWithShards, sortBy string, sortOrder string) { + sort.Slice(volumes, func(i, j int) bool { + var less bool + switch sortBy { + case "volume_id": + less = volumes[i].VolumeID < volumes[j].VolumeID + case "collection": + if volumes[i].Collection == volumes[j].Collection { + less = volumes[i].VolumeID < volumes[j].VolumeID + } else { + less = volumes[i].Collection < volumes[j].Collection + } + case "total_shards": + if volumes[i].TotalShards == volumes[j].TotalShards { + less = volumes[i].VolumeID < volumes[j].VolumeID + } else { + less = volumes[i].TotalShards < volumes[j].TotalShards + } + case "completeness": + // Complete volumes first, then by volume ID + if volumes[i].IsComplete == volumes[j].IsComplete { + less = volumes[i].VolumeID < volumes[j].VolumeID + } else { + less = volumes[i].IsComplete && !volumes[j].IsComplete + } + default: + less = volumes[i].VolumeID < volumes[j].VolumeID + } + + if sortOrder == "desc" { + return !less + } + return less + }) +} + +// getShardCount returns the number of shards represented by the bitmap +func getShardCount(ecIndexBits uint32) int { + count := 0 + for i := 0; i < erasure_coding.TotalShardsCount; i++ { + if (ecIndexBits & (1 << uint(i))) != 0 { + count++ + } + } + return count +} + +// getMissingShards returns a slice of missing shard IDs for a volume +func getMissingShards(ecIndexBits uint32) []int { + var missing []int + for i := 0; i < erasure_coding.TotalShardsCount; i++ { + if (ecIndexBits & (1 << uint(i))) == 0 { + missing = append(missing, i) + } + } + return missing +} + +// sortEcShards sorts EC shards based on the specified field and order +func sortEcShards(shards []EcShardWithInfo, sortBy string, sortOrder string) { + sort.Slice(shards, func(i, j int) bool { + var less bool + switch sortBy { + case "shard_id": + less = shards[i].ShardID < shards[j].ShardID + case "server": + if shards[i].Server == shards[j].Server { + less = shards[i].ShardID < shards[j].ShardID // Secondary sort by shard ID + } else { + less = shards[i].Server < shards[j].Server + } + case "data_center": + if shards[i].DataCenter == shards[j].DataCenter { + less = shards[i].ShardID < shards[j].ShardID // Secondary sort by shard ID + } else { + less = shards[i].DataCenter < shards[j].DataCenter + } + case "rack": + if shards[i].Rack == shards[j].Rack { + less = shards[i].ShardID < shards[j].ShardID // Secondary sort by shard ID + } else { + less = shards[i].Rack < shards[j].Rack + } + default: + less = shards[i].ShardID < shards[j].ShardID + } + + if sortOrder == "desc" { + return !less + } + return less + }) +} + +// GetEcVolumeDetails retrieves detailed information about a specific EC volume +func (s *AdminServer) GetEcVolumeDetails(volumeID uint32, sortBy string, sortOrder string) (*EcVolumeDetailsData, error) { + // Set defaults + if sortBy == "" { + sortBy = "shard_id" + } + if sortOrder == "" { + sortOrder = "asc" + } + + var shards []EcShardWithInfo + var collection string + dataCenters := make(map[string]bool) + servers := make(map[string]bool) + + // Get detailed EC shard information for the specific volume via gRPC + err := s.WithMasterClient(func(client master_pb.SeaweedClient) error { + resp, err := client.VolumeList(context.Background(), &master_pb.VolumeListRequest{}) + if err != nil { + return err + } + + if resp.TopologyInfo != nil { + for _, dc := range resp.TopologyInfo.DataCenterInfos { + for _, rack := range dc.RackInfos { + for _, node := range rack.DataNodeInfos { + for _, diskInfo := range node.DiskInfos { + // Process EC shard information for this specific volume + for _, ecShardInfo := range diskInfo.EcShardInfos { + if ecShardInfo.Id == volumeID { + collection = ecShardInfo.Collection + dataCenters[dc.Id] = true + servers[node.Id] = true + + // Create individual shard entries for each shard this server has + shardBits := ecShardInfo.EcIndexBits + for shardId := 0; shardId < erasure_coding.TotalShardsCount; shardId++ { + if (shardBits & (1 << uint(shardId))) != 0 { + ecShard := EcShardWithInfo{ + VolumeID: ecShardInfo.Id, + ShardID: uint32(shardId), + Collection: ecShardInfo.Collection, + Size: 0, // EC shards don't have individual size in the API response + Server: node.Id, + DataCenter: dc.Id, + Rack: rack.Id, + DiskType: diskInfo.Type, + ModifiedTime: 0, // Not available in current API + EcIndexBits: ecShardInfo.EcIndexBits, + ShardCount: getShardCount(ecShardInfo.EcIndexBits), + } + shards = append(shards, ecShard) + } + } + } + } + } + } + } + } + } + + return nil + }) + + if err != nil { + return nil, err + } + + if len(shards) == 0 { + return nil, fmt.Errorf("EC volume %d not found", volumeID) + } + + // Collect shard size information from volume servers + shardSizeMap := make(map[string]map[uint32]uint64) // server -> shardId -> size + for _, shard := range shards { + server := shard.Server + if _, exists := shardSizeMap[server]; !exists { + // Query this server for shard sizes + err := s.WithVolumeServerClient(pb.ServerAddress(server), func(client volume_server_pb.VolumeServerClient) error { + resp, err := client.VolumeEcShardsInfo(context.Background(), &volume_server_pb.VolumeEcShardsInfoRequest{ + VolumeId: volumeID, + }) + if err != nil { + glog.V(1).Infof("Failed to get EC shard info from %s for volume %d: %v", server, volumeID, err) + return nil // Continue with other servers, don't fail the entire request + } + + // Store shard sizes for this server + shardSizeMap[server] = make(map[uint32]uint64) + for _, shardInfo := range resp.EcShardInfos { + shardSizeMap[server][shardInfo.ShardId] = uint64(shardInfo.Size) + } + + return nil + }) + if err != nil { + glog.V(1).Infof("Failed to connect to volume server %s: %v", server, err) + } + } + } + + // Update shard sizes in the shards array + for i := range shards { + server := shards[i].Server + shardId := shards[i].ShardID + if serverSizes, exists := shardSizeMap[server]; exists { + if size, exists := serverSizes[shardId]; exists { + shards[i].Size = size + } + } + } + + // Calculate completeness based on unique shard IDs + foundShards := make(map[int]bool) + for _, shard := range shards { + foundShards[int(shard.ShardID)] = true + } + + totalUniqueShards := len(foundShards) + isComplete := (totalUniqueShards == erasure_coding.TotalShardsCount) + + // Calculate missing shards + var missingShards []int + for i := 0; i < erasure_coding.TotalShardsCount; i++ { + if !foundShards[i] { + missingShards = append(missingShards, i) + } + } + + // Update completeness info for each shard + for i := range shards { + shards[i].IsComplete = isComplete + shards[i].MissingShards = missingShards + } + + // Sort shards based on parameters + sortEcShards(shards, sortBy, sortOrder) + + // Convert maps to slices + var dcList []string + for dc := range dataCenters { + dcList = append(dcList, dc) + } + var serverList []string + for server := range servers { + serverList = append(serverList, server) + } + + data := &EcVolumeDetailsData{ + VolumeID: volumeID, + Collection: collection, + Shards: shards, + TotalShards: totalUniqueShards, + IsComplete: isComplete, + MissingShards: missingShards, + DataCenters: dcList, + Servers: serverList, + LastUpdated: time.Now(), + SortBy: sortBy, + SortOrder: sortOrder, + } + + return data, nil +} diff --git a/weed/admin/dash/middleware.go b/weed/admin/dash/middleware.go index ce538d7ca..a4cfedfd0 100644 --- a/weed/admin/dash/middleware.go +++ b/weed/admin/dash/middleware.go @@ -25,3 +25,26 @@ func RequireAuth() gin.HandlerFunc { c.Next() } } + +// RequireAuthAPI checks if user is authenticated for API endpoints +// Returns JSON error instead of redirecting to login page +func RequireAuthAPI() gin.HandlerFunc { + return func(c *gin.Context) { + session := sessions.Default(c) + authenticated := session.Get("authenticated") + username := session.Get("username") + + if authenticated != true || username == nil { + c.JSON(http.StatusUnauthorized, gin.H{ + "error": "Authentication required", + "message": "Please log in to access this endpoint", + }) + c.Abort() + return + } + + // Set username in context for use in handlers + c.Set("username", username) + c.Next() + } +} diff --git a/weed/admin/dash/types.go b/weed/admin/dash/types.go index 60f499229..f098fad8c 100644 --- a/weed/admin/dash/types.go +++ b/weed/admin/dash/types.go @@ -135,6 +135,84 @@ type ClusterVolumesData struct { FilterCollection string `json:"filter_collection"` } +// ClusterEcShardsData represents the data for the cluster EC shards page +type ClusterEcShardsData struct { + Username string `json:"username"` + EcShards []EcShardWithInfo `json:"ec_shards"` + TotalShards int `json:"total_shards"` + TotalVolumes int `json:"total_volumes"` + LastUpdated time.Time `json:"last_updated"` + + // Pagination + CurrentPage int `json:"current_page"` + TotalPages int `json:"total_pages"` + PageSize int `json:"page_size"` + + // Sorting + SortBy string `json:"sort_by"` + SortOrder string `json:"sort_order"` + + // Statistics + DataCenterCount int `json:"datacenter_count"` + RackCount int `json:"rack_count"` + CollectionCount int `json:"collection_count"` + + // Conditional display flags + ShowDataCenterColumn bool `json:"show_datacenter_column"` + ShowRackColumn bool `json:"show_rack_column"` + ShowCollectionColumn bool `json:"show_collection_column"` + + // Single values when only one exists + SingleDataCenter string `json:"single_datacenter"` + SingleRack string `json:"single_rack"` + SingleCollection string `json:"single_collection"` + + // Filtering + FilterCollection string `json:"filter_collection"` + + // EC specific statistics + ShardsPerVolume map[uint32]int `json:"shards_per_volume"` // VolumeID -> shard count + VolumesWithAllShards int `json:"volumes_with_all_shards"` // Volumes with all 14 shards + VolumesWithMissingShards int `json:"volumes_with_missing_shards"` // Volumes missing shards +} + +// EcShardWithInfo represents an EC shard with its topology information +type EcShardWithInfo struct { + VolumeID uint32 `json:"volume_id"` + ShardID uint32 `json:"shard_id"` + Collection string `json:"collection"` + Size uint64 `json:"size"` + Server string `json:"server"` + DataCenter string `json:"datacenter"` + Rack string `json:"rack"` + DiskType string `json:"disk_type"` + ModifiedTime int64 `json:"modified_time"` + + // EC specific fields + EcIndexBits uint32 `json:"ec_index_bits"` // Bitmap of which shards this server has + ShardCount int `json:"shard_count"` // Number of shards this server has for this volume + IsComplete bool `json:"is_complete"` // True if this volume has all 14 shards + MissingShards []int `json:"missing_shards"` // List of missing shard IDs +} + +// EcVolumeDetailsData represents the data for the EC volume details page +type EcVolumeDetailsData struct { + Username string `json:"username"` + VolumeID uint32 `json:"volume_id"` + Collection string `json:"collection"` + Shards []EcShardWithInfo `json:"shards"` + TotalShards int `json:"total_shards"` + IsComplete bool `json:"is_complete"` + MissingShards []int `json:"missing_shards"` + DataCenters []string `json:"datacenters"` + Servers []string `json:"servers"` + LastUpdated time.Time `json:"last_updated"` + + // Sorting + SortBy string `json:"sort_by"` + SortOrder string `json:"sort_order"` +} + type VolumeDetailsData struct { Volume VolumeWithTopology `json:"volume"` Replicas []VolumeWithTopology `json:"replicas"` @@ -145,12 +223,13 @@ type VolumeDetailsData struct { // Collection management structures type CollectionInfo struct { - Name string `json:"name"` - DataCenter string `json:"datacenter"` - VolumeCount int `json:"volume_count"` - FileCount int64 `json:"file_count"` - TotalSize int64 `json:"total_size"` - DiskTypes []string `json:"disk_types"` + Name string `json:"name"` + DataCenter string `json:"datacenter"` + VolumeCount int `json:"volume_count"` + EcVolumeCount int `json:"ec_volume_count"` + FileCount int64 `json:"file_count"` + TotalSize int64 `json:"total_size"` + DiskTypes []string `json:"disk_types"` } type ClusterCollectionsData struct { @@ -158,6 +237,7 @@ type ClusterCollectionsData struct { Collections []CollectionInfo `json:"collections"` TotalCollections int `json:"total_collections"` TotalVolumes int `json:"total_volumes"` + TotalEcVolumes int `json:"total_ec_volumes"` TotalFiles int64 `json:"total_files"` TotalSize int64 `json:"total_size"` LastUpdated time.Time `json:"last_updated"` @@ -376,3 +456,74 @@ type MaintenanceWorkersData struct { } // Maintenance system types are now in weed/admin/maintenance package + +// EcVolumeWithShards represents an EC volume with its shard distribution +type EcVolumeWithShards struct { + VolumeID uint32 `json:"volume_id"` + Collection string `json:"collection"` + TotalShards int `json:"total_shards"` + IsComplete bool `json:"is_complete"` + MissingShards []int `json:"missing_shards"` + ShardLocations map[int]string `json:"shard_locations"` // shardId -> server + ShardSizes map[int]int64 `json:"shard_sizes"` // shardId -> size in bytes + DataCenters []string `json:"data_centers"` + Servers []string `json:"servers"` + Racks []string `json:"racks"` + ModifiedTime int64 `json:"modified_time"` +} + +// ClusterEcVolumesData represents the response for clustered EC volumes view +type ClusterEcVolumesData struct { + EcVolumes []EcVolumeWithShards `json:"ec_volumes"` + TotalVolumes int `json:"total_volumes"` + LastUpdated time.Time `json:"last_updated"` + + // Pagination + Page int `json:"page"` + PageSize int `json:"page_size"` + TotalPages int `json:"total_pages"` + + // Sorting + SortBy string `json:"sort_by"` + SortOrder string `json:"sort_order"` + + // Filtering + Collection string `json:"collection"` + + // Conditional display flags + ShowDataCenterColumn bool `json:"show_datacenter_column"` + ShowRackColumn bool `json:"show_rack_column"` + ShowCollectionColumn bool `json:"show_collection_column"` + + // Statistics + CompleteVolumes int `json:"complete_volumes"` + IncompleteVolumes int `json:"incomplete_volumes"` + TotalShards int `json:"total_shards"` + + // User context + Username string `json:"username"` +} + +// Collection detail page structures +type CollectionDetailsData struct { + Username string `json:"username"` + CollectionName string `json:"collection_name"` + RegularVolumes []VolumeWithTopology `json:"regular_volumes"` + EcVolumes []EcVolumeWithShards `json:"ec_volumes"` + TotalVolumes int `json:"total_volumes"` + TotalEcVolumes int `json:"total_ec_volumes"` + TotalFiles int64 `json:"total_files"` + TotalSize int64 `json:"total_size"` + DataCenters []string `json:"data_centers"` + DiskTypes []string `json:"disk_types"` + LastUpdated time.Time `json:"last_updated"` + + // Pagination + Page int `json:"page"` + PageSize int `json:"page_size"` + TotalPages int `json:"total_pages"` + + // Sorting + SortBy string `json:"sort_by"` + SortOrder string `json:"sort_order"` +} diff --git a/weed/admin/dash/worker_grpc_server.go b/weed/admin/dash/worker_grpc_server.go index 36f97261a..3b4312235 100644 --- a/weed/admin/dash/worker_grpc_server.go +++ b/weed/admin/dash/worker_grpc_server.go @@ -319,27 +319,41 @@ func (s *WorkerGrpcServer) handleHeartbeat(conn *WorkerConnection, heartbeat *wo // handleTaskRequest processes task requests from workers func (s *WorkerGrpcServer) handleTaskRequest(conn *WorkerConnection, request *worker_pb.TaskRequest) { + // glog.Infof("DEBUG handleTaskRequest: Worker %s requesting tasks with capabilities %v", conn.workerID, conn.capabilities) + if s.adminServer.maintenanceManager == nil { + glog.Infof("DEBUG handleTaskRequest: maintenance manager is nil") return } // Get next task from maintenance manager task := s.adminServer.maintenanceManager.GetNextTask(conn.workerID, conn.capabilities) + // glog.Infof("DEBUG handleTaskRequest: GetNextTask returned task: %v", task != nil) if task != nil { + glog.Infof("DEBUG handleTaskRequest: Assigning task %s (type: %s) to worker %s", task.ID, task.Type, conn.workerID) + + // Use typed params directly - master client should already be configured in the params + var taskParams *worker_pb.TaskParams + if task.TypedParams != nil { + taskParams = task.TypedParams + } else { + // Create basic params if none exist + taskParams = &worker_pb.TaskParams{ + VolumeId: task.VolumeID, + Server: task.Server, + Collection: task.Collection, + } + } + // Send task assignment assignment := &worker_pb.AdminMessage{ Timestamp: time.Now().Unix(), Message: &worker_pb.AdminMessage_TaskAssignment{ TaskAssignment: &worker_pb.TaskAssignment{ - TaskId: task.ID, - TaskType: string(task.Type), - Params: &worker_pb.TaskParams{ - VolumeId: task.VolumeID, - Server: task.Server, - Collection: task.Collection, - Parameters: convertTaskParameters(task.Parameters), - }, + TaskId: task.ID, + TaskType: string(task.Type), + Params: taskParams, Priority: int32(task.Priority), CreatedTime: time.Now().Unix(), }, @@ -348,10 +362,12 @@ func (s *WorkerGrpcServer) handleTaskRequest(conn *WorkerConnection, request *wo select { case conn.outgoing <- assignment: - glog.V(2).Infof("Assigned task %s to worker %s", task.ID, conn.workerID) + glog.Infof("DEBUG handleTaskRequest: Successfully assigned task %s to worker %s", task.ID, conn.workerID) case <-time.After(time.Second): glog.Warningf("Failed to send task assignment to worker %s", conn.workerID) } + } else { + // glog.Infof("DEBUG handleTaskRequest: No tasks available for worker %s", conn.workerID) } } diff --git a/weed/admin/handlers/admin_handlers.go b/weed/admin/handlers/admin_handlers.go index 76a123a4f..d28dc9e53 100644 --- a/weed/admin/handlers/admin_handlers.go +++ b/weed/admin/handlers/admin_handlers.go @@ -78,6 +78,9 @@ func (h *AdminHandlers) SetupRoutes(r *gin.Engine, authRequired bool, username, protected.GET("/cluster/volumes", h.clusterHandlers.ShowClusterVolumes) protected.GET("/cluster/volumes/:id/:server", h.clusterHandlers.ShowVolumeDetails) protected.GET("/cluster/collections", h.clusterHandlers.ShowClusterCollections) + protected.GET("/cluster/collections/:name", h.clusterHandlers.ShowCollectionDetails) + protected.GET("/cluster/ec-shards", h.clusterHandlers.ShowClusterEcShards) + protected.GET("/cluster/ec-volumes/:id", h.clusterHandlers.ShowEcVolumeDetails) // Message Queue management routes protected.GET("/mq/brokers", h.mqHandlers.ShowBrokers) @@ -93,7 +96,8 @@ func (h *AdminHandlers) SetupRoutes(r *gin.Engine, authRequired bool, username, protected.POST("/maintenance/config/:taskType", h.maintenanceHandlers.UpdateTaskConfig) // API routes for AJAX calls - api := protected.Group("/api") + api := r.Group("/api") + api.Use(dash.RequireAuthAPI()) // Use API-specific auth middleware { api.GET("/cluster/topology", h.clusterHandlers.GetClusterTopology) api.GET("/cluster/masters", h.clusterHandlers.GetMasters) @@ -198,6 +202,9 @@ func (h *AdminHandlers) SetupRoutes(r *gin.Engine, authRequired bool, username, r.GET("/cluster/volumes", h.clusterHandlers.ShowClusterVolumes) r.GET("/cluster/volumes/:id/:server", h.clusterHandlers.ShowVolumeDetails) r.GET("/cluster/collections", h.clusterHandlers.ShowClusterCollections) + r.GET("/cluster/collections/:name", h.clusterHandlers.ShowCollectionDetails) + r.GET("/cluster/ec-shards", h.clusterHandlers.ShowClusterEcShards) + r.GET("/cluster/ec-volumes/:id", h.clusterHandlers.ShowEcVolumeDetails) // Message Queue management routes r.GET("/mq/brokers", h.mqHandlers.ShowBrokers) diff --git a/weed/admin/handlers/cluster_handlers.go b/weed/admin/handlers/cluster_handlers.go index 03f7e88a0..32b89acd1 100644 --- a/weed/admin/handlers/cluster_handlers.go +++ b/weed/admin/handlers/cluster_handlers.go @@ -1,6 +1,7 @@ package handlers import ( + "math" "net/http" "strconv" @@ -161,6 +162,129 @@ func (h *ClusterHandlers) ShowClusterCollections(c *gin.Context) { } } +// ShowCollectionDetails renders the collection detail page +func (h *ClusterHandlers) ShowCollectionDetails(c *gin.Context) { + collectionName := c.Param("name") + if collectionName == "" { + c.JSON(http.StatusBadRequest, gin.H{"error": "Collection name is required"}) + return + } + + // Parse query parameters + page, _ := strconv.Atoi(c.DefaultQuery("page", "1")) + pageSize, _ := strconv.Atoi(c.DefaultQuery("page_size", "25")) + sortBy := c.DefaultQuery("sort_by", "volume_id") + sortOrder := c.DefaultQuery("sort_order", "asc") + + // Get collection details data (volumes and EC volumes) + collectionDetailsData, err := h.adminServer.GetCollectionDetails(collectionName, page, pageSize, sortBy, sortOrder) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to get collection details: " + err.Error()}) + return + } + + // Set username + username := c.GetString("username") + if username == "" { + username = "admin" + } + collectionDetailsData.Username = username + + // Render HTML template + c.Header("Content-Type", "text/html") + collectionDetailsComponent := app.CollectionDetails(*collectionDetailsData) + layoutComponent := layout.Layout(c, collectionDetailsComponent) + err = layoutComponent.Render(c.Request.Context(), c.Writer) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to render template: " + err.Error()}) + return + } +} + +// ShowClusterEcShards handles the cluster EC shards page (individual shards view) +func (h *ClusterHandlers) ShowClusterEcShards(c *gin.Context) { + // Parse query parameters + page, _ := strconv.Atoi(c.DefaultQuery("page", "1")) + pageSize, _ := strconv.Atoi(c.DefaultQuery("page_size", "100")) + sortBy := c.DefaultQuery("sort_by", "volume_id") + sortOrder := c.DefaultQuery("sort_order", "asc") + collection := c.DefaultQuery("collection", "") + + // Get data from admin server + data, err := h.adminServer.GetClusterEcVolumes(page, pageSize, sortBy, sortOrder, collection) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + // Set username + username := c.GetString("username") + if username == "" { + username = "admin" + } + data.Username = username + + // Render template + c.Header("Content-Type", "text/html") + ecVolumesComponent := app.ClusterEcVolumes(*data) + layoutComponent := layout.Layout(c, ecVolumesComponent) + err = layoutComponent.Render(c.Request.Context(), c.Writer) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } +} + +// ShowEcVolumeDetails renders the EC volume details page +func (h *ClusterHandlers) ShowEcVolumeDetails(c *gin.Context) { + volumeIDStr := c.Param("id") + + if volumeIDStr == "" { + c.JSON(http.StatusBadRequest, gin.H{"error": "Volume ID is required"}) + return + } + + volumeID, err := strconv.Atoi(volumeIDStr) + if err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid volume ID"}) + return + } + + // Check that volumeID is within uint32 range + if volumeID < 0 || volumeID > int(math.MaxUint32) { + c.JSON(http.StatusBadRequest, gin.H{"error": "Volume ID out of range"}) + return + } + + // Parse sorting parameters + sortBy := c.DefaultQuery("sort_by", "shard_id") + sortOrder := c.DefaultQuery("sort_order", "asc") + + // Get EC volume details + ecVolumeDetails, err := h.adminServer.GetEcVolumeDetails(uint32(volumeID), sortBy, sortOrder) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to get EC volume details: " + err.Error()}) + return + } + + // Set username + username := c.GetString("username") + if username == "" { + username = "admin" + } + ecVolumeDetails.Username = username + + // Render HTML template + c.Header("Content-Type", "text/html") + ecVolumeDetailsComponent := app.EcVolumeDetails(*ecVolumeDetails) + layoutComponent := layout.Layout(c, ecVolumeDetailsComponent) + err = layoutComponent.Render(c.Request.Context(), c.Writer) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to render template: " + err.Error()}) + return + } +} + // ShowClusterMasters renders the cluster masters page func (h *ClusterHandlers) ShowClusterMasters(c *gin.Context) { // Get cluster masters data diff --git a/weed/admin/handlers/maintenance_handlers.go b/weed/admin/handlers/maintenance_handlers.go index 4b1f91387..1e2337272 100644 --- a/weed/admin/handlers/maintenance_handlers.go +++ b/weed/admin/handlers/maintenance_handlers.go @@ -1,16 +1,24 @@ package handlers import ( + "fmt" "net/http" + "reflect" + "strconv" + "strings" "time" "github.com/gin-gonic/gin" + "github.com/seaweedfs/seaweedfs/weed/admin/config" "github.com/seaweedfs/seaweedfs/weed/admin/dash" "github.com/seaweedfs/seaweedfs/weed/admin/maintenance" "github.com/seaweedfs/seaweedfs/weed/admin/view/app" - "github.com/seaweedfs/seaweedfs/weed/admin/view/components" "github.com/seaweedfs/seaweedfs/weed/admin/view/layout" + "github.com/seaweedfs/seaweedfs/weed/glog" "github.com/seaweedfs/seaweedfs/weed/worker/tasks" + "github.com/seaweedfs/seaweedfs/weed/worker/tasks/balance" + "github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding" + "github.com/seaweedfs/seaweedfs/weed/worker/tasks/vacuum" "github.com/seaweedfs/seaweedfs/weed/worker/types" ) @@ -30,19 +38,31 @@ func NewMaintenanceHandlers(adminServer *dash.AdminServer) *MaintenanceHandlers func (h *MaintenanceHandlers) ShowMaintenanceQueue(c *gin.Context) { data, err := h.getMaintenanceQueueData() if err != nil { + glog.Infof("DEBUG ShowMaintenanceQueue: error getting data: %v", err) c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } + glog.Infof("DEBUG ShowMaintenanceQueue: got data with %d tasks", len(data.Tasks)) + if data.Stats != nil { + glog.Infof("DEBUG ShowMaintenanceQueue: stats = {pending: %d, running: %d, completed: %d}", + data.Stats.PendingTasks, data.Stats.RunningTasks, data.Stats.CompletedToday) + } else { + glog.Infof("DEBUG ShowMaintenanceQueue: stats is nil") + } + // Render HTML template c.Header("Content-Type", "text/html") maintenanceComponent := app.MaintenanceQueue(data) layoutComponent := layout.Layout(c, maintenanceComponent) err = layoutComponent.Render(c.Request.Context(), c.Writer) if err != nil { + glog.Infof("DEBUG ShowMaintenanceQueue: render error: %v", err) c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to render template: " + err.Error()}) return } + + glog.Infof("DEBUG ShowMaintenanceQueue: template rendered successfully") } // ShowMaintenanceWorkers displays the maintenance workers page @@ -72,9 +92,12 @@ func (h *MaintenanceHandlers) ShowMaintenanceConfig(c *gin.Context) { return } - // Render HTML template + // Get the schema for dynamic form rendering + schema := maintenance.GetMaintenanceConfigSchema() + + // Render HTML template using schema-driven approach c.Header("Content-Type", "text/html") - configComponent := app.MaintenanceConfig(config) + configComponent := app.MaintenanceConfigSchema(config, schema) layoutComponent := layout.Layout(c, configComponent) err = layoutComponent.Render(c.Request.Context(), c.Writer) if err != nil { @@ -87,20 +110,20 @@ func (h *MaintenanceHandlers) ShowMaintenanceConfig(c *gin.Context) { func (h *MaintenanceHandlers) ShowTaskConfig(c *gin.Context) { taskTypeName := c.Param("taskType") - // Get the task type - taskType := maintenance.GetMaintenanceTaskType(taskTypeName) - if taskType == "" { - c.JSON(http.StatusNotFound, gin.H{"error": "Task type not found"}) + // Get the schema for this task type + schema := tasks.GetTaskConfigSchema(taskTypeName) + if schema == nil { + c.JSON(http.StatusNotFound, gin.H{"error": "Task type not found or no schema available"}) return } - // Get the UI provider for this task type + // Get the UI provider for current configuration uiRegistry := tasks.GetGlobalUIRegistry() typesRegistry := tasks.GetGlobalTypesRegistry() var provider types.TaskUIProvider for workerTaskType := range typesRegistry.GetAllDetectors() { - if string(workerTaskType) == string(taskType) { + if string(workerTaskType) == taskTypeName { provider = uiRegistry.GetProvider(workerTaskType) break } @@ -111,73 +134,23 @@ func (h *MaintenanceHandlers) ShowTaskConfig(c *gin.Context) { return } - // Try to get templ UI provider first - temporarily disabled - // templUIProvider := getTemplUIProvider(taskType) - var configSections []components.ConfigSectionData + // Get current configuration + currentConfig := provider.GetCurrentConfig() - // Temporarily disabled templ UI provider - // if templUIProvider != nil { - // // Use the new templ-based UI provider - // currentConfig := templUIProvider.GetCurrentConfig() - // sections, err := templUIProvider.RenderConfigSections(currentConfig) - // if err != nil { - // c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to render configuration sections: " + err.Error()}) - // return - // } - // configSections = sections - // } else { - // Fallback to basic configuration for providers that haven't been migrated yet - configSections = []components.ConfigSectionData{ - { - Title: "Configuration Settings", - Icon: "fas fa-cogs", - Description: "Configure task detection and scheduling parameters", - Fields: []interface{}{ - components.CheckboxFieldData{ - FormFieldData: components.FormFieldData{ - Name: "enabled", - Label: "Enable Task", - Description: "Whether this task type should be enabled", - }, - Checked: true, - }, - components.NumberFieldData{ - FormFieldData: components.FormFieldData{ - Name: "max_concurrent", - Label: "Max Concurrent Tasks", - Description: "Maximum number of concurrent tasks", - Required: true, - }, - Value: 2, - Step: "1", - Min: floatPtr(1), - }, - components.DurationFieldData{ - FormFieldData: components.FormFieldData{ - Name: "scan_interval", - Label: "Scan Interval", - Description: "How often to scan for tasks", - Required: true, - }, - Value: "30m", - }, - }, - }, - } - // } // End of disabled templ UI provider else block + // Note: Do NOT apply schema defaults to current config as it overrides saved values + // Only apply defaults when creating new configs, not when displaying existing ones - // Create task configuration data using templ components - configData := &app.TaskConfigTemplData{ - TaskType: taskType, - TaskName: provider.GetDisplayName(), - TaskIcon: provider.GetIcon(), - Description: provider.GetDescription(), - ConfigSections: configSections, + // Create task configuration data + configData := &maintenance.TaskConfigData{ + TaskType: maintenance.MaintenanceTaskType(taskTypeName), + TaskName: schema.DisplayName, + TaskIcon: schema.Icon, + Description: schema.Description, } - // Render HTML template using templ components + // Render HTML template using schema-based approach c.Header("Content-Type", "text/html") - taskConfigComponent := app.TaskConfigTempl(configData) + taskConfigComponent := app.TaskConfigSchema(configData, schema, currentConfig) layoutComponent := layout.Layout(c, taskConfigComponent) err := layoutComponent.Render(c.Request.Context(), c.Writer) if err != nil { @@ -186,19 +159,10 @@ func (h *MaintenanceHandlers) ShowTaskConfig(c *gin.Context) { } } -// UpdateTaskConfig updates configuration for a specific task type +// UpdateTaskConfig updates task configuration from form func (h *MaintenanceHandlers) UpdateTaskConfig(c *gin.Context) { taskTypeName := c.Param("taskType") - - // Get the task type - taskType := maintenance.GetMaintenanceTaskType(taskTypeName) - if taskType == "" { - c.JSON(http.StatusNotFound, gin.H{"error": "Task type not found"}) - return - } - - // Try to get templ UI provider first - temporarily disabled - // templUIProvider := getTemplUIProvider(taskType) + taskType := types.TaskType(taskTypeName) // Parse form data err := c.Request.ParseForm() @@ -207,31 +171,100 @@ func (h *MaintenanceHandlers) UpdateTaskConfig(c *gin.Context) { return } - // Convert form data to map - formData := make(map[string][]string) + // Debug logging - show received form data + glog.V(1).Infof("Received form data for task type %s:", taskTypeName) for key, values := range c.Request.PostForm { - formData[key] = values + glog.V(1).Infof(" %s: %v", key, values) } - var config interface{} + // Get the task configuration schema + schema := tasks.GetTaskConfigSchema(taskTypeName) + if schema == nil { + c.JSON(http.StatusNotFound, gin.H{"error": "Schema not found for task type: " + taskTypeName}) + return + } - // Temporarily disabled templ UI provider - // if templUIProvider != nil { - // // Use the new templ-based UI provider - // config, err = templUIProvider.ParseConfigForm(formData) - // if err != nil { - // c.JSON(http.StatusBadRequest, gin.H{"error": "Failed to parse configuration: " + err.Error()}) - // return - // } - // // Apply configuration using templ provider - // err = templUIProvider.ApplyConfig(config) - // if err != nil { - // c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to apply configuration: " + err.Error()}) - // return - // } - // } else { - // Fallback to old UI provider for tasks that haven't been migrated yet - // Fallback to old UI provider for tasks that haven't been migrated yet + // Create a new config instance based on task type and apply schema defaults + var config TaskConfig + switch taskType { + case types.TaskTypeVacuum: + config = &vacuum.Config{} + case types.TaskTypeBalance: + config = &balance.Config{} + case types.TaskTypeErasureCoding: + config = &erasure_coding.Config{} + default: + c.JSON(http.StatusBadRequest, gin.H{"error": "Unsupported task type: " + taskTypeName}) + return + } + + // Apply schema defaults first using type-safe method + if err := schema.ApplyDefaultsToConfig(config); err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to apply defaults: " + err.Error()}) + return + } + + // First, get the current configuration to preserve existing values + currentUIRegistry := tasks.GetGlobalUIRegistry() + currentTypesRegistry := tasks.GetGlobalTypesRegistry() + + var currentProvider types.TaskUIProvider + for workerTaskType := range currentTypesRegistry.GetAllDetectors() { + if string(workerTaskType) == string(taskType) { + currentProvider = currentUIRegistry.GetProvider(workerTaskType) + break + } + } + + if currentProvider != nil { + // Copy current config values to the new config + currentConfig := currentProvider.GetCurrentConfig() + if currentConfigProtobuf, ok := currentConfig.(TaskConfig); ok { + // Apply current values using protobuf directly - no map conversion needed! + currentPolicy := currentConfigProtobuf.ToTaskPolicy() + if err := config.FromTaskPolicy(currentPolicy); err != nil { + glog.Warningf("Failed to load current config for %s: %v", taskTypeName, err) + } + } + } + + // Parse form data using schema-based approach (this will override with new values) + err = h.parseTaskConfigFromForm(c.Request.PostForm, schema, config) + if err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "Failed to parse configuration: " + err.Error()}) + return + } + + // Debug logging - show parsed config values + switch taskType { + case types.TaskTypeVacuum: + if vacuumConfig, ok := config.(*vacuum.Config); ok { + glog.V(1).Infof("Parsed vacuum config - GarbageThreshold: %f, MinVolumeAgeSeconds: %d, MinIntervalSeconds: %d", + vacuumConfig.GarbageThreshold, vacuumConfig.MinVolumeAgeSeconds, vacuumConfig.MinIntervalSeconds) + } + case types.TaskTypeErasureCoding: + if ecConfig, ok := config.(*erasure_coding.Config); ok { + glog.V(1).Infof("Parsed EC config - FullnessRatio: %f, QuietForSeconds: %d, MinSizeMB: %d, CollectionFilter: '%s'", + ecConfig.FullnessRatio, ecConfig.QuietForSeconds, ecConfig.MinSizeMB, ecConfig.CollectionFilter) + } + case types.TaskTypeBalance: + if balanceConfig, ok := config.(*balance.Config); ok { + glog.V(1).Infof("Parsed balance config - Enabled: %v, MaxConcurrent: %d, ScanIntervalSeconds: %d, ImbalanceThreshold: %f, MinServerCount: %d", + balanceConfig.Enabled, balanceConfig.MaxConcurrent, balanceConfig.ScanIntervalSeconds, balanceConfig.ImbalanceThreshold, balanceConfig.MinServerCount) + } + } + + // Validate the configuration + if validationErrors := schema.ValidateConfig(config); len(validationErrors) > 0 { + errorMessages := make([]string, len(validationErrors)) + for i, err := range validationErrors { + errorMessages[i] = err.Error() + } + c.JSON(http.StatusBadRequest, gin.H{"error": "Configuration validation failed", "details": errorMessages}) + return + } + + // Apply configuration using UIProvider uiRegistry := tasks.GetGlobalUIRegistry() typesRegistry := tasks.GetGlobalTypesRegistry() @@ -248,25 +281,153 @@ func (h *MaintenanceHandlers) UpdateTaskConfig(c *gin.Context) { return } - // Parse configuration from form using old provider - config, err = provider.ParseConfigForm(formData) - if err != nil { - c.JSON(http.StatusBadRequest, gin.H{"error": "Failed to parse configuration: " + err.Error()}) - return - } - - // Apply configuration using old provider - err = provider.ApplyConfig(config) + // Apply configuration using provider + err = provider.ApplyTaskConfig(config) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to apply configuration: " + err.Error()}) return } - // } // End of disabled templ UI provider else block + + // Save task configuration to protobuf file using ConfigPersistence + if h.adminServer != nil && h.adminServer.GetConfigPersistence() != nil { + err = h.saveTaskConfigToProtobuf(taskType, config) + if err != nil { + glog.Warningf("Failed to save task config to protobuf file: %v", err) + // Don't fail the request, just log the warning + } + } + + // Trigger a configuration reload in the maintenance manager + if h.adminServer != nil { + if manager := h.adminServer.GetMaintenanceManager(); manager != nil { + err = manager.ReloadTaskConfigurations() + if err != nil { + glog.Warningf("Failed to reload task configurations: %v", err) + } else { + glog.V(1).Infof("Successfully reloaded task configurations after updating %s", taskTypeName) + } + } + } // Redirect back to task configuration page c.Redirect(http.StatusSeeOther, "/maintenance/config/"+taskTypeName) } +// parseTaskConfigFromForm parses form data using schema definitions +func (h *MaintenanceHandlers) parseTaskConfigFromForm(formData map[string][]string, schema *tasks.TaskConfigSchema, config interface{}) error { + configValue := reflect.ValueOf(config) + if configValue.Kind() == reflect.Ptr { + configValue = configValue.Elem() + } + + if configValue.Kind() != reflect.Struct { + return fmt.Errorf("config must be a struct or pointer to struct") + } + + configType := configValue.Type() + + for i := 0; i < configValue.NumField(); i++ { + field := configValue.Field(i) + fieldType := configType.Field(i) + + // Handle embedded structs recursively + if fieldType.Anonymous && field.Kind() == reflect.Struct { + err := h.parseTaskConfigFromForm(formData, schema, field.Addr().Interface()) + if err != nil { + return fmt.Errorf("error parsing embedded struct %s: %w", fieldType.Name, err) + } + continue + } + + // Get JSON tag name + jsonTag := fieldType.Tag.Get("json") + if jsonTag == "" { + continue + } + + // Remove options like ",omitempty" + if commaIdx := strings.Index(jsonTag, ","); commaIdx > 0 { + jsonTag = jsonTag[:commaIdx] + } + + // Find corresponding schema field + schemaField := schema.GetFieldByName(jsonTag) + if schemaField == nil { + continue + } + + // Parse value based on field type + if err := h.parseFieldFromForm(formData, schemaField, field); err != nil { + return fmt.Errorf("error parsing field %s: %w", schemaField.DisplayName, err) + } + } + + return nil +} + +// parseFieldFromForm parses a single field value from form data +func (h *MaintenanceHandlers) parseFieldFromForm(formData map[string][]string, schemaField *config.Field, fieldValue reflect.Value) error { + if !fieldValue.CanSet() { + return nil + } + + switch schemaField.Type { + case config.FieldTypeBool: + // Checkbox fields - present means true, absent means false + _, exists := formData[schemaField.JSONName] + fieldValue.SetBool(exists) + + case config.FieldTypeInt: + if values, ok := formData[schemaField.JSONName]; ok && len(values) > 0 { + if intVal, err := strconv.Atoi(values[0]); err != nil { + return fmt.Errorf("invalid integer value: %s", values[0]) + } else { + fieldValue.SetInt(int64(intVal)) + } + } + + case config.FieldTypeFloat: + if values, ok := formData[schemaField.JSONName]; ok && len(values) > 0 { + if floatVal, err := strconv.ParseFloat(values[0], 64); err != nil { + return fmt.Errorf("invalid float value: %s", values[0]) + } else { + fieldValue.SetFloat(floatVal) + } + } + + case config.FieldTypeString: + if values, ok := formData[schemaField.JSONName]; ok && len(values) > 0 { + fieldValue.SetString(values[0]) + } + + case config.FieldTypeInterval: + // Parse interval fields with value + unit + valueKey := schemaField.JSONName + "_value" + unitKey := schemaField.JSONName + "_unit" + + if valueStrs, ok := formData[valueKey]; ok && len(valueStrs) > 0 { + value, err := strconv.Atoi(valueStrs[0]) + if err != nil { + return fmt.Errorf("invalid interval value: %s", valueStrs[0]) + } + + unit := "minutes" // default + if unitStrs, ok := formData[unitKey]; ok && len(unitStrs) > 0 { + unit = unitStrs[0] + } + + // Convert to seconds + seconds := config.IntervalValueUnitToSeconds(value, unit) + fieldValue.SetInt(int64(seconds)) + } + + default: + return fmt.Errorf("unsupported field type: %s", schemaField.Type) + } + + return nil +} + // UpdateMaintenanceConfig updates maintenance configuration from form func (h *MaintenanceHandlers) UpdateMaintenanceConfig(c *gin.Context) { var config maintenance.MaintenanceConfig @@ -302,36 +463,50 @@ func (h *MaintenanceHandlers) getMaintenanceQueueData() (*maintenance.Maintenanc return nil, err } - return &maintenance.MaintenanceQueueData{ + data := &maintenance.MaintenanceQueueData{ Tasks: tasks, Workers: workers, Stats: stats, LastUpdated: time.Now(), - }, nil + } + + return data, nil } func (h *MaintenanceHandlers) getMaintenanceQueueStats() (*maintenance.QueueStats, error) { - // This would integrate with the maintenance queue to get real statistics - // For now, return mock data - return &maintenance.QueueStats{ - PendingTasks: 5, - RunningTasks: 2, - CompletedToday: 15, - FailedToday: 1, - TotalTasks: 23, - }, nil + // Use the exported method from AdminServer + return h.adminServer.GetMaintenanceQueueStats() } func (h *MaintenanceHandlers) getMaintenanceTasks() ([]*maintenance.MaintenanceTask, error) { - // This would integrate with the maintenance queue to get real tasks - // For now, return mock data - return []*maintenance.MaintenanceTask{}, nil + // Call the maintenance manager directly to get all tasks + if h.adminServer == nil { + return []*maintenance.MaintenanceTask{}, nil + } + + manager := h.adminServer.GetMaintenanceManager() + if manager == nil { + return []*maintenance.MaintenanceTask{}, nil + } + + // Get ALL tasks using empty parameters - this should match what the API returns + allTasks := manager.GetTasks("", "", 0) + return allTasks, nil } func (h *MaintenanceHandlers) getMaintenanceWorkers() ([]*maintenance.MaintenanceWorker, error) { - // This would integrate with the maintenance system to get real workers - // For now, return mock data - return []*maintenance.MaintenanceWorker{}, nil + // Get workers from the admin server's maintenance manager + if h.adminServer == nil { + return []*maintenance.MaintenanceWorker{}, nil + } + + if h.adminServer.GetMaintenanceManager() == nil { + return []*maintenance.MaintenanceWorker{}, nil + } + + // Get workers from the maintenance manager + workers := h.adminServer.GetMaintenanceManager().GetWorkers() + return workers, nil } func (h *MaintenanceHandlers) getMaintenanceConfig() (*maintenance.MaintenanceConfigData, error) { @@ -344,40 +519,25 @@ func (h *MaintenanceHandlers) updateMaintenanceConfig(config *maintenance.Mainte return h.adminServer.UpdateMaintenanceConfigData(config) } -// floatPtr is a helper function to create float64 pointers -func floatPtr(f float64) *float64 { - return &f -} +// saveTaskConfigToProtobuf saves task configuration to protobuf file +func (h *MaintenanceHandlers) saveTaskConfigToProtobuf(taskType types.TaskType, config TaskConfig) error { + configPersistence := h.adminServer.GetConfigPersistence() + if configPersistence == nil { + return fmt.Errorf("config persistence not available") + } -// Global templ UI registry - temporarily disabled -// var globalTemplUIRegistry *types.UITemplRegistry + // Use the new ToTaskPolicy method - much simpler and more maintainable! + taskPolicy := config.ToTaskPolicy() -// initTemplUIRegistry initializes the global templ UI registry - temporarily disabled -func initTemplUIRegistry() { - // Temporarily disabled due to missing types - // if globalTemplUIRegistry == nil { - // globalTemplUIRegistry = types.NewUITemplRegistry() - // // Register vacuum templ UI provider using shared instances - // vacuumDetector, vacuumScheduler := vacuum.GetSharedInstances() - // vacuum.RegisterUITempl(globalTemplUIRegistry, vacuumDetector, vacuumScheduler) - // // Register erasure coding templ UI provider using shared instances - // erasureCodingDetector, erasureCodingScheduler := erasure_coding.GetSharedInstances() - // erasure_coding.RegisterUITempl(globalTemplUIRegistry, erasureCodingDetector, erasureCodingScheduler) - // // Register balance templ UI provider using shared instances - // balanceDetector, balanceScheduler := balance.GetSharedInstances() - // balance.RegisterUITempl(globalTemplUIRegistry, balanceDetector, balanceScheduler) - // } -} - -// getTemplUIProvider gets the templ UI provider for a task type - temporarily disabled -func getTemplUIProvider(taskType maintenance.MaintenanceTaskType) interface{} { - // initTemplUIRegistry() - // Convert maintenance task type to worker task type - // typesRegistry := tasks.GetGlobalTypesRegistry() - // for workerTaskType := range typesRegistry.GetAllDetectors() { - // if string(workerTaskType) == string(taskType) { - // return globalTemplUIRegistry.GetProvider(workerTaskType) - // } - // } - return nil + // Save using task-specific methods + switch taskType { + case types.TaskTypeVacuum: + return configPersistence.SaveVacuumTaskPolicy(taskPolicy) + case types.TaskTypeErasureCoding: + return configPersistence.SaveErasureCodingTaskPolicy(taskPolicy) + case types.TaskTypeBalance: + return configPersistence.SaveBalanceTaskPolicy(taskPolicy) + default: + return fmt.Errorf("unsupported task type for protobuf persistence: %s", taskType) + } } diff --git a/weed/admin/handlers/maintenance_handlers_test.go b/weed/admin/handlers/maintenance_handlers_test.go new file mode 100644 index 000000000..fa5a365f1 --- /dev/null +++ b/weed/admin/handlers/maintenance_handlers_test.go @@ -0,0 +1,389 @@ +package handlers + +import ( + "net/url" + "testing" + + "github.com/seaweedfs/seaweedfs/weed/admin/config" + "github.com/seaweedfs/seaweedfs/weed/worker/tasks" + "github.com/seaweedfs/seaweedfs/weed/worker/tasks/balance" + "github.com/seaweedfs/seaweedfs/weed/worker/tasks/base" + "github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding" + "github.com/seaweedfs/seaweedfs/weed/worker/tasks/vacuum" +) + +func TestParseTaskConfigFromForm_WithEmbeddedStruct(t *testing.T) { + // Create a maintenance handlers instance for testing + h := &MaintenanceHandlers{} + + // Test with balance config + t.Run("Balance Config", func(t *testing.T) { + // Simulate form data + formData := url.Values{ + "enabled": {"on"}, // checkbox field + "scan_interval_seconds_value": {"30"}, // interval field + "scan_interval_seconds_unit": {"minutes"}, // interval unit + "max_concurrent": {"2"}, // number field + "imbalance_threshold": {"0.15"}, // float field + "min_server_count": {"3"}, // number field + } + + // Get schema + schema := tasks.GetTaskConfigSchema("balance") + if schema == nil { + t.Fatal("Failed to get balance schema") + } + + // Create config instance + config := &balance.Config{} + + // Parse form data + err := h.parseTaskConfigFromForm(formData, schema, config) + if err != nil { + t.Fatalf("Failed to parse form data: %v", err) + } + + // Verify embedded struct fields were set correctly + if !config.Enabled { + t.Errorf("Expected Enabled=true, got %v", config.Enabled) + } + + if config.ScanIntervalSeconds != 1800 { // 30 minutes * 60 + t.Errorf("Expected ScanIntervalSeconds=1800, got %v", config.ScanIntervalSeconds) + } + + if config.MaxConcurrent != 2 { + t.Errorf("Expected MaxConcurrent=2, got %v", config.MaxConcurrent) + } + + // Verify balance-specific fields were set correctly + if config.ImbalanceThreshold != 0.15 { + t.Errorf("Expected ImbalanceThreshold=0.15, got %v", config.ImbalanceThreshold) + } + + if config.MinServerCount != 3 { + t.Errorf("Expected MinServerCount=3, got %v", config.MinServerCount) + } + }) + + // Test with vacuum config + t.Run("Vacuum Config", func(t *testing.T) { + // Simulate form data + formData := url.Values{ + // "enabled" field omitted to simulate unchecked checkbox + "scan_interval_seconds_value": {"4"}, // interval field + "scan_interval_seconds_unit": {"hours"}, // interval unit + "max_concurrent": {"3"}, // number field + "garbage_threshold": {"0.4"}, // float field + "min_volume_age_seconds_value": {"2"}, // interval field + "min_volume_age_seconds_unit": {"days"}, // interval unit + "min_interval_seconds_value": {"1"}, // interval field + "min_interval_seconds_unit": {"days"}, // interval unit + } + + // Get schema + schema := tasks.GetTaskConfigSchema("vacuum") + if schema == nil { + t.Fatal("Failed to get vacuum schema") + } + + // Create config instance + config := &vacuum.Config{} + + // Parse form data + err := h.parseTaskConfigFromForm(formData, schema, config) + if err != nil { + t.Fatalf("Failed to parse form data: %v", err) + } + + // Verify embedded struct fields were set correctly + if config.Enabled { + t.Errorf("Expected Enabled=false, got %v", config.Enabled) + } + + if config.ScanIntervalSeconds != 14400 { // 4 hours * 3600 + t.Errorf("Expected ScanIntervalSeconds=14400, got %v", config.ScanIntervalSeconds) + } + + if config.MaxConcurrent != 3 { + t.Errorf("Expected MaxConcurrent=3, got %v", config.MaxConcurrent) + } + + // Verify vacuum-specific fields were set correctly + if config.GarbageThreshold != 0.4 { + t.Errorf("Expected GarbageThreshold=0.4, got %v", config.GarbageThreshold) + } + + if config.MinVolumeAgeSeconds != 172800 { // 2 days * 86400 + t.Errorf("Expected MinVolumeAgeSeconds=172800, got %v", config.MinVolumeAgeSeconds) + } + + if config.MinIntervalSeconds != 86400 { // 1 day * 86400 + t.Errorf("Expected MinIntervalSeconds=86400, got %v", config.MinIntervalSeconds) + } + }) + + // Test with erasure coding config + t.Run("Erasure Coding Config", func(t *testing.T) { + // Simulate form data + formData := url.Values{ + "enabled": {"on"}, // checkbox field + "scan_interval_seconds_value": {"2"}, // interval field + "scan_interval_seconds_unit": {"hours"}, // interval unit + "max_concurrent": {"1"}, // number field + "quiet_for_seconds_value": {"10"}, // interval field + "quiet_for_seconds_unit": {"minutes"}, // interval unit + "fullness_ratio": {"0.85"}, // float field + "collection_filter": {"test_collection"}, // string field + "min_size_mb": {"50"}, // number field + } + + // Get schema + schema := tasks.GetTaskConfigSchema("erasure_coding") + if schema == nil { + t.Fatal("Failed to get erasure_coding schema") + } + + // Create config instance + config := &erasure_coding.Config{} + + // Parse form data + err := h.parseTaskConfigFromForm(formData, schema, config) + if err != nil { + t.Fatalf("Failed to parse form data: %v", err) + } + + // Verify embedded struct fields were set correctly + if !config.Enabled { + t.Errorf("Expected Enabled=true, got %v", config.Enabled) + } + + if config.ScanIntervalSeconds != 7200 { // 2 hours * 3600 + t.Errorf("Expected ScanIntervalSeconds=7200, got %v", config.ScanIntervalSeconds) + } + + if config.MaxConcurrent != 1 { + t.Errorf("Expected MaxConcurrent=1, got %v", config.MaxConcurrent) + } + + // Verify erasure coding-specific fields were set correctly + if config.QuietForSeconds != 600 { // 10 minutes * 60 + t.Errorf("Expected QuietForSeconds=600, got %v", config.QuietForSeconds) + } + + if config.FullnessRatio != 0.85 { + t.Errorf("Expected FullnessRatio=0.85, got %v", config.FullnessRatio) + } + + if config.CollectionFilter != "test_collection" { + t.Errorf("Expected CollectionFilter='test_collection', got %v", config.CollectionFilter) + } + + if config.MinSizeMB != 50 { + t.Errorf("Expected MinSizeMB=50, got %v", config.MinSizeMB) + } + }) +} + +func TestConfigurationValidation(t *testing.T) { + // Test that config structs can be validated and converted to protobuf format + taskTypes := []struct { + name string + config interface{} + }{ + { + "balance", + &balance.Config{ + BaseConfig: base.BaseConfig{ + Enabled: true, + ScanIntervalSeconds: 2400, + MaxConcurrent: 3, + }, + ImbalanceThreshold: 0.18, + MinServerCount: 4, + }, + }, + { + "vacuum", + &vacuum.Config{ + BaseConfig: base.BaseConfig{ + Enabled: false, + ScanIntervalSeconds: 7200, + MaxConcurrent: 2, + }, + GarbageThreshold: 0.35, + MinVolumeAgeSeconds: 86400, + MinIntervalSeconds: 604800, + }, + }, + { + "erasure_coding", + &erasure_coding.Config{ + BaseConfig: base.BaseConfig{ + Enabled: true, + ScanIntervalSeconds: 3600, + MaxConcurrent: 1, + }, + QuietForSeconds: 900, + FullnessRatio: 0.9, + CollectionFilter: "important", + MinSizeMB: 100, + }, + }, + } + + for _, test := range taskTypes { + t.Run(test.name, func(t *testing.T) { + // Test that configs can be converted to protobuf TaskPolicy + switch cfg := test.config.(type) { + case *balance.Config: + policy := cfg.ToTaskPolicy() + if policy == nil { + t.Fatal("ToTaskPolicy returned nil") + } + if policy.Enabled != cfg.Enabled { + t.Errorf("Expected Enabled=%v, got %v", cfg.Enabled, policy.Enabled) + } + if policy.MaxConcurrent != int32(cfg.MaxConcurrent) { + t.Errorf("Expected MaxConcurrent=%v, got %v", cfg.MaxConcurrent, policy.MaxConcurrent) + } + case *vacuum.Config: + policy := cfg.ToTaskPolicy() + if policy == nil { + t.Fatal("ToTaskPolicy returned nil") + } + if policy.Enabled != cfg.Enabled { + t.Errorf("Expected Enabled=%v, got %v", cfg.Enabled, policy.Enabled) + } + if policy.MaxConcurrent != int32(cfg.MaxConcurrent) { + t.Errorf("Expected MaxConcurrent=%v, got %v", cfg.MaxConcurrent, policy.MaxConcurrent) + } + case *erasure_coding.Config: + policy := cfg.ToTaskPolicy() + if policy == nil { + t.Fatal("ToTaskPolicy returned nil") + } + if policy.Enabled != cfg.Enabled { + t.Errorf("Expected Enabled=%v, got %v", cfg.Enabled, policy.Enabled) + } + if policy.MaxConcurrent != int32(cfg.MaxConcurrent) { + t.Errorf("Expected MaxConcurrent=%v, got %v", cfg.MaxConcurrent, policy.MaxConcurrent) + } + default: + t.Fatalf("Unknown config type: %T", test.config) + } + + // Test that configs can be validated + switch cfg := test.config.(type) { + case *balance.Config: + if err := cfg.Validate(); err != nil { + t.Errorf("Validation failed: %v", err) + } + case *vacuum.Config: + if err := cfg.Validate(); err != nil { + t.Errorf("Validation failed: %v", err) + } + case *erasure_coding.Config: + if err := cfg.Validate(); err != nil { + t.Errorf("Validation failed: %v", err) + } + } + }) + } +} + +func TestParseFieldFromForm_EdgeCases(t *testing.T) { + h := &MaintenanceHandlers{} + + // Test checkbox parsing (boolean fields) + t.Run("Checkbox Fields", func(t *testing.T) { + tests := []struct { + name string + formData url.Values + expectedValue bool + }{ + {"Checked checkbox", url.Values{"test_field": {"on"}}, true}, + {"Unchecked checkbox", url.Values{}, false}, + {"Empty value checkbox", url.Values{"test_field": {""}}, true}, // Present but empty means checked + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + schema := &tasks.TaskConfigSchema{ + Schema: config.Schema{ + Fields: []*config.Field{ + { + JSONName: "test_field", + Type: config.FieldTypeBool, + InputType: "checkbox", + }, + }, + }, + } + + type TestConfig struct { + TestField bool `json:"test_field"` + } + + config := &TestConfig{} + err := h.parseTaskConfigFromForm(test.formData, schema, config) + if err != nil { + t.Fatalf("parseTaskConfigFromForm failed: %v", err) + } + + if config.TestField != test.expectedValue { + t.Errorf("Expected %v, got %v", test.expectedValue, config.TestField) + } + }) + } + }) + + // Test interval parsing + t.Run("Interval Fields", func(t *testing.T) { + tests := []struct { + name string + value string + unit string + expectedSecs int + }{ + {"Minutes", "30", "minutes", 1800}, + {"Hours", "2", "hours", 7200}, + {"Days", "1", "days", 86400}, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + formData := url.Values{ + "test_field_value": {test.value}, + "test_field_unit": {test.unit}, + } + + schema := &tasks.TaskConfigSchema{ + Schema: config.Schema{ + Fields: []*config.Field{ + { + JSONName: "test_field", + Type: config.FieldTypeInterval, + InputType: "interval", + }, + }, + }, + } + + type TestConfig struct { + TestField int `json:"test_field"` + } + + config := &TestConfig{} + err := h.parseTaskConfigFromForm(formData, schema, config) + if err != nil { + t.Fatalf("parseTaskConfigFromForm failed: %v", err) + } + + if config.TestField != test.expectedSecs { + t.Errorf("Expected %d seconds, got %d", test.expectedSecs, config.TestField) + } + }) + } + }) +} diff --git a/weed/admin/handlers/task_config_interface.go b/weed/admin/handlers/task_config_interface.go new file mode 100644 index 000000000..dd22c5250 --- /dev/null +++ b/weed/admin/handlers/task_config_interface.go @@ -0,0 +1,25 @@ +package handlers + +import ( + "github.com/seaweedfs/seaweedfs/weed/admin/config" + "github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" +) + +// TaskConfig defines the interface that all task configuration types must implement +type TaskConfig interface { + config.ConfigWithDefaults // Extends ConfigWithDefaults for type-safe schema operations + + // Common methods from BaseConfig + IsEnabled() bool + SetEnabled(enabled bool) + + // Protobuf serialization methods - no more map[string]interface{}! + ToTaskPolicy() *worker_pb.TaskPolicy + FromTaskPolicy(policy *worker_pb.TaskPolicy) error +} + +// TaskConfigProvider defines the interface for creating specific task config types +type TaskConfigProvider interface { + NewConfig() TaskConfig + GetTaskType() string +} diff --git a/weed/admin/maintenance/config_schema.go b/weed/admin/maintenance/config_schema.go new file mode 100644 index 000000000..c911ad59c --- /dev/null +++ b/weed/admin/maintenance/config_schema.go @@ -0,0 +1,190 @@ +package maintenance + +import ( + "github.com/seaweedfs/seaweedfs/weed/admin/config" +) + +// Type aliases for backward compatibility +type ConfigFieldType = config.FieldType +type ConfigFieldUnit = config.FieldUnit +type ConfigField = config.Field + +// Constant aliases for backward compatibility +const ( + FieldTypeBool = config.FieldTypeBool + FieldTypeInt = config.FieldTypeInt + FieldTypeDuration = config.FieldTypeDuration + FieldTypeInterval = config.FieldTypeInterval + FieldTypeString = config.FieldTypeString + FieldTypeFloat = config.FieldTypeFloat +) + +const ( + UnitSeconds = config.UnitSeconds + UnitMinutes = config.UnitMinutes + UnitHours = config.UnitHours + UnitDays = config.UnitDays + UnitCount = config.UnitCount + UnitNone = config.UnitNone +) + +// Function aliases for backward compatibility +var ( + SecondsToIntervalValueUnit = config.SecondsToIntervalValueUnit + IntervalValueUnitToSeconds = config.IntervalValueUnitToSeconds +) + +// MaintenanceConfigSchema defines the schema for maintenance configuration +type MaintenanceConfigSchema struct { + config.Schema // Embed common schema functionality +} + +// GetMaintenanceConfigSchema returns the schema for maintenance configuration +func GetMaintenanceConfigSchema() *MaintenanceConfigSchema { + return &MaintenanceConfigSchema{ + Schema: config.Schema{ + Fields: []*config.Field{ + { + Name: "enabled", + JSONName: "enabled", + Type: config.FieldTypeBool, + DefaultValue: true, + Required: false, + DisplayName: "Enable Maintenance System", + Description: "When enabled, the system will automatically scan for and execute maintenance tasks", + HelpText: "Toggle this to enable or disable the entire maintenance system", + InputType: "checkbox", + CSSClasses: "form-check-input", + }, + { + Name: "scan_interval_seconds", + JSONName: "scan_interval_seconds", + Type: config.FieldTypeInterval, + DefaultValue: 30 * 60, // 30 minutes in seconds + MinValue: 1 * 60, // 1 minute + MaxValue: 24 * 60 * 60, // 24 hours + Required: true, + DisplayName: "Scan Interval", + Description: "How often to scan for maintenance tasks", + HelpText: "The system will check for new maintenance tasks at this interval", + Placeholder: "30", + Unit: config.UnitMinutes, + InputType: "interval", + CSSClasses: "form-control", + }, + { + Name: "worker_timeout_seconds", + JSONName: "worker_timeout_seconds", + Type: config.FieldTypeInterval, + DefaultValue: 5 * 60, // 5 minutes + MinValue: 1 * 60, // 1 minute + MaxValue: 60 * 60, // 1 hour + Required: true, + DisplayName: "Worker Timeout", + Description: "How long to wait for worker heartbeat before considering it inactive", + HelpText: "Workers that don't send heartbeats within this time are considered offline", + Placeholder: "5", + Unit: config.UnitMinutes, + InputType: "interval", + CSSClasses: "form-control", + }, + { + Name: "task_timeout_seconds", + JSONName: "task_timeout_seconds", + Type: config.FieldTypeInterval, + DefaultValue: 2 * 60 * 60, // 2 hours + MinValue: 10 * 60, // 10 minutes + MaxValue: 24 * 60 * 60, // 24 hours + Required: true, + DisplayName: "Task Timeout", + Description: "Maximum time allowed for a task to complete", + HelpText: "Tasks that exceed this duration will be marked as failed", + Placeholder: "2", + Unit: config.UnitHours, + InputType: "interval", + CSSClasses: "form-control", + }, + { + Name: "retry_delay_seconds", + JSONName: "retry_delay_seconds", + Type: config.FieldTypeInterval, + DefaultValue: 15 * 60, // 15 minutes + MinValue: 1 * 60, // 1 minute + MaxValue: 24 * 60 * 60, // 24 hours + Required: true, + DisplayName: "Retry Delay", + Description: "How long to wait before retrying a failed task", + HelpText: "Failed tasks will be retried after this delay", + Placeholder: "15", + Unit: config.UnitMinutes, + InputType: "interval", + CSSClasses: "form-control", + }, + { + Name: "max_retries", + JSONName: "max_retries", + Type: config.FieldTypeInt, + DefaultValue: 3, + MinValue: 0, + MaxValue: 10, + Required: true, + DisplayName: "Max Retries", + Description: "Maximum number of times to retry a failed task", + HelpText: "Tasks that fail more than this many times will be marked as permanently failed", + Placeholder: "3", + Unit: config.UnitCount, + InputType: "number", + CSSClasses: "form-control", + }, + { + Name: "cleanup_interval_seconds", + JSONName: "cleanup_interval_seconds", + Type: config.FieldTypeInterval, + DefaultValue: 24 * 60 * 60, // 24 hours + MinValue: 1 * 60 * 60, // 1 hour + MaxValue: 7 * 24 * 60 * 60, // 7 days + Required: true, + DisplayName: "Cleanup Interval", + Description: "How often to run maintenance cleanup operations", + HelpText: "Removes old task records and temporary files at this interval", + Placeholder: "24", + Unit: config.UnitHours, + InputType: "interval", + CSSClasses: "form-control", + }, + { + Name: "task_retention_seconds", + JSONName: "task_retention_seconds", + Type: config.FieldTypeInterval, + DefaultValue: 7 * 24 * 60 * 60, // 7 days + MinValue: 1 * 24 * 60 * 60, // 1 day + MaxValue: 30 * 24 * 60 * 60, // 30 days + Required: true, + DisplayName: "Task Retention", + Description: "How long to keep completed task records", + HelpText: "Task history older than this duration will be automatically deleted", + Placeholder: "7", + Unit: config.UnitDays, + InputType: "interval", + CSSClasses: "form-control", + }, + { + Name: "global_max_concurrent", + JSONName: "global_max_concurrent", + Type: config.FieldTypeInt, + DefaultValue: 10, + MinValue: 1, + MaxValue: 100, + Required: true, + DisplayName: "Global Max Concurrent Tasks", + Description: "Maximum number of maintenance tasks that can run simultaneously across all workers", + HelpText: "Limits the total number of maintenance operations to control system load", + Placeholder: "10", + Unit: config.UnitCount, + InputType: "number", + CSSClasses: "form-control", + }, + }, + }, + } +} diff --git a/weed/admin/maintenance/config_verification.go b/weed/admin/maintenance/config_verification.go new file mode 100644 index 000000000..0ac40aad1 --- /dev/null +++ b/weed/admin/maintenance/config_verification.go @@ -0,0 +1,124 @@ +package maintenance + +import ( + "fmt" + + "github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" +) + +// VerifyProtobufConfig demonstrates that the protobuf configuration system is working +func VerifyProtobufConfig() error { + // Create configuration manager + configManager := NewMaintenanceConfigManager() + config := configManager.GetConfig() + + // Verify basic configuration + if !config.Enabled { + return fmt.Errorf("expected config to be enabled by default") + } + + if config.ScanIntervalSeconds != 30*60 { + return fmt.Errorf("expected scan interval to be 1800 seconds, got %d", config.ScanIntervalSeconds) + } + + // Verify policy configuration + if config.Policy == nil { + return fmt.Errorf("expected policy to be configured") + } + + if config.Policy.GlobalMaxConcurrent != 4 { + return fmt.Errorf("expected global max concurrent to be 4, got %d", config.Policy.GlobalMaxConcurrent) + } + + // Verify task policies + vacuumPolicy := config.Policy.TaskPolicies["vacuum"] + if vacuumPolicy == nil { + return fmt.Errorf("expected vacuum policy to be configured") + } + + if !vacuumPolicy.Enabled { + return fmt.Errorf("expected vacuum policy to be enabled") + } + + // Verify typed configuration access + vacuumConfig := vacuumPolicy.GetVacuumConfig() + if vacuumConfig == nil { + return fmt.Errorf("expected vacuum config to be accessible") + } + + if vacuumConfig.GarbageThreshold != 0.3 { + return fmt.Errorf("expected garbage threshold to be 0.3, got %f", vacuumConfig.GarbageThreshold) + } + + // Verify helper functions work + if !IsTaskEnabled(config.Policy, "vacuum") { + return fmt.Errorf("expected vacuum task to be enabled via helper function") + } + + maxConcurrent := GetMaxConcurrent(config.Policy, "vacuum") + if maxConcurrent != 2 { + return fmt.Errorf("expected vacuum max concurrent to be 2, got %d", maxConcurrent) + } + + // Verify erasure coding configuration + ecPolicy := config.Policy.TaskPolicies["erasure_coding"] + if ecPolicy == nil { + return fmt.Errorf("expected EC policy to be configured") + } + + ecConfig := ecPolicy.GetErasureCodingConfig() + if ecConfig == nil { + return fmt.Errorf("expected EC config to be accessible") + } + + // Verify configurable EC fields only + if ecConfig.FullnessRatio <= 0 || ecConfig.FullnessRatio > 1 { + return fmt.Errorf("expected EC config to have valid fullness ratio (0-1), got %f", ecConfig.FullnessRatio) + } + + return nil +} + +// GetProtobufConfigSummary returns a summary of the current protobuf configuration +func GetProtobufConfigSummary() string { + configManager := NewMaintenanceConfigManager() + config := configManager.GetConfig() + + summary := fmt.Sprintf("SeaweedFS Protobuf Maintenance Configuration:\n") + summary += fmt.Sprintf(" Enabled: %v\n", config.Enabled) + summary += fmt.Sprintf(" Scan Interval: %d seconds\n", config.ScanIntervalSeconds) + summary += fmt.Sprintf(" Max Retries: %d\n", config.MaxRetries) + summary += fmt.Sprintf(" Global Max Concurrent: %d\n", config.Policy.GlobalMaxConcurrent) + summary += fmt.Sprintf(" Task Policies: %d configured\n", len(config.Policy.TaskPolicies)) + + for taskType, policy := range config.Policy.TaskPolicies { + summary += fmt.Sprintf(" %s: enabled=%v, max_concurrent=%d\n", + taskType, policy.Enabled, policy.MaxConcurrent) + } + + return summary +} + +// CreateCustomConfig demonstrates creating a custom protobuf configuration +func CreateCustomConfig() *worker_pb.MaintenanceConfig { + return &worker_pb.MaintenanceConfig{ + Enabled: true, + ScanIntervalSeconds: 60 * 60, // 1 hour + MaxRetries: 5, + Policy: &worker_pb.MaintenancePolicy{ + GlobalMaxConcurrent: 8, + TaskPolicies: map[string]*worker_pb.TaskPolicy{ + "custom_vacuum": { + Enabled: true, + MaxConcurrent: 4, + TaskConfig: &worker_pb.TaskPolicy_VacuumConfig{ + VacuumConfig: &worker_pb.VacuumTaskConfig{ + GarbageThreshold: 0.5, + MinVolumeAgeHours: 48, + }, + }, + }, + }, + }, + } +} diff --git a/weed/admin/maintenance/maintenance_config_proto.go b/weed/admin/maintenance/maintenance_config_proto.go new file mode 100644 index 000000000..67a6b74be --- /dev/null +++ b/weed/admin/maintenance/maintenance_config_proto.go @@ -0,0 +1,287 @@ +package maintenance + +import ( + "fmt" + "time" + + "github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" +) + +// MaintenanceConfigManager handles protobuf-based configuration +type MaintenanceConfigManager struct { + config *worker_pb.MaintenanceConfig +} + +// NewMaintenanceConfigManager creates a new config manager with defaults +func NewMaintenanceConfigManager() *MaintenanceConfigManager { + return &MaintenanceConfigManager{ + config: DefaultMaintenanceConfigProto(), + } +} + +// DefaultMaintenanceConfigProto returns default configuration as protobuf +func DefaultMaintenanceConfigProto() *worker_pb.MaintenanceConfig { + return &worker_pb.MaintenanceConfig{ + Enabled: true, + ScanIntervalSeconds: 30 * 60, // 30 minutes + WorkerTimeoutSeconds: 5 * 60, // 5 minutes + TaskTimeoutSeconds: 2 * 60 * 60, // 2 hours + RetryDelaySeconds: 15 * 60, // 15 minutes + MaxRetries: 3, + CleanupIntervalSeconds: 24 * 60 * 60, // 24 hours + TaskRetentionSeconds: 7 * 24 * 60 * 60, // 7 days + // Policy field will be populated dynamically from separate task configuration files + Policy: nil, + } +} + +// GetConfig returns the current configuration +func (mcm *MaintenanceConfigManager) GetConfig() *worker_pb.MaintenanceConfig { + return mcm.config +} + +// Type-safe configuration accessors + +// GetVacuumConfig returns vacuum-specific configuration for a task type +func (mcm *MaintenanceConfigManager) GetVacuumConfig(taskType string) *worker_pb.VacuumTaskConfig { + if policy := mcm.getTaskPolicy(taskType); policy != nil { + if vacuumConfig := policy.GetVacuumConfig(); vacuumConfig != nil { + return vacuumConfig + } + } + // Return defaults if not configured + return &worker_pb.VacuumTaskConfig{ + GarbageThreshold: 0.3, + MinVolumeAgeHours: 24, + MinIntervalSeconds: 7 * 24 * 60 * 60, // 7 days + } +} + +// GetErasureCodingConfig returns EC-specific configuration for a task type +func (mcm *MaintenanceConfigManager) GetErasureCodingConfig(taskType string) *worker_pb.ErasureCodingTaskConfig { + if policy := mcm.getTaskPolicy(taskType); policy != nil { + if ecConfig := policy.GetErasureCodingConfig(); ecConfig != nil { + return ecConfig + } + } + // Return defaults if not configured + return &worker_pb.ErasureCodingTaskConfig{ + FullnessRatio: 0.95, + QuietForSeconds: 3600, + MinVolumeSizeMb: 100, + CollectionFilter: "", + } +} + +// GetBalanceConfig returns balance-specific configuration for a task type +func (mcm *MaintenanceConfigManager) GetBalanceConfig(taskType string) *worker_pb.BalanceTaskConfig { + if policy := mcm.getTaskPolicy(taskType); policy != nil { + if balanceConfig := policy.GetBalanceConfig(); balanceConfig != nil { + return balanceConfig + } + } + // Return defaults if not configured + return &worker_pb.BalanceTaskConfig{ + ImbalanceThreshold: 0.2, + MinServerCount: 2, + } +} + +// GetReplicationConfig returns replication-specific configuration for a task type +func (mcm *MaintenanceConfigManager) GetReplicationConfig(taskType string) *worker_pb.ReplicationTaskConfig { + if policy := mcm.getTaskPolicy(taskType); policy != nil { + if replicationConfig := policy.GetReplicationConfig(); replicationConfig != nil { + return replicationConfig + } + } + // Return defaults if not configured + return &worker_pb.ReplicationTaskConfig{ + TargetReplicaCount: 2, + } +} + +// Typed convenience methods for getting task configurations + +// GetVacuumTaskConfigForType returns vacuum configuration for a specific task type +func (mcm *MaintenanceConfigManager) GetVacuumTaskConfigForType(taskType string) *worker_pb.VacuumTaskConfig { + return GetVacuumTaskConfig(mcm.config.Policy, MaintenanceTaskType(taskType)) +} + +// GetErasureCodingTaskConfigForType returns erasure coding configuration for a specific task type +func (mcm *MaintenanceConfigManager) GetErasureCodingTaskConfigForType(taskType string) *worker_pb.ErasureCodingTaskConfig { + return GetErasureCodingTaskConfig(mcm.config.Policy, MaintenanceTaskType(taskType)) +} + +// GetBalanceTaskConfigForType returns balance configuration for a specific task type +func (mcm *MaintenanceConfigManager) GetBalanceTaskConfigForType(taskType string) *worker_pb.BalanceTaskConfig { + return GetBalanceTaskConfig(mcm.config.Policy, MaintenanceTaskType(taskType)) +} + +// GetReplicationTaskConfigForType returns replication configuration for a specific task type +func (mcm *MaintenanceConfigManager) GetReplicationTaskConfigForType(taskType string) *worker_pb.ReplicationTaskConfig { + return GetReplicationTaskConfig(mcm.config.Policy, MaintenanceTaskType(taskType)) +} + +// Helper methods + +func (mcm *MaintenanceConfigManager) getTaskPolicy(taskType string) *worker_pb.TaskPolicy { + if mcm.config.Policy != nil && mcm.config.Policy.TaskPolicies != nil { + return mcm.config.Policy.TaskPolicies[taskType] + } + return nil +} + +// IsTaskEnabled returns whether a task type is enabled +func (mcm *MaintenanceConfigManager) IsTaskEnabled(taskType string) bool { + if policy := mcm.getTaskPolicy(taskType); policy != nil { + return policy.Enabled + } + return false +} + +// GetMaxConcurrent returns the max concurrent limit for a task type +func (mcm *MaintenanceConfigManager) GetMaxConcurrent(taskType string) int32 { + if policy := mcm.getTaskPolicy(taskType); policy != nil { + return policy.MaxConcurrent + } + return 1 // Default +} + +// GetRepeatInterval returns the repeat interval for a task type in seconds +func (mcm *MaintenanceConfigManager) GetRepeatInterval(taskType string) int32 { + if policy := mcm.getTaskPolicy(taskType); policy != nil { + return policy.RepeatIntervalSeconds + } + return mcm.config.Policy.DefaultRepeatIntervalSeconds +} + +// GetCheckInterval returns the check interval for a task type in seconds +func (mcm *MaintenanceConfigManager) GetCheckInterval(taskType string) int32 { + if policy := mcm.getTaskPolicy(taskType); policy != nil { + return policy.CheckIntervalSeconds + } + return mcm.config.Policy.DefaultCheckIntervalSeconds +} + +// Duration accessor methods + +// GetScanInterval returns the scan interval as a time.Duration +func (mcm *MaintenanceConfigManager) GetScanInterval() time.Duration { + return time.Duration(mcm.config.ScanIntervalSeconds) * time.Second +} + +// GetWorkerTimeout returns the worker timeout as a time.Duration +func (mcm *MaintenanceConfigManager) GetWorkerTimeout() time.Duration { + return time.Duration(mcm.config.WorkerTimeoutSeconds) * time.Second +} + +// GetTaskTimeout returns the task timeout as a time.Duration +func (mcm *MaintenanceConfigManager) GetTaskTimeout() time.Duration { + return time.Duration(mcm.config.TaskTimeoutSeconds) * time.Second +} + +// GetRetryDelay returns the retry delay as a time.Duration +func (mcm *MaintenanceConfigManager) GetRetryDelay() time.Duration { + return time.Duration(mcm.config.RetryDelaySeconds) * time.Second +} + +// GetCleanupInterval returns the cleanup interval as a time.Duration +func (mcm *MaintenanceConfigManager) GetCleanupInterval() time.Duration { + return time.Duration(mcm.config.CleanupIntervalSeconds) * time.Second +} + +// GetTaskRetention returns the task retention period as a time.Duration +func (mcm *MaintenanceConfigManager) GetTaskRetention() time.Duration { + return time.Duration(mcm.config.TaskRetentionSeconds) * time.Second +} + +// ValidateMaintenanceConfigWithSchema validates protobuf maintenance configuration using ConfigField rules +func ValidateMaintenanceConfigWithSchema(config *worker_pb.MaintenanceConfig) error { + if config == nil { + return fmt.Errorf("configuration cannot be nil") + } + + // Get the schema to access field validation rules + schema := GetMaintenanceConfigSchema() + + // Validate each field individually using the ConfigField rules + if err := validateFieldWithSchema(schema, "enabled", config.Enabled); err != nil { + return err + } + + if err := validateFieldWithSchema(schema, "scan_interval_seconds", int(config.ScanIntervalSeconds)); err != nil { + return err + } + + if err := validateFieldWithSchema(schema, "worker_timeout_seconds", int(config.WorkerTimeoutSeconds)); err != nil { + return err + } + + if err := validateFieldWithSchema(schema, "task_timeout_seconds", int(config.TaskTimeoutSeconds)); err != nil { + return err + } + + if err := validateFieldWithSchema(schema, "retry_delay_seconds", int(config.RetryDelaySeconds)); err != nil { + return err + } + + if err := validateFieldWithSchema(schema, "max_retries", int(config.MaxRetries)); err != nil { + return err + } + + if err := validateFieldWithSchema(schema, "cleanup_interval_seconds", int(config.CleanupIntervalSeconds)); err != nil { + return err + } + + if err := validateFieldWithSchema(schema, "task_retention_seconds", int(config.TaskRetentionSeconds)); err != nil { + return err + } + + // Validate policy fields if present + if config.Policy != nil { + // Note: These field names might need to be adjusted based on the actual schema + if err := validatePolicyField("global_max_concurrent", int(config.Policy.GlobalMaxConcurrent)); err != nil { + return err + } + + if err := validatePolicyField("default_repeat_interval_seconds", int(config.Policy.DefaultRepeatIntervalSeconds)); err != nil { + return err + } + + if err := validatePolicyField("default_check_interval_seconds", int(config.Policy.DefaultCheckIntervalSeconds)); err != nil { + return err + } + } + + return nil +} + +// validateFieldWithSchema validates a single field using its ConfigField definition +func validateFieldWithSchema(schema *MaintenanceConfigSchema, fieldName string, value interface{}) error { + field := schema.GetFieldByName(fieldName) + if field == nil { + // Field not in schema, skip validation + return nil + } + + return field.ValidateValue(value) +} + +// validatePolicyField validates policy fields (simplified validation for now) +func validatePolicyField(fieldName string, value int) error { + switch fieldName { + case "global_max_concurrent": + if value < 1 || value > 20 { + return fmt.Errorf("Global Max Concurrent must be between 1 and 20, got %d", value) + } + case "default_repeat_interval": + if value < 1 || value > 168 { + return fmt.Errorf("Default Repeat Interval must be between 1 and 168 hours, got %d", value) + } + case "default_check_interval": + if value < 1 || value > 168 { + return fmt.Errorf("Default Check Interval must be between 1 and 168 hours, got %d", value) + } + } + return nil +} diff --git a/weed/admin/maintenance/maintenance_integration.go b/weed/admin/maintenance/maintenance_integration.go index 9a965d38a..1bdd7ffcc 100644 --- a/weed/admin/maintenance/maintenance_integration.go +++ b/weed/admin/maintenance/maintenance_integration.go @@ -1,11 +1,20 @@ package maintenance import ( + "context" + "fmt" "time" + "github.com/seaweedfs/seaweedfs/weed/admin/topology" "github.com/seaweedfs/seaweedfs/weed/glog" + "github.com/seaweedfs/seaweedfs/weed/operation" + "github.com/seaweedfs/seaweedfs/weed/pb" + "github.com/seaweedfs/seaweedfs/weed/pb/master_pb" + "github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" "github.com/seaweedfs/seaweedfs/weed/worker/tasks" "github.com/seaweedfs/seaweedfs/weed/worker/types" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" ) // MaintenanceIntegration bridges the task system with existing maintenance @@ -17,6 +26,12 @@ type MaintenanceIntegration struct { maintenanceQueue *MaintenanceQueue maintenancePolicy *MaintenancePolicy + // Pending operations tracker + pendingOperations *PendingOperations + + // Active topology for task detection and target selection + activeTopology *topology.ActiveTopology + // Type conversion maps taskTypeMap map[types.TaskType]MaintenanceTaskType revTaskTypeMap map[MaintenanceTaskType]types.TaskType @@ -31,8 +46,12 @@ func NewMaintenanceIntegration(queue *MaintenanceQueue, policy *MaintenancePolic uiRegistry: tasks.GetGlobalUIRegistry(), // Use global UI registry with auto-registered UI providers maintenanceQueue: queue, maintenancePolicy: policy, + pendingOperations: NewPendingOperations(), } + // Initialize active topology with 10 second recent task window + integration.activeTopology = topology.NewActiveTopology(10) + // Initialize type conversion maps integration.initializeTypeMaps() @@ -96,7 +115,7 @@ func (s *MaintenanceIntegration) registerAllTasks() { s.buildTaskTypeMappings() // Configure tasks from policy - s.configureTasksFromPolicy() + s.ConfigureTasksFromPolicy() registeredTaskTypes := make([]string, 0, len(s.taskTypeMap)) for _, maintenanceTaskType := range s.taskTypeMap { @@ -105,8 +124,8 @@ func (s *MaintenanceIntegration) registerAllTasks() { glog.V(1).Infof("Registered tasks: %v", registeredTaskTypes) } -// configureTasksFromPolicy dynamically configures all registered tasks based on the maintenance policy -func (s *MaintenanceIntegration) configureTasksFromPolicy() { +// ConfigureTasksFromPolicy dynamically configures all registered tasks based on the maintenance policy +func (s *MaintenanceIntegration) ConfigureTasksFromPolicy() { if s.maintenancePolicy == nil { return } @@ -143,7 +162,7 @@ func (s *MaintenanceIntegration) configureDetectorFromPolicy(taskType types.Task // Convert task system type to maintenance task type for policy lookup maintenanceTaskType, exists := s.taskTypeMap[taskType] if exists { - enabled := s.maintenancePolicy.IsTaskEnabled(maintenanceTaskType) + enabled := IsTaskEnabled(s.maintenancePolicy, maintenanceTaskType) basicDetector.SetEnabled(enabled) glog.V(3).Infof("Set enabled=%v for detector %s", enabled, taskType) } @@ -172,14 +191,14 @@ func (s *MaintenanceIntegration) configureSchedulerFromPolicy(taskType types.Tas // Set enabled status if scheduler supports it if enableableScheduler, ok := scheduler.(interface{ SetEnabled(bool) }); ok { - enabled := s.maintenancePolicy.IsTaskEnabled(maintenanceTaskType) + enabled := IsTaskEnabled(s.maintenancePolicy, maintenanceTaskType) enableableScheduler.SetEnabled(enabled) glog.V(3).Infof("Set enabled=%v for scheduler %s", enabled, taskType) } // Set max concurrent if scheduler supports it if concurrentScheduler, ok := scheduler.(interface{ SetMaxConcurrent(int) }); ok { - maxConcurrent := s.maintenancePolicy.GetMaxConcurrent(maintenanceTaskType) + maxConcurrent := GetMaxConcurrent(s.maintenancePolicy, maintenanceTaskType) if maxConcurrent > 0 { concurrentScheduler.SetMaxConcurrent(maxConcurrent) glog.V(3).Infof("Set max concurrent=%d for scheduler %s", maxConcurrent, taskType) @@ -193,11 +212,20 @@ func (s *MaintenanceIntegration) configureSchedulerFromPolicy(taskType types.Tas // ScanWithTaskDetectors performs a scan using the task system func (s *MaintenanceIntegration) ScanWithTaskDetectors(volumeMetrics []*types.VolumeHealthMetrics) ([]*TaskDetectionResult, error) { + // Note: ActiveTopology gets updated from topology info instead of volume metrics + glog.V(2).Infof("Processed %d volume metrics for task detection", len(volumeMetrics)) + + // Filter out volumes with pending operations to avoid duplicates + filteredMetrics := s.pendingOperations.FilterVolumeMetricsExcludingPending(volumeMetrics) + + glog.V(1).Infof("Scanning %d volumes (filtered from %d) excluding pending operations", + len(filteredMetrics), len(volumeMetrics)) + var allResults []*TaskDetectionResult // Create cluster info clusterInfo := &types.ClusterInfo{ - TotalVolumes: len(volumeMetrics), + TotalVolumes: len(filteredMetrics), LastUpdated: time.Now(), } @@ -209,17 +237,26 @@ func (s *MaintenanceIntegration) ScanWithTaskDetectors(volumeMetrics []*types.Vo glog.V(2).Infof("Running detection for task type: %s", taskType) - results, err := detector.ScanForTasks(volumeMetrics, clusterInfo) + results, err := detector.ScanForTasks(filteredMetrics, clusterInfo) if err != nil { glog.Errorf("Failed to scan for %s tasks: %v", taskType, err) continue } - // Convert results to existing system format + // Convert results to existing system format and check for conflicts for _, result := range results { existingResult := s.convertToExistingFormat(result) if existingResult != nil { - allResults = append(allResults, existingResult) + // Double-check for conflicts with pending operations + opType := s.mapMaintenanceTaskTypeToPendingOperationType(existingResult.TaskType) + if !s.pendingOperations.WouldConflictWithPending(existingResult.VolumeID, opType) { + // Plan destination for operations that need it + s.planDestinationForTask(existingResult, opType) + allResults = append(allResults, existingResult) + } else { + glog.V(2).Infof("Skipping task %s for volume %d due to conflict with pending operation", + existingResult.TaskType, existingResult.VolumeID) + } } } @@ -229,6 +266,11 @@ func (s *MaintenanceIntegration) ScanWithTaskDetectors(volumeMetrics []*types.Vo return allResults, nil } +// UpdateTopologyInfo updates the volume shard tracker with topology information for empty servers +func (s *MaintenanceIntegration) UpdateTopologyInfo(topologyInfo *master_pb.TopologyInfo) error { + return s.activeTopology.UpdateTopology(topologyInfo) +} + // convertToExistingFormat converts task results to existing system format using dynamic mapping func (s *MaintenanceIntegration) convertToExistingFormat(result *types.TaskDetectionResult) *TaskDetectionResult { // Convert types using mapping tables @@ -241,49 +283,62 @@ func (s *MaintenanceIntegration) convertToExistingFormat(result *types.TaskDetec existingPriority, exists := s.priorityMap[result.Priority] if !exists { - glog.Warningf("Unknown priority %d, defaulting to normal", result.Priority) + glog.Warningf("Unknown priority %s, defaulting to normal", result.Priority) existingPriority = PriorityNormal } return &TaskDetectionResult{ - TaskType: existingType, - VolumeID: result.VolumeID, - Server: result.Server, - Collection: result.Collection, - Priority: existingPriority, - Reason: result.Reason, - Parameters: result.Parameters, - ScheduleAt: result.ScheduleAt, + TaskType: existingType, + VolumeID: result.VolumeID, + Server: result.Server, + Collection: result.Collection, + Priority: existingPriority, + Reason: result.Reason, + TypedParams: result.TypedParams, + ScheduleAt: result.ScheduleAt, } } // CanScheduleWithTaskSchedulers determines if a task can be scheduled using task schedulers with dynamic type conversion func (s *MaintenanceIntegration) CanScheduleWithTaskSchedulers(task *MaintenanceTask, runningTasks []*MaintenanceTask, availableWorkers []*MaintenanceWorker) bool { + glog.Infof("DEBUG CanScheduleWithTaskSchedulers: Checking task %s (type: %s)", task.ID, task.Type) + // Convert existing types to task types using mapping taskType, exists := s.revTaskTypeMap[task.Type] if !exists { - glog.V(2).Infof("Unknown task type %s for scheduling, falling back to existing logic", task.Type) + glog.Infof("DEBUG CanScheduleWithTaskSchedulers: Unknown task type %s for scheduling, falling back to existing logic", task.Type) return false // Fallback to existing logic for unknown types } + glog.Infof("DEBUG CanScheduleWithTaskSchedulers: Mapped task type %s to %s", task.Type, taskType) + // Convert task objects taskObject := s.convertTaskToTaskSystem(task) if taskObject == nil { - glog.V(2).Infof("Failed to convert task %s for scheduling", task.ID) + glog.Infof("DEBUG CanScheduleWithTaskSchedulers: Failed to convert task %s for scheduling", task.ID) return false } + glog.Infof("DEBUG CanScheduleWithTaskSchedulers: Successfully converted task %s", task.ID) + runningTaskObjects := s.convertTasksToTaskSystem(runningTasks) workerObjects := s.convertWorkersToTaskSystem(availableWorkers) + glog.Infof("DEBUG CanScheduleWithTaskSchedulers: Converted %d running tasks and %d workers", len(runningTaskObjects), len(workerObjects)) + // Get the appropriate scheduler scheduler := s.taskRegistry.GetScheduler(taskType) if scheduler == nil { - glog.V(2).Infof("No scheduler found for task type %s", taskType) + glog.Infof("DEBUG CanScheduleWithTaskSchedulers: No scheduler found for task type %s", taskType) return false } - return scheduler.CanScheduleNow(taskObject, runningTaskObjects, workerObjects) + glog.Infof("DEBUG CanScheduleWithTaskSchedulers: Found scheduler for task type %s", taskType) + + canSchedule := scheduler.CanScheduleNow(taskObject, runningTaskObjects, workerObjects) + glog.Infof("DEBUG CanScheduleWithTaskSchedulers: Scheduler decision for task %s: %v", task.ID, canSchedule) + + return canSchedule } // convertTaskToTaskSystem converts existing task to task system format using dynamic mapping @@ -304,14 +359,14 @@ func (s *MaintenanceIntegration) convertTaskToTaskSystem(task *MaintenanceTask) } return &types.Task{ - ID: task.ID, - Type: taskType, - Priority: priority, - VolumeID: task.VolumeID, - Server: task.Server, - Collection: task.Collection, - Parameters: task.Parameters, - CreatedAt: task.CreatedAt, + ID: task.ID, + Type: taskType, + Priority: priority, + VolumeID: task.VolumeID, + Server: task.Server, + Collection: task.Collection, + TypedParams: task.TypedParams, + CreatedAt: task.CreatedAt, } } @@ -407,3 +462,463 @@ func (s *MaintenanceIntegration) GetAllTaskStats() []*types.TaskStats { return stats } + +// mapMaintenanceTaskTypeToPendingOperationType converts a maintenance task type to a pending operation type +func (s *MaintenanceIntegration) mapMaintenanceTaskTypeToPendingOperationType(taskType MaintenanceTaskType) PendingOperationType { + switch taskType { + case MaintenanceTaskType("balance"): + return OpTypeVolumeBalance + case MaintenanceTaskType("erasure_coding"): + return OpTypeErasureCoding + case MaintenanceTaskType("vacuum"): + return OpTypeVacuum + case MaintenanceTaskType("replication"): + return OpTypeReplication + default: + // For other task types, assume they're volume operations + return OpTypeVolumeMove + } +} + +// GetPendingOperations returns the pending operations tracker +func (s *MaintenanceIntegration) GetPendingOperations() *PendingOperations { + return s.pendingOperations +} + +// GetActiveTopology returns the active topology for task detection +func (s *MaintenanceIntegration) GetActiveTopology() *topology.ActiveTopology { + return s.activeTopology +} + +// planDestinationForTask plans the destination for a task that requires it and creates typed protobuf parameters +func (s *MaintenanceIntegration) planDestinationForTask(task *TaskDetectionResult, opType PendingOperationType) { + // Only plan destinations for operations that move volumes/shards + if opType == OpTypeVacuum { + // For vacuum tasks, create VacuumTaskParams + s.createVacuumTaskParams(task) + return + } + + glog.V(1).Infof("Planning destination for %s task on volume %d (server: %s)", task.TaskType, task.VolumeID, task.Server) + + // Use ActiveTopology for destination planning + destinationPlan, err := s.planDestinationWithActiveTopology(task, opType) + + if err != nil { + glog.Warningf("Failed to plan primary destination for %s task volume %d: %v", + task.TaskType, task.VolumeID, err) + // Don't return here - still try to create task params which might work with multiple destinations + } + + // Create typed protobuf parameters based on operation type + switch opType { + case OpTypeErasureCoding: + if destinationPlan == nil { + glog.Warningf("Cannot create EC task for volume %d: destination planning failed", task.VolumeID) + return + } + s.createErasureCodingTaskParams(task, destinationPlan) + case OpTypeVolumeMove, OpTypeVolumeBalance: + if destinationPlan == nil { + glog.Warningf("Cannot create balance task for volume %d: destination planning failed", task.VolumeID) + return + } + s.createBalanceTaskParams(task, destinationPlan.(*topology.DestinationPlan)) + case OpTypeReplication: + if destinationPlan == nil { + glog.Warningf("Cannot create replication task for volume %d: destination planning failed", task.VolumeID) + return + } + s.createReplicationTaskParams(task, destinationPlan.(*topology.DestinationPlan)) + default: + glog.V(2).Infof("Unknown operation type for task %s: %v", task.TaskType, opType) + } + + if destinationPlan != nil { + switch plan := destinationPlan.(type) { + case *topology.DestinationPlan: + glog.V(1).Infof("Completed destination planning for %s task on volume %d: %s -> %s", + task.TaskType, task.VolumeID, task.Server, plan.TargetNode) + case *topology.MultiDestinationPlan: + glog.V(1).Infof("Completed EC destination planning for volume %d: %s -> %d destinations (racks: %d, DCs: %d)", + task.VolumeID, task.Server, len(plan.Plans), plan.SuccessfulRack, plan.SuccessfulDCs) + } + } else { + glog.V(1).Infof("Completed destination planning for %s task on volume %d: no destination planned", + task.TaskType, task.VolumeID) + } +} + +// createVacuumTaskParams creates typed parameters for vacuum tasks +func (s *MaintenanceIntegration) createVacuumTaskParams(task *TaskDetectionResult) { + // Get configuration from policy instead of using hard-coded values + vacuumConfig := GetVacuumTaskConfig(s.maintenancePolicy, MaintenanceTaskType("vacuum")) + + // Use configured values or defaults if config is not available + garbageThreshold := 0.3 // Default 30% + verifyChecksum := true // Default to verify + batchSize := int32(1000) // Default batch size + workingDir := "/tmp/seaweedfs_vacuum_work" // Default working directory + + if vacuumConfig != nil { + garbageThreshold = vacuumConfig.GarbageThreshold + // Note: VacuumTaskConfig has GarbageThreshold, MinVolumeAgeHours, MinIntervalSeconds + // Other fields like VerifyChecksum, BatchSize, WorkingDir would need to be added + // to the protobuf definition if they should be configurable + } + + // Create typed protobuf parameters + task.TypedParams = &worker_pb.TaskParams{ + VolumeId: task.VolumeID, + Server: task.Server, + Collection: task.Collection, + TaskParams: &worker_pb.TaskParams_VacuumParams{ + VacuumParams: &worker_pb.VacuumTaskParams{ + GarbageThreshold: garbageThreshold, + ForceVacuum: false, + BatchSize: batchSize, + WorkingDir: workingDir, + VerifyChecksum: verifyChecksum, + }, + }, + } +} + +// planDestinationWithActiveTopology uses ActiveTopology to plan destinations +func (s *MaintenanceIntegration) planDestinationWithActiveTopology(task *TaskDetectionResult, opType PendingOperationType) (interface{}, error) { + // Get source node information from topology + var sourceRack, sourceDC string + + // Extract rack and DC from topology info + topologyInfo := s.activeTopology.GetTopologyInfo() + if topologyInfo != nil { + for _, dc := range topologyInfo.DataCenterInfos { + for _, rack := range dc.RackInfos { + for _, dataNodeInfo := range rack.DataNodeInfos { + if dataNodeInfo.Id == task.Server { + sourceDC = dc.Id + sourceRack = rack.Id + break + } + } + if sourceRack != "" { + break + } + } + if sourceDC != "" { + break + } + } + } + + switch opType { + case OpTypeVolumeBalance, OpTypeVolumeMove: + // Plan single destination for balance operation + return s.activeTopology.PlanBalanceDestination(task.VolumeID, task.Server, sourceRack, sourceDC, 0) + + case OpTypeErasureCoding: + // Plan multiple destinations for EC operation using adaptive shard counts + // Start with the default configuration, but fall back to smaller configurations if insufficient disks + totalShards := s.getOptimalECShardCount() + multiPlan, err := s.activeTopology.PlanECDestinations(task.VolumeID, task.Server, sourceRack, sourceDC, totalShards) + if err != nil { + return nil, err + } + if multiPlan != nil && len(multiPlan.Plans) > 0 { + // Return the multi-destination plan for EC + return multiPlan, nil + } + return nil, fmt.Errorf("no EC destinations found") + + default: + return nil, fmt.Errorf("unsupported operation type for destination planning: %v", opType) + } +} + +// createErasureCodingTaskParams creates typed parameters for EC tasks +func (s *MaintenanceIntegration) createErasureCodingTaskParams(task *TaskDetectionResult, destinationPlan interface{}) { + // Determine EC shard counts based on the number of planned destinations + multiPlan, ok := destinationPlan.(*topology.MultiDestinationPlan) + if !ok { + glog.Warningf("EC task for volume %d received unexpected destination plan type", task.VolumeID) + task.TypedParams = nil + return + } + + // Use adaptive shard configuration based on actual planned destinations + totalShards := len(multiPlan.Plans) + dataShards, parityShards := s.getECShardCounts(totalShards) + + // Extract disk-aware destinations from the multi-destination plan + var destinations []*worker_pb.ECDestination + var allConflicts []string + + for _, plan := range multiPlan.Plans { + allConflicts = append(allConflicts, plan.Conflicts...) + + // Create disk-aware destination + destinations = append(destinations, &worker_pb.ECDestination{ + Node: plan.TargetNode, + DiskId: plan.TargetDisk, + Rack: plan.TargetRack, + DataCenter: plan.TargetDC, + PlacementScore: plan.PlacementScore, + }) + } + + glog.V(1).Infof("EC destination planning for volume %d: got %d destinations (%d+%d shards) across %d racks and %d DCs", + task.VolumeID, len(destinations), dataShards, parityShards, multiPlan.SuccessfulRack, multiPlan.SuccessfulDCs) + + if len(destinations) == 0 { + glog.Warningf("No destinations available for EC task volume %d - rejecting task", task.VolumeID) + task.TypedParams = nil + return + } + + // Collect existing EC shard locations for cleanup + existingShardLocations := s.collectExistingEcShardLocations(task.VolumeID) + + // Create EC task parameters + ecParams := &worker_pb.ErasureCodingTaskParams{ + Destinations: destinations, // Disk-aware destinations + DataShards: dataShards, + ParityShards: parityShards, + WorkingDir: "/tmp/seaweedfs_ec_work", + MasterClient: "localhost:9333", + CleanupSource: true, + ExistingShardLocations: existingShardLocations, // Pass existing shards for cleanup + } + + // Add placement conflicts if any + if len(allConflicts) > 0 { + // Remove duplicates + conflictMap := make(map[string]bool) + var uniqueConflicts []string + for _, conflict := range allConflicts { + if !conflictMap[conflict] { + conflictMap[conflict] = true + uniqueConflicts = append(uniqueConflicts, conflict) + } + } + ecParams.PlacementConflicts = uniqueConflicts + } + + // Wrap in TaskParams + task.TypedParams = &worker_pb.TaskParams{ + VolumeId: task.VolumeID, + Server: task.Server, + Collection: task.Collection, + TaskParams: &worker_pb.TaskParams_ErasureCodingParams{ + ErasureCodingParams: ecParams, + }, + } + + glog.V(1).Infof("Created EC task params with %d destinations for volume %d", + len(destinations), task.VolumeID) +} + +// createBalanceTaskParams creates typed parameters for balance/move tasks +func (s *MaintenanceIntegration) createBalanceTaskParams(task *TaskDetectionResult, destinationPlan *topology.DestinationPlan) { + // balanceConfig could be used for future config options like ImbalanceThreshold, MinServerCount + + // Create balance task parameters + balanceParams := &worker_pb.BalanceTaskParams{ + DestNode: destinationPlan.TargetNode, + EstimatedSize: destinationPlan.ExpectedSize, + DestRack: destinationPlan.TargetRack, + DestDc: destinationPlan.TargetDC, + PlacementScore: destinationPlan.PlacementScore, + ForceMove: false, // Default to false + TimeoutSeconds: 300, // Default 5 minutes + } + + // Add placement conflicts if any + if len(destinationPlan.Conflicts) > 0 { + balanceParams.PlacementConflicts = destinationPlan.Conflicts + } + + // Note: balanceConfig would have ImbalanceThreshold, MinServerCount if needed for future enhancements + + // Wrap in TaskParams + task.TypedParams = &worker_pb.TaskParams{ + VolumeId: task.VolumeID, + Server: task.Server, + Collection: task.Collection, + TaskParams: &worker_pb.TaskParams_BalanceParams{ + BalanceParams: balanceParams, + }, + } + + glog.V(1).Infof("Created balance task params for volume %d: %s -> %s (score: %.2f)", + task.VolumeID, task.Server, destinationPlan.TargetNode, destinationPlan.PlacementScore) +} + +// createReplicationTaskParams creates typed parameters for replication tasks +func (s *MaintenanceIntegration) createReplicationTaskParams(task *TaskDetectionResult, destinationPlan *topology.DestinationPlan) { + // replicationConfig could be used for future config options like TargetReplicaCount + + // Create replication task parameters + replicationParams := &worker_pb.ReplicationTaskParams{ + DestNode: destinationPlan.TargetNode, + DestRack: destinationPlan.TargetRack, + DestDc: destinationPlan.TargetDC, + PlacementScore: destinationPlan.PlacementScore, + } + + // Add placement conflicts if any + if len(destinationPlan.Conflicts) > 0 { + replicationParams.PlacementConflicts = destinationPlan.Conflicts + } + + // Note: replicationConfig would have TargetReplicaCount if needed for future enhancements + + // Wrap in TaskParams + task.TypedParams = &worker_pb.TaskParams{ + VolumeId: task.VolumeID, + Server: task.Server, + Collection: task.Collection, + TaskParams: &worker_pb.TaskParams_ReplicationParams{ + ReplicationParams: replicationParams, + }, + } + + glog.V(1).Infof("Created replication task params for volume %d: %s -> %s", + task.VolumeID, task.Server, destinationPlan.TargetNode) +} + +// getOptimalECShardCount returns the optimal number of EC shards based on available disks +// Uses a simplified approach to avoid blocking during UI access +func (s *MaintenanceIntegration) getOptimalECShardCount() int { + // Try to get available disks quickly, but don't block if topology is busy + availableDisks := s.getAvailableDisksQuickly() + + // EC configurations in order of preference: (data+parity=total) + // Use smaller configurations for smaller clusters + if availableDisks >= 14 { + glog.V(1).Infof("Using default EC configuration: 10+4=14 shards for %d available disks", availableDisks) + return 14 // Default: 10+4 + } else if availableDisks >= 6 { + glog.V(1).Infof("Using small cluster EC configuration: 4+2=6 shards for %d available disks", availableDisks) + return 6 // Small cluster: 4+2 + } else if availableDisks >= 4 { + glog.V(1).Infof("Using minimal EC configuration: 3+1=4 shards for %d available disks", availableDisks) + return 4 // Minimal: 3+1 + } else { + glog.V(1).Infof("Using very small cluster EC configuration: 2+1=3 shards for %d available disks", availableDisks) + return 3 // Very small: 2+1 + } +} + +// getAvailableDisksQuickly returns available disk count with a fast path to avoid UI blocking +func (s *MaintenanceIntegration) getAvailableDisksQuickly() int { + // Use ActiveTopology's optimized disk counting if available + // Use empty task type and node filter for general availability check + allDisks := s.activeTopology.GetAvailableDisks(topology.TaskTypeErasureCoding, "") + if len(allDisks) > 0 { + return len(allDisks) + } + + // Fallback: try to count from topology but don't hold locks for too long + topologyInfo := s.activeTopology.GetTopologyInfo() + return s.countAvailableDisks(topologyInfo) +} + +// countAvailableDisks counts the total number of available disks in the topology +func (s *MaintenanceIntegration) countAvailableDisks(topologyInfo *master_pb.TopologyInfo) int { + if topologyInfo == nil { + return 0 + } + + diskCount := 0 + for _, dc := range topologyInfo.DataCenterInfos { + for _, rack := range dc.RackInfos { + for _, node := range rack.DataNodeInfos { + diskCount += len(node.DiskInfos) + } + } + } + + return diskCount +} + +// getECShardCounts determines data and parity shard counts for a given total +func (s *MaintenanceIntegration) getECShardCounts(totalShards int) (int32, int32) { + // Map total shards to (data, parity) configurations + switch totalShards { + case 14: + return 10, 4 // Default: 10+4 + case 9: + return 6, 3 // Medium: 6+3 + case 6: + return 4, 2 // Small: 4+2 + case 4: + return 3, 1 // Minimal: 3+1 + case 3: + return 2, 1 // Very small: 2+1 + default: + // For any other total, try to maintain roughly 3:1 or 4:1 ratio + if totalShards >= 4 { + parityShards := totalShards / 4 + if parityShards < 1 { + parityShards = 1 + } + dataShards := totalShards - parityShards + return int32(dataShards), int32(parityShards) + } + // Fallback for very small clusters + return int32(totalShards - 1), 1 + } +} + +// collectExistingEcShardLocations queries the master for existing EC shard locations during planning +func (s *MaintenanceIntegration) collectExistingEcShardLocations(volumeId uint32) []*worker_pb.ExistingECShardLocation { + var existingShardLocations []*worker_pb.ExistingECShardLocation + + // Use insecure connection for simplicity - in production this might be configurable + grpcDialOption := grpc.WithTransportCredentials(insecure.NewCredentials()) + + err := operation.WithMasterServerClient(false, pb.ServerAddress("localhost:9333"), grpcDialOption, + func(masterClient master_pb.SeaweedClient) error { + req := &master_pb.LookupEcVolumeRequest{ + VolumeId: volumeId, + } + resp, err := masterClient.LookupEcVolume(context.Background(), req) + if err != nil { + // If volume doesn't exist as EC volume, that's fine - just no existing shards + glog.V(1).Infof("LookupEcVolume for volume %d returned: %v (this is normal if no existing EC shards)", volumeId, err) + return nil + } + + // Group shard locations by server + serverShardMap := make(map[string][]uint32) + for _, shardIdLocation := range resp.ShardIdLocations { + shardId := uint32(shardIdLocation.ShardId) + for _, location := range shardIdLocation.Locations { + serverAddr := pb.NewServerAddressFromLocation(location) + serverShardMap[string(serverAddr)] = append(serverShardMap[string(serverAddr)], shardId) + } + } + + // Convert to protobuf format + for serverAddr, shardIds := range serverShardMap { + existingShardLocations = append(existingShardLocations, &worker_pb.ExistingECShardLocation{ + Node: serverAddr, + ShardIds: shardIds, + }) + } + + return nil + }) + + if err != nil { + glog.Errorf("Failed to lookup existing EC shards from master for volume %d: %v", volumeId, err) + // Return empty list - cleanup will be skipped but task can continue + return []*worker_pb.ExistingECShardLocation{} + } + + if len(existingShardLocations) > 0 { + glog.V(1).Infof("Found existing EC shards for volume %d on %d servers during planning", volumeId, len(existingShardLocations)) + } + + return existingShardLocations +} diff --git a/weed/admin/maintenance/maintenance_manager.go b/weed/admin/maintenance/maintenance_manager.go index 5d87d817e..4aab137e0 100644 --- a/weed/admin/maintenance/maintenance_manager.go +++ b/weed/admin/maintenance/maintenance_manager.go @@ -7,8 +7,76 @@ import ( "time" "github.com/seaweedfs/seaweedfs/weed/glog" + "github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" + "github.com/seaweedfs/seaweedfs/weed/worker/tasks/balance" + "github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding" + "github.com/seaweedfs/seaweedfs/weed/worker/tasks/vacuum" ) +// buildPolicyFromTaskConfigs loads task configurations from separate files and builds a MaintenancePolicy +func buildPolicyFromTaskConfigs() *worker_pb.MaintenancePolicy { + policy := &worker_pb.MaintenancePolicy{ + GlobalMaxConcurrent: 4, + DefaultRepeatIntervalSeconds: 6 * 3600, // 6 hours in seconds + DefaultCheckIntervalSeconds: 12 * 3600, // 12 hours in seconds + TaskPolicies: make(map[string]*worker_pb.TaskPolicy), + } + + // Load vacuum task configuration + if vacuumConfig := vacuum.LoadConfigFromPersistence(nil); vacuumConfig != nil { + policy.TaskPolicies["vacuum"] = &worker_pb.TaskPolicy{ + Enabled: vacuumConfig.Enabled, + MaxConcurrent: int32(vacuumConfig.MaxConcurrent), + RepeatIntervalSeconds: int32(vacuumConfig.ScanIntervalSeconds), + CheckIntervalSeconds: int32(vacuumConfig.ScanIntervalSeconds), + TaskConfig: &worker_pb.TaskPolicy_VacuumConfig{ + VacuumConfig: &worker_pb.VacuumTaskConfig{ + GarbageThreshold: float64(vacuumConfig.GarbageThreshold), + MinVolumeAgeHours: int32(vacuumConfig.MinVolumeAgeSeconds / 3600), // Convert seconds to hours + MinIntervalSeconds: int32(vacuumConfig.MinIntervalSeconds), + }, + }, + } + } + + // Load erasure coding task configuration + if ecConfig := erasure_coding.LoadConfigFromPersistence(nil); ecConfig != nil { + policy.TaskPolicies["erasure_coding"] = &worker_pb.TaskPolicy{ + Enabled: ecConfig.Enabled, + MaxConcurrent: int32(ecConfig.MaxConcurrent), + RepeatIntervalSeconds: int32(ecConfig.ScanIntervalSeconds), + CheckIntervalSeconds: int32(ecConfig.ScanIntervalSeconds), + TaskConfig: &worker_pb.TaskPolicy_ErasureCodingConfig{ + ErasureCodingConfig: &worker_pb.ErasureCodingTaskConfig{ + FullnessRatio: float64(ecConfig.FullnessRatio), + QuietForSeconds: int32(ecConfig.QuietForSeconds), + MinVolumeSizeMb: int32(ecConfig.MinSizeMB), + CollectionFilter: ecConfig.CollectionFilter, + }, + }, + } + } + + // Load balance task configuration + if balanceConfig := balance.LoadConfigFromPersistence(nil); balanceConfig != nil { + policy.TaskPolicies["balance"] = &worker_pb.TaskPolicy{ + Enabled: balanceConfig.Enabled, + MaxConcurrent: int32(balanceConfig.MaxConcurrent), + RepeatIntervalSeconds: int32(balanceConfig.ScanIntervalSeconds), + CheckIntervalSeconds: int32(balanceConfig.ScanIntervalSeconds), + TaskConfig: &worker_pb.TaskPolicy_BalanceConfig{ + BalanceConfig: &worker_pb.BalanceTaskConfig{ + ImbalanceThreshold: float64(balanceConfig.ImbalanceThreshold), + MinServerCount: int32(balanceConfig.MinServerCount), + }, + }, + } + } + + glog.V(1).Infof("Built maintenance policy from separate task configs - %d task policies loaded", len(policy.TaskPolicies)) + return policy +} + // MaintenanceManager coordinates the maintenance system type MaintenanceManager struct { config *MaintenanceConfig @@ -18,11 +86,12 @@ type MaintenanceManager struct { running bool stopChan chan struct{} // Error handling and backoff - errorCount int - lastError error - lastErrorTime time.Time - backoffDelay time.Duration - mutex sync.RWMutex + errorCount int + lastError error + lastErrorTime time.Time + backoffDelay time.Duration + mutex sync.RWMutex + scanInProgress bool } // NewMaintenanceManager creates a new maintenance manager @@ -31,8 +100,15 @@ func NewMaintenanceManager(adminClient AdminClient, config *MaintenanceConfig) * config = DefaultMaintenanceConfig() } - queue := NewMaintenanceQueue(config.Policy) - scanner := NewMaintenanceScanner(adminClient, config.Policy, queue) + // Use the policy from the config (which is populated from separate task files in LoadMaintenanceConfig) + policy := config.Policy + if policy == nil { + // Fallback: build policy from separate task configuration files if not already populated + policy = buildPolicyFromTaskConfigs() + } + + queue := NewMaintenanceQueue(policy) + scanner := NewMaintenanceScanner(adminClient, policy, queue) return &MaintenanceManager{ config: config, @@ -125,23 +201,14 @@ func (mm *MaintenanceManager) scanLoop() { return case <-ticker.C: glog.V(1).Infof("Performing maintenance scan every %v", scanInterval) - mm.performScan() - // Adjust ticker interval based on error state - mm.mutex.RLock() - currentInterval := scanInterval - if mm.errorCount > 0 { - // Use backoff delay when there are errors - currentInterval = mm.backoffDelay - if currentInterval > scanInterval { - // Don't make it longer than the configured interval * 10 - maxInterval := scanInterval * 10 - if currentInterval > maxInterval { - currentInterval = maxInterval - } - } + // Use the same synchronization as TriggerScan to prevent concurrent scans + if err := mm.triggerScanInternal(false); err != nil { + glog.V(1).Infof("Scheduled scan skipped: %v", err) } - mm.mutex.RUnlock() + + // Adjust ticker interval based on error state (read error state safely) + currentInterval := mm.getScanInterval(scanInterval) // Reset ticker with new interval if needed if currentInterval != scanInterval { @@ -152,6 +219,26 @@ func (mm *MaintenanceManager) scanLoop() { } } +// getScanInterval safely reads the current scan interval with error backoff +func (mm *MaintenanceManager) getScanInterval(baseInterval time.Duration) time.Duration { + mm.mutex.RLock() + defer mm.mutex.RUnlock() + + if mm.errorCount > 0 { + // Use backoff delay when there are errors + currentInterval := mm.backoffDelay + if currentInterval > baseInterval { + // Don't make it longer than the configured interval * 10 + maxInterval := baseInterval * 10 + if currentInterval > maxInterval { + currentInterval = maxInterval + } + } + return currentInterval + } + return baseInterval +} + // cleanupLoop periodically cleans up old tasks and stale workers func (mm *MaintenanceManager) cleanupLoop() { cleanupInterval := time.Duration(mm.config.CleanupIntervalSeconds) * time.Second @@ -170,25 +257,54 @@ func (mm *MaintenanceManager) cleanupLoop() { // performScan executes a maintenance scan with error handling and backoff func (mm *MaintenanceManager) performScan() { - mm.mutex.Lock() - defer mm.mutex.Unlock() + defer func() { + // Always reset scan in progress flag when done + mm.mutex.Lock() + mm.scanInProgress = false + mm.mutex.Unlock() + }() - glog.V(2).Infof("Starting maintenance scan") + glog.Infof("Starting maintenance scan...") results, err := mm.scanner.ScanForMaintenanceTasks() if err != nil { + // Handle scan error + mm.mutex.Lock() mm.handleScanError(err) + mm.mutex.Unlock() + glog.Warningf("Maintenance scan failed: %v", err) return } - // Scan succeeded, reset error tracking - mm.resetErrorTracking() + // Scan succeeded - update state and process results + mm.handleScanSuccess(results) +} - if len(results) > 0 { +// handleScanSuccess processes successful scan results with proper lock management +func (mm *MaintenanceManager) handleScanSuccess(results []*TaskDetectionResult) { + // Update manager state first + mm.mutex.Lock() + mm.resetErrorTracking() + taskCount := len(results) + mm.mutex.Unlock() + + if taskCount > 0 { + // Count tasks by type for logging (outside of lock) + taskCounts := make(map[MaintenanceTaskType]int) + for _, result := range results { + taskCounts[result.TaskType]++ + } + + // Add tasks to queue (no manager lock held) mm.queue.AddTasksFromResults(results) - glog.V(1).Infof("Maintenance scan completed: added %d tasks", len(results)) + + // Log detailed scan results + glog.Infof("Maintenance scan completed: found %d tasks", taskCount) + for taskType, count := range taskCounts { + glog.Infof(" - %s: %d tasks", taskType, count) + } } else { - glog.V(2).Infof("Maintenance scan completed: no tasks needed") + glog.Infof("Maintenance scan completed: no maintenance tasks needed") } } @@ -272,8 +388,19 @@ func (mm *MaintenanceManager) performCleanup() { removedTasks := mm.queue.CleanupOldTasks(taskRetention) removedWorkers := mm.queue.RemoveStaleWorkers(workerTimeout) - if removedTasks > 0 || removedWorkers > 0 { - glog.V(1).Infof("Cleanup completed: removed %d old tasks and %d stale workers", removedTasks, removedWorkers) + // Clean up stale pending operations (operations running for more than 4 hours) + staleOperationTimeout := 4 * time.Hour + removedOperations := 0 + if mm.scanner != nil && mm.scanner.integration != nil { + pendingOps := mm.scanner.integration.GetPendingOperations() + if pendingOps != nil { + removedOperations = pendingOps.CleanupStaleOperations(staleOperationTimeout) + } + } + + if removedTasks > 0 || removedWorkers > 0 || removedOperations > 0 { + glog.V(1).Infof("Cleanup completed: removed %d old tasks, %d stale workers, and %d stale operations", + removedTasks, removedWorkers, removedOperations) } } @@ -311,6 +438,21 @@ func (mm *MaintenanceManager) GetStats() *MaintenanceStats { return stats } +// ReloadTaskConfigurations reloads task configurations from the current policy +func (mm *MaintenanceManager) ReloadTaskConfigurations() error { + mm.mutex.Lock() + defer mm.mutex.Unlock() + + // Trigger configuration reload in the integration layer + if mm.scanner != nil && mm.scanner.integration != nil { + mm.scanner.integration.ConfigureTasksFromPolicy() + glog.V(1).Infof("Task configurations reloaded from policy") + return nil + } + + return fmt.Errorf("integration not available for configuration reload") +} + // GetErrorState returns the current error state for monitoring func (mm *MaintenanceManager) GetErrorState() (errorCount int, lastError error, backoffDelay time.Duration) { mm.mutex.RLock() @@ -330,10 +472,29 @@ func (mm *MaintenanceManager) GetWorkers() []*MaintenanceWorker { // TriggerScan manually triggers a maintenance scan func (mm *MaintenanceManager) TriggerScan() error { + return mm.triggerScanInternal(true) +} + +// triggerScanInternal handles both manual and automatic scan triggers +func (mm *MaintenanceManager) triggerScanInternal(isManual bool) error { if !mm.running { return fmt.Errorf("maintenance manager is not running") } + // Prevent multiple concurrent scans + mm.mutex.Lock() + if mm.scanInProgress { + mm.mutex.Unlock() + if isManual { + glog.V(1).Infof("Manual scan already in progress, ignoring trigger request") + } else { + glog.V(2).Infof("Automatic scan already in progress, ignoring scheduled scan") + } + return fmt.Errorf("scan already in progress") + } + mm.scanInProgress = true + mm.mutex.Unlock() + go mm.performScan() return nil } diff --git a/weed/admin/maintenance/maintenance_queue.go b/weed/admin/maintenance/maintenance_queue.go index 580a98718..ca402bd4d 100644 --- a/weed/admin/maintenance/maintenance_queue.go +++ b/weed/admin/maintenance/maintenance_queue.go @@ -1,10 +1,13 @@ package maintenance import ( + "crypto/rand" + "fmt" "sort" "time" "github.com/seaweedfs/seaweedfs/weed/glog" + "github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" ) // NewMaintenanceQueue creates a new maintenance queue @@ -24,11 +27,18 @@ func (mq *MaintenanceQueue) SetIntegration(integration *MaintenanceIntegration) glog.V(1).Infof("Maintenance queue configured with integration") } -// AddTask adds a new maintenance task to the queue +// AddTask adds a new maintenance task to the queue with deduplication func (mq *MaintenanceQueue) AddTask(task *MaintenanceTask) { mq.mutex.Lock() defer mq.mutex.Unlock() + // Check for duplicate tasks (same type + volume + not completed) + if mq.hasDuplicateTask(task) { + glog.V(1).Infof("Task skipped (duplicate): %s for volume %d on %s (already queued or running)", + task.Type, task.VolumeID, task.Server) + return + } + task.ID = generateTaskID() task.Status = TaskStatusPending task.CreatedAt = time.Now() @@ -45,19 +55,48 @@ func (mq *MaintenanceQueue) AddTask(task *MaintenanceTask) { return mq.pendingTasks[i].ScheduledAt.Before(mq.pendingTasks[j].ScheduledAt) }) - glog.V(2).Infof("Added maintenance task %s: %s for volume %d", task.ID, task.Type, task.VolumeID) + scheduleInfo := "" + if !task.ScheduledAt.IsZero() && time.Until(task.ScheduledAt) > time.Minute { + scheduleInfo = fmt.Sprintf(", scheduled for %v", task.ScheduledAt.Format("15:04:05")) + } + + glog.Infof("Task queued: %s (%s) volume %d on %s, priority %d%s, reason: %s", + task.ID, task.Type, task.VolumeID, task.Server, task.Priority, scheduleInfo, task.Reason) +} + +// hasDuplicateTask checks if a similar task already exists (same type, volume, and not completed) +func (mq *MaintenanceQueue) hasDuplicateTask(newTask *MaintenanceTask) bool { + for _, existingTask := range mq.tasks { + if existingTask.Type == newTask.Type && + existingTask.VolumeID == newTask.VolumeID && + existingTask.Server == newTask.Server && + (existingTask.Status == TaskStatusPending || + existingTask.Status == TaskStatusAssigned || + existingTask.Status == TaskStatusInProgress) { + return true + } + } + return false } // AddTasksFromResults converts detection results to tasks and adds them to the queue func (mq *MaintenanceQueue) AddTasksFromResults(results []*TaskDetectionResult) { for _, result := range results { + // Validate that task has proper typed parameters + if result.TypedParams == nil { + glog.Warningf("Rejecting invalid task: %s for volume %d on %s - no typed parameters (insufficient destinations or planning failed)", + result.TaskType, result.VolumeID, result.Server) + continue + } + task := &MaintenanceTask{ - Type: result.TaskType, - Priority: result.Priority, - VolumeID: result.VolumeID, - Server: result.Server, - Collection: result.Collection, - Parameters: result.Parameters, + Type: result.TaskType, + Priority: result.Priority, + VolumeID: result.VolumeID, + Server: result.Server, + Collection: result.Collection, + // Copy typed protobuf parameters + TypedParams: result.TypedParams, Reason: result.Reason, ScheduledAt: result.ScheduleAt, } @@ -67,57 +106,92 @@ func (mq *MaintenanceQueue) AddTasksFromResults(results []*TaskDetectionResult) // GetNextTask returns the next available task for a worker func (mq *MaintenanceQueue) GetNextTask(workerID string, capabilities []MaintenanceTaskType) *MaintenanceTask { - mq.mutex.Lock() - defer mq.mutex.Unlock() + // Use read lock for initial checks and search + mq.mutex.RLock() worker, exists := mq.workers[workerID] if !exists { + mq.mutex.RUnlock() + glog.V(2).Infof("Task assignment failed for worker %s: worker not registered", workerID) return nil } // Check if worker has capacity if worker.CurrentLoad >= worker.MaxConcurrent { + mq.mutex.RUnlock() + glog.V(2).Infof("Task assignment failed for worker %s: at capacity (%d/%d)", workerID, worker.CurrentLoad, worker.MaxConcurrent) return nil } now := time.Now() + var selectedTask *MaintenanceTask + var selectedIndex int = -1 - // Find the next suitable task + // Find the next suitable task (using read lock) for i, task := range mq.pendingTasks { // Check if it's time to execute the task if task.ScheduledAt.After(now) { + glog.V(3).Infof("Task %s skipped for worker %s: scheduled for future (%v)", task.ID, workerID, task.ScheduledAt) continue } // Check if worker can handle this task type if !mq.workerCanHandle(task.Type, capabilities) { + glog.V(3).Infof("Task %s (%s) skipped for worker %s: capability mismatch (worker has: %v)", task.ID, task.Type, workerID, capabilities) continue } - // Check scheduling logic - use simplified system if available, otherwise fallback + // Check if this task type needs a cooldown period if !mq.canScheduleTaskNow(task) { + glog.V(3).Infof("Task %s (%s) skipped for worker %s: scheduling constraints not met", task.ID, task.Type, workerID) continue } - // Assign task to worker - task.Status = TaskStatusAssigned - task.WorkerID = workerID - startTime := now - task.StartedAt = &startTime - - // Remove from pending tasks - mq.pendingTasks = append(mq.pendingTasks[:i], mq.pendingTasks[i+1:]...) - - // Update worker - worker.CurrentTask = task - worker.CurrentLoad++ - worker.Status = "busy" - - glog.V(2).Infof("Assigned task %s to worker %s", task.ID, workerID) - return task + // Found a suitable task + selectedTask = task + selectedIndex = i + break } - return nil + // Release read lock + mq.mutex.RUnlock() + + // If no task found, return nil + if selectedTask == nil { + glog.V(2).Infof("No suitable tasks available for worker %s (checked %d pending tasks)", workerID, len(mq.pendingTasks)) + return nil + } + + // Now acquire write lock to actually assign the task + mq.mutex.Lock() + defer mq.mutex.Unlock() + + // Re-check that the task is still available (it might have been assigned to another worker) + if selectedIndex >= len(mq.pendingTasks) || mq.pendingTasks[selectedIndex].ID != selectedTask.ID { + glog.V(2).Infof("Task %s no longer available for worker %s: assigned to another worker", selectedTask.ID, workerID) + return nil + } + + // Assign the task + selectedTask.Status = TaskStatusAssigned + selectedTask.WorkerID = workerID + selectedTask.StartedAt = &now + + // Remove from pending tasks + mq.pendingTasks = append(mq.pendingTasks[:selectedIndex], mq.pendingTasks[selectedIndex+1:]...) + + // Update worker load + if worker, exists := mq.workers[workerID]; exists { + worker.CurrentLoad++ + } + + // Track pending operation + mq.trackPendingOperation(selectedTask) + + glog.Infof("Task assigned: %s (%s) → worker %s (volume %d, server %s)", + selectedTask.ID, selectedTask.Type, workerID, selectedTask.VolumeID, selectedTask.Server) + + return selectedTask } // CompleteTask marks a task as completed @@ -127,12 +201,19 @@ func (mq *MaintenanceQueue) CompleteTask(taskID string, error string) { task, exists := mq.tasks[taskID] if !exists { + glog.Warningf("Attempted to complete non-existent task: %s", taskID) return } completedTime := time.Now() task.CompletedAt = &completedTime + // Calculate task duration + var duration time.Duration + if task.StartedAt != nil { + duration = completedTime.Sub(*task.StartedAt) + } + if error != "" { task.Status = TaskStatusFailed task.Error = error @@ -148,14 +229,17 @@ func (mq *MaintenanceQueue) CompleteTask(taskID string, error string) { task.ScheduledAt = time.Now().Add(15 * time.Minute) // Retry delay mq.pendingTasks = append(mq.pendingTasks, task) - glog.V(2).Infof("Retrying task %s (attempt %d/%d)", taskID, task.RetryCount, task.MaxRetries) + glog.Warningf("Task failed, scheduling retry: %s (%s) attempt %d/%d, worker %s, duration %v, error: %s", + taskID, task.Type, task.RetryCount, task.MaxRetries, task.WorkerID, duration, error) } else { - glog.Errorf("Task %s failed permanently after %d retries: %s", taskID, task.MaxRetries, error) + glog.Errorf("Task failed permanently: %s (%s) worker %s, duration %v, after %d retries: %s", + taskID, task.Type, task.WorkerID, duration, task.MaxRetries, error) } } else { task.Status = TaskStatusCompleted task.Progress = 100 - glog.V(2).Infof("Task %s completed successfully", taskID) + glog.Infof("Task completed: %s (%s) worker %s, duration %v, volume %d", + taskID, task.Type, task.WorkerID, duration, task.VolumeID) } // Update worker @@ -168,6 +252,11 @@ func (mq *MaintenanceQueue) CompleteTask(taskID string, error string) { } } } + + // Remove pending operation (unless it's being retried) + if task.Status != TaskStatusPending { + mq.removePendingOperation(taskID) + } } // UpdateTaskProgress updates the progress of a running task @@ -176,8 +265,26 @@ func (mq *MaintenanceQueue) UpdateTaskProgress(taskID string, progress float64) defer mq.mutex.RUnlock() if task, exists := mq.tasks[taskID]; exists { + oldProgress := task.Progress task.Progress = progress task.Status = TaskStatusInProgress + + // Update pending operation status + mq.updatePendingOperationStatus(taskID, "in_progress") + + // Log progress at significant milestones or changes + if progress == 0 { + glog.V(1).Infof("Task started: %s (%s) worker %s, volume %d", + taskID, task.Type, task.WorkerID, task.VolumeID) + } else if progress >= 100 { + glog.V(1).Infof("Task progress: %s (%s) worker %s, %.1f%% complete", + taskID, task.Type, task.WorkerID, progress) + } else if progress-oldProgress >= 25 { // Log every 25% increment + glog.V(1).Infof("Task progress: %s (%s) worker %s, %.1f%% complete", + taskID, task.Type, task.WorkerID, progress) + } + } else { + glog.V(2).Infof("Progress update for unknown task: %s (%.1f%%)", taskID, progress) } } @@ -186,12 +293,25 @@ func (mq *MaintenanceQueue) RegisterWorker(worker *MaintenanceWorker) { mq.mutex.Lock() defer mq.mutex.Unlock() + isNewWorker := true + if existingWorker, exists := mq.workers[worker.ID]; exists { + isNewWorker = false + glog.Infof("Worker reconnected: %s at %s (capabilities: %v, max concurrent: %d)", + worker.ID, worker.Address, worker.Capabilities, worker.MaxConcurrent) + + // Preserve current load when reconnecting + worker.CurrentLoad = existingWorker.CurrentLoad + } else { + glog.Infof("Worker registered: %s at %s (capabilities: %v, max concurrent: %d)", + worker.ID, worker.Address, worker.Capabilities, worker.MaxConcurrent) + } + worker.LastHeartbeat = time.Now() worker.Status = "active" - worker.CurrentLoad = 0 + if isNewWorker { + worker.CurrentLoad = 0 + } mq.workers[worker.ID] = worker - - glog.V(1).Infof("Registered maintenance worker %s at %s", worker.ID, worker.Address) } // UpdateWorkerHeartbeat updates worker heartbeat @@ -200,7 +320,15 @@ func (mq *MaintenanceQueue) UpdateWorkerHeartbeat(workerID string) { defer mq.mutex.Unlock() if worker, exists := mq.workers[workerID]; exists { + lastSeen := worker.LastHeartbeat worker.LastHeartbeat = time.Now() + + // Log if worker was offline for a while + if time.Since(lastSeen) > 2*time.Minute { + glog.Infof("Worker %s heartbeat resumed after %v", workerID, time.Since(lastSeen)) + } + } else { + glog.V(2).Infof("Heartbeat from unknown worker: %s", workerID) } } @@ -255,7 +383,7 @@ func (mq *MaintenanceQueue) getRepeatPreventionInterval(taskType MaintenanceTask // Fallback to policy configuration if no scheduler available or scheduler doesn't provide default if mq.policy != nil { - repeatIntervalHours := mq.policy.GetRepeatInterval(taskType) + repeatIntervalHours := GetRepeatInterval(mq.policy, taskType) if repeatIntervalHours > 0 { interval := time.Duration(repeatIntervalHours) * time.Hour glog.V(3).Infof("Using policy configuration repeat interval for %s: %v", taskType, interval) @@ -311,10 +439,23 @@ func (mq *MaintenanceQueue) GetWorkers() []*MaintenanceWorker { func generateTaskID() string { const charset = "abcdefghijklmnopqrstuvwxyz0123456789" b := make([]byte, 8) - for i := range b { - b[i] = charset[i%len(charset)] + randBytes := make([]byte, 8) + + // Generate random bytes + if _, err := rand.Read(randBytes); err != nil { + // Fallback to timestamp-based ID if crypto/rand fails + timestamp := time.Now().UnixNano() + return fmt.Sprintf("task-%d", timestamp) } - return string(b) + + // Convert random bytes to charset + for i := range b { + b[i] = charset[int(randBytes[i])%len(charset)] + } + + // Add timestamp suffix to ensure uniqueness + timestamp := time.Now().Unix() % 10000 // last 4 digits of timestamp + return fmt.Sprintf("%s-%04d", string(b), timestamp) } // CleanupOldTasks removes old completed and failed tasks @@ -427,19 +568,31 @@ func (mq *MaintenanceQueue) workerCanHandle(taskType MaintenanceTaskType, capabi // canScheduleTaskNow determines if a task can be scheduled using task schedulers or fallback logic func (mq *MaintenanceQueue) canScheduleTaskNow(task *MaintenanceTask) bool { + glog.V(2).Infof("Checking if task %s (type: %s) can be scheduled", task.ID, task.Type) + + // TEMPORARY FIX: Skip integration task scheduler which is being overly restrictive + // Use fallback logic directly for now + glog.V(2).Infof("Using fallback logic for task scheduling") + canExecute := mq.canExecuteTaskType(task.Type) + glog.V(2).Infof("Fallback decision for task %s: %v", task.ID, canExecute) + return canExecute + + // NOTE: Original integration code disabled temporarily // Try task scheduling logic first - if mq.integration != nil { - // Get all running tasks and available workers - runningTasks := mq.getRunningTasks() - availableWorkers := mq.getAvailableWorkers() + /* + if mq.integration != nil { + glog.Infof("DEBUG canScheduleTaskNow: Using integration task scheduler") + // Get all running tasks and available workers + runningTasks := mq.getRunningTasks() + availableWorkers := mq.getAvailableWorkers() - canSchedule := mq.integration.CanScheduleWithTaskSchedulers(task, runningTasks, availableWorkers) - glog.V(3).Infof("Task scheduler decision for task %s (%s): %v", task.ID, task.Type, canSchedule) - return canSchedule - } + glog.Infof("DEBUG canScheduleTaskNow: Running tasks: %d, Available workers: %d", len(runningTasks), len(availableWorkers)) - // Fallback to hardcoded logic - return mq.canExecuteTaskType(task.Type) + canSchedule := mq.integration.CanScheduleWithTaskSchedulers(task, runningTasks, availableWorkers) + glog.Infof("DEBUG canScheduleTaskNow: Task scheduler decision for task %s (%s): %v", task.ID, task.Type, canSchedule) + return canSchedule + } + */ } // canExecuteTaskType checks if we can execute more tasks of this type (concurrency limits) - fallback logic @@ -465,7 +618,7 @@ func (mq *MaintenanceQueue) getMaxConcurrentForTaskType(taskType MaintenanceTask // Fallback to policy configuration if no scheduler available or scheduler doesn't provide default if mq.policy != nil { - maxConcurrent := mq.policy.GetMaxConcurrent(taskType) + maxConcurrent := GetMaxConcurrent(mq.policy, taskType) if maxConcurrent > 0 { glog.V(3).Infof("Using policy configuration max concurrent for %s: %d", taskType, maxConcurrent) return maxConcurrent @@ -498,3 +651,108 @@ func (mq *MaintenanceQueue) getAvailableWorkers() []*MaintenanceWorker { } return availableWorkers } + +// trackPendingOperation adds a task to the pending operations tracker +func (mq *MaintenanceQueue) trackPendingOperation(task *MaintenanceTask) { + if mq.integration == nil { + return + } + + pendingOps := mq.integration.GetPendingOperations() + if pendingOps == nil { + return + } + + // Skip tracking for tasks without proper typed parameters + if task.TypedParams == nil { + glog.V(2).Infof("Skipping pending operation tracking for task %s - no typed parameters", task.ID) + return + } + + // Map maintenance task type to pending operation type + var opType PendingOperationType + switch task.Type { + case MaintenanceTaskType("balance"): + opType = OpTypeVolumeBalance + case MaintenanceTaskType("erasure_coding"): + opType = OpTypeErasureCoding + case MaintenanceTaskType("vacuum"): + opType = OpTypeVacuum + case MaintenanceTaskType("replication"): + opType = OpTypeReplication + default: + opType = OpTypeVolumeMove + } + + // Determine destination node and estimated size from typed parameters + destNode := "" + estimatedSize := uint64(1024 * 1024 * 1024) // Default 1GB estimate + + switch params := task.TypedParams.TaskParams.(type) { + case *worker_pb.TaskParams_ErasureCodingParams: + if params.ErasureCodingParams != nil { + if len(params.ErasureCodingParams.Destinations) > 0 { + destNode = params.ErasureCodingParams.Destinations[0].Node + } + if params.ErasureCodingParams.EstimatedShardSize > 0 { + estimatedSize = params.ErasureCodingParams.EstimatedShardSize + } + } + case *worker_pb.TaskParams_BalanceParams: + if params.BalanceParams != nil { + destNode = params.BalanceParams.DestNode + if params.BalanceParams.EstimatedSize > 0 { + estimatedSize = params.BalanceParams.EstimatedSize + } + } + case *worker_pb.TaskParams_ReplicationParams: + if params.ReplicationParams != nil { + destNode = params.ReplicationParams.DestNode + if params.ReplicationParams.EstimatedSize > 0 { + estimatedSize = params.ReplicationParams.EstimatedSize + } + } + } + + operation := &PendingOperation{ + VolumeID: task.VolumeID, + OperationType: opType, + SourceNode: task.Server, + DestNode: destNode, + TaskID: task.ID, + StartTime: time.Now(), + EstimatedSize: estimatedSize, + Collection: task.Collection, + Status: "assigned", + } + + pendingOps.AddOperation(operation) +} + +// removePendingOperation removes a task from the pending operations tracker +func (mq *MaintenanceQueue) removePendingOperation(taskID string) { + if mq.integration == nil { + return + } + + pendingOps := mq.integration.GetPendingOperations() + if pendingOps == nil { + return + } + + pendingOps.RemoveOperation(taskID) +} + +// updatePendingOperationStatus updates the status of a pending operation +func (mq *MaintenanceQueue) updatePendingOperationStatus(taskID string, status string) { + if mq.integration == nil { + return + } + + pendingOps := mq.integration.GetPendingOperations() + if pendingOps == nil { + return + } + + pendingOps.UpdateOperationStatus(taskID, status) +} diff --git a/weed/admin/maintenance/maintenance_queue_test.go b/weed/admin/maintenance/maintenance_queue_test.go new file mode 100644 index 000000000..2c38471a0 --- /dev/null +++ b/weed/admin/maintenance/maintenance_queue_test.go @@ -0,0 +1,353 @@ +package maintenance + +import ( + "testing" + + "github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" +) + +// Test suite for canScheduleTaskNow() function and related scheduling logic +// +// This test suite ensures that: +// 1. The fallback scheduling logic works correctly when no integration is present +// 2. Task concurrency limits are properly enforced per task type +// 3. Different task types don't interfere with each other's concurrency limits +// 4. Custom policies with higher concurrency limits work correctly +// 5. Edge cases (nil tasks, empty task types) are handled gracefully +// 6. Helper functions (GetRunningTaskCount, canExecuteTaskType, etc.) work correctly +// +// Background: The canScheduleTaskNow() function is critical for task assignment. +// It was previously failing due to an overly restrictive integration scheduler, +// so we implemented a temporary fix that bypasses the integration and uses +// fallback logic based on simple concurrency limits per task type. + +func TestCanScheduleTaskNow_FallbackLogic(t *testing.T) { + // Test the current implementation which uses fallback logic + mq := &MaintenanceQueue{ + tasks: make(map[string]*MaintenanceTask), + pendingTasks: []*MaintenanceTask{}, + workers: make(map[string]*MaintenanceWorker), + policy: nil, // No policy for default behavior + integration: nil, // No integration to force fallback + } + + task := &MaintenanceTask{ + ID: "test-task-1", + Type: MaintenanceTaskType("erasure_coding"), + Status: TaskStatusPending, + } + + // Should return true with fallback logic (no running tasks, default max concurrent = 1) + result := mq.canScheduleTaskNow(task) + if !result { + t.Errorf("Expected canScheduleTaskNow to return true with fallback logic, got false") + } +} + +func TestCanScheduleTaskNow_FallbackWithRunningTasks(t *testing.T) { + // Test fallback logic when there are already running tasks + mq := &MaintenanceQueue{ + tasks: map[string]*MaintenanceTask{ + "running-task": { + ID: "running-task", + Type: MaintenanceTaskType("erasure_coding"), + Status: TaskStatusInProgress, + }, + }, + pendingTasks: []*MaintenanceTask{}, + workers: make(map[string]*MaintenanceWorker), + policy: nil, + integration: nil, + } + + task := &MaintenanceTask{ + ID: "test-task-2", + Type: MaintenanceTaskType("erasure_coding"), + Status: TaskStatusPending, + } + + // Should return false because max concurrent is 1 and we have 1 running task + result := mq.canScheduleTaskNow(task) + if result { + t.Errorf("Expected canScheduleTaskNow to return false when at capacity, got true") + } +} + +func TestCanScheduleTaskNow_DifferentTaskTypes(t *testing.T) { + // Test that different task types don't interfere with each other + mq := &MaintenanceQueue{ + tasks: map[string]*MaintenanceTask{ + "running-ec-task": { + ID: "running-ec-task", + Type: MaintenanceTaskType("erasure_coding"), + Status: TaskStatusInProgress, + }, + }, + pendingTasks: []*MaintenanceTask{}, + workers: make(map[string]*MaintenanceWorker), + policy: nil, + integration: nil, + } + + // Test vacuum task when EC task is running + vacuumTask := &MaintenanceTask{ + ID: "vacuum-task", + Type: MaintenanceTaskType("vacuum"), + Status: TaskStatusPending, + } + + // Should return true because vacuum and erasure_coding are different task types + result := mq.canScheduleTaskNow(vacuumTask) + if !result { + t.Errorf("Expected canScheduleTaskNow to return true for different task type, got false") + } + + // Test another EC task when one is already running + ecTask := &MaintenanceTask{ + ID: "ec-task", + Type: MaintenanceTaskType("erasure_coding"), + Status: TaskStatusPending, + } + + // Should return false because max concurrent for EC is 1 and we have 1 running + result = mq.canScheduleTaskNow(ecTask) + if result { + t.Errorf("Expected canScheduleTaskNow to return false for same task type at capacity, got true") + } +} + +func TestCanScheduleTaskNow_WithIntegration(t *testing.T) { + // Test with a real MaintenanceIntegration (will use fallback logic in current implementation) + policy := &MaintenancePolicy{ + TaskPolicies: make(map[string]*worker_pb.TaskPolicy), + GlobalMaxConcurrent: 10, + DefaultRepeatIntervalSeconds: 24 * 60 * 60, // 24 hours in seconds + DefaultCheckIntervalSeconds: 60 * 60, // 1 hour in seconds + } + mq := NewMaintenanceQueue(policy) + + // Create a basic integration (this would normally be more complex) + integration := NewMaintenanceIntegration(mq, policy) + mq.SetIntegration(integration) + + task := &MaintenanceTask{ + ID: "test-task-3", + Type: MaintenanceTaskType("erasure_coding"), + Status: TaskStatusPending, + } + + // With our current implementation (fallback logic), this should return true + result := mq.canScheduleTaskNow(task) + if !result { + t.Errorf("Expected canScheduleTaskNow to return true with fallback logic, got false") + } +} + +func TestGetRunningTaskCount(t *testing.T) { + // Test the helper function used by fallback logic + mq := &MaintenanceQueue{ + tasks: map[string]*MaintenanceTask{ + "task1": { + ID: "task1", + Type: MaintenanceTaskType("erasure_coding"), + Status: TaskStatusInProgress, + }, + "task2": { + ID: "task2", + Type: MaintenanceTaskType("erasure_coding"), + Status: TaskStatusAssigned, + }, + "task3": { + ID: "task3", + Type: MaintenanceTaskType("vacuum"), + Status: TaskStatusInProgress, + }, + "task4": { + ID: "task4", + Type: MaintenanceTaskType("erasure_coding"), + Status: TaskStatusCompleted, + }, + }, + pendingTasks: []*MaintenanceTask{}, + workers: make(map[string]*MaintenanceWorker), + } + + // Should count 2 running EC tasks (in_progress + assigned) + ecCount := mq.GetRunningTaskCount(MaintenanceTaskType("erasure_coding")) + if ecCount != 2 { + t.Errorf("Expected 2 running EC tasks, got %d", ecCount) + } + + // Should count 1 running vacuum task + vacuumCount := mq.GetRunningTaskCount(MaintenanceTaskType("vacuum")) + if vacuumCount != 1 { + t.Errorf("Expected 1 running vacuum task, got %d", vacuumCount) + } + + // Should count 0 running balance tasks + balanceCount := mq.GetRunningTaskCount(MaintenanceTaskType("balance")) + if balanceCount != 0 { + t.Errorf("Expected 0 running balance tasks, got %d", balanceCount) + } +} + +func TestCanExecuteTaskType(t *testing.T) { + // Test the fallback logic helper function + mq := &MaintenanceQueue{ + tasks: map[string]*MaintenanceTask{ + "running-task": { + ID: "running-task", + Type: MaintenanceTaskType("erasure_coding"), + Status: TaskStatusInProgress, + }, + }, + pendingTasks: []*MaintenanceTask{}, + workers: make(map[string]*MaintenanceWorker), + policy: nil, // Will use default max concurrent = 1 + integration: nil, + } + + // Should return false for EC (1 running, max = 1) + result := mq.canExecuteTaskType(MaintenanceTaskType("erasure_coding")) + if result { + t.Errorf("Expected canExecuteTaskType to return false for EC at capacity, got true") + } + + // Should return true for vacuum (0 running, max = 1) + result = mq.canExecuteTaskType(MaintenanceTaskType("vacuum")) + if !result { + t.Errorf("Expected canExecuteTaskType to return true for vacuum, got false") + } +} + +func TestGetMaxConcurrentForTaskType_DefaultBehavior(t *testing.T) { + // Test the default behavior when no policy or integration is set + mq := &MaintenanceQueue{ + tasks: make(map[string]*MaintenanceTask), + pendingTasks: []*MaintenanceTask{}, + workers: make(map[string]*MaintenanceWorker), + policy: nil, + integration: nil, + } + + // Should return default value of 1 + maxConcurrent := mq.getMaxConcurrentForTaskType(MaintenanceTaskType("erasure_coding")) + if maxConcurrent != 1 { + t.Errorf("Expected default max concurrent to be 1, got %d", maxConcurrent) + } + + maxConcurrent = mq.getMaxConcurrentForTaskType(MaintenanceTaskType("vacuum")) + if maxConcurrent != 1 { + t.Errorf("Expected default max concurrent to be 1, got %d", maxConcurrent) + } +} + +// Test edge cases and error conditions +func TestCanScheduleTaskNow_NilTask(t *testing.T) { + mq := &MaintenanceQueue{ + tasks: make(map[string]*MaintenanceTask), + pendingTasks: []*MaintenanceTask{}, + workers: make(map[string]*MaintenanceWorker), + policy: nil, + integration: nil, + } + + // This should panic with a nil task, so we expect and catch the panic + defer func() { + if r := recover(); r == nil { + t.Errorf("Expected canScheduleTaskNow to panic with nil task, but it didn't") + } + }() + + // This should panic + mq.canScheduleTaskNow(nil) +} + +func TestCanScheduleTaskNow_EmptyTaskType(t *testing.T) { + mq := &MaintenanceQueue{ + tasks: make(map[string]*MaintenanceTask), + pendingTasks: []*MaintenanceTask{}, + workers: make(map[string]*MaintenanceWorker), + policy: nil, + integration: nil, + } + + task := &MaintenanceTask{ + ID: "empty-type-task", + Type: MaintenanceTaskType(""), // Empty task type + Status: TaskStatusPending, + } + + // Should handle empty task type gracefully + result := mq.canScheduleTaskNow(task) + if !result { + t.Errorf("Expected canScheduleTaskNow to handle empty task type, got false") + } +} + +func TestCanScheduleTaskNow_WithPolicy(t *testing.T) { + // Test with a policy that allows higher concurrency + policy := &MaintenancePolicy{ + TaskPolicies: map[string]*worker_pb.TaskPolicy{ + string(MaintenanceTaskType("erasure_coding")): { + Enabled: true, + MaxConcurrent: 3, + RepeatIntervalSeconds: 60 * 60, // 1 hour + CheckIntervalSeconds: 60 * 60, // 1 hour + }, + string(MaintenanceTaskType("vacuum")): { + Enabled: true, + MaxConcurrent: 2, + RepeatIntervalSeconds: 60 * 60, // 1 hour + CheckIntervalSeconds: 60 * 60, // 1 hour + }, + }, + GlobalMaxConcurrent: 10, + DefaultRepeatIntervalSeconds: 24 * 60 * 60, // 24 hours in seconds + DefaultCheckIntervalSeconds: 60 * 60, // 1 hour in seconds + } + + mq := &MaintenanceQueue{ + tasks: map[string]*MaintenanceTask{ + "running-task-1": { + ID: "running-task-1", + Type: MaintenanceTaskType("erasure_coding"), + Status: TaskStatusInProgress, + }, + "running-task-2": { + ID: "running-task-2", + Type: MaintenanceTaskType("erasure_coding"), + Status: TaskStatusAssigned, + }, + }, + pendingTasks: []*MaintenanceTask{}, + workers: make(map[string]*MaintenanceWorker), + policy: policy, + integration: nil, + } + + task := &MaintenanceTask{ + ID: "test-task-policy", + Type: MaintenanceTaskType("erasure_coding"), + Status: TaskStatusPending, + } + + // Should return true because we have 2 running EC tasks but max is 3 + result := mq.canScheduleTaskNow(task) + if !result { + t.Errorf("Expected canScheduleTaskNow to return true with policy allowing 3 concurrent, got false") + } + + // Add one more running task to reach the limit + mq.tasks["running-task-3"] = &MaintenanceTask{ + ID: "running-task-3", + Type: MaintenanceTaskType("erasure_coding"), + Status: TaskStatusInProgress, + } + + // Should return false because we now have 3 running EC tasks (at limit) + result = mq.canScheduleTaskNow(task) + if result { + t.Errorf("Expected canScheduleTaskNow to return false when at policy limit, got true") + } +} diff --git a/weed/admin/maintenance/maintenance_scanner.go b/weed/admin/maintenance/maintenance_scanner.go index 271765ef8..ef41b78ed 100644 --- a/weed/admin/maintenance/maintenance_scanner.go +++ b/weed/admin/maintenance/maintenance_scanner.go @@ -43,7 +43,18 @@ func (ms *MaintenanceScanner) ScanForMaintenanceTasks() ([]*TaskDetectionResult, // Convert metrics to task system format taskMetrics := ms.convertToTaskMetrics(volumeMetrics) - // Use task detection system + // Update topology information for complete cluster view (including empty servers) + // This must happen before task detection to ensure EC placement can consider all servers + if ms.lastTopologyInfo != nil { + if err := ms.integration.UpdateTopologyInfo(ms.lastTopologyInfo); err != nil { + glog.Errorf("Failed to update topology info for empty servers: %v", err) + // Don't fail the scan - continue with just volume-bearing servers + } else { + glog.V(1).Infof("Updated topology info for complete cluster view including empty servers") + } + } + + // Use task detection system with complete cluster information results, err := ms.integration.ScanWithTaskDetectors(taskMetrics) if err != nil { glog.Errorf("Task scanning failed: %v", err) @@ -62,25 +73,60 @@ func (ms *MaintenanceScanner) ScanForMaintenanceTasks() ([]*TaskDetectionResult, // getVolumeHealthMetrics collects health information for all volumes func (ms *MaintenanceScanner) getVolumeHealthMetrics() ([]*VolumeHealthMetrics, error) { var metrics []*VolumeHealthMetrics + var volumeSizeLimitMB uint64 + glog.V(1).Infof("Collecting volume health metrics from master") err := ms.adminClient.WithMasterClient(func(client master_pb.SeaweedClient) error { + // First, get volume size limit from master configuration + configResp, err := client.GetMasterConfiguration(context.Background(), &master_pb.GetMasterConfigurationRequest{}) + if err != nil { + glog.Warningf("Failed to get volume size limit from master: %v", err) + volumeSizeLimitMB = 30000 // Default to 30GB if we can't get from master + } else { + volumeSizeLimitMB = uint64(configResp.VolumeSizeLimitMB) + } + + // Now get volume list resp, err := client.VolumeList(context.Background(), &master_pb.VolumeListRequest{}) if err != nil { return err } if resp.TopologyInfo == nil { + glog.Warningf("No topology info received from master") return nil } + volumeSizeLimitBytes := volumeSizeLimitMB * 1024 * 1024 // Convert MB to bytes + + // Track all nodes discovered in topology + var allNodesInTopology []string + var nodesWithVolumes []string + var nodesWithoutVolumes []string + for _, dc := range resp.TopologyInfo.DataCenterInfos { + glog.V(2).Infof("Processing datacenter: %s", dc.Id) for _, rack := range dc.RackInfos { + glog.V(2).Infof("Processing rack: %s in datacenter: %s", rack.Id, dc.Id) for _, node := range rack.DataNodeInfos { - for _, diskInfo := range node.DiskInfos { + allNodesInTopology = append(allNodesInTopology, node.Id) + glog.V(2).Infof("Found volume server in topology: %s (disks: %d)", node.Id, len(node.DiskInfos)) + + hasVolumes := false + // Process each disk on this node + for diskType, diskInfo := range node.DiskInfos { + if len(diskInfo.VolumeInfos) > 0 { + hasVolumes = true + glog.V(2).Infof("Volume server %s disk %s has %d volumes", node.Id, diskType, len(diskInfo.VolumeInfos)) + } + + // Process volumes on this specific disk for _, volInfo := range diskInfo.VolumeInfos { metric := &VolumeHealthMetrics{ VolumeID: volInfo.Id, Server: node.Id, + DiskType: diskType, // Track which disk this volume is on + DiskId: volInfo.DiskId, // Use disk ID from volume info Collection: volInfo.Collection, Size: volInfo.Size, DeletedBytes: volInfo.DeletedByteCount, @@ -94,31 +140,58 @@ func (ms *MaintenanceScanner) getVolumeHealthMetrics() ([]*VolumeHealthMetrics, // Calculate derived metrics if metric.Size > 0 { metric.GarbageRatio = float64(metric.DeletedBytes) / float64(metric.Size) - // Calculate fullness ratio (would need volume size limit) - // metric.FullnessRatio = float64(metric.Size) / float64(volumeSizeLimit) + // Calculate fullness ratio using actual volume size limit from master + metric.FullnessRatio = float64(metric.Size) / float64(volumeSizeLimitBytes) } metric.Age = time.Since(metric.LastModified) + glog.V(3).Infof("Volume %d on %s:%s (ID %d): size=%d, limit=%d, fullness=%.2f", + metric.VolumeID, metric.Server, metric.DiskType, metric.DiskId, metric.Size, volumeSizeLimitBytes, metric.FullnessRatio) + metrics = append(metrics, metric) } } + + if hasVolumes { + nodesWithVolumes = append(nodesWithVolumes, node.Id) + } else { + nodesWithoutVolumes = append(nodesWithoutVolumes, node.Id) + glog.V(1).Infof("Volume server %s found in topology but has no volumes", node.Id) + } } } } + glog.Infof("Topology discovery complete:") + glog.Infof(" - Total volume servers in topology: %d (%v)", len(allNodesInTopology), allNodesInTopology) + glog.Infof(" - Volume servers with volumes: %d (%v)", len(nodesWithVolumes), nodesWithVolumes) + glog.Infof(" - Volume servers without volumes: %d (%v)", len(nodesWithoutVolumes), nodesWithoutVolumes) + glog.Infof("Note: Maintenance system will track empty servers separately from volume metrics.") + + // Store topology info for volume shard tracker + ms.lastTopologyInfo = resp.TopologyInfo + return nil }) if err != nil { + glog.Errorf("Failed to get volume health metrics: %v", err) return nil, err } + glog.V(1).Infof("Successfully collected metrics for %d actual volumes with disk ID information", len(metrics)) + // Count actual replicas and identify EC volumes ms.enrichVolumeMetrics(metrics) return metrics, nil } +// getTopologyInfo returns the last collected topology information +func (ms *MaintenanceScanner) getTopologyInfo() *master_pb.TopologyInfo { + return ms.lastTopologyInfo +} + // enrichVolumeMetrics adds additional information like replica counts func (ms *MaintenanceScanner) enrichVolumeMetrics(metrics []*VolumeHealthMetrics) { // Group volumes by ID to count replicas @@ -127,13 +200,17 @@ func (ms *MaintenanceScanner) enrichVolumeMetrics(metrics []*VolumeHealthMetrics volumeGroups[metric.VolumeID] = append(volumeGroups[metric.VolumeID], metric) } - // Update replica counts - for _, group := range volumeGroups { - actualReplicas := len(group) - for _, metric := range group { - metric.ReplicaCount = actualReplicas + // Update replica counts for actual volumes + for volumeID, replicas := range volumeGroups { + replicaCount := len(replicas) + for _, replica := range replicas { + replica.ReplicaCount = replicaCount } + glog.V(3).Infof("Volume %d has %d replicas", volumeID, replicaCount) } + + // TODO: Identify EC volumes by checking volume structure + // This would require querying volume servers for EC shard information } // convertToTaskMetrics converts existing volume metrics to task system format @@ -144,6 +221,8 @@ func (ms *MaintenanceScanner) convertToTaskMetrics(metrics []*VolumeHealthMetric simplified = append(simplified, &types.VolumeHealthMetrics{ VolumeID: metric.VolumeID, Server: metric.Server, + DiskType: metric.DiskType, + DiskId: metric.DiskId, Collection: metric.Collection, Size: metric.Size, DeletedBytes: metric.DeletedBytes, @@ -159,5 +238,6 @@ func (ms *MaintenanceScanner) convertToTaskMetrics(metrics []*VolumeHealthMetric }) } + glog.V(2).Infof("Converted %d volume metrics with disk ID information for task detection", len(simplified)) return simplified } diff --git a/weed/admin/maintenance/maintenance_types.go b/weed/admin/maintenance/maintenance_types.go index 6b8c2e9a0..e863b26e6 100644 --- a/weed/admin/maintenance/maintenance_types.go +++ b/weed/admin/maintenance/maintenance_types.go @@ -8,6 +8,7 @@ import ( "github.com/seaweedfs/seaweedfs/weed/glog" "github.com/seaweedfs/seaweedfs/weed/pb/master_pb" + "github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" "github.com/seaweedfs/seaweedfs/weed/worker/tasks" "github.com/seaweedfs/seaweedfs/weed/worker/types" ) @@ -96,7 +97,7 @@ type MaintenanceTask struct { VolumeID uint32 `json:"volume_id,omitempty"` Server string `json:"server,omitempty"` Collection string `json:"collection,omitempty"` - Parameters map[string]interface{} `json:"parameters,omitempty"` + TypedParams *worker_pb.TaskParams `json:"typed_params,omitempty"` Reason string `json:"reason"` CreatedAt time.Time `json:"created_at"` ScheduledAt time.Time `json:"scheduled_at"` @@ -109,90 +110,149 @@ type MaintenanceTask struct { MaxRetries int `json:"max_retries"` } +// MaintenanceConfig holds configuration for the maintenance system +// DEPRECATED: Use worker_pb.MaintenanceConfig instead +type MaintenanceConfig = worker_pb.MaintenanceConfig + +// MaintenancePolicy defines policies for maintenance operations +// DEPRECATED: Use worker_pb.MaintenancePolicy instead +type MaintenancePolicy = worker_pb.MaintenancePolicy + // TaskPolicy represents configuration for a specific task type -type TaskPolicy struct { - Enabled bool `json:"enabled"` - MaxConcurrent int `json:"max_concurrent"` - RepeatInterval int `json:"repeat_interval"` // Hours to wait before repeating - CheckInterval int `json:"check_interval"` // Hours between checks - Configuration map[string]interface{} `json:"configuration"` // Task-specific config +// DEPRECATED: Use worker_pb.TaskPolicy instead +type TaskPolicy = worker_pb.TaskPolicy + +// Default configuration values +func DefaultMaintenanceConfig() *MaintenanceConfig { + return DefaultMaintenanceConfigProto() } -// MaintenancePolicy defines policies for maintenance operations using a dynamic structure -type MaintenancePolicy struct { - // Task-specific policies mapped by task type - TaskPolicies map[MaintenanceTaskType]*TaskPolicy `json:"task_policies"` +// Policy helper functions (since we can't add methods to type aliases) - // Global policy settings - GlobalMaxConcurrent int `json:"global_max_concurrent"` // Overall limit across all task types - DefaultRepeatInterval int `json:"default_repeat_interval"` // Default hours if task doesn't specify - DefaultCheckInterval int `json:"default_check_interval"` // Default hours for periodic checks -} - -// GetTaskPolicy returns the policy for a specific task type, creating generic defaults if needed -func (mp *MaintenancePolicy) GetTaskPolicy(taskType MaintenanceTaskType) *TaskPolicy { +// GetTaskPolicy returns the policy for a specific task type +func GetTaskPolicy(mp *MaintenancePolicy, taskType MaintenanceTaskType) *TaskPolicy { if mp.TaskPolicies == nil { - mp.TaskPolicies = make(map[MaintenanceTaskType]*TaskPolicy) + return nil } - - policy, exists := mp.TaskPolicies[taskType] - if !exists { - // Create generic default policy using global settings - no hardcoded fallbacks - policy = &TaskPolicy{ - Enabled: false, // Conservative default - require explicit enabling - MaxConcurrent: 1, // Conservative default concurrency - RepeatInterval: mp.DefaultRepeatInterval, // Use configured default, 0 if not set - CheckInterval: mp.DefaultCheckInterval, // Use configured default, 0 if not set - Configuration: make(map[string]interface{}), - } - mp.TaskPolicies[taskType] = policy - } - - return policy + return mp.TaskPolicies[string(taskType)] } // SetTaskPolicy sets the policy for a specific task type -func (mp *MaintenancePolicy) SetTaskPolicy(taskType MaintenanceTaskType, policy *TaskPolicy) { +func SetTaskPolicy(mp *MaintenancePolicy, taskType MaintenanceTaskType, policy *TaskPolicy) { if mp.TaskPolicies == nil { - mp.TaskPolicies = make(map[MaintenanceTaskType]*TaskPolicy) + mp.TaskPolicies = make(map[string]*TaskPolicy) } - mp.TaskPolicies[taskType] = policy + mp.TaskPolicies[string(taskType)] = policy } // IsTaskEnabled returns whether a task type is enabled -func (mp *MaintenancePolicy) IsTaskEnabled(taskType MaintenanceTaskType) bool { - policy := mp.GetTaskPolicy(taskType) +func IsTaskEnabled(mp *MaintenancePolicy, taskType MaintenanceTaskType) bool { + policy := GetTaskPolicy(mp, taskType) + if policy == nil { + return false + } return policy.Enabled } // GetMaxConcurrent returns the max concurrent limit for a task type -func (mp *MaintenancePolicy) GetMaxConcurrent(taskType MaintenanceTaskType) int { - policy := mp.GetTaskPolicy(taskType) - return policy.MaxConcurrent +func GetMaxConcurrent(mp *MaintenancePolicy, taskType MaintenanceTaskType) int { + policy := GetTaskPolicy(mp, taskType) + if policy == nil { + return 1 + } + return int(policy.MaxConcurrent) } // GetRepeatInterval returns the repeat interval for a task type -func (mp *MaintenancePolicy) GetRepeatInterval(taskType MaintenanceTaskType) int { - policy := mp.GetTaskPolicy(taskType) - return policy.RepeatInterval -} - -// GetTaskConfig returns a configuration value for a task type -func (mp *MaintenancePolicy) GetTaskConfig(taskType MaintenanceTaskType, key string) (interface{}, bool) { - policy := mp.GetTaskPolicy(taskType) - value, exists := policy.Configuration[key] - return value, exists -} - -// SetTaskConfig sets a configuration value for a task type -func (mp *MaintenancePolicy) SetTaskConfig(taskType MaintenanceTaskType, key string, value interface{}) { - policy := mp.GetTaskPolicy(taskType) - if policy.Configuration == nil { - policy.Configuration = make(map[string]interface{}) +func GetRepeatInterval(mp *MaintenancePolicy, taskType MaintenanceTaskType) int { + policy := GetTaskPolicy(mp, taskType) + if policy == nil { + return int(mp.DefaultRepeatIntervalSeconds) } - policy.Configuration[key] = value + return int(policy.RepeatIntervalSeconds) } +// GetVacuumTaskConfig returns the vacuum task configuration +func GetVacuumTaskConfig(mp *MaintenancePolicy, taskType MaintenanceTaskType) *worker_pb.VacuumTaskConfig { + policy := GetTaskPolicy(mp, taskType) + if policy == nil { + return nil + } + return policy.GetVacuumConfig() +} + +// GetErasureCodingTaskConfig returns the erasure coding task configuration +func GetErasureCodingTaskConfig(mp *MaintenancePolicy, taskType MaintenanceTaskType) *worker_pb.ErasureCodingTaskConfig { + policy := GetTaskPolicy(mp, taskType) + if policy == nil { + return nil + } + return policy.GetErasureCodingConfig() +} + +// GetBalanceTaskConfig returns the balance task configuration +func GetBalanceTaskConfig(mp *MaintenancePolicy, taskType MaintenanceTaskType) *worker_pb.BalanceTaskConfig { + policy := GetTaskPolicy(mp, taskType) + if policy == nil { + return nil + } + return policy.GetBalanceConfig() +} + +// GetReplicationTaskConfig returns the replication task configuration +func GetReplicationTaskConfig(mp *MaintenancePolicy, taskType MaintenanceTaskType) *worker_pb.ReplicationTaskConfig { + policy := GetTaskPolicy(mp, taskType) + if policy == nil { + return nil + } + return policy.GetReplicationConfig() +} + +// Note: GetTaskConfig was removed - use typed getters: GetVacuumTaskConfig, GetErasureCodingTaskConfig, GetBalanceTaskConfig, or GetReplicationTaskConfig + +// SetVacuumTaskConfig sets the vacuum task configuration +func SetVacuumTaskConfig(mp *MaintenancePolicy, taskType MaintenanceTaskType, config *worker_pb.VacuumTaskConfig) { + policy := GetTaskPolicy(mp, taskType) + if policy != nil { + policy.TaskConfig = &worker_pb.TaskPolicy_VacuumConfig{ + VacuumConfig: config, + } + } +} + +// SetErasureCodingTaskConfig sets the erasure coding task configuration +func SetErasureCodingTaskConfig(mp *MaintenancePolicy, taskType MaintenanceTaskType, config *worker_pb.ErasureCodingTaskConfig) { + policy := GetTaskPolicy(mp, taskType) + if policy != nil { + policy.TaskConfig = &worker_pb.TaskPolicy_ErasureCodingConfig{ + ErasureCodingConfig: config, + } + } +} + +// SetBalanceTaskConfig sets the balance task configuration +func SetBalanceTaskConfig(mp *MaintenancePolicy, taskType MaintenanceTaskType, config *worker_pb.BalanceTaskConfig) { + policy := GetTaskPolicy(mp, taskType) + if policy != nil { + policy.TaskConfig = &worker_pb.TaskPolicy_BalanceConfig{ + BalanceConfig: config, + } + } +} + +// SetReplicationTaskConfig sets the replication task configuration +func SetReplicationTaskConfig(mp *MaintenancePolicy, taskType MaintenanceTaskType, config *worker_pb.ReplicationTaskConfig) { + policy := GetTaskPolicy(mp, taskType) + if policy != nil { + policy.TaskConfig = &worker_pb.TaskPolicy_ReplicationConfig{ + ReplicationConfig: config, + } + } +} + +// SetTaskConfig sets a configuration value for a task type (legacy method - use typed setters above) +// Note: SetTaskConfig was removed - use typed setters: SetVacuumTaskConfig, SetErasureCodingTaskConfig, SetBalanceTaskConfig, or SetReplicationTaskConfig + // MaintenanceWorker represents a worker instance type MaintenanceWorker struct { ID string `json:"id"` @@ -217,29 +277,32 @@ type MaintenanceQueue struct { // MaintenanceScanner analyzes the cluster and generates maintenance tasks type MaintenanceScanner struct { - adminClient AdminClient - policy *MaintenancePolicy - queue *MaintenanceQueue - lastScan map[MaintenanceTaskType]time.Time - integration *MaintenanceIntegration + adminClient AdminClient + policy *MaintenancePolicy + queue *MaintenanceQueue + lastScan map[MaintenanceTaskType]time.Time + integration *MaintenanceIntegration + lastTopologyInfo *master_pb.TopologyInfo } // TaskDetectionResult represents the result of scanning for maintenance needs type TaskDetectionResult struct { - TaskType MaintenanceTaskType `json:"task_type"` - VolumeID uint32 `json:"volume_id,omitempty"` - Server string `json:"server,omitempty"` - Collection string `json:"collection,omitempty"` - Priority MaintenanceTaskPriority `json:"priority"` - Reason string `json:"reason"` - Parameters map[string]interface{} `json:"parameters,omitempty"` - ScheduleAt time.Time `json:"schedule_at"` + TaskType MaintenanceTaskType `json:"task_type"` + VolumeID uint32 `json:"volume_id,omitempty"` + Server string `json:"server,omitempty"` + Collection string `json:"collection,omitempty"` + Priority MaintenanceTaskPriority `json:"priority"` + Reason string `json:"reason"` + TypedParams *worker_pb.TaskParams `json:"typed_params,omitempty"` + ScheduleAt time.Time `json:"schedule_at"` } -// VolumeHealthMetrics contains health information about a volume +// VolumeHealthMetrics represents the health metrics for a volume type VolumeHealthMetrics struct { VolumeID uint32 `json:"volume_id"` Server string `json:"server"` + DiskType string `json:"disk_type"` // Disk type (e.g., "hdd", "ssd") or disk path (e.g., "/data1") + DiskId uint32 `json:"disk_id"` // ID of the disk in Store.Locations array Collection string `json:"collection"` Size uint64 `json:"size"` DeletedBytes uint64 `json:"deleted_bytes"` @@ -267,38 +330,6 @@ type MaintenanceStats struct { NextScanTime time.Time `json:"next_scan_time"` } -// MaintenanceConfig holds configuration for the maintenance system -type MaintenanceConfig struct { - Enabled bool `json:"enabled"` - ScanIntervalSeconds int `json:"scan_interval_seconds"` // How often to scan for maintenance needs (in seconds) - WorkerTimeoutSeconds int `json:"worker_timeout_seconds"` // Worker heartbeat timeout (in seconds) - TaskTimeoutSeconds int `json:"task_timeout_seconds"` // Individual task timeout (in seconds) - RetryDelaySeconds int `json:"retry_delay_seconds"` // Delay between retries (in seconds) - MaxRetries int `json:"max_retries"` // Default max retries for tasks - CleanupIntervalSeconds int `json:"cleanup_interval_seconds"` // How often to clean up old tasks (in seconds) - TaskRetentionSeconds int `json:"task_retention_seconds"` // How long to keep completed/failed tasks (in seconds) - Policy *MaintenancePolicy `json:"policy"` -} - -// Default configuration values -func DefaultMaintenanceConfig() *MaintenanceConfig { - return &MaintenanceConfig{ - Enabled: false, // Disabled by default for safety - ScanIntervalSeconds: 30 * 60, // 30 minutes - WorkerTimeoutSeconds: 5 * 60, // 5 minutes - TaskTimeoutSeconds: 2 * 60 * 60, // 2 hours - RetryDelaySeconds: 15 * 60, // 15 minutes - MaxRetries: 3, - CleanupIntervalSeconds: 24 * 60 * 60, // 24 hours - TaskRetentionSeconds: 7 * 24 * 60 * 60, // 7 days - Policy: &MaintenancePolicy{ - GlobalMaxConcurrent: 4, - DefaultRepeatInterval: 6, - DefaultCheckInterval: 12, - }, - } -} - // MaintenanceQueueData represents data for the queue visualization UI type MaintenanceQueueData struct { Tasks []*MaintenanceTask `json:"tasks"` @@ -380,10 +411,10 @@ type ClusterReplicationTask struct { // from all registered tasks using their UI providers func BuildMaintenancePolicyFromTasks() *MaintenancePolicy { policy := &MaintenancePolicy{ - TaskPolicies: make(map[MaintenanceTaskType]*TaskPolicy), - GlobalMaxConcurrent: 4, - DefaultRepeatInterval: 6, - DefaultCheckInterval: 12, + TaskPolicies: make(map[string]*TaskPolicy), + GlobalMaxConcurrent: 4, + DefaultRepeatIntervalSeconds: 6 * 3600, // 6 hours in seconds + DefaultCheckIntervalSeconds: 12 * 3600, // 12 hours in seconds } // Get all registered task types from the UI registry @@ -399,32 +430,23 @@ func BuildMaintenancePolicyFromTasks() *MaintenancePolicy { // Create task policy from UI configuration taskPolicy := &TaskPolicy{ - Enabled: true, // Default enabled - MaxConcurrent: 2, // Default concurrency - RepeatInterval: policy.DefaultRepeatInterval, - CheckInterval: policy.DefaultCheckInterval, - Configuration: make(map[string]interface{}), + Enabled: true, // Default enabled + MaxConcurrent: 2, // Default concurrency + RepeatIntervalSeconds: policy.DefaultRepeatIntervalSeconds, + CheckIntervalSeconds: policy.DefaultCheckIntervalSeconds, } - // Extract configuration from UI provider's config - if configMap, ok := defaultConfig.(map[string]interface{}); ok { - // Copy all configuration values - for key, value := range configMap { - taskPolicy.Configuration[key] = value + // Extract configuration using TaskConfig interface - no more map conversions! + if taskConfig, ok := defaultConfig.(interface{ ToTaskPolicy() *worker_pb.TaskPolicy }); ok { + // Use protobuf directly for clean, type-safe config extraction + pbTaskPolicy := taskConfig.ToTaskPolicy() + taskPolicy.Enabled = pbTaskPolicy.Enabled + taskPolicy.MaxConcurrent = pbTaskPolicy.MaxConcurrent + if pbTaskPolicy.RepeatIntervalSeconds > 0 { + taskPolicy.RepeatIntervalSeconds = pbTaskPolicy.RepeatIntervalSeconds } - - // Extract common fields - if enabled, exists := configMap["enabled"]; exists { - if enabledBool, ok := enabled.(bool); ok { - taskPolicy.Enabled = enabledBool - } - } - if maxConcurrent, exists := configMap["max_concurrent"]; exists { - if maxConcurrentInt, ok := maxConcurrent.(int); ok { - taskPolicy.MaxConcurrent = maxConcurrentInt - } else if maxConcurrentFloat, ok := maxConcurrent.(float64); ok { - taskPolicy.MaxConcurrent = int(maxConcurrentFloat) - } + if pbTaskPolicy.CheckIntervalSeconds > 0 { + taskPolicy.CheckIntervalSeconds = pbTaskPolicy.CheckIntervalSeconds } } @@ -432,24 +454,24 @@ func BuildMaintenancePolicyFromTasks() *MaintenancePolicy { var scheduler types.TaskScheduler = typesRegistry.GetScheduler(taskType) if scheduler != nil { if taskPolicy.MaxConcurrent <= 0 { - taskPolicy.MaxConcurrent = scheduler.GetMaxConcurrent() + taskPolicy.MaxConcurrent = int32(scheduler.GetMaxConcurrent()) } - // Convert default repeat interval to hours + // Convert default repeat interval to seconds if repeatInterval := scheduler.GetDefaultRepeatInterval(); repeatInterval > 0 { - taskPolicy.RepeatInterval = int(repeatInterval.Hours()) + taskPolicy.RepeatIntervalSeconds = int32(repeatInterval.Seconds()) } } // Also get defaults from detector if available (using types.TaskDetector explicitly) var detector types.TaskDetector = typesRegistry.GetDetector(taskType) if detector != nil { - // Convert scan interval to check interval (hours) + // Convert scan interval to check interval (seconds) if scanInterval := detector.ScanInterval(); scanInterval > 0 { - taskPolicy.CheckInterval = int(scanInterval.Hours()) + taskPolicy.CheckIntervalSeconds = int32(scanInterval.Seconds()) } } - policy.TaskPolicies[maintenanceTaskType] = taskPolicy + policy.TaskPolicies[string(maintenanceTaskType)] = taskPolicy glog.V(3).Infof("Built policy for task type %s: enabled=%v, max_concurrent=%d", maintenanceTaskType, taskPolicy.Enabled, taskPolicy.MaxConcurrent) } @@ -558,3 +580,8 @@ func BuildMaintenanceMenuItems() []*MaintenanceMenuItem { return menuItems } + +// Helper functions to extract configuration fields + +// Note: Removed getVacuumConfigField, getErasureCodingConfigField, getBalanceConfigField, getReplicationConfigField +// These were orphaned after removing GetTaskConfig - use typed getters instead diff --git a/weed/admin/maintenance/maintenance_worker.go b/weed/admin/maintenance/maintenance_worker.go index ab2157f24..96e17f9e9 100644 --- a/weed/admin/maintenance/maintenance_worker.go +++ b/weed/admin/maintenance/maintenance_worker.go @@ -7,6 +7,7 @@ import ( "time" "github.com/seaweedfs/seaweedfs/weed/glog" + "github.com/seaweedfs/seaweedfs/weed/worker" "github.com/seaweedfs/seaweedfs/weed/worker/tasks" "github.com/seaweedfs/seaweedfs/weed/worker/types" @@ -145,15 +146,20 @@ func NewMaintenanceWorkerService(workerID, address, adminServer string) *Mainten func (mws *MaintenanceWorkerService) executeGenericTask(task *MaintenanceTask) error { glog.V(2).Infof("Executing generic task %s: %s for volume %d", task.ID, task.Type, task.VolumeID) + // Validate that task has proper typed parameters + if task.TypedParams == nil { + return fmt.Errorf("task %s has no typed parameters - task was not properly planned (insufficient destinations)", task.ID) + } + // Convert MaintenanceTask to types.TaskType taskType := types.TaskType(string(task.Type)) // Create task parameters taskParams := types.TaskParams{ - VolumeID: task.VolumeID, - Server: task.Server, - Collection: task.Collection, - Parameters: task.Parameters, + VolumeID: task.VolumeID, + Server: task.Server, + Collection: task.Collection, + TypedParams: task.TypedParams, } // Create task instance using the registry @@ -396,10 +402,19 @@ func NewMaintenanceWorkerCommand(workerID, address, adminServer string) *Mainten // Run starts the maintenance worker as a standalone service func (mwc *MaintenanceWorkerCommand) Run() error { - // Generate worker ID if not provided + // Generate or load persistent worker ID if not provided if mwc.workerService.workerID == "" { - hostname, _ := os.Hostname() - mwc.workerService.workerID = fmt.Sprintf("worker-%s-%d", hostname, time.Now().Unix()) + // Get current working directory for worker ID persistence + wd, err := os.Getwd() + if err != nil { + return fmt.Errorf("failed to get working directory: %w", err) + } + + workerID, err := worker.GenerateOrLoadWorkerID(wd) + if err != nil { + return fmt.Errorf("failed to generate or load worker ID: %w", err) + } + mwc.workerService.workerID = workerID } // Start the worker service diff --git a/weed/admin/maintenance/pending_operations.go b/weed/admin/maintenance/pending_operations.go new file mode 100644 index 000000000..16130b4c9 --- /dev/null +++ b/weed/admin/maintenance/pending_operations.go @@ -0,0 +1,311 @@ +package maintenance + +import ( + "sync" + "time" + + "github.com/seaweedfs/seaweedfs/weed/glog" + "github.com/seaweedfs/seaweedfs/weed/worker/types" +) + +// PendingOperationType represents the type of pending operation +type PendingOperationType string + +const ( + OpTypeVolumeMove PendingOperationType = "volume_move" + OpTypeVolumeBalance PendingOperationType = "volume_balance" + OpTypeErasureCoding PendingOperationType = "erasure_coding" + OpTypeVacuum PendingOperationType = "vacuum" + OpTypeReplication PendingOperationType = "replication" +) + +// PendingOperation represents a pending volume/shard operation +type PendingOperation struct { + VolumeID uint32 `json:"volume_id"` + OperationType PendingOperationType `json:"operation_type"` + SourceNode string `json:"source_node"` + DestNode string `json:"dest_node,omitempty"` // Empty for non-movement operations + TaskID string `json:"task_id"` + StartTime time.Time `json:"start_time"` + EstimatedSize uint64 `json:"estimated_size"` // Bytes + Collection string `json:"collection"` + Status string `json:"status"` // "assigned", "in_progress", "completing" +} + +// PendingOperations tracks all pending volume/shard operations +type PendingOperations struct { + // Operations by volume ID for conflict detection + byVolumeID map[uint32]*PendingOperation + + // Operations by task ID for updates + byTaskID map[string]*PendingOperation + + // Operations by node for capacity calculations + bySourceNode map[string][]*PendingOperation + byDestNode map[string][]*PendingOperation + + mutex sync.RWMutex +} + +// NewPendingOperations creates a new pending operations tracker +func NewPendingOperations() *PendingOperations { + return &PendingOperations{ + byVolumeID: make(map[uint32]*PendingOperation), + byTaskID: make(map[string]*PendingOperation), + bySourceNode: make(map[string][]*PendingOperation), + byDestNode: make(map[string][]*PendingOperation), + } +} + +// AddOperation adds a pending operation +func (po *PendingOperations) AddOperation(op *PendingOperation) { + po.mutex.Lock() + defer po.mutex.Unlock() + + // Check for existing operation on this volume + if existing, exists := po.byVolumeID[op.VolumeID]; exists { + glog.V(1).Infof("Replacing existing pending operation on volume %d: %s -> %s", + op.VolumeID, existing.TaskID, op.TaskID) + po.removeOperationUnlocked(existing) + } + + // Add new operation + po.byVolumeID[op.VolumeID] = op + po.byTaskID[op.TaskID] = op + + // Add to node indexes + po.bySourceNode[op.SourceNode] = append(po.bySourceNode[op.SourceNode], op) + if op.DestNode != "" { + po.byDestNode[op.DestNode] = append(po.byDestNode[op.DestNode], op) + } + + glog.V(2).Infof("Added pending operation: volume %d, type %s, task %s, %s -> %s", + op.VolumeID, op.OperationType, op.TaskID, op.SourceNode, op.DestNode) +} + +// RemoveOperation removes a completed operation +func (po *PendingOperations) RemoveOperation(taskID string) { + po.mutex.Lock() + defer po.mutex.Unlock() + + if op, exists := po.byTaskID[taskID]; exists { + po.removeOperationUnlocked(op) + glog.V(2).Infof("Removed completed operation: volume %d, task %s", op.VolumeID, taskID) + } +} + +// removeOperationUnlocked removes an operation (must hold lock) +func (po *PendingOperations) removeOperationUnlocked(op *PendingOperation) { + delete(po.byVolumeID, op.VolumeID) + delete(po.byTaskID, op.TaskID) + + // Remove from source node list + if ops, exists := po.bySourceNode[op.SourceNode]; exists { + for i, other := range ops { + if other.TaskID == op.TaskID { + po.bySourceNode[op.SourceNode] = append(ops[:i], ops[i+1:]...) + break + } + } + } + + // Remove from dest node list + if op.DestNode != "" { + if ops, exists := po.byDestNode[op.DestNode]; exists { + for i, other := range ops { + if other.TaskID == op.TaskID { + po.byDestNode[op.DestNode] = append(ops[:i], ops[i+1:]...) + break + } + } + } + } +} + +// HasPendingOperationOnVolume checks if a volume has a pending operation +func (po *PendingOperations) HasPendingOperationOnVolume(volumeID uint32) bool { + po.mutex.RLock() + defer po.mutex.RUnlock() + + _, exists := po.byVolumeID[volumeID] + return exists +} + +// GetPendingOperationOnVolume returns the pending operation on a volume +func (po *PendingOperations) GetPendingOperationOnVolume(volumeID uint32) *PendingOperation { + po.mutex.RLock() + defer po.mutex.RUnlock() + + return po.byVolumeID[volumeID] +} + +// WouldConflictWithPending checks if a new operation would conflict with pending ones +func (po *PendingOperations) WouldConflictWithPending(volumeID uint32, opType PendingOperationType) bool { + po.mutex.RLock() + defer po.mutex.RUnlock() + + if existing, exists := po.byVolumeID[volumeID]; exists { + // Volume already has a pending operation + glog.V(3).Infof("Volume %d conflict: already has %s operation (task %s)", + volumeID, existing.OperationType, existing.TaskID) + return true + } + + return false +} + +// GetPendingCapacityImpactForNode calculates pending capacity changes for a node +func (po *PendingOperations) GetPendingCapacityImpactForNode(nodeID string) (incoming uint64, outgoing uint64) { + po.mutex.RLock() + defer po.mutex.RUnlock() + + // Calculate outgoing capacity (volumes leaving this node) + if ops, exists := po.bySourceNode[nodeID]; exists { + for _, op := range ops { + // Only count movement operations + if op.DestNode != "" { + outgoing += op.EstimatedSize + } + } + } + + // Calculate incoming capacity (volumes coming to this node) + if ops, exists := po.byDestNode[nodeID]; exists { + for _, op := range ops { + incoming += op.EstimatedSize + } + } + + return incoming, outgoing +} + +// FilterVolumeMetricsExcludingPending filters out volumes with pending operations +func (po *PendingOperations) FilterVolumeMetricsExcludingPending(metrics []*types.VolumeHealthMetrics) []*types.VolumeHealthMetrics { + po.mutex.RLock() + defer po.mutex.RUnlock() + + var filtered []*types.VolumeHealthMetrics + excludedCount := 0 + + for _, metric := range metrics { + if _, hasPending := po.byVolumeID[metric.VolumeID]; !hasPending { + filtered = append(filtered, metric) + } else { + excludedCount++ + glog.V(3).Infof("Excluding volume %d from scan due to pending operation", metric.VolumeID) + } + } + + if excludedCount > 0 { + glog.V(1).Infof("Filtered out %d volumes with pending operations from %d total volumes", + excludedCount, len(metrics)) + } + + return filtered +} + +// GetNodeCapacityProjection calculates projected capacity for a node +func (po *PendingOperations) GetNodeCapacityProjection(nodeID string, currentUsed uint64, totalCapacity uint64) NodeCapacityProjection { + incoming, outgoing := po.GetPendingCapacityImpactForNode(nodeID) + + projectedUsed := currentUsed + incoming - outgoing + projectedFree := totalCapacity - projectedUsed + + return NodeCapacityProjection{ + NodeID: nodeID, + CurrentUsed: currentUsed, + TotalCapacity: totalCapacity, + PendingIncoming: incoming, + PendingOutgoing: outgoing, + ProjectedUsed: projectedUsed, + ProjectedFree: projectedFree, + } +} + +// GetAllPendingOperations returns all pending operations +func (po *PendingOperations) GetAllPendingOperations() []*PendingOperation { + po.mutex.RLock() + defer po.mutex.RUnlock() + + var operations []*PendingOperation + for _, op := range po.byVolumeID { + operations = append(operations, op) + } + + return operations +} + +// UpdateOperationStatus updates the status of a pending operation +func (po *PendingOperations) UpdateOperationStatus(taskID string, status string) { + po.mutex.Lock() + defer po.mutex.Unlock() + + if op, exists := po.byTaskID[taskID]; exists { + op.Status = status + glog.V(3).Infof("Updated operation status: task %s, volume %d -> %s", taskID, op.VolumeID, status) + } +} + +// CleanupStaleOperations removes operations that have been running too long +func (po *PendingOperations) CleanupStaleOperations(maxAge time.Duration) int { + po.mutex.Lock() + defer po.mutex.Unlock() + + cutoff := time.Now().Add(-maxAge) + var staleOps []*PendingOperation + + for _, op := range po.byVolumeID { + if op.StartTime.Before(cutoff) { + staleOps = append(staleOps, op) + } + } + + for _, op := range staleOps { + po.removeOperationUnlocked(op) + glog.Warningf("Removed stale pending operation: volume %d, task %s, age %v", + op.VolumeID, op.TaskID, time.Since(op.StartTime)) + } + + return len(staleOps) +} + +// NodeCapacityProjection represents projected capacity for a node +type NodeCapacityProjection struct { + NodeID string `json:"node_id"` + CurrentUsed uint64 `json:"current_used"` + TotalCapacity uint64 `json:"total_capacity"` + PendingIncoming uint64 `json:"pending_incoming"` + PendingOutgoing uint64 `json:"pending_outgoing"` + ProjectedUsed uint64 `json:"projected_used"` + ProjectedFree uint64 `json:"projected_free"` +} + +// GetStats returns statistics about pending operations +func (po *PendingOperations) GetStats() PendingOperationsStats { + po.mutex.RLock() + defer po.mutex.RUnlock() + + stats := PendingOperationsStats{ + TotalOperations: len(po.byVolumeID), + ByType: make(map[PendingOperationType]int), + ByStatus: make(map[string]int), + } + + var totalSize uint64 + for _, op := range po.byVolumeID { + stats.ByType[op.OperationType]++ + stats.ByStatus[op.Status]++ + totalSize += op.EstimatedSize + } + + stats.TotalEstimatedSize = totalSize + return stats +} + +// PendingOperationsStats provides statistics about pending operations +type PendingOperationsStats struct { + TotalOperations int `json:"total_operations"` + ByType map[PendingOperationType]int `json:"by_type"` + ByStatus map[string]int `json:"by_status"` + TotalEstimatedSize uint64 `json:"total_estimated_size"` +} diff --git a/weed/admin/maintenance/pending_operations_test.go b/weed/admin/maintenance/pending_operations_test.go new file mode 100644 index 000000000..64bb591fb --- /dev/null +++ b/weed/admin/maintenance/pending_operations_test.go @@ -0,0 +1,250 @@ +package maintenance + +import ( + "testing" + "time" + + "github.com/seaweedfs/seaweedfs/weed/worker/types" +) + +func TestPendingOperations_ConflictDetection(t *testing.T) { + pendingOps := NewPendingOperations() + + // Add a pending erasure coding operation on volume 123 + op := &PendingOperation{ + VolumeID: 123, + OperationType: OpTypeErasureCoding, + SourceNode: "node1", + TaskID: "task-001", + StartTime: time.Now(), + EstimatedSize: 1024 * 1024 * 1024, // 1GB + Collection: "test", + Status: "assigned", + } + + pendingOps.AddOperation(op) + + // Test conflict detection + if !pendingOps.HasPendingOperationOnVolume(123) { + t.Errorf("Expected volume 123 to have pending operation") + } + + if !pendingOps.WouldConflictWithPending(123, OpTypeVacuum) { + t.Errorf("Expected conflict when trying to add vacuum operation on volume 123") + } + + if pendingOps.HasPendingOperationOnVolume(124) { + t.Errorf("Expected volume 124 to have no pending operation") + } + + if pendingOps.WouldConflictWithPending(124, OpTypeVacuum) { + t.Errorf("Expected no conflict for volume 124") + } +} + +func TestPendingOperations_CapacityProjection(t *testing.T) { + pendingOps := NewPendingOperations() + + // Add operation moving volume from node1 to node2 + op1 := &PendingOperation{ + VolumeID: 100, + OperationType: OpTypeVolumeMove, + SourceNode: "node1", + DestNode: "node2", + TaskID: "task-001", + StartTime: time.Now(), + EstimatedSize: 2 * 1024 * 1024 * 1024, // 2GB + Collection: "test", + Status: "in_progress", + } + + // Add operation moving volume from node3 to node1 + op2 := &PendingOperation{ + VolumeID: 101, + OperationType: OpTypeVolumeMove, + SourceNode: "node3", + DestNode: "node1", + TaskID: "task-002", + StartTime: time.Now(), + EstimatedSize: 1 * 1024 * 1024 * 1024, // 1GB + Collection: "test", + Status: "assigned", + } + + pendingOps.AddOperation(op1) + pendingOps.AddOperation(op2) + + // Test capacity impact for node1 + incoming, outgoing := pendingOps.GetPendingCapacityImpactForNode("node1") + expectedIncoming := uint64(1 * 1024 * 1024 * 1024) // 1GB incoming + expectedOutgoing := uint64(2 * 1024 * 1024 * 1024) // 2GB outgoing + + if incoming != expectedIncoming { + t.Errorf("Expected incoming capacity %d, got %d", expectedIncoming, incoming) + } + + if outgoing != expectedOutgoing { + t.Errorf("Expected outgoing capacity %d, got %d", expectedOutgoing, outgoing) + } + + // Test projection for node1 + currentUsed := uint64(10 * 1024 * 1024 * 1024) // 10GB current + totalCapacity := uint64(50 * 1024 * 1024 * 1024) // 50GB total + + projection := pendingOps.GetNodeCapacityProjection("node1", currentUsed, totalCapacity) + + expectedProjectedUsed := currentUsed + incoming - outgoing // 10 + 1 - 2 = 9GB + expectedProjectedFree := totalCapacity - expectedProjectedUsed // 50 - 9 = 41GB + + if projection.ProjectedUsed != expectedProjectedUsed { + t.Errorf("Expected projected used %d, got %d", expectedProjectedUsed, projection.ProjectedUsed) + } + + if projection.ProjectedFree != expectedProjectedFree { + t.Errorf("Expected projected free %d, got %d", expectedProjectedFree, projection.ProjectedFree) + } +} + +func TestPendingOperations_VolumeFiltering(t *testing.T) { + pendingOps := NewPendingOperations() + + // Create volume metrics + metrics := []*types.VolumeHealthMetrics{ + {VolumeID: 100, Server: "node1"}, + {VolumeID: 101, Server: "node2"}, + {VolumeID: 102, Server: "node3"}, + {VolumeID: 103, Server: "node1"}, + } + + // Add pending operations on volumes 101 and 103 + op1 := &PendingOperation{ + VolumeID: 101, + OperationType: OpTypeVacuum, + SourceNode: "node2", + TaskID: "task-001", + StartTime: time.Now(), + EstimatedSize: 1024 * 1024 * 1024, + Status: "in_progress", + } + + op2 := &PendingOperation{ + VolumeID: 103, + OperationType: OpTypeErasureCoding, + SourceNode: "node1", + TaskID: "task-002", + StartTime: time.Now(), + EstimatedSize: 2 * 1024 * 1024 * 1024, + Status: "assigned", + } + + pendingOps.AddOperation(op1) + pendingOps.AddOperation(op2) + + // Filter metrics + filtered := pendingOps.FilterVolumeMetricsExcludingPending(metrics) + + // Should only have volumes 100 and 102 (101 and 103 are filtered out) + if len(filtered) != 2 { + t.Errorf("Expected 2 filtered metrics, got %d", len(filtered)) + } + + // Check that correct volumes remain + foundVolumes := make(map[uint32]bool) + for _, metric := range filtered { + foundVolumes[metric.VolumeID] = true + } + + if !foundVolumes[100] || !foundVolumes[102] { + t.Errorf("Expected volumes 100 and 102 to remain after filtering") + } + + if foundVolumes[101] || foundVolumes[103] { + t.Errorf("Expected volumes 101 and 103 to be filtered out") + } +} + +func TestPendingOperations_OperationLifecycle(t *testing.T) { + pendingOps := NewPendingOperations() + + // Add operation + op := &PendingOperation{ + VolumeID: 200, + OperationType: OpTypeVolumeBalance, + SourceNode: "node1", + DestNode: "node2", + TaskID: "task-balance-001", + StartTime: time.Now(), + EstimatedSize: 1024 * 1024 * 1024, + Status: "assigned", + } + + pendingOps.AddOperation(op) + + // Check it exists + if !pendingOps.HasPendingOperationOnVolume(200) { + t.Errorf("Expected volume 200 to have pending operation") + } + + // Update status + pendingOps.UpdateOperationStatus("task-balance-001", "in_progress") + + retrievedOp := pendingOps.GetPendingOperationOnVolume(200) + if retrievedOp == nil { + t.Errorf("Expected to retrieve pending operation for volume 200") + } else if retrievedOp.Status != "in_progress" { + t.Errorf("Expected operation status to be 'in_progress', got '%s'", retrievedOp.Status) + } + + // Complete operation + pendingOps.RemoveOperation("task-balance-001") + + if pendingOps.HasPendingOperationOnVolume(200) { + t.Errorf("Expected volume 200 to have no pending operation after removal") + } +} + +func TestPendingOperations_StaleCleanup(t *testing.T) { + pendingOps := NewPendingOperations() + + // Add recent operation + recentOp := &PendingOperation{ + VolumeID: 300, + OperationType: OpTypeVacuum, + SourceNode: "node1", + TaskID: "task-recent", + StartTime: time.Now(), + EstimatedSize: 1024 * 1024 * 1024, + Status: "in_progress", + } + + // Add stale operation (24 hours ago) + staleOp := &PendingOperation{ + VolumeID: 301, + OperationType: OpTypeErasureCoding, + SourceNode: "node2", + TaskID: "task-stale", + StartTime: time.Now().Add(-24 * time.Hour), + EstimatedSize: 2 * 1024 * 1024 * 1024, + Status: "in_progress", + } + + pendingOps.AddOperation(recentOp) + pendingOps.AddOperation(staleOp) + + // Clean up operations older than 1 hour + removedCount := pendingOps.CleanupStaleOperations(1 * time.Hour) + + if removedCount != 1 { + t.Errorf("Expected to remove 1 stale operation, removed %d", removedCount) + } + + // Recent operation should still exist + if !pendingOps.HasPendingOperationOnVolume(300) { + t.Errorf("Expected recent operation on volume 300 to still exist") + } + + // Stale operation should be removed + if pendingOps.HasPendingOperationOnVolume(301) { + t.Errorf("Expected stale operation on volume 301 to be removed") + } +} diff --git a/weed/admin/static/css/admin.css b/weed/admin/static/css/admin.css index c69876060..a945d320e 100644 --- a/weed/admin/static/css/admin.css +++ b/weed/admin/static/css/admin.css @@ -9,6 +9,7 @@ z-index: 100; padding: 48px 0 0; box-shadow: inset -1px 0 0 rgba(0, 0, 0, .1); + overflow-y: auto; } .sidebar-heading { diff --git a/weed/admin/topology/active_topology.go b/weed/admin/topology/active_topology.go new file mode 100644 index 000000000..9ce63bfa7 --- /dev/null +++ b/weed/admin/topology/active_topology.go @@ -0,0 +1,741 @@ +package topology + +import ( + "fmt" + "sync" + "time" + + "github.com/seaweedfs/seaweedfs/weed/glog" + "github.com/seaweedfs/seaweedfs/weed/pb/master_pb" +) + +// TaskType represents different types of maintenance operations +type TaskType string + +// TaskStatus represents the current status of a task +type TaskStatus string + +// Common task type constants +const ( + TaskTypeVacuum TaskType = "vacuum" + TaskTypeBalance TaskType = "balance" + TaskTypeErasureCoding TaskType = "erasure_coding" + TaskTypeReplication TaskType = "replication" +) + +// Common task status constants +const ( + TaskStatusPending TaskStatus = "pending" + TaskStatusInProgress TaskStatus = "in_progress" + TaskStatusCompleted TaskStatus = "completed" +) + +// taskState represents the current state of tasks affecting the topology (internal) +type taskState struct { + VolumeID uint32 `json:"volume_id"` + TaskType TaskType `json:"task_type"` + SourceServer string `json:"source_server"` + SourceDisk uint32 `json:"source_disk"` + TargetServer string `json:"target_server,omitempty"` + TargetDisk uint32 `json:"target_disk,omitempty"` + Status TaskStatus `json:"status"` + StartedAt time.Time `json:"started_at"` + CompletedAt time.Time `json:"completed_at,omitempty"` +} + +// DiskInfo represents a disk with its current state and ongoing tasks (public for external access) +type DiskInfo struct { + NodeID string `json:"node_id"` + DiskID uint32 `json:"disk_id"` + DiskType string `json:"disk_type"` + DataCenter string `json:"data_center"` + Rack string `json:"rack"` + DiskInfo *master_pb.DiskInfo `json:"disk_info"` + LoadCount int `json:"load_count"` // Number of active tasks +} + +// activeDisk represents internal disk state (private) +type activeDisk struct { + *DiskInfo + pendingTasks []*taskState + assignedTasks []*taskState + recentTasks []*taskState // Completed in last N seconds +} + +// activeNode represents a node with its disks (private) +type activeNode struct { + nodeID string + dataCenter string + rack string + nodeInfo *master_pb.DataNodeInfo + disks map[uint32]*activeDisk // DiskID -> activeDisk +} + +// ActiveTopology provides a real-time view of cluster state with task awareness +type ActiveTopology struct { + // Core topology from master + topologyInfo *master_pb.TopologyInfo + lastUpdated time.Time + + // Structured topology for easy access (private) + nodes map[string]*activeNode // NodeID -> activeNode + disks map[string]*activeDisk // "NodeID:DiskID" -> activeDisk + + // Task states affecting the topology (private) + pendingTasks map[string]*taskState + assignedTasks map[string]*taskState + recentTasks map[string]*taskState + + // Configuration + recentTaskWindowSeconds int + + // Synchronization + mutex sync.RWMutex +} + +// NewActiveTopology creates a new ActiveTopology instance +func NewActiveTopology(recentTaskWindowSeconds int) *ActiveTopology { + if recentTaskWindowSeconds <= 0 { + recentTaskWindowSeconds = 10 // Default 10 seconds + } + + return &ActiveTopology{ + nodes: make(map[string]*activeNode), + disks: make(map[string]*activeDisk), + pendingTasks: make(map[string]*taskState), + assignedTasks: make(map[string]*taskState), + recentTasks: make(map[string]*taskState), + recentTaskWindowSeconds: recentTaskWindowSeconds, + } +} + +// UpdateTopology updates the topology information from master +func (at *ActiveTopology) UpdateTopology(topologyInfo *master_pb.TopologyInfo) error { + at.mutex.Lock() + defer at.mutex.Unlock() + + at.topologyInfo = topologyInfo + at.lastUpdated = time.Now() + + // Rebuild structured topology + at.nodes = make(map[string]*activeNode) + at.disks = make(map[string]*activeDisk) + + for _, dc := range topologyInfo.DataCenterInfos { + for _, rack := range dc.RackInfos { + for _, nodeInfo := range rack.DataNodeInfos { + node := &activeNode{ + nodeID: nodeInfo.Id, + dataCenter: dc.Id, + rack: rack.Id, + nodeInfo: nodeInfo, + disks: make(map[uint32]*activeDisk), + } + + // Add disks for this node + for diskType, diskInfo := range nodeInfo.DiskInfos { + disk := &activeDisk{ + DiskInfo: &DiskInfo{ + NodeID: nodeInfo.Id, + DiskID: diskInfo.DiskId, + DiskType: diskType, + DataCenter: dc.Id, + Rack: rack.Id, + DiskInfo: diskInfo, + }, + } + + diskKey := fmt.Sprintf("%s:%d", nodeInfo.Id, diskInfo.DiskId) + node.disks[diskInfo.DiskId] = disk + at.disks[diskKey] = disk + } + + at.nodes[nodeInfo.Id] = node + } + } + } + + // Reassign task states to updated topology + at.reassignTaskStates() + + glog.V(1).Infof("ActiveTopology updated: %d nodes, %d disks", len(at.nodes), len(at.disks)) + return nil +} + +// AddPendingTask adds a pending task to the topology +func (at *ActiveTopology) AddPendingTask(taskID string, taskType TaskType, volumeID uint32, + sourceServer string, sourceDisk uint32, targetServer string, targetDisk uint32) { + at.mutex.Lock() + defer at.mutex.Unlock() + + task := &taskState{ + VolumeID: volumeID, + TaskType: taskType, + SourceServer: sourceServer, + SourceDisk: sourceDisk, + TargetServer: targetServer, + TargetDisk: targetDisk, + Status: TaskStatusPending, + StartedAt: time.Now(), + } + + at.pendingTasks[taskID] = task + at.assignTaskToDisk(task) +} + +// AssignTask moves a task from pending to assigned +func (at *ActiveTopology) AssignTask(taskID string) error { + at.mutex.Lock() + defer at.mutex.Unlock() + + task, exists := at.pendingTasks[taskID] + if !exists { + return fmt.Errorf("pending task %s not found", taskID) + } + + delete(at.pendingTasks, taskID) + task.Status = TaskStatusInProgress + at.assignedTasks[taskID] = task + at.reassignTaskStates() + + return nil +} + +// CompleteTask moves a task from assigned to recent +func (at *ActiveTopology) CompleteTask(taskID string) error { + at.mutex.Lock() + defer at.mutex.Unlock() + + task, exists := at.assignedTasks[taskID] + if !exists { + return fmt.Errorf("assigned task %s not found", taskID) + } + + delete(at.assignedTasks, taskID) + task.Status = TaskStatusCompleted + task.CompletedAt = time.Now() + at.recentTasks[taskID] = task + at.reassignTaskStates() + + // Clean up old recent tasks + at.cleanupRecentTasks() + + return nil +} + +// GetAvailableDisks returns disks that can accept new tasks of the given type +func (at *ActiveTopology) GetAvailableDisks(taskType TaskType, excludeNodeID string) []*DiskInfo { + at.mutex.RLock() + defer at.mutex.RUnlock() + + var available []*DiskInfo + + for _, disk := range at.disks { + if disk.NodeID == excludeNodeID { + continue // Skip excluded node + } + + if at.isDiskAvailable(disk, taskType) { + // Create a copy with current load count + diskCopy := *disk.DiskInfo + diskCopy.LoadCount = len(disk.pendingTasks) + len(disk.assignedTasks) + available = append(available, &diskCopy) + } + } + + return available +} + +// GetDiskLoad returns the current load on a disk (number of active tasks) +func (at *ActiveTopology) GetDiskLoad(nodeID string, diskID uint32) int { + at.mutex.RLock() + defer at.mutex.RUnlock() + + diskKey := fmt.Sprintf("%s:%d", nodeID, diskID) + disk, exists := at.disks[diskKey] + if !exists { + return 0 + } + + return len(disk.pendingTasks) + len(disk.assignedTasks) +} + +// HasRecentTaskForVolume checks if a volume had a recent task (to avoid immediate re-detection) +func (at *ActiveTopology) HasRecentTaskForVolume(volumeID uint32, taskType TaskType) bool { + at.mutex.RLock() + defer at.mutex.RUnlock() + + for _, task := range at.recentTasks { + if task.VolumeID == volumeID && task.TaskType == taskType { + return true + } + } + + return false +} + +// GetAllNodes returns information about all nodes (public interface) +func (at *ActiveTopology) GetAllNodes() map[string]*master_pb.DataNodeInfo { + at.mutex.RLock() + defer at.mutex.RUnlock() + + result := make(map[string]*master_pb.DataNodeInfo) + for nodeID, node := range at.nodes { + result[nodeID] = node.nodeInfo + } + return result +} + +// GetTopologyInfo returns the current topology information (read-only access) +func (at *ActiveTopology) GetTopologyInfo() *master_pb.TopologyInfo { + at.mutex.RLock() + defer at.mutex.RUnlock() + return at.topologyInfo +} + +// GetNodeDisks returns all disks for a specific node +func (at *ActiveTopology) GetNodeDisks(nodeID string) []*DiskInfo { + at.mutex.RLock() + defer at.mutex.RUnlock() + + node, exists := at.nodes[nodeID] + if !exists { + return nil + } + + var disks []*DiskInfo + for _, disk := range node.disks { + diskCopy := *disk.DiskInfo + diskCopy.LoadCount = len(disk.pendingTasks) + len(disk.assignedTasks) + disks = append(disks, &diskCopy) + } + + return disks +} + +// DestinationPlan represents a planned destination for a volume/shard operation +type DestinationPlan struct { + TargetNode string `json:"target_node"` + TargetDisk uint32 `json:"target_disk"` + TargetRack string `json:"target_rack"` + TargetDC string `json:"target_dc"` + ExpectedSize uint64 `json:"expected_size"` + PlacementScore float64 `json:"placement_score"` + Conflicts []string `json:"conflicts"` +} + +// MultiDestinationPlan represents multiple planned destinations for operations like EC +type MultiDestinationPlan struct { + Plans []*DestinationPlan `json:"plans"` + TotalShards int `json:"total_shards"` + SuccessfulRack int `json:"successful_racks"` + SuccessfulDCs int `json:"successful_dcs"` +} + +// PlanBalanceDestination finds the best destination for a balance operation +func (at *ActiveTopology) PlanBalanceDestination(volumeID uint32, sourceNode string, sourceRack string, sourceDC string, volumeSize uint64) (*DestinationPlan, error) { + at.mutex.RLock() + defer at.mutex.RUnlock() + + // Get available disks, excluding the source node + availableDisks := at.getAvailableDisksForPlanning(TaskTypeBalance, sourceNode) + if len(availableDisks) == 0 { + return nil, fmt.Errorf("no available disks for balance operation") + } + + // Score each disk for balance placement + bestDisk := at.selectBestBalanceDestination(availableDisks, sourceRack, sourceDC, volumeSize) + if bestDisk == nil { + return nil, fmt.Errorf("no suitable destination found for balance operation") + } + + return &DestinationPlan{ + TargetNode: bestDisk.NodeID, + TargetDisk: bestDisk.DiskID, + TargetRack: bestDisk.Rack, + TargetDC: bestDisk.DataCenter, + ExpectedSize: volumeSize, + PlacementScore: at.calculatePlacementScore(bestDisk, sourceRack, sourceDC), + Conflicts: at.checkPlacementConflicts(bestDisk, TaskTypeBalance), + }, nil +} + +// PlanECDestinations finds multiple destinations for EC shard distribution +func (at *ActiveTopology) PlanECDestinations(volumeID uint32, sourceNode string, sourceRack string, sourceDC string, shardsNeeded int) (*MultiDestinationPlan, error) { + at.mutex.RLock() + defer at.mutex.RUnlock() + + // Get available disks for EC placement + availableDisks := at.getAvailableDisksForPlanning(TaskTypeErasureCoding, "") + if len(availableDisks) < shardsNeeded { + return nil, fmt.Errorf("insufficient disks for EC placement: need %d, have %d", shardsNeeded, len(availableDisks)) + } + + // Select best disks for EC placement with rack/DC diversity + selectedDisks := at.selectBestECDestinations(availableDisks, sourceRack, sourceDC, shardsNeeded) + if len(selectedDisks) < shardsNeeded { + return nil, fmt.Errorf("could not find %d suitable destinations for EC placement", shardsNeeded) + } + + var plans []*DestinationPlan + rackCount := make(map[string]int) + dcCount := make(map[string]int) + + for _, disk := range selectedDisks { + plan := &DestinationPlan{ + TargetNode: disk.NodeID, + TargetDisk: disk.DiskID, + TargetRack: disk.Rack, + TargetDC: disk.DataCenter, + ExpectedSize: 0, // EC shards don't have predetermined size + PlacementScore: at.calculatePlacementScore(disk, sourceRack, sourceDC), + Conflicts: at.checkPlacementConflicts(disk, TaskTypeErasureCoding), + } + plans = append(plans, plan) + + // Count rack and DC diversity + rackKey := fmt.Sprintf("%s:%s", disk.DataCenter, disk.Rack) + rackCount[rackKey]++ + dcCount[disk.DataCenter]++ + } + + return &MultiDestinationPlan{ + Plans: plans, + TotalShards: len(plans), + SuccessfulRack: len(rackCount), + SuccessfulDCs: len(dcCount), + }, nil +} + +// getAvailableDisksForPlanning returns disks available for destination planning +func (at *ActiveTopology) getAvailableDisksForPlanning(taskType TaskType, excludeNodeID string) []*activeDisk { + var available []*activeDisk + + for _, disk := range at.disks { + if excludeNodeID != "" && disk.NodeID == excludeNodeID { + continue // Skip excluded node + } + + if at.isDiskAvailable(disk, taskType) { + available = append(available, disk) + } + } + + return available +} + +// selectBestBalanceDestination selects the best disk for balance operation +func (at *ActiveTopology) selectBestBalanceDestination(disks []*activeDisk, sourceRack string, sourceDC string, volumeSize uint64) *activeDisk { + if len(disks) == 0 { + return nil + } + + var bestDisk *activeDisk + bestScore := -1.0 + + for _, disk := range disks { + score := at.calculateBalanceScore(disk, sourceRack, sourceDC, volumeSize) + if score > bestScore { + bestScore = score + bestDisk = disk + } + } + + return bestDisk +} + +// selectBestECDestinations selects multiple disks for EC shard placement with diversity +func (at *ActiveTopology) selectBestECDestinations(disks []*activeDisk, sourceRack string, sourceDC string, shardsNeeded int) []*activeDisk { + if len(disks) == 0 { + return nil + } + + // Group disks by rack and DC for diversity + rackGroups := make(map[string][]*activeDisk) + for _, disk := range disks { + rackKey := fmt.Sprintf("%s:%s", disk.DataCenter, disk.Rack) + rackGroups[rackKey] = append(rackGroups[rackKey], disk) + } + + var selected []*activeDisk + usedRacks := make(map[string]bool) + + // First pass: select one disk from each rack for maximum diversity + for rackKey, rackDisks := range rackGroups { + if len(selected) >= shardsNeeded { + break + } + + // Select best disk from this rack + bestDisk := at.selectBestFromRack(rackDisks, sourceRack, sourceDC) + if bestDisk != nil { + selected = append(selected, bestDisk) + usedRacks[rackKey] = true + } + } + + // Second pass: if we need more disks, select from racks we've already used + if len(selected) < shardsNeeded { + for _, disk := range disks { + if len(selected) >= shardsNeeded { + break + } + + // Skip if already selected + alreadySelected := false + for _, sel := range selected { + if sel.NodeID == disk.NodeID && sel.DiskID == disk.DiskID { + alreadySelected = true + break + } + } + + if !alreadySelected && at.isDiskAvailable(disk, TaskTypeErasureCoding) { + selected = append(selected, disk) + } + } + } + + return selected +} + +// selectBestFromRack selects the best disk from a rack +func (at *ActiveTopology) selectBestFromRack(disks []*activeDisk, sourceRack string, sourceDC string) *activeDisk { + if len(disks) == 0 { + return nil + } + + var bestDisk *activeDisk + bestScore := -1.0 + + for _, disk := range disks { + if !at.isDiskAvailable(disk, TaskTypeErasureCoding) { + continue + } + + score := at.calculateECScore(disk, sourceRack, sourceDC) + if score > bestScore { + bestScore = score + bestDisk = disk + } + } + + return bestDisk +} + +// calculateBalanceScore calculates placement score for balance operations +func (at *ActiveTopology) calculateBalanceScore(disk *activeDisk, sourceRack string, sourceDC string, volumeSize uint64) float64 { + score := 0.0 + + // Prefer disks with lower load + activeLoad := len(disk.pendingTasks) + len(disk.assignedTasks) + score += (2.0 - float64(activeLoad)) * 40.0 // Max 80 points for load + + // Prefer disks with more free space + if disk.DiskInfo.DiskInfo.MaxVolumeCount > 0 { + freeRatio := float64(disk.DiskInfo.DiskInfo.MaxVolumeCount-disk.DiskInfo.DiskInfo.VolumeCount) / float64(disk.DiskInfo.DiskInfo.MaxVolumeCount) + score += freeRatio * 20.0 // Max 20 points for free space + } + + // Rack diversity bonus (prefer different rack) + if disk.Rack != sourceRack { + score += 10.0 + } + + // DC diversity bonus (prefer different DC) + if disk.DataCenter != sourceDC { + score += 5.0 + } + + return score +} + +// calculateECScore calculates placement score for EC operations +func (at *ActiveTopology) calculateECScore(disk *activeDisk, sourceRack string, sourceDC string) float64 { + score := 0.0 + + // Prefer disks with lower load + activeLoad := len(disk.pendingTasks) + len(disk.assignedTasks) + score += (2.0 - float64(activeLoad)) * 30.0 // Max 60 points for load + + // Prefer disks with more free space + if disk.DiskInfo.DiskInfo.MaxVolumeCount > 0 { + freeRatio := float64(disk.DiskInfo.DiskInfo.MaxVolumeCount-disk.DiskInfo.DiskInfo.VolumeCount) / float64(disk.DiskInfo.DiskInfo.MaxVolumeCount) + score += freeRatio * 20.0 // Max 20 points for free space + } + + // Strong rack diversity preference for EC + if disk.Rack != sourceRack { + score += 20.0 + } + + // Strong DC diversity preference for EC + if disk.DataCenter != sourceDC { + score += 15.0 + } + + return score +} + +// calculatePlacementScore calculates overall placement quality score +func (at *ActiveTopology) calculatePlacementScore(disk *activeDisk, sourceRack string, sourceDC string) float64 { + score := 0.0 + + // Load factor + activeLoad := len(disk.pendingTasks) + len(disk.assignedTasks) + loadScore := (2.0 - float64(activeLoad)) / 2.0 // Normalize to 0-1 + score += loadScore * 0.4 + + // Capacity factor + if disk.DiskInfo.DiskInfo.MaxVolumeCount > 0 { + freeRatio := float64(disk.DiskInfo.DiskInfo.MaxVolumeCount-disk.DiskInfo.DiskInfo.VolumeCount) / float64(disk.DiskInfo.DiskInfo.MaxVolumeCount) + score += freeRatio * 0.3 + } + + // Diversity factor + diversityScore := 0.0 + if disk.Rack != sourceRack { + diversityScore += 0.5 + } + if disk.DataCenter != sourceDC { + diversityScore += 0.5 + } + score += diversityScore * 0.3 + + return score // Score between 0.0 and 1.0 +} + +// checkPlacementConflicts checks for placement rule violations +func (at *ActiveTopology) checkPlacementConflicts(disk *activeDisk, taskType TaskType) []string { + var conflicts []string + + // Check load limits + activeLoad := len(disk.pendingTasks) + len(disk.assignedTasks) + if activeLoad >= 2 { + conflicts = append(conflicts, fmt.Sprintf("disk_load_high_%d", activeLoad)) + } + + // Check capacity limits + if disk.DiskInfo.DiskInfo.MaxVolumeCount > 0 { + usageRatio := float64(disk.DiskInfo.DiskInfo.VolumeCount) / float64(disk.DiskInfo.DiskInfo.MaxVolumeCount) + if usageRatio > 0.9 { + conflicts = append(conflicts, "disk_capacity_high") + } + } + + // Check for conflicting task types + for _, task := range disk.assignedTasks { + if at.areTaskTypesConflicting(task.TaskType, taskType) { + conflicts = append(conflicts, fmt.Sprintf("task_conflict_%s", task.TaskType)) + } + } + + return conflicts +} + +// Private methods + +// reassignTaskStates assigns tasks to the appropriate disks +func (at *ActiveTopology) reassignTaskStates() { + // Clear existing task assignments + for _, disk := range at.disks { + disk.pendingTasks = nil + disk.assignedTasks = nil + disk.recentTasks = nil + } + + // Reassign pending tasks + for _, task := range at.pendingTasks { + at.assignTaskToDisk(task) + } + + // Reassign assigned tasks + for _, task := range at.assignedTasks { + at.assignTaskToDisk(task) + } + + // Reassign recent tasks + for _, task := range at.recentTasks { + at.assignTaskToDisk(task) + } +} + +// assignTaskToDisk assigns a task to the appropriate disk(s) +func (at *ActiveTopology) assignTaskToDisk(task *taskState) { + // Assign to source disk + sourceKey := fmt.Sprintf("%s:%d", task.SourceServer, task.SourceDisk) + if sourceDisk, exists := at.disks[sourceKey]; exists { + switch task.Status { + case TaskStatusPending: + sourceDisk.pendingTasks = append(sourceDisk.pendingTasks, task) + case TaskStatusInProgress: + sourceDisk.assignedTasks = append(sourceDisk.assignedTasks, task) + case TaskStatusCompleted: + sourceDisk.recentTasks = append(sourceDisk.recentTasks, task) + } + } + + // Assign to target disk if it exists and is different from source + if task.TargetServer != "" && (task.TargetServer != task.SourceServer || task.TargetDisk != task.SourceDisk) { + targetKey := fmt.Sprintf("%s:%d", task.TargetServer, task.TargetDisk) + if targetDisk, exists := at.disks[targetKey]; exists { + switch task.Status { + case TaskStatusPending: + targetDisk.pendingTasks = append(targetDisk.pendingTasks, task) + case TaskStatusInProgress: + targetDisk.assignedTasks = append(targetDisk.assignedTasks, task) + case TaskStatusCompleted: + targetDisk.recentTasks = append(targetDisk.recentTasks, task) + } + } + } +} + +// isDiskAvailable checks if a disk can accept new tasks +func (at *ActiveTopology) isDiskAvailable(disk *activeDisk, taskType TaskType) bool { + // Check if disk has too many active tasks + activeLoad := len(disk.pendingTasks) + len(disk.assignedTasks) + if activeLoad >= 2 { // Max 2 concurrent tasks per disk + return false + } + + // Check for conflicting task types + for _, task := range disk.assignedTasks { + if at.areTaskTypesConflicting(task.TaskType, taskType) { + return false + } + } + + return true +} + +// areTaskTypesConflicting checks if two task types conflict +func (at *ActiveTopology) areTaskTypesConflicting(existing, new TaskType) bool { + // Examples of conflicting task types + conflictMap := map[TaskType][]TaskType{ + TaskTypeVacuum: {TaskTypeBalance, TaskTypeErasureCoding}, + TaskTypeBalance: {TaskTypeVacuum, TaskTypeErasureCoding}, + TaskTypeErasureCoding: {TaskTypeVacuum, TaskTypeBalance}, + } + + if conflicts, exists := conflictMap[existing]; exists { + for _, conflictType := range conflicts { + if conflictType == new { + return true + } + } + } + + return false +} + +// cleanupRecentTasks removes old recent tasks +func (at *ActiveTopology) cleanupRecentTasks() { + cutoff := time.Now().Add(-time.Duration(at.recentTaskWindowSeconds) * time.Second) + + for taskID, task := range at.recentTasks { + if task.CompletedAt.Before(cutoff) { + delete(at.recentTasks, taskID) + } + } +} diff --git a/weed/admin/topology/active_topology_test.go b/weed/admin/topology/active_topology_test.go new file mode 100644 index 000000000..9f2f09c29 --- /dev/null +++ b/weed/admin/topology/active_topology_test.go @@ -0,0 +1,654 @@ +package topology + +import ( + "testing" + "time" + + "github.com/seaweedfs/seaweedfs/weed/glog" + "github.com/seaweedfs/seaweedfs/weed/pb/master_pb" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestActiveTopologyBasicOperations tests basic topology management +func TestActiveTopologyBasicOperations(t *testing.T) { + topology := NewActiveTopology(10) + assert.NotNil(t, topology) + assert.Equal(t, 10, topology.recentTaskWindowSeconds) + + // Test empty topology + assert.Equal(t, 0, len(topology.nodes)) + assert.Equal(t, 0, len(topology.disks)) + assert.Equal(t, 0, len(topology.pendingTasks)) +} + +// TestActiveTopologyUpdate tests topology updates from master +func TestActiveTopologyUpdate(t *testing.T) { + topology := NewActiveTopology(10) + + // Create sample topology info + topologyInfo := createSampleTopology() + + err := topology.UpdateTopology(topologyInfo) + require.NoError(t, err) + + // Verify topology structure + assert.Equal(t, 2, len(topology.nodes)) // 2 nodes + assert.Equal(t, 4, len(topology.disks)) // 4 disks total (2 per node) + + // Verify node structure + node1, exists := topology.nodes["10.0.0.1:8080"] + require.True(t, exists) + assert.Equal(t, "dc1", node1.dataCenter) + assert.Equal(t, "rack1", node1.rack) + assert.Equal(t, 2, len(node1.disks)) + + // Verify disk structure + disk1, exists := topology.disks["10.0.0.1:8080:0"] + require.True(t, exists) + assert.Equal(t, uint32(0), disk1.DiskID) + assert.Equal(t, "hdd", disk1.DiskType) + assert.Equal(t, "dc1", disk1.DataCenter) +} + +// TestTaskLifecycle tests the complete task lifecycle +func TestTaskLifecycle(t *testing.T) { + topology := NewActiveTopology(10) + topology.UpdateTopology(createSampleTopology()) + + taskID := "balance-001" + + // 1. Add pending task + topology.AddPendingTask(taskID, TaskTypeBalance, 1001, + "10.0.0.1:8080", 0, "10.0.0.2:8080", 1) + + // Verify pending state + assert.Equal(t, 1, len(topology.pendingTasks)) + assert.Equal(t, 0, len(topology.assignedTasks)) + assert.Equal(t, 0, len(topology.recentTasks)) + + task := topology.pendingTasks[taskID] + assert.Equal(t, TaskStatusPending, task.Status) + assert.Equal(t, uint32(1001), task.VolumeID) + + // Verify task assigned to disks + sourceDisk := topology.disks["10.0.0.1:8080:0"] + targetDisk := topology.disks["10.0.0.2:8080:1"] + assert.Equal(t, 1, len(sourceDisk.pendingTasks)) + assert.Equal(t, 1, len(targetDisk.pendingTasks)) + + // 2. Assign task + err := topology.AssignTask(taskID) + require.NoError(t, err) + + // Verify assigned state + assert.Equal(t, 0, len(topology.pendingTasks)) + assert.Equal(t, 1, len(topology.assignedTasks)) + assert.Equal(t, 0, len(topology.recentTasks)) + + task = topology.assignedTasks[taskID] + assert.Equal(t, TaskStatusInProgress, task.Status) + + // Verify task moved to assigned on disks + assert.Equal(t, 0, len(sourceDisk.pendingTasks)) + assert.Equal(t, 1, len(sourceDisk.assignedTasks)) + assert.Equal(t, 0, len(targetDisk.pendingTasks)) + assert.Equal(t, 1, len(targetDisk.assignedTasks)) + + // 3. Complete task + err = topology.CompleteTask(taskID) + require.NoError(t, err) + + // Verify completed state + assert.Equal(t, 0, len(topology.pendingTasks)) + assert.Equal(t, 0, len(topology.assignedTasks)) + assert.Equal(t, 1, len(topology.recentTasks)) + + task = topology.recentTasks[taskID] + assert.Equal(t, TaskStatusCompleted, task.Status) + assert.False(t, task.CompletedAt.IsZero()) +} + +// TestTaskDetectionScenarios tests various task detection scenarios +func TestTaskDetectionScenarios(t *testing.T) { + tests := []struct { + name string + scenario func() *ActiveTopology + expectedTasks map[string]bool // taskType -> shouldDetect + }{ + { + name: "Empty cluster - no tasks needed", + scenario: func() *ActiveTopology { + topology := NewActiveTopology(10) + topology.UpdateTopology(createEmptyTopology()) + return topology + }, + expectedTasks: map[string]bool{ + "balance": false, + "vacuum": false, + "ec": false, + }, + }, + { + name: "Unbalanced cluster - balance task needed", + scenario: func() *ActiveTopology { + topology := NewActiveTopology(10) + topology.UpdateTopology(createUnbalancedTopology()) + return topology + }, + expectedTasks: map[string]bool{ + "balance": true, + "vacuum": false, + "ec": false, + }, + }, + { + name: "High garbage ratio - vacuum task needed", + scenario: func() *ActiveTopology { + topology := NewActiveTopology(10) + topology.UpdateTopology(createHighGarbageTopology()) + return topology + }, + expectedTasks: map[string]bool{ + "balance": false, + "vacuum": true, + "ec": false, + }, + }, + { + name: "Large volumes - EC task needed", + scenario: func() *ActiveTopology { + topology := NewActiveTopology(10) + topology.UpdateTopology(createLargeVolumeTopology()) + return topology + }, + expectedTasks: map[string]bool{ + "balance": false, + "vacuum": false, + "ec": true, + }, + }, + { + name: "Recent tasks - no immediate re-detection", + scenario: func() *ActiveTopology { + topology := NewActiveTopology(10) + topology.UpdateTopology(createUnbalancedTopology()) + // Add recent balance task + topology.recentTasks["recent-balance"] = &taskState{ + VolumeID: 1001, + TaskType: TaskTypeBalance, + Status: TaskStatusCompleted, + CompletedAt: time.Now().Add(-5 * time.Second), // 5 seconds ago + } + return topology + }, + expectedTasks: map[string]bool{ + "balance": false, // Should not detect due to recent task + "vacuum": false, + "ec": false, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + topology := tt.scenario() + + // Test balance task detection + shouldDetectBalance := tt.expectedTasks["balance"] + actualDetectBalance := !topology.HasRecentTaskForVolume(1001, TaskTypeBalance) + if shouldDetectBalance { + assert.True(t, actualDetectBalance, "Should detect balance task") + } else { + // Note: In real implementation, task detection would be more sophisticated + // This is a simplified test of the recent task prevention mechanism + } + + // Test that recent tasks prevent re-detection + if len(topology.recentTasks) > 0 { + for _, task := range topology.recentTasks { + hasRecent := topology.HasRecentTaskForVolume(task.VolumeID, task.TaskType) + assert.True(t, hasRecent, "Should find recent task for volume %d", task.VolumeID) + } + } + }) + } +} + +// TestTargetSelectionScenarios tests target selection for different task types +func TestTargetSelectionScenarios(t *testing.T) { + tests := []struct { + name string + topology *ActiveTopology + taskType TaskType + excludeNode string + expectedTargets int + expectedBestTarget string + }{ + { + name: "Balance task - find least loaded disk", + topology: createTopologyWithLoad(), + taskType: TaskTypeBalance, + excludeNode: "10.0.0.1:8080", // Exclude source node + expectedTargets: 2, // 2 disks on other node + }, + { + name: "EC task - find multiple available disks", + topology: createTopologyForEC(), + taskType: TaskTypeErasureCoding, + excludeNode: "", // Don't exclude any nodes + expectedTargets: 4, // All 4 disks available + }, + { + name: "Vacuum task - avoid conflicting disks", + topology: createTopologyWithConflicts(), + taskType: TaskTypeVacuum, + excludeNode: "", + expectedTargets: 1, // Only 1 disk without conflicts (conflicts exclude more disks) + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + availableDisks := tt.topology.GetAvailableDisks(tt.taskType, tt.excludeNode) + assert.Equal(t, tt.expectedTargets, len(availableDisks), + "Expected %d available disks, got %d", tt.expectedTargets, len(availableDisks)) + + // Verify disks are actually available + for _, disk := range availableDisks { + assert.NotEqual(t, tt.excludeNode, disk.NodeID, + "Available disk should not be on excluded node") + + load := tt.topology.GetDiskLoad(disk.NodeID, disk.DiskID) + assert.Less(t, load, 2, "Disk load should be less than 2") + } + }) + } +} + +// TestDiskLoadCalculation tests disk load calculation +func TestDiskLoadCalculation(t *testing.T) { + topology := NewActiveTopology(10) + topology.UpdateTopology(createSampleTopology()) + + // Initially no load + load := topology.GetDiskLoad("10.0.0.1:8080", 0) + assert.Equal(t, 0, load) + + // Add pending task + topology.AddPendingTask("task1", TaskTypeBalance, 1001, + "10.0.0.1:8080", 0, "10.0.0.2:8080", 1) + + // Check load increased + load = topology.GetDiskLoad("10.0.0.1:8080", 0) + assert.Equal(t, 1, load) + + // Add another task to same disk + topology.AddPendingTask("task2", TaskTypeVacuum, 1002, + "10.0.0.1:8080", 0, "", 0) + + load = topology.GetDiskLoad("10.0.0.1:8080", 0) + assert.Equal(t, 2, load) + + // Move one task to assigned + topology.AssignTask("task1") + + // Load should still be 2 (1 pending + 1 assigned) + load = topology.GetDiskLoad("10.0.0.1:8080", 0) + assert.Equal(t, 2, load) + + // Complete one task + topology.CompleteTask("task1") + + // Load should decrease to 1 + load = topology.GetDiskLoad("10.0.0.1:8080", 0) + assert.Equal(t, 1, load) +} + +// TestTaskConflictDetection tests task conflict detection +func TestTaskConflictDetection(t *testing.T) { + topology := NewActiveTopology(10) + topology.UpdateTopology(createSampleTopology()) + + // Add a balance task + topology.AddPendingTask("balance1", TaskTypeBalance, 1001, + "10.0.0.1:8080", 0, "10.0.0.2:8080", 1) + topology.AssignTask("balance1") + + // Try to get available disks for vacuum (conflicts with balance) + availableDisks := topology.GetAvailableDisks(TaskTypeVacuum, "") + + // Source disk should not be available due to conflict + sourceDiskAvailable := false + for _, disk := range availableDisks { + if disk.NodeID == "10.0.0.1:8080" && disk.DiskID == 0 { + sourceDiskAvailable = true + break + } + } + assert.False(t, sourceDiskAvailable, "Source disk should not be available due to task conflict") +} + +// TestPublicInterfaces tests the public interface methods +func TestPublicInterfaces(t *testing.T) { + topology := NewActiveTopology(10) + topology.UpdateTopology(createSampleTopology()) + + // Test GetAllNodes + nodes := topology.GetAllNodes() + assert.Equal(t, 2, len(nodes)) + assert.Contains(t, nodes, "10.0.0.1:8080") + assert.Contains(t, nodes, "10.0.0.2:8080") + + // Test GetNodeDisks + disks := topology.GetNodeDisks("10.0.0.1:8080") + assert.Equal(t, 2, len(disks)) + + // Test with non-existent node + disks = topology.GetNodeDisks("non-existent") + assert.Nil(t, disks) +} + +// Helper functions to create test topologies + +func createSampleTopology() *master_pb.TopologyInfo { + return &master_pb.TopologyInfo{ + DataCenterInfos: []*master_pb.DataCenterInfo{ + { + Id: "dc1", + RackInfos: []*master_pb.RackInfo{ + { + Id: "rack1", + DataNodeInfos: []*master_pb.DataNodeInfo{ + { + Id: "10.0.0.1:8080", + DiskInfos: map[string]*master_pb.DiskInfo{ + "hdd": {DiskId: 0, VolumeCount: 10, MaxVolumeCount: 100}, + "ssd": {DiskId: 1, VolumeCount: 5, MaxVolumeCount: 50}, + }, + }, + { + Id: "10.0.0.2:8080", + DiskInfos: map[string]*master_pb.DiskInfo{ + "hdd": {DiskId: 0, VolumeCount: 8, MaxVolumeCount: 100}, + "ssd": {DiskId: 1, VolumeCount: 3, MaxVolumeCount: 50}, + }, + }, + }, + }, + }, + }, + }, + } +} + +func createEmptyTopology() *master_pb.TopologyInfo { + return &master_pb.TopologyInfo{ + DataCenterInfos: []*master_pb.DataCenterInfo{ + { + Id: "dc1", + RackInfos: []*master_pb.RackInfo{ + { + Id: "rack1", + DataNodeInfos: []*master_pb.DataNodeInfo{ + { + Id: "10.0.0.1:8080", + DiskInfos: map[string]*master_pb.DiskInfo{ + "hdd": {DiskId: 0, VolumeCount: 0, MaxVolumeCount: 100}, + }, + }, + }, + }, + }, + }, + }, + } +} + +func createUnbalancedTopology() *master_pb.TopologyInfo { + return &master_pb.TopologyInfo{ + DataCenterInfos: []*master_pb.DataCenterInfo{ + { + Id: "dc1", + RackInfos: []*master_pb.RackInfo{ + { + Id: "rack1", + DataNodeInfos: []*master_pb.DataNodeInfo{ + { + Id: "10.0.0.1:8080", + DiskInfos: map[string]*master_pb.DiskInfo{ + "hdd": {DiskId: 0, VolumeCount: 90, MaxVolumeCount: 100}, // Very loaded + }, + }, + { + Id: "10.0.0.2:8080", + DiskInfos: map[string]*master_pb.DiskInfo{ + "hdd": {DiskId: 0, VolumeCount: 10, MaxVolumeCount: 100}, // Lightly loaded + }, + }, + }, + }, + }, + }, + }, + } +} + +func createHighGarbageTopology() *master_pb.TopologyInfo { + // In a real implementation, this would include volume-level garbage metrics + return createSampleTopology() +} + +func createLargeVolumeTopology() *master_pb.TopologyInfo { + // In a real implementation, this would include volume-level size metrics + return createSampleTopology() +} + +func createTopologyWithLoad() *ActiveTopology { + topology := NewActiveTopology(10) + topology.UpdateTopology(createSampleTopology()) + + // Add some existing tasks to create load + topology.AddPendingTask("existing1", TaskTypeVacuum, 2001, + "10.0.0.1:8080", 0, "", 0) + topology.AssignTask("existing1") + + return topology +} + +func createTopologyForEC() *ActiveTopology { + topology := NewActiveTopology(10) + topology.UpdateTopology(createSampleTopology()) + return topology +} + +func createTopologyWithConflicts() *ActiveTopology { + topology := NewActiveTopology(10) + topology.UpdateTopology(createSampleTopology()) + + // Add conflicting tasks + topology.AddPendingTask("balance1", TaskTypeBalance, 3001, + "10.0.0.1:8080", 0, "10.0.0.2:8080", 0) + topology.AssignTask("balance1") + + topology.AddPendingTask("ec1", TaskTypeErasureCoding, 3002, + "10.0.0.1:8080", 1, "", 0) + topology.AssignTask("ec1") + + return topology +} + +// TestDestinationPlanning tests destination planning functionality +func TestDestinationPlanning(t *testing.T) { + topology := NewActiveTopology(10) + topology.UpdateTopology(createSampleTopology()) + + // Test balance destination planning + t.Run("Balance destination planning", func(t *testing.T) { + plan, err := topology.PlanBalanceDestination(1001, "10.0.0.1:8080", "rack1", "dc1", 1024*1024) // 1MB + require.NoError(t, err) + require.NotNil(t, plan) + + // Should not target the source node + assert.NotEqual(t, "10.0.0.1:8080", plan.TargetNode) + assert.Equal(t, "10.0.0.2:8080", plan.TargetNode) + assert.NotEmpty(t, plan.TargetRack) + assert.NotEmpty(t, plan.TargetDC) + assert.Greater(t, plan.PlacementScore, 0.0) + }) + + // Test EC destination planning + t.Run("EC destination planning", func(t *testing.T) { + multiPlan, err := topology.PlanECDestinations(1002, "10.0.0.1:8080", "rack1", "dc1", 3) // Ask for 3 shards - source node can be included + require.NoError(t, err) + require.NotNil(t, multiPlan) + assert.Greater(t, len(multiPlan.Plans), 0) + assert.LessOrEqual(t, len(multiPlan.Plans), 3) // Should get at most 3 shards + assert.Equal(t, len(multiPlan.Plans), multiPlan.TotalShards) + + // Check that all plans have valid target nodes + for _, plan := range multiPlan.Plans { + assert.NotEmpty(t, plan.TargetNode) + assert.NotEmpty(t, plan.TargetRack) + assert.NotEmpty(t, plan.TargetDC) + assert.GreaterOrEqual(t, plan.PlacementScore, 0.0) + } + + // Check diversity metrics + assert.GreaterOrEqual(t, multiPlan.SuccessfulRack, 1) + assert.GreaterOrEqual(t, multiPlan.SuccessfulDCs, 1) + }) + + // Test destination planning with load + t.Run("Destination planning considers load", func(t *testing.T) { + // Add load to one disk + topology.AddPendingTask("task1", TaskTypeBalance, 2001, + "10.0.0.2:8080", 0, "", 0) + + plan, err := topology.PlanBalanceDestination(1003, "10.0.0.1:8080", "rack1", "dc1", 1024*1024) + require.NoError(t, err) + require.NotNil(t, plan) + + // Should prefer less loaded disk (disk 1 over disk 0 on node2) + assert.Equal(t, "10.0.0.2:8080", plan.TargetNode) + assert.Equal(t, uint32(1), plan.TargetDisk) // Should prefer SSD (disk 1) which has no load + }) + + // Test insufficient destinations + t.Run("Handle insufficient destinations", func(t *testing.T) { + // Try to plan for more EC shards than available disks + multiPlan, err := topology.PlanECDestinations(1004, "10.0.0.1:8080", "rack1", "dc1", 100) + + // Should get an error for insufficient disks + assert.Error(t, err) + assert.Nil(t, multiPlan) + }) +} + +// TestDestinationPlanningWithActiveTopology tests the integration between task detection and destination planning +func TestDestinationPlanningWithActiveTopology(t *testing.T) { + topology := NewActiveTopology(10) + topology.UpdateTopology(createUnbalancedTopology()) + + // Test that tasks are created with destinations + t.Run("Balance task with destination", func(t *testing.T) { + // Simulate what the balance detector would create + sourceNode := "10.0.0.1:8080" // Overloaded node + volumeID := uint32(1001) + + plan, err := topology.PlanBalanceDestination(volumeID, sourceNode, "rack1", "dc1", 1024*1024) + require.NoError(t, err) + require.NotNil(t, plan) + + // Verify the destination is different from source + assert.NotEqual(t, sourceNode, plan.TargetNode) + assert.Equal(t, "10.0.0.2:8080", plan.TargetNode) // Should be the lightly loaded node + + // Verify placement quality + assert.Greater(t, plan.PlacementScore, 0.0) + assert.LessOrEqual(t, plan.PlacementScore, 1.0) + }) + + // Test task state integration + t.Run("Task state affects future planning", func(t *testing.T) { + volumeID := uint32(1002) + sourceNode := "10.0.0.1:8080" + targetNode := "10.0.0.2:8080" + + // Plan first destination + plan1, err := topology.PlanBalanceDestination(volumeID, sourceNode, "rack1", "dc1", 1024*1024) + require.NoError(t, err) + require.NotNil(t, plan1) + + // Add a pending task to the target + topology.AddPendingTask("task1", TaskTypeBalance, volumeID, sourceNode, 0, targetNode, 0) + + // Plan another destination - should consider the pending task load + plan2, err := topology.PlanBalanceDestination(1003, sourceNode, "rack1", "dc1", 1024*1024) + require.NoError(t, err) + require.NotNil(t, plan2) + + // The placement score should reflect the increased load + // (This test might need adjustment based on the actual scoring algorithm) + glog.V(1).Infof("Plan1 score: %.3f, Plan2 score: %.3f", plan1.PlacementScore, plan2.PlacementScore) + }) +} + +// TestECDestinationPlanningDetailed tests the EC destination planning with multiple shards +func TestECDestinationPlanningDetailed(t *testing.T) { + topology := NewActiveTopology(10) + topology.UpdateTopology(createSampleTopology()) + + t.Run("EC multiple destinations", func(t *testing.T) { + // Plan for 3 EC shards (now including source node, we have 4 disks total) + multiPlan, err := topology.PlanECDestinations(1005, "10.0.0.1:8080", "rack1", "dc1", 3) + require.NoError(t, err) + require.NotNil(t, multiPlan) + + // Should get 3 destinations (can include source node's disks) + assert.Equal(t, 3, len(multiPlan.Plans)) + assert.Equal(t, 3, multiPlan.TotalShards) + + // Count node distribution - source node can now be included + nodeCount := make(map[string]int) + for _, plan := range multiPlan.Plans { + nodeCount[plan.TargetNode]++ + } + + // Should distribute across available nodes (both nodes can be used) + assert.GreaterOrEqual(t, len(nodeCount), 1, "Should use at least 1 node") + assert.LessOrEqual(t, len(nodeCount), 2, "Should use at most 2 nodes") + glog.V(1).Infof("EC destinations node distribution: %v", nodeCount) + + glog.V(1).Infof("EC destinations: %d plans across %d racks, %d DCs", + multiPlan.TotalShards, multiPlan.SuccessfulRack, multiPlan.SuccessfulDCs) + }) + + t.Run("EC destination planning with task conflicts", func(t *testing.T) { + // Create a fresh topology for this test to avoid conflicts from previous test + freshTopology := NewActiveTopology(10) + freshTopology.UpdateTopology(createSampleTopology()) + + // Add tasks to create conflicts on some disks + freshTopology.AddPendingTask("conflict1", TaskTypeVacuum, 2001, "10.0.0.2:8080", 0, "", 0) + freshTopology.AddPendingTask("conflict2", TaskTypeBalance, 2002, "10.0.0.1:8080", 0, "", 0) + freshTopology.AssignTask("conflict1") + freshTopology.AssignTask("conflict2") + + // Plan EC destinations - should still succeed using available disks + multiPlan, err := freshTopology.PlanECDestinations(1006, "10.0.0.1:8080", "rack1", "dc1", 2) + require.NoError(t, err) + require.NotNil(t, multiPlan) + + // Should get destinations (using disks that don't have conflicts) + assert.GreaterOrEqual(t, len(multiPlan.Plans), 1) + assert.LessOrEqual(t, len(multiPlan.Plans), 2) + + // Available disks should be: node1/disk1 and node2/disk1 (since disk0 on both nodes have conflicts) + for _, plan := range multiPlan.Plans { + assert.Equal(t, uint32(1), plan.TargetDisk, "Should prefer disk 1 which has no conflicts") + } + + glog.V(1).Infof("EC destination planning with conflicts: found %d destinations", len(multiPlan.Plans)) + }) +} diff --git a/weed/admin/view/app/cluster_collections.templ b/weed/admin/view/app/cluster_collections.templ index 9099fe112..d4765ea86 100644 --- a/weed/admin/view/app/cluster_collections.templ +++ b/weed/admin/view/app/cluster_collections.templ @@ -22,7 +22,7 @@ templ ClusterCollections(data dash.ClusterCollectionsData) {