2025-07-30 12:38:03 -07:00
package erasure_coding
import (
"fmt"
"strings"
"time"
2025-08-01 11:18:32 -07:00
"github.com/seaweedfs/seaweedfs/weed/admin/topology"
2025-07-30 12:38:03 -07:00
"github.com/seaweedfs/seaweedfs/weed/glog"
2025-08-01 11:18:32 -07:00
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
2025-07-30 12:38:03 -07:00
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/base"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// Detection implements the detection logic for erasure coding tasks
func Detection ( metrics [ ] * types . VolumeHealthMetrics , clusterInfo * types . ClusterInfo , config base . TaskConfig ) ( [ ] * types . TaskDetectionResult , error ) {
if ! config . IsEnabled ( ) {
return nil , nil
}
ecConfig := config . ( * Config )
var results [ ] * types . TaskDetectionResult
now := time . Now ( )
quietThreshold := time . Duration ( ecConfig . QuietForSeconds ) * time . Second
minSizeBytes := uint64 ( ecConfig . MinSizeMB ) * 1024 * 1024 // Configurable minimum
debugCount := 0
skippedAlreadyEC := 0
skippedTooSmall := 0
skippedCollectionFilter := 0
skippedQuietTime := 0
skippedFullness := 0
for _ , metric := range metrics {
// Skip if already EC volume
if metric . IsECVolume {
skippedAlreadyEC ++
continue
}
// Check minimum size requirement
if metric . Size < minSizeBytes {
skippedTooSmall ++
continue
}
// Check collection filter if specified
if ecConfig . CollectionFilter != "" {
// Parse comma-separated collections
allowedCollections := make ( map [ string ] bool )
for _ , collection := range strings . Split ( ecConfig . CollectionFilter , "," ) {
allowedCollections [ strings . TrimSpace ( collection ) ] = true
}
// Skip if volume's collection is not in the allowed list
if ! allowedCollections [ metric . Collection ] {
skippedCollectionFilter ++
continue
}
}
// Check quiet duration and fullness criteria
if metric . Age >= quietThreshold && metric . FullnessRatio >= ecConfig . FullnessRatio {
2025-08-03 01:35:38 -07:00
// Generate task ID for ActiveTopology integration
taskID := fmt . Sprintf ( "ec_vol_%d_%d" , metric . VolumeID , now . Unix ( ) )
2025-07-30 12:38:03 -07:00
result := & types . TaskDetectionResult {
2025-08-03 01:35:38 -07:00
TaskID : taskID , // Link to ActiveTopology pending task
2025-07-30 12:38:03 -07:00
TaskType : types . TaskTypeErasureCoding ,
VolumeID : metric . VolumeID ,
Server : metric . Server ,
Collection : metric . Collection ,
Priority : types . TaskPriorityLow , // EC is not urgent
2025-07-30 16:43:14 -07:00
Reason : fmt . Sprintf ( "Volume meets EC criteria: quiet for %.1fs (>%ds), fullness=%.1f%% (>%.1f%%), size=%.1fMB (>%dMB)" ,
2025-07-30 12:38:03 -07:00
metric . Age . Seconds ( ) , ecConfig . QuietForSeconds , metric . FullnessRatio * 100 , ecConfig . FullnessRatio * 100 ,
2025-07-30 16:43:14 -07:00
float64 ( metric . Size ) / ( 1024 * 1024 ) , ecConfig . MinSizeMB ) ,
2025-07-30 12:38:03 -07:00
ScheduleAt : now ,
}
2025-08-01 11:18:32 -07:00
// Plan EC destinations if ActiveTopology is available
if clusterInfo . ActiveTopology != nil {
multiPlan , err := planECDestinations ( clusterInfo . ActiveTopology , metric , ecConfig )
if err != nil {
glog . Warningf ( "Failed to plan EC destinations for volume %d: %v" , metric . VolumeID , err )
continue // Skip this volume if destination planning fails
}
2025-08-03 01:35:38 -07:00
// Calculate expected shard size for EC operation
// Each data shard will be approximately volumeSize / dataShards
expectedShardSize := uint64 ( metric . Size ) / uint64 ( erasure_coding . DataShardsCount )
// Add pending EC shard task to ActiveTopology for capacity management
// Extract shard destinations from multiPlan
var shardDestinations [ ] string
var shardDiskIDs [ ] uint32
for _ , plan := range multiPlan . Plans {
shardDestinations = append ( shardDestinations , plan . TargetNode )
shardDiskIDs = append ( shardDiskIDs , plan . TargetDisk )
}
// Find all volume replica locations (server + disk) from topology
replicaLocations := findVolumeReplicaLocations ( clusterInfo . ActiveTopology , metric . VolumeID , metric . Collection )
if len ( replicaLocations ) == 0 {
glog . Warningf ( "No replica locations found for volume %d, skipping EC" , metric . VolumeID )
continue
}
// Find existing EC shards from previous failed attempts
existingECShards := findExistingECShards ( clusterInfo . ActiveTopology , metric . VolumeID , metric . Collection )
// Combine volume replicas and existing EC shards for cleanup
var allSourceLocations [ ] topology . TaskSourceLocation
// Add volume replicas (will free volume slots)
for _ , replica := range replicaLocations {
allSourceLocations = append ( allSourceLocations , topology . TaskSourceLocation {
ServerID : replica . ServerID ,
DiskID : replica . DiskID ,
CleanupType : topology . CleanupVolumeReplica ,
} )
}
// Add existing EC shards (will free shard slots)
duplicateCheck := make ( map [ string ] bool )
for _ , replica := range replicaLocations {
key := fmt . Sprintf ( "%s:%d" , replica . ServerID , replica . DiskID )
duplicateCheck [ key ] = true
}
for _ , shard := range existingECShards {
key := fmt . Sprintf ( "%s:%d" , shard . ServerID , shard . DiskID )
if ! duplicateCheck [ key ] { // Avoid duplicates if EC shards are on same disk as volume replicas
allSourceLocations = append ( allSourceLocations , topology . TaskSourceLocation {
ServerID : shard . ServerID ,
DiskID : shard . DiskID ,
CleanupType : topology . CleanupECShards ,
} )
duplicateCheck [ key ] = true
}
}
glog . V ( 2 ) . Infof ( "Found %d volume replicas and %d existing EC shards for volume %d (total %d cleanup sources)" ,
len ( replicaLocations ) , len ( existingECShards ) , metric . VolumeID , len ( allSourceLocations ) )
// Convert TaskSourceLocation to TaskSourceSpec
sources := make ( [ ] topology . TaskSourceSpec , len ( allSourceLocations ) )
for i , srcLoc := range allSourceLocations {
sources [ i ] = topology . TaskSourceSpec {
ServerID : srcLoc . ServerID ,
DiskID : srcLoc . DiskID ,
CleanupType : srcLoc . CleanupType ,
}
}
// Convert shard destinations to TaskDestinationSpec
destinations := make ( [ ] topology . TaskDestinationSpec , len ( shardDestinations ) )
shardImpact := topology . CalculateECShardStorageImpact ( 1 , int64 ( expectedShardSize ) ) // 1 shard per destination
shardSize := int64 ( expectedShardSize )
for i , dest := range shardDestinations {
destinations [ i ] = topology . TaskDestinationSpec {
ServerID : dest ,
DiskID : shardDiskIDs [ i ] ,
StorageImpact : & shardImpact ,
EstimatedSize : & shardSize ,
}
}
err = clusterInfo . ActiveTopology . AddPendingTask ( topology . TaskSpec {
TaskID : taskID ,
TaskType : topology . TaskTypeErasureCoding ,
VolumeID : metric . VolumeID ,
VolumeSize : int64 ( metric . Size ) ,
Sources : sources ,
Destinations : destinations ,
} )
if err != nil {
glog . Warningf ( "Failed to add pending EC shard task to ActiveTopology for volume %d: %v" , metric . VolumeID , err )
continue // Skip this volume if topology task addition fails
}
glog . V ( 2 ) . Infof ( "Added pending EC shard task %s to ActiveTopology for volume %d with %d cleanup sources and %d shard destinations" ,
taskID , metric . VolumeID , len ( allSourceLocations ) , len ( multiPlan . Plans ) )
// Find all volume replicas from topology (for legacy worker compatibility)
var replicas [ ] string
serverSet := make ( map [ string ] struct { } )
for _ , loc := range replicaLocations {
if _ , found := serverSet [ loc . ServerID ] ; ! found {
replicas = append ( replicas , loc . ServerID )
serverSet [ loc . ServerID ] = struct { } { }
}
}
2025-08-01 11:18:32 -07:00
glog . V ( 1 ) . Infof ( "Found %d replicas for volume %d: %v" , len ( replicas ) , metric . VolumeID , replicas )
// Create typed parameters with EC destination information and replicas
result . TypedParams = & worker_pb . TaskParams {
2025-08-03 01:35:38 -07:00
TaskId : taskID , // Link to ActiveTopology pending task
2025-08-01 11:18:32 -07:00
VolumeId : metric . VolumeID ,
Server : metric . Server ,
Collection : metric . Collection ,
VolumeSize : metric . Size , // Store original volume size for tracking changes
Replicas : replicas , // Include all volume replicas for deletion
TaskParams : & worker_pb . TaskParams_ErasureCodingParams {
ErasureCodingParams : createECTaskParams ( multiPlan ) ,
} ,
}
glog . V ( 1 ) . Infof ( "Planned EC destinations for volume %d: %d shards across %d racks, %d DCs" ,
metric . VolumeID , len ( multiPlan . Plans ) , multiPlan . SuccessfulRack , multiPlan . SuccessfulDCs )
} else {
glog . Warningf ( "No ActiveTopology available for destination planning in EC detection" )
continue // Skip this volume if no topology available
}
2025-07-30 12:38:03 -07:00
results = append ( results , result )
} else {
// Count debug reasons
if debugCount < 5 { // Limit to avoid spam
if metric . Age < quietThreshold {
skippedQuietTime ++
}
if metric . FullnessRatio < ecConfig . FullnessRatio {
skippedFullness ++
}
}
debugCount ++
}
}
// Log debug summary if no tasks were created
if len ( results ) == 0 && len ( metrics ) > 0 {
totalVolumes := len ( metrics )
glog . V ( 1 ) . Infof ( "EC detection: No tasks created for %d volumes (skipped: %d already EC, %d too small, %d filtered, %d not quiet, %d not full)" ,
totalVolumes , skippedAlreadyEC , skippedTooSmall , skippedCollectionFilter , skippedQuietTime , skippedFullness )
// Show details for first few volumes
for i , metric := range metrics {
if i >= 3 || metric . IsECVolume { // Limit to first 3 non-EC volumes
continue
}
sizeMB := float64 ( metric . Size ) / ( 1024 * 1024 )
glog . Infof ( "ERASURE CODING: Volume %d: size=%.1fMB (need ≥%dMB), age=%s (need ≥%s), fullness=%.1f%% (need ≥%.1f%%)" ,
metric . VolumeID , sizeMB , ecConfig . MinSizeMB , metric . Age . Truncate ( time . Minute ) , quietThreshold . Truncate ( time . Minute ) ,
metric . FullnessRatio * 100 , ecConfig . FullnessRatio * 100 )
}
}
return results , nil
}
2025-08-01 11:18:32 -07:00
// planECDestinations plans the destinations for erasure coding operation
// This function implements EC destination planning logic directly in the detection phase
func planECDestinations ( activeTopology * topology . ActiveTopology , metric * types . VolumeHealthMetrics , ecConfig * Config ) ( * topology . MultiDestinationPlan , error ) {
2025-08-03 01:35:38 -07:00
// Calculate expected shard size for EC operation
expectedShardSize := uint64 ( metric . Size ) / uint64 ( erasure_coding . DataShardsCount )
2025-08-01 11:18:32 -07:00
// Get source node information from topology
var sourceRack , sourceDC string
2025-07-30 12:38:03 -07:00
2025-08-01 11:18:32 -07:00
// Extract rack and DC from topology info
topologyInfo := activeTopology . GetTopologyInfo ( )
if topologyInfo != nil {
for _ , dc := range topologyInfo . DataCenterInfos {
for _ , rack := range dc . RackInfos {
for _ , dataNodeInfo := range rack . DataNodeInfos {
if dataNodeInfo . Id == metric . Server {
sourceDC = dc . Id
sourceRack = rack . Id
break
}
}
if sourceRack != "" {
break
}
}
if sourceDC != "" {
break
}
}
}
2025-08-03 01:35:38 -07:00
// Get available disks for EC placement with effective capacity consideration (includes pending tasks)
// For EC, we typically need 1 volume slot per shard, so use minimum capacity of 1
// For EC, we need at least 1 available volume slot on a disk to consider it for placement.
availableDisks := activeTopology . GetDisksWithEffectiveCapacity ( topology . TaskTypeErasureCoding , metric . Server , 1 )
2025-08-02 02:16:49 -07:00
if len ( availableDisks ) < erasure_coding . MinTotalDisks {
2025-08-03 01:35:38 -07:00
return nil , fmt . Errorf ( "insufficient disks for EC placement: need %d, have %d (considering pending/active tasks)" , erasure_coding . MinTotalDisks , len ( availableDisks ) )
2025-08-01 11:18:32 -07:00
}
// Select best disks for EC placement with rack/DC diversity
selectedDisks := selectBestECDestinations ( availableDisks , sourceRack , sourceDC , erasure_coding . TotalShardsCount )
2025-08-02 02:16:49 -07:00
if len ( selectedDisks ) < erasure_coding . MinTotalDisks {
return nil , fmt . Errorf ( "found %d disks, but could not find %d suitable destinations for EC placement" , len ( selectedDisks ) , erasure_coding . MinTotalDisks )
2025-08-01 11:18:32 -07:00
}
var plans [ ] * topology . DestinationPlan
rackCount := make ( map [ string ] int )
dcCount := make ( map [ string ] int )
for _ , disk := range selectedDisks {
plan := & topology . DestinationPlan {
TargetNode : disk . NodeID ,
TargetDisk : disk . DiskID ,
TargetRack : disk . Rack ,
TargetDC : disk . DataCenter ,
2025-08-03 01:35:38 -07:00
ExpectedSize : expectedShardSize , // Set calculated EC shard size
2025-08-01 11:18:32 -07:00
PlacementScore : calculateECScore ( disk , sourceRack , sourceDC ) ,
Conflicts : checkECPlacementConflicts ( disk , sourceRack , sourceDC ) ,
}
plans = append ( plans , plan )
// Count rack and DC diversity
rackKey := fmt . Sprintf ( "%s:%s" , disk . DataCenter , disk . Rack )
rackCount [ rackKey ] ++
dcCount [ disk . DataCenter ] ++
}
2025-08-03 01:35:38 -07:00
// Log capacity utilization information using ActiveTopology's encapsulated logic
totalEffectiveCapacity := int64 ( 0 )
for _ , plan := range plans {
effectiveCapacity := activeTopology . GetEffectiveAvailableCapacity ( plan . TargetNode , plan . TargetDisk )
totalEffectiveCapacity += effectiveCapacity
}
glog . V ( 1 ) . Infof ( "Planned EC destinations for volume %d (size=%d bytes): expected shard size=%d bytes, %d shards across %d racks, %d DCs, total effective capacity=%d slots" ,
metric . VolumeID , metric . Size , expectedShardSize , len ( plans ) , len ( rackCount ) , len ( dcCount ) , totalEffectiveCapacity )
// Log storage impact for EC task (source only - EC has multiple targets handled individually)
sourceChange , _ := topology . CalculateTaskStorageImpact ( topology . TaskTypeErasureCoding , int64 ( metric . Size ) )
glog . V ( 2 ) . Infof ( "EC task capacity management: source_reserves_with_zero_impact={VolumeSlots:%d, ShardSlots:%d}, %d_targets_will_receive_shards, estimated_size=%d" ,
sourceChange . VolumeSlots , sourceChange . ShardSlots , len ( plans ) , metric . Size )
glog . V ( 2 ) . Infof ( "EC source reserves capacity but with zero StorageSlotChange impact" )
2025-08-01 11:18:32 -07:00
return & topology . MultiDestinationPlan {
Plans : plans ,
TotalShards : len ( plans ) ,
SuccessfulRack : len ( rackCount ) ,
SuccessfulDCs : len ( dcCount ) ,
} , nil
}
// createECTaskParams creates EC task parameters from the multi-destination plan
func createECTaskParams ( multiPlan * topology . MultiDestinationPlan ) * worker_pb . ErasureCodingTaskParams {
var destinations [ ] * worker_pb . ECDestination
for _ , plan := range multiPlan . Plans {
destination := & worker_pb . ECDestination {
Node : plan . TargetNode ,
DiskId : plan . TargetDisk ,
Rack : plan . TargetRack ,
DataCenter : plan . TargetDC ,
PlacementScore : plan . PlacementScore ,
}
destinations = append ( destinations , destination )
}
// Collect placement conflicts from all destinations
var placementConflicts [ ] string
for _ , plan := range multiPlan . Plans {
placementConflicts = append ( placementConflicts , plan . Conflicts ... )
}
return & worker_pb . ErasureCodingTaskParams {
Destinations : destinations ,
DataShards : erasure_coding . DataShardsCount , // Standard data shards
ParityShards : erasure_coding . ParityShardsCount , // Standard parity shards
PlacementConflicts : placementConflicts ,
}
}
// selectBestECDestinations selects multiple disks for EC shard placement with diversity
func selectBestECDestinations ( disks [ ] * topology . DiskInfo , sourceRack , sourceDC string , shardsNeeded int ) [ ] * topology . DiskInfo {
if len ( disks ) == 0 {
return nil
}
// Group disks by rack and DC for diversity
rackGroups := make ( map [ string ] [ ] * topology . DiskInfo )
for _ , disk := range disks {
rackKey := fmt . Sprintf ( "%s:%s" , disk . DataCenter , disk . Rack )
rackGroups [ rackKey ] = append ( rackGroups [ rackKey ] , disk )
}
var selected [ ] * topology . DiskInfo
usedRacks := make ( map [ string ] bool )
// First pass: select one disk from each rack for maximum diversity
for rackKey , rackDisks := range rackGroups {
if len ( selected ) >= shardsNeeded {
break
}
// Select best disk from this rack
bestDisk := selectBestFromRack ( rackDisks , sourceRack , sourceDC )
if bestDisk != nil {
selected = append ( selected , bestDisk )
usedRacks [ rackKey ] = true
}
}
// Second pass: if we need more disks, select from racks we've already used
if len ( selected ) < shardsNeeded {
for _ , disk := range disks {
if len ( selected ) >= shardsNeeded {
break
}
// Skip if already selected
alreadySelected := false
for _ , sel := range selected {
if sel . NodeID == disk . NodeID && sel . DiskID == disk . DiskID {
alreadySelected = true
break
}
}
if ! alreadySelected && isDiskSuitableForEC ( disk ) {
selected = append ( selected , disk )
}
}
}
return selected
}
// selectBestFromRack selects the best disk from a rack for EC placement
func selectBestFromRack ( disks [ ] * topology . DiskInfo , sourceRack , sourceDC string ) * topology . DiskInfo {
if len ( disks ) == 0 {
return nil
2025-07-30 12:38:03 -07:00
}
2025-08-01 11:18:32 -07:00
var bestDisk * topology . DiskInfo
bestScore := - 1.0
for _ , disk := range disks {
if ! isDiskSuitableForEC ( disk ) {
continue
}
score := calculateECScore ( disk , sourceRack , sourceDC )
if score > bestScore {
bestScore = score
bestDisk = disk
2025-07-30 12:38:03 -07:00
}
}
2025-08-01 11:18:32 -07:00
return bestDisk
}
// calculateECScore calculates placement score for EC operations
func calculateECScore ( disk * topology . DiskInfo , sourceRack , sourceDC string ) float64 {
if disk . DiskInfo == nil {
return 0.0
}
score := 0.0
// Prefer disks with available capacity
if disk . DiskInfo . MaxVolumeCount > 0 {
utilization := float64 ( disk . DiskInfo . VolumeCount ) / float64 ( disk . DiskInfo . MaxVolumeCount )
score += ( 1.0 - utilization ) * 50.0 // Up to 50 points for available capacity
}
// Prefer different racks for better distribution
if disk . Rack != sourceRack {
score += 30.0
}
// Prefer different data centers for better distribution
if disk . DataCenter != sourceDC {
score += 20.0
}
// Consider current load
score += ( 10.0 - float64 ( disk . LoadCount ) ) // Up to 10 points for low load
return score
}
// isDiskSuitableForEC checks if a disk is suitable for EC placement
func isDiskSuitableForEC ( disk * topology . DiskInfo ) bool {
if disk . DiskInfo == nil {
return false
}
2025-08-03 01:35:38 -07:00
// Check if disk is not overloaded with tasks
if disk . LoadCount > topology . MaxTaskLoadForECPlacement {
2025-07-30 12:38:03 -07:00
return false
}
2025-08-01 11:18:32 -07:00
return true
}
// checkECPlacementConflicts checks for placement rule conflicts in EC operations
func checkECPlacementConflicts ( disk * topology . DiskInfo , sourceRack , sourceDC string ) [ ] string {
var conflicts [ ] string
// For EC, being on the same rack as source is often acceptable
// but we note it as potential conflict for monitoring
if disk . Rack == sourceRack && disk . DataCenter == sourceDC {
conflicts = append ( conflicts , "same_rack_as_source" )
}
return conflicts
}
2025-08-03 01:35:38 -07:00
// findVolumeReplicaLocations finds all replica locations (server + disk) for the specified volume
// Uses O(1) indexed lookup for optimal performance on large clusters.
func findVolumeReplicaLocations ( activeTopology * topology . ActiveTopology , volumeID uint32 , collection string ) [ ] topology . VolumeReplica {
if activeTopology == nil {
return nil
}
return activeTopology . GetVolumeLocations ( volumeID , collection )
}
// findExistingECShards finds existing EC shards for a volume (from previous failed EC attempts)
// Uses O(1) indexed lookup for optimal performance on large clusters.
func findExistingECShards ( activeTopology * topology . ActiveTopology , volumeID uint32 , collection string ) [ ] topology . VolumeReplica {
if activeTopology == nil {
return nil
}
return activeTopology . GetECShardLocations ( volumeID , collection )
}
2025-08-01 11:18:32 -07:00
// findVolumeReplicas finds all servers that have replicas of the specified volume
func findVolumeReplicas ( activeTopology * topology . ActiveTopology , volumeID uint32 , collection string ) [ ] string {
if activeTopology == nil {
return [ ] string { }
}
topologyInfo := activeTopology . GetTopologyInfo ( )
if topologyInfo == nil {
return [ ] string { }
}
var replicaServers [ ] string
// Iterate through all nodes to find volume replicas
for _ , dc := range topologyInfo . DataCenterInfos {
for _ , rack := range dc . RackInfos {
for _ , nodeInfo := range rack . DataNodeInfos {
for _ , diskInfo := range nodeInfo . DiskInfos {
for _ , volumeInfo := range diskInfo . VolumeInfos {
if volumeInfo . Id == volumeID && volumeInfo . Collection == collection {
replicaServers = append ( replicaServers , nodeInfo . Id )
break // Found volume on this node, move to next node
}
}
}
2025-07-30 12:38:03 -07:00
}
}
}
2025-08-01 11:18:32 -07:00
return replicaServers
2025-07-30 12:38:03 -07:00
}