mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2025-11-08 02:34:46 +08:00
The volume.check.disk command could get stuck in an infinite loop when syncing replicas that have persistent discrepancies that cannot be resolved. This happened because the sync loop had no maximum iteration limit and no detection for when progress stopped being made. Issues fixed: 1. Infinite loop: Added maxIterations limit (5) to prevent endless looping 2. Progress detection: Detect when hasChanges state doesn't change between iterations, indicating sync is stuck 3. Return value bug: Fixed naked return statement that was returning zero values instead of the actual hasChanges value, causing incorrect loop termination logic Changes: - Added maximum iteration limit with clear error messages - Added progress detection to identify stuck sync situations - Fixed return statement to properly return hasChanges and error - Added verbose logging for sync iterations The fix ensures that: - Sync will terminate after 5 iterations maximum - Users get clear messages about why sync stopped - The hasChanges logic properly reflects deletion sync results Fixes #7307
This commit is contained in:
@@ -183,11 +183,34 @@ func (c *commandVolumeCheckDisk) Do(args []string, commandEnv *CommandEnv, write
|
||||
|
||||
func (c *commandVolumeCheckDisk) syncTwoReplicas(a *VolumeReplica, b *VolumeReplica, applyChanges bool, doSyncDeletions bool, nonRepairThreshold float64, verbose bool) (err error) {
|
||||
aHasChanges, bHasChanges := true, true
|
||||
for aHasChanges || bHasChanges {
|
||||
const maxIterations = 5
|
||||
iteration := 0
|
||||
|
||||
for (aHasChanges || bHasChanges) && iteration < maxIterations {
|
||||
iteration++
|
||||
if verbose {
|
||||
fmt.Fprintf(c.writer, "sync iteration %d for volume %d\n", iteration, a.info.Id)
|
||||
}
|
||||
|
||||
prevAHasChanges, prevBHasChanges := aHasChanges, bHasChanges
|
||||
if aHasChanges, bHasChanges, err = c.checkBoth(a, b, applyChanges, doSyncDeletions, nonRepairThreshold, verbose); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Detect if we're stuck in a loop with no progress
|
||||
if iteration > 1 && prevAHasChanges == aHasChanges && prevBHasChanges == bHasChanges && (aHasChanges || bHasChanges) {
|
||||
fmt.Fprintf(c.writer, "volume %d sync is not making progress between %s and %s after iteration %d, stopping to prevent infinite loop\n",
|
||||
a.info.Id, a.location.dataNode.Id, b.location.dataNode.Id, iteration)
|
||||
return fmt.Errorf("sync not making progress after %d iterations", iteration)
|
||||
}
|
||||
}
|
||||
|
||||
if iteration >= maxIterations && (aHasChanges || bHasChanges) {
|
||||
fmt.Fprintf(c.writer, "volume %d sync reached maximum iterations (%d) between %s and %s, may need manual intervention\n",
|
||||
a.info.Id, maxIterations, a.location.dataNode.Id, b.location.dataNode.Id)
|
||||
return fmt.Errorf("reached maximum sync iterations (%d)", maxIterations)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -307,11 +330,10 @@ func doVolumeCheckDisk(minuend, subtrahend *needle_map.MemDb, source, target *Vo
|
||||
for _, deleteResult := range deleteResults {
|
||||
if deleteResult.Status == http.StatusAccepted && deleteResult.Size > 0 {
|
||||
hasChanges = true
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
return hasChanges, nil
|
||||
}
|
||||
|
||||
func readSourceNeedleBlob(grpcDialOption grpc.DialOption, sourceVolumeServer pb.ServerAddress, volumeId uint32, needleValue needle_map.NeedleValue) (needleBlob []byte, err error) {
|
||||
|
||||
Reference in New Issue
Block a user