Shell: support regular expression for collection selection (#7158)

* support regular expression for collection selection

* refactor

* ordering

* fix exact match

* Update command_volume_balance_test.go

* simplify

* Update command_volume_balance.go

* comment
This commit is contained in:
Chris Lu
2025-08-23 11:04:24 -07:00
committed by GitHub
parent a367c39967
commit 41aedaa687
7 changed files with 111 additions and 30 deletions

View File

@@ -5,6 +5,7 @@ import (
"errors" "errors"
"fmt" "fmt"
"math/rand/v2" "math/rand/v2"
"regexp"
"slices" "slices"
"sort" "sort"
"time" "time"
@@ -1054,3 +1055,13 @@ func EcBalance(commandEnv *CommandEnv, collections []string, dc string, ecReplic
return nil return nil
} }
// compileCollectionPattern compiles a regex pattern for collection matching.
// Empty patterns match empty collections only.
func compileCollectionPattern(pattern string) (*regexp.Regexp, error) {
if pattern == "" {
// empty pattern matches empty collection
return regexp.Compile("^$")
}
return regexp.Compile(pattern)
}

View File

@@ -34,6 +34,11 @@ func (c *commandEcDecode) Help() string {
ec.decode [-collection=""] [-volumeId=<volume_id>] ec.decode [-collection=""] [-volumeId=<volume_id>]
The -collection parameter supports regular expressions for pattern matching:
- Use exact match: ec.decode -collection="^mybucket$"
- Match multiple buckets: ec.decode -collection="bucket.*"
- Match all collections: ec.decode -collection=".*"
` `
} }
@@ -67,8 +72,11 @@ func (c *commandEcDecode) Do(args []string, commandEnv *CommandEnv, writer io.Wr
} }
// apply to all volumes in the collection // apply to all volumes in the collection
volumeIds := collectEcShardIds(topologyInfo, *collection) volumeIds, err := collectEcShardIds(topologyInfo, *collection)
fmt.Printf("ec encode volumes: %v\n", volumeIds) if err != nil {
return err
}
fmt.Printf("ec decode volumes: %v\n", volumeIds)
for _, vid := range volumeIds { for _, vid := range volumeIds {
if err = doEcDecode(commandEnv, topologyInfo, *collection, vid); err != nil { if err = doEcDecode(commandEnv, topologyInfo, *collection, vid); err != nil {
return err return err
@@ -240,13 +248,18 @@ func lookupVolumeIds(commandEnv *CommandEnv, volumeIds []string) (volumeIdLocati
return resp.VolumeIdLocations, nil return resp.VolumeIdLocations, nil
} }
func collectEcShardIds(topoInfo *master_pb.TopologyInfo, selectedCollection string) (vids []needle.VolumeId) { func collectEcShardIds(topoInfo *master_pb.TopologyInfo, collectionPattern string) (vids []needle.VolumeId, err error) {
// compile regex pattern for collection matching
collectionRegex, err := compileCollectionPattern(collectionPattern)
if err != nil {
return nil, fmt.Errorf("invalid collection pattern '%s': %v", collectionPattern, err)
}
vidMap := make(map[uint32]bool) vidMap := make(map[uint32]bool)
eachDataNode(topoInfo, func(dc DataCenterId, rack RackId, dn *master_pb.DataNodeInfo) { eachDataNode(topoInfo, func(dc DataCenterId, rack RackId, dn *master_pb.DataNodeInfo) {
if diskInfo, found := dn.DiskInfos[string(types.HardDriveType)]; found { if diskInfo, found := dn.DiskInfos[string(types.HardDriveType)]; found {
for _, v := range diskInfo.EcShardInfos { for _, v := range diskInfo.EcShardInfos {
if v.Collection == selectedCollection { if collectionRegex.MatchString(v.Collection) {
vidMap[v.Id] = true vidMap[v.Id] = true
} }
} }

View File

@@ -5,6 +5,7 @@ import (
"flag" "flag"
"fmt" "fmt"
"io" "io"
"sort"
"time" "time"
"github.com/seaweedfs/seaweedfs/weed/storage/types" "github.com/seaweedfs/seaweedfs/weed/storage/types"
@@ -53,6 +54,11 @@ func (c *commandEcEncode) Help() string {
If you only have less than 4 volume servers, with erasure coding, at least you can afford to If you only have less than 4 volume servers, with erasure coding, at least you can afford to
have 4 corrupted shard files. have 4 corrupted shard files.
The -collection parameter supports regular expressions for pattern matching:
- Use exact match: ec.encode -collection="^mybucket$"
- Match multiple buckets: ec.encode -collection="bucket.*"
- Match all collections: ec.encode -collection=".*"
Options: Options:
-verbose: show detailed reasons why volumes are not selected for encoding -verbose: show detailed reasons why volumes are not selected for encoding
@@ -112,12 +118,11 @@ func (c *commandEcEncode) Do(args []string, commandEnv *CommandEnv, writer io.Wr
volumeIds = append(volumeIds, vid) volumeIds = append(volumeIds, vid)
balanceCollections = collectCollectionsForVolumeIds(topologyInfo, volumeIds) balanceCollections = collectCollectionsForVolumeIds(topologyInfo, volumeIds)
} else { } else {
// apply to all volumes for the given collection // apply to all volumes for the given collection pattern (regex)
volumeIds, err = collectVolumeIdsForEcEncode(commandEnv, *collection, nil, *fullPercentage, *quietPeriod, *verbose) volumeIds, balanceCollections, err = collectVolumeIdsForEcEncode(commandEnv, *collection, nil, *fullPercentage, *quietPeriod, *verbose)
if err != nil { if err != nil {
return err return err
} }
balanceCollections = []string{*collection}
} }
// Collect volume locations BEFORE EC encoding starts to avoid race condition // Collect volume locations BEFORE EC encoding starts to avoid race condition
@@ -270,7 +275,13 @@ func generateEcShards(grpcDialOption grpc.DialOption, volumeId needle.VolumeId,
} }
func collectVolumeIdsForEcEncode(commandEnv *CommandEnv, selectedCollection string, sourceDiskType *types.DiskType, fullPercentage float64, quietPeriod time.Duration, verbose bool) (vids []needle.VolumeId, err error) { func collectVolumeIdsForEcEncode(commandEnv *CommandEnv, collectionPattern string, sourceDiskType *types.DiskType, fullPercentage float64, quietPeriod time.Duration, verbose bool) (vids []needle.VolumeId, matchedCollections []string, err error) {
// compile regex pattern for collection matching
collectionRegex, err := compileCollectionPattern(collectionPattern)
if err != nil {
return nil, nil, fmt.Errorf("invalid collection pattern '%s': %v", collectionPattern, err)
}
// collect topology information // collect topology information
topologyInfo, volumeSizeLimitMb, err := collectTopologyInfo(commandEnv, 0) topologyInfo, volumeSizeLimitMb, err := collectTopologyInfo(commandEnv, 0)
if err != nil { if err != nil {
@@ -280,7 +291,7 @@ func collectVolumeIdsForEcEncode(commandEnv *CommandEnv, selectedCollection stri
quietSeconds := int64(quietPeriod / time.Second) quietSeconds := int64(quietPeriod / time.Second)
nowUnixSeconds := time.Now().Unix() nowUnixSeconds := time.Now().Unix()
fmt.Printf("collect volumes quiet for: %d seconds and %.1f%% full\n", quietSeconds, fullPercentage) fmt.Printf("collect volumes with collection pattern '%s', quiet for: %d seconds and %.1f%% full\n", collectionPattern, quietSeconds, fullPercentage)
// Statistics for verbose mode // Statistics for verbose mode
var ( var (
@@ -294,6 +305,7 @@ func collectVolumeIdsForEcEncode(commandEnv *CommandEnv, selectedCollection stri
) )
vidMap := make(map[uint32]bool) vidMap := make(map[uint32]bool)
collectionSet := make(map[string]bool)
eachDataNode(topologyInfo, func(dc DataCenterId, rack RackId, dn *master_pb.DataNodeInfo) { eachDataNode(topologyInfo, func(dc DataCenterId, rack RackId, dn *master_pb.DataNodeInfo) {
for _, diskInfo := range dn.DiskInfos { for _, diskInfo := range dn.DiskInfos {
for _, v := range diskInfo.VolumeInfos { for _, v := range diskInfo.VolumeInfos {
@@ -309,16 +321,19 @@ func collectVolumeIdsForEcEncode(commandEnv *CommandEnv, selectedCollection stri
continue continue
} }
// check collection // check collection against regex pattern
if v.Collection != selectedCollection { if !collectionRegex.MatchString(v.Collection) {
wrongCollection++ wrongCollection++
if verbose { if verbose {
fmt.Printf("skip volume %d on %s: wrong collection (expected: %s, actual: %s)\n", fmt.Printf("skip volume %d on %s: collection doesn't match pattern (pattern: %s, actual: %s)\n",
v.Id, dn.Id, selectedCollection, v.Collection) v.Id, dn.Id, collectionPattern, v.Collection)
} }
continue continue
} }
// track matched collection
collectionSet[v.Collection] = true
// check disk type // check disk type
if sourceDiskType != nil && types.ToDiskType(v.DiskType) != *sourceDiskType { if sourceDiskType != nil && types.ToDiskType(v.DiskType) != *sourceDiskType {
wrongDiskType++ wrongDiskType++
@@ -393,11 +408,18 @@ func collectVolumeIdsForEcEncode(commandEnv *CommandEnv, selectedCollection stri
} }
} }
// Convert collection set to slice
for collection := range collectionSet {
matchedCollections = append(matchedCollections, collection)
}
sort.Strings(matchedCollections)
// Print summary statistics in verbose mode or when no volumes selected // Print summary statistics in verbose mode or when no volumes selected
if verbose || len(vids) == 0 { if verbose || len(vids) == 0 {
fmt.Printf("\nVolume selection summary:\n") fmt.Printf("\nVolume selection summary:\n")
fmt.Printf(" Total volumes examined: %d\n", totalVolumes) fmt.Printf(" Total volumes examined: %d\n", totalVolumes)
fmt.Printf(" Selected for encoding: %d\n", len(vids)) fmt.Printf(" Selected for encoding: %d\n", len(vids))
fmt.Printf(" Collections matched: %v\n", matchedCollections)
if totalVolumes > 0 { if totalVolumes > 0 {
fmt.Printf("\nReasons for exclusion:\n") fmt.Printf("\nReasons for exclusion:\n")
@@ -405,7 +427,7 @@ func collectVolumeIdsForEcEncode(commandEnv *CommandEnv, selectedCollection stri
fmt.Printf(" Remote volumes: %d\n", remoteVolumes) fmt.Printf(" Remote volumes: %d\n", remoteVolumes)
} }
if wrongCollection > 0 { if wrongCollection > 0 {
fmt.Printf(" Wrong collection: %d\n", wrongCollection) fmt.Printf(" Collection doesn't match pattern: %d\n", wrongCollection)
} }
if wrongDiskType > 0 { if wrongDiskType > 0 {
fmt.Printf(" Wrong disk type: %d\n", wrongDiskType) fmt.Printf(" Wrong disk type: %d\n", wrongDiskType)

View File

@@ -6,6 +6,7 @@ import (
"fmt" "fmt"
"io" "io"
"os" "os"
"regexp"
"strings" "strings"
"time" "time"
@@ -40,6 +41,14 @@ func (c *commandVolumeBalance) Help() string {
volume.balance [-collection ALL_COLLECTIONS|EACH_COLLECTION|<collection_name>] [-force] [-dataCenter=<data_center_name>] [-racks=rack_name_one,rack_name_two] [-nodes=192.168.0.1:8080,192.168.0.2:8080] volume.balance [-collection ALL_COLLECTIONS|EACH_COLLECTION|<collection_name>] [-force] [-dataCenter=<data_center_name>] [-racks=rack_name_one,rack_name_two] [-nodes=192.168.0.1:8080,192.168.0.2:8080]
The -collection parameter supports:
- ALL_COLLECTIONS: balance across all collections
- EACH_COLLECTION: balance each collection separately
- Regular expressions for pattern matching:
* Use exact match: volume.balance -collection="^mybucket$"
* Match multiple buckets: volume.balance -collection="bucket.*"
* Match all user collections: volume.balance -collection="user-.*"
Algorithm: Algorithm:
For each type of volume server (different max volume count limit){ For each type of volume server (different max volume count limit){
@@ -118,12 +127,23 @@ func (c *commandVolumeBalance) Do(args []string, commandEnv *CommandEnv, writer
return err return err
} }
for _, col := range collections { for _, col := range collections {
if err = c.balanceVolumeServers(diskTypes, volumeReplicas, volumeServers, col); err != nil { // Use direct string comparison for exact match (more efficient than regex)
if err = c.balanceVolumeServers(diskTypes, volumeReplicas, volumeServers, nil, col); err != nil {
return err return err
} }
} }
} else if *collection == "ALL_COLLECTIONS" {
// Pass nil pattern for all collections
if err = c.balanceVolumeServers(diskTypes, volumeReplicas, volumeServers, nil, *collection); err != nil {
return err
}
} else { } else {
if err = c.balanceVolumeServers(diskTypes, volumeReplicas, volumeServers, *collection); err != nil { // Compile user-provided pattern
collectionPattern, err := compileCollectionPattern(*collection)
if err != nil {
return fmt.Errorf("invalid collection pattern '%s': %v", *collection, err)
}
if err = c.balanceVolumeServers(diskTypes, volumeReplicas, volumeServers, collectionPattern, *collection); err != nil {
return err return err
} }
} }
@@ -131,24 +151,29 @@ func (c *commandVolumeBalance) Do(args []string, commandEnv *CommandEnv, writer
return nil return nil
} }
func (c *commandVolumeBalance) balanceVolumeServers(diskTypes []types.DiskType, volumeReplicas map[uint32][]*VolumeReplica, nodes []*Node, collection string) error { func (c *commandVolumeBalance) balanceVolumeServers(diskTypes []types.DiskType, volumeReplicas map[uint32][]*VolumeReplica, nodes []*Node, collectionPattern *regexp.Regexp, collectionName string) error {
for _, diskType := range diskTypes { for _, diskType := range diskTypes {
if err := c.balanceVolumeServersByDiskType(diskType, volumeReplicas, nodes, collection); err != nil { if err := c.balanceVolumeServersByDiskType(diskType, volumeReplicas, nodes, collectionPattern, collectionName); err != nil {
return err return err
} }
} }
return nil return nil
} }
func (c *commandVolumeBalance) balanceVolumeServersByDiskType(diskType types.DiskType, volumeReplicas map[uint32][]*VolumeReplica, nodes []*Node, collection string) error { func (c *commandVolumeBalance) balanceVolumeServersByDiskType(diskType types.DiskType, volumeReplicas map[uint32][]*VolumeReplica, nodes []*Node, collectionPattern *regexp.Regexp, collectionName string) error {
for _, n := range nodes { for _, n := range nodes {
n.selectVolumes(func(v *master_pb.VolumeInformationMessage) bool { n.selectVolumes(func(v *master_pb.VolumeInformationMessage) bool {
if collection != "ALL_COLLECTIONS" { if collectionName != "ALL_COLLECTIONS" {
if v.Collection != collection { if collectionPattern != nil {
return false // Use regex pattern matching
if !collectionPattern.MatchString(v.Collection) {
return false
}
} else {
// Use exact string matching (for EACH_COLLECTION)
if v.Collection != collectionName {
return false
}
} }
} }
if v.DiskType != string(diskType) { if v.DiskType != string(diskType) {

View File

@@ -256,7 +256,7 @@ func TestBalance(t *testing.T) {
volumeReplicas, _ := collectVolumeReplicaLocations(topologyInfo) volumeReplicas, _ := collectVolumeReplicaLocations(topologyInfo)
diskTypes := collectVolumeDiskTypes(topologyInfo) diskTypes := collectVolumeDiskTypes(topologyInfo)
c := &commandVolumeBalance{} c := &commandVolumeBalance{}
if err := c.balanceVolumeServers(diskTypes, volumeReplicas, volumeServers, "ALL_COLLECTIONS"); err != nil { if err := c.balanceVolumeServers(diskTypes, volumeReplicas, volumeServers, nil, "ALL_COLLECTIONS"); err != nil {
t.Errorf("balance: %v", err) t.Errorf("balance: %v", err)
} }

View File

@@ -33,6 +33,11 @@ func (c *commandVolumeTierDownload) Help() string {
volume.tier.download [-collection=""] volume.tier.download [-collection=""]
volume.tier.download [-collection=""] -volumeId=<volume_id> volume.tier.download [-collection=""] -volumeId=<volume_id>
The -collection parameter supports regular expressions for pattern matching:
- Use exact match: volume.tier.download -collection="^mybucket$"
- Match multiple buckets: volume.tier.download -collection="bucket.*"
- Match all collections: volume.tier.download -collection=".*"
e.g.: e.g.:
volume.tier.download -volumeId=7 volume.tier.download -volumeId=7
@@ -73,7 +78,7 @@ func (c *commandVolumeTierDownload) Do(args []string, commandEnv *CommandEnv, wr
// apply to all volumes in the collection // apply to all volumes in the collection
// reusing collectVolumeIdsForEcEncode for now // reusing collectVolumeIdsForEcEncode for now
volumeIds := collectRemoteVolumes(topologyInfo, *collection) volumeIds, err := collectRemoteVolumes(topologyInfo, *collection)
if err != nil { if err != nil {
return err return err
} }
@@ -87,13 +92,18 @@ func (c *commandVolumeTierDownload) Do(args []string, commandEnv *CommandEnv, wr
return nil return nil
} }
func collectRemoteVolumes(topoInfo *master_pb.TopologyInfo, selectedCollection string) (vids []needle.VolumeId) { func collectRemoteVolumes(topoInfo *master_pb.TopologyInfo, collectionPattern string) (vids []needle.VolumeId, err error) {
// compile regex pattern for collection matching
collectionRegex, err := compileCollectionPattern(collectionPattern)
if err != nil {
return nil, fmt.Errorf("invalid collection pattern '%s': %v", collectionPattern, err)
}
vidMap := make(map[uint32]bool) vidMap := make(map[uint32]bool)
eachDataNode(topoInfo, func(dc DataCenterId, rack RackId, dn *master_pb.DataNodeInfo) { eachDataNode(topoInfo, func(dc DataCenterId, rack RackId, dn *master_pb.DataNodeInfo) {
for _, diskInfo := range dn.DiskInfos { for _, diskInfo := range dn.DiskInfos {
for _, v := range diskInfo.VolumeInfos { for _, v := range diskInfo.VolumeInfos {
if v.Collection == selectedCollection && v.RemoteStorageKey != "" && v.RemoteStorageName != "" { if collectionRegex.MatchString(v.Collection) && v.RemoteStorageKey != "" && v.RemoteStorageName != "" {
vidMap[v.Id] = true vidMap[v.Id] = true
} }
} }

View File

@@ -98,7 +98,7 @@ func (c *commandVolumeTierUpload) Do(args []string, commandEnv *CommandEnv, writ
// apply to all volumes in the collection // apply to all volumes in the collection
// reusing collectVolumeIdsForEcEncode for now // reusing collectVolumeIdsForEcEncode for now
volumeIds, err := collectVolumeIdsForEcEncode(commandEnv, *collection, diskType, *fullPercentage, *quietPeriod, false) volumeIds, _, err := collectVolumeIdsForEcEncode(commandEnv, *collection, diskType, *fullPercentage, *quietPeriod, false)
if err != nil { if err != nil {
return err return err
} }