use parquet statistics for optimization

2025-09-20 01:59:23 +08:00 · 2025-09-01 13:50:56 -07:00
parent 471ba271dc
commit c73ceac79f
2 changed files with 617 additions and 0 deletions
--- a/weed/query/engine/engine.go
+++ b/weed/query/engine/engine.go
@@ -13,6 +13,7 @@ import (
 	"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
 	"github.com/seaweedfs/seaweedfs/weed/pb/schema_pb"
 	"github.com/seaweedfs/seaweedfs/weed/query/sqltypes"
 	"github.com/seaweedfs/seaweedfs/weed/util"
 	"github.com/seaweedfs/seaweedfs/weed/util/http"
 	"github.com/xwb1989/sqlparser"
 )
@@ -1096,6 +1097,19 @@ func (e *SQLEngine) executeAggregationQuery(ctx context.Context, hybridScanner *
 		startTimeNs, stopTimeNs = e.extractTimeFilters(stmt.Where.Expr)
 	}
 	// 🚀 FAST PATH: Try to use parquet statistics for optimization
 	// This can be ~130x faster than scanning all data
 	if stmt.Where == nil { // Only optimize when no complex WHERE clause
 		fastResult, canOptimize := e.tryFastParquetAggregation(ctx, hybridScanner, aggregations)
 		if canOptimize {
 			fmt.Printf("✅ Using fast parquet statistics for aggregation (skipped full scan)\n")
 			return fastResult, nil
 		}
 	}
 	// SLOW PATH: Fall back to full table scan
 	fmt.Printf("⚠️  Using full table scan for aggregation (parquet optimization not applicable)\n")
 	// Build scan options for full table scan (aggregations need all data)
 	hybridScanOptions := HybridScanOptions{
 		StartTimeNs: startTimeNs,
@@ -1340,6 +1354,292 @@ func (e *SQLEngine) compareValues(value1 *schema_pb.Value, value2 *schema_pb.Val
 	return 0
 }
 // tryFastParquetAggregation attempts to compute aggregations using parquet metadata instead of full scan
 // Returns (result, canOptimize) where canOptimize=true means the fast path was used
 func (e *SQLEngine) tryFastParquetAggregation(ctx context.Context, hybridScanner *HybridMessageScanner, aggregations []AggregationSpec) (*QueryResult, bool) {
 	// Check if all aggregations are optimizable with parquet statistics
 	for _, spec := range aggregations {
 		if !e.canUseParquetStatsForAggregation(spec) {
 			return nil, false
 		}
 	}
 	// Get all partitions for this topic
 	partitions, err := e.discoverTopicPartitions(hybridScanner.topic.Namespace, hybridScanner.topic.Name)
 	if err != nil {
 		return nil, false
 	}
 	// Collect parquet statistics from all partitions
 	allFileStats := make(map[string][]*ParquetFileStats) // partitionPath -> file stats
 	totalRowCount := int64(0)
 	for _, partition := range partitions {
 		partitionPath := fmt.Sprintf("/topics/%s/%s/%s", hybridScanner.topic.Namespace, hybridScanner.topic.Name, partition)
 		fileStats, err := hybridScanner.ReadParquetStatistics(partitionPath)
 		if err != nil {
 			// If we can't read stats from any partition, fall back to full scan
 			return nil, false
 		}
 		if len(fileStats) > 0 {
 			allFileStats[partitionPath] = fileStats
 			for _, fileStat := range fileStats {
 				totalRowCount += fileStat.RowCount
 			}
 		}
 	}
 	// If no parquet files found, can't optimize
 	if len(allFileStats) == 0 || totalRowCount == 0 {
 		return nil, false
 	}
 	// Compute aggregations using parquet statistics
 	aggResults := make([]AggregationResult, len(aggregations))
 	for i, spec := range aggregations {
 		switch spec.Function {
 		case "COUNT":
 			if spec.Column == "*" {
 				// COUNT(*) = sum of all file row counts
 				aggResults[i].Count = totalRowCount
 			} else {
 				// COUNT(column) - for now, assume all rows have non-null values
 				// TODO: Use null counts from parquet stats for more accuracy
 				aggResults[i].Count = totalRowCount
 			}
 		case "MIN":
 			// Find global minimum across all files and partitions
 			var globalMin interface{}
 			var globalMinValue *schema_pb.Value
 			for _, fileStats := range allFileStats {
 				for _, fileStat := range fileStats {
 					if colStats, exists := fileStat.ColumnStats[spec.Column]; exists {
 						if globalMinValue == nil || e.compareValues(colStats.MinValue, globalMinValue) < 0 {
 							globalMinValue = colStats.MinValue
 							globalMin = e.extractRawValue(colStats.MinValue)
 						}
 					}
 				}
 			}
 			// Handle system columns that aren't in parquet column stats
 			if globalMin == nil {
 				globalMin = e.getSystemColumnGlobalMin(spec.Column, allFileStats)
 			}
 			aggResults[i].Min = globalMin
 		case "MAX":
 			// Find global maximum across all files and partitions
 			var globalMax interface{}
 			var globalMaxValue *schema_pb.Value
 			for _, fileStats := range allFileStats {
 				for _, fileStat := range fileStats {
 					if colStats, exists := fileStat.ColumnStats[spec.Column]; exists {
 						if globalMaxValue == nil || e.compareValues(colStats.MaxValue, globalMaxValue) > 0 {
 							globalMaxValue = colStats.MaxValue
 							globalMax = e.extractRawValue(colStats.MaxValue)
 						}
 					}
 				}
 			}
 			// Handle system columns that aren't in parquet column stats
 			if globalMax == nil {
 				globalMax = e.getSystemColumnGlobalMax(spec.Column, allFileStats)
 			}
 			aggResults[i].Max = globalMax
 		default:
 			// SUM, AVG not easily optimizable with current parquet stats
 			return nil, false
 		}
 	}
 	// Build result using fast parquet statistics
 	columns := make([]string, len(aggregations))
 	row := make([]sqltypes.Value, len(aggregations))
 	for i, spec := range aggregations {
 		columns[i] = spec.Alias
 		row[i] = e.formatAggregationResult(spec, aggResults[i])
 	}
 	result := &QueryResult{
 		Columns: columns,
 		Rows:    [][]sqltypes.Value{row},
 	}
 	return result, true
 }
 // canUseParquetStatsForAggregation determines if an aggregation can be optimized with parquet stats
 func (e *SQLEngine) canUseParquetStatsForAggregation(spec AggregationSpec) bool {
 	switch spec.Function {
 	case "COUNT":
 		return spec.Column == "*" || e.isSystemColumn(spec.Column) || e.isRegularColumn(spec.Column)
 	case "MIN", "MAX":
 		return e.isSystemColumn(spec.Column) || e.isRegularColumn(spec.Column)
 	case "SUM", "AVG":
 		// These require scanning actual values, not just min/max
 		return false
 	default:
 		return false
 	}
 }
 // isSystemColumn checks if a column is a system column (_timestamp_ns, _key, _source)
 func (e *SQLEngine) isSystemColumn(columnName string) bool {
 	lowerName := strings.ToLower(columnName)
 	return lowerName == "_timestamp_ns" || lowerName == "timestamp_ns" ||
 		lowerName == "_key" || lowerName == "key" ||
 		lowerName == "_source" || lowerName == "source"
 }
 // isRegularColumn checks if a column might be a regular data column (placeholder)
 func (e *SQLEngine) isRegularColumn(columnName string) bool {
 	// For now, assume any non-system column is a regular column
 	return !e.isSystemColumn(columnName)
 }
 // getSystemColumnGlobalMin computes global min for system columns using file metadata
 func (e *SQLEngine) getSystemColumnGlobalMin(columnName string, allFileStats map[string][]*ParquetFileStats) interface{} {
 	lowerName := strings.ToLower(columnName)
 	switch lowerName {
 	case "_timestamp_ns", "timestamp_ns":
 		// For timestamps, find the earliest timestamp across all files
 		// This should match what's in the Extended["min"] metadata
 		var minTimestamp *int64
 		for _, fileStats := range allFileStats {
 			for _, fileStat := range fileStats {
 				// Extract timestamp from filename (format: YYYY-MM-DD-HH-MM-SS.parquet)
 				timestamp := e.extractTimestampFromFilename(fileStat.FileName)
 				if timestamp != 0 {
 					if minTimestamp == nil || timestamp < *minTimestamp {
 						minTimestamp = &timestamp
 					}
 				}
 			}
 		}
 		if minTimestamp != nil {
 			return *minTimestamp
 		}
 	case "_key", "key":
 		// For keys, we'd need to read the actual parquet column stats
 		// Fall back to scanning if not available in our current stats
 		return nil
 	case "_source", "source":
 		// Source is always "parquet_archive" for parquet files
 		return "parquet_archive"
 	}
 	return nil
 }
 // getSystemColumnGlobalMax computes global max for system columns using file metadata
 func (e *SQLEngine) getSystemColumnGlobalMax(columnName string, allFileStats map[string][]*ParquetFileStats) interface{} {
 	lowerName := strings.ToLower(columnName)
 	switch lowerName {
 	case "_timestamp_ns", "timestamp_ns":
 		// For timestamps, find the latest timestamp across all files
 		var maxTimestamp *int64
 		for _, fileStats := range allFileStats {
 			for _, fileStat := range fileStats {
 				// Extract timestamp from filename (format: YYYY-MM-DD-HH-MM-SS.parquet)
 				timestamp := e.extractTimestampFromFilename(fileStat.FileName)
 				if timestamp != 0 {
 					if maxTimestamp == nil || timestamp > *maxTimestamp {
 						maxTimestamp = &timestamp
 					}
 				}
 			}
 		}
 		if maxTimestamp != nil {
 			return *maxTimestamp
 		}
 	case "_key", "key":
 		// For keys, we'd need to read the actual parquet column stats
 		return nil
 	case "_source", "source":
 		// Source is always "parquet_archive" for parquet files
 		return "parquet_archive"
 	}
 	return nil
 }
 // extractTimestampFromFilename extracts timestamp from parquet filename
 // Format: YYYY-MM-DD-HH-MM-SS.parquet
 func (e *SQLEngine) extractTimestampFromFilename(filename string) int64 {
 	// Remove .parquet extension
 	if strings.HasSuffix(filename, ".parquet") {
 		filename = filename[:len(filename)-8]
 	}
 	// Parse timestamp format: 2006-01-02-15-04-05
 	t, err := time.Parse("2006-01-02-15-04-05", filename)
 	if err != nil {
 		return 0
 	}
 	return t.UnixNano()
 }
 // discoverTopicPartitions discovers all partitions for a given topic
 func (e *SQLEngine) discoverTopicPartitions(namespace, topicName string) ([]string, error) {
 	// Use the same discovery logic as in hybrid_message_scanner.go
 	topicPath := fmt.Sprintf("/topics/%s/%s", namespace, topicName)
 	// Get FilerClient from BrokerClient
 	filerClient, err := e.catalog.brokerClient.GetFilerClient()
 	if err != nil {
 		return nil, err
 	}
 	var partitions []string
 	err = filer_pb.ReadDirAllEntries(context.Background(), filerClient, util.FullPath(topicPath), "", func(entry *filer_pb.Entry, isLast bool) error {
 		if !entry.IsDirectory {
 			return nil
 		}
 		// Check if this looks like a partition directory (format: vYYYY-MM-DD-HH-MM-SS)
 		if strings.HasPrefix(entry.Name, "v") && len(entry.Name) == 20 {
 			// This is a time-based partition directory
 			// Look for numeric subdirectories (partition IDs)
 			partitionBasePath := fmt.Sprintf("%s/%s", topicPath, entry.Name)
 			err := filer_pb.ReadDirAllEntries(context.Background(), filerClient, util.FullPath(partitionBasePath), "", func(subEntry *filer_pb.Entry, isLast bool) error {
 				if subEntry.IsDirectory {
 					// Check if this is a numeric partition directory (format: 0000-XXXX)
 					if len(subEntry.Name) >= 4 {
 						partitionPath := fmt.Sprintf("%s/%s", entry.Name, subEntry.Name)
 						partitions = append(partitions, partitionPath)
 					}
 				}
 				return nil
 			})
 			if err != nil {
 				return err
 			}
 		}
 		return nil
 	})
 	return partitions, err
 }
 func (e *SQLEngine) formatAggregationResult(spec AggregationSpec, result AggregationResult) sqltypes.Value {
 	switch spec.Function {
 	case "COUNT":
--- a/weed/query/engine/hybrid_message_scanner.go
+++ b/weed/query/engine/hybrid_message_scanner.go
@@ -8,6 +8,8 @@ import (
 	"strings"
 	"time"
 	"github.com/parquet-go/parquet-go"
 	"github.com/seaweedfs/seaweedfs/weed/filer"
 	"github.com/seaweedfs/seaweedfs/weed/mq/logstore"
 	"github.com/seaweedfs/seaweedfs/weed/mq/schema"
 	"github.com/seaweedfs/seaweedfs/weed/mq/topic"
@@ -16,7 +18,9 @@ import (
 	"github.com/seaweedfs/seaweedfs/weed/pb/schema_pb"
 	"github.com/seaweedfs/seaweedfs/weed/query/sqltypes"
 	"github.com/seaweedfs/seaweedfs/weed/util"
 	"github.com/seaweedfs/seaweedfs/weed/util/chunk_cache"
 	"github.com/seaweedfs/seaweedfs/weed/util/log_buffer"
 	"github.com/seaweedfs/seaweedfs/weed/wdclient"
 	"google.golang.org/protobuf/proto"
 )
@@ -107,6 +111,22 @@ type HybridScanResult struct {
 	Source    string                      // "live_log" or "parquet_archive"
 }
 // ParquetColumnStats holds statistics for a single column from parquet metadata
 type ParquetColumnStats struct {
 	ColumnName string
 	MinValue   *schema_pb.Value
 	MaxValue   *schema_pb.Value
 	NullCount  int64
 	RowCount   int64
 }
 // ParquetFileStats holds aggregated statistics for a parquet file
 type ParquetFileStats struct {
 	FileName    string
 	RowCount    int64
 	ColumnStats map[string]*ParquetColumnStats
 }
 // Scan reads messages from both live logs and archived Parquet files
 // Uses SeaweedFS MQ's GenMergedReadFunc for seamless integration
 // Assumptions:
@@ -668,3 +688,300 @@ func (hms *HybridMessageScanner) generateSampleHybridData(options HybridScanOpti
 	return sampleData
 }
 // ReadParquetStatistics efficiently reads column statistics from parquet files
 // without scanning the full file content - uses parquet's built-in metadata
 func (h *HybridMessageScanner) ReadParquetStatistics(partitionPath string) ([]*ParquetFileStats, error) {
 	var fileStats []*ParquetFileStats
 	// Use the same chunk cache as the logstore package
 	chunkCache := chunk_cache.NewChunkCacheInMemory(256)
 	lookupFileIdFn := filer.LookupFn(h.filerClient)
 	err := filer_pb.ReadDirAllEntries(context.Background(), h.filerClient, util.FullPath(partitionPath), "", func(entry *filer_pb.Entry, isLast bool) error {
 		// Only process parquet files
 		if entry.IsDirectory || !strings.HasSuffix(entry.Name, ".parquet") {
 			return nil
 		}
 		// Extract statistics from this parquet file
 		stats, err := h.extractParquetFileStats(entry, lookupFileIdFn, chunkCache)
 		if err != nil {
 			// Log error but continue processing other files
 			fmt.Printf("Warning: failed to extract stats from %s: %v\n", entry.Name, err)
 			return nil
 		}
 		if stats != nil {
 			fileStats = append(fileStats, stats)
 		}
 		return nil
 	})
 	return fileStats, err
 }
 // extractParquetFileStats extracts column statistics from a single parquet file
 func (h *HybridMessageScanner) extractParquetFileStats(entry *filer_pb.Entry, lookupFileIdFn wdclient.LookupFileIdFunctionType, chunkCache *chunk_cache.ChunkCacheInMemory) (*ParquetFileStats, error) {
 	// Create reader for the parquet file
 	fileSize := filer.FileSize(entry)
 	visibleIntervals, _ := filer.NonOverlappingVisibleIntervals(context.Background(), lookupFileIdFn, entry.Chunks, 0, int64(fileSize))
 	chunkViews := filer.ViewFromVisibleIntervals(visibleIntervals, 0, int64(fileSize))
 	readerCache := filer.NewReaderCache(32, chunkCache, lookupFileIdFn)
 	readerAt := filer.NewChunkReaderAtFromClient(context.Background(), readerCache, chunkViews, int64(fileSize))
 	// Create parquet reader - this only reads metadata, not data
 	parquetReader := parquet.NewReader(readerAt)
 	defer parquetReader.Close()
 	fileView := parquetReader.File()
 	fileStats := &ParquetFileStats{
 		FileName:    entry.Name,
 		RowCount:    fileView.NumRows(),
 		ColumnStats: make(map[string]*ParquetColumnStats),
 	}
 	// Get schema information
 	schema := fileView.Schema()
 	// Process each row group
 	rowGroups := fileView.RowGroups()
 	for _, rowGroup := range rowGroups {
 		columnChunks := rowGroup.ColumnChunks()
 		// Process each column chunk
 		for i, chunk := range columnChunks {
 			// Get column name from schema
 			columnName := h.getColumnNameFromSchema(schema, i)
 			if columnName == "" {
 				continue
 			}
 			// Try to get column statistics
 			columnIndex, err := chunk.ColumnIndex()
 			if err != nil {
 				// No column index available - skip this column
 				continue
 			}
 			// Extract min/max values from the first page (for simplicity)
 			// In a more sophisticated implementation, we could aggregate across all pages
 			numPages := columnIndex.NumPages()
 			if numPages == 0 {
 				continue
 			}
 			minParquetValue := columnIndex.MinValue(0)
 			maxParquetValue := columnIndex.MaxValue(numPages - 1)
 			nullCount := int64(0)
 			// Aggregate null counts across all pages
 			for pageIdx := 0; pageIdx < numPages; pageIdx++ {
 				nullCount += columnIndex.NullCount(pageIdx)
 			}
 			// Convert parquet values to schema_pb.Value
 			minValue, err := h.convertParquetValueToSchemaValue(minParquetValue)
 			if err != nil {
 				continue
 			}
 			maxValue, err := h.convertParquetValueToSchemaValue(maxParquetValue)
 			if err != nil {
 				continue
 			}
 			// Store column statistics (aggregate across row groups if column already exists)
 			if existingStats, exists := fileStats.ColumnStats[columnName]; exists {
 				// Update existing statistics
 				if h.compareSchemaValues(minValue, existingStats.MinValue) < 0 {
 					existingStats.MinValue = minValue
 				}
 				if h.compareSchemaValues(maxValue, existingStats.MaxValue) > 0 {
 					existingStats.MaxValue = maxValue
 				}
 				existingStats.NullCount += nullCount
 			} else {
 				// Create new column statistics
 				fileStats.ColumnStats[columnName] = &ParquetColumnStats{
 					ColumnName: columnName,
 					MinValue:   minValue,
 					MaxValue:   maxValue,
 					NullCount:  nullCount,
 					RowCount:   rowGroup.NumRows(),
 				}
 			}
 		}
 	}
 	return fileStats, nil
 }
 // getColumnNameFromSchema extracts column name from parquet schema by index
 func (h *HybridMessageScanner) getColumnNameFromSchema(schema *parquet.Schema, columnIndex int) string {
 	// Get the leaf columns in order
 	var columnNames []string
 	h.collectColumnNames(schema.Fields(), &columnNames)
 	if columnIndex >= 0 && columnIndex < len(columnNames) {
 		return columnNames[columnIndex]
 	}
 	return ""
 }
 // collectColumnNames recursively collects leaf column names from schema
 func (h *HybridMessageScanner) collectColumnNames(fields []parquet.Field, names *[]string) {
 	for _, field := range fields {
 		if len(field.Fields()) == 0 {
 			// This is a leaf field (no sub-fields)
 			*names = append(*names, field.Name())
 		} else {
 			// This is a group - recurse
 			h.collectColumnNames(field.Fields(), names)
 		}
 	}
 }
 // convertParquetValueToSchemaValue converts parquet.Value to schema_pb.Value
 func (h *HybridMessageScanner) convertParquetValueToSchemaValue(pv parquet.Value) (*schema_pb.Value, error) {
 	switch pv.Kind() {
 	case parquet.Boolean:
 		return &schema_pb.Value{Kind: &schema_pb.Value_BoolValue{BoolValue: pv.Boolean()}}, nil
 	case parquet.Int32:
 		return &schema_pb.Value{Kind: &schema_pb.Value_Int32Value{Int32Value: pv.Int32()}}, nil
 	case parquet.Int64:
 		return &schema_pb.Value{Kind: &schema_pb.Value_Int64Value{Int64Value: pv.Int64()}}, nil
 	case parquet.Float:
 		return &schema_pb.Value{Kind: &schema_pb.Value_FloatValue{FloatValue: pv.Float()}}, nil
 	case parquet.Double:
 		return &schema_pb.Value{Kind: &schema_pb.Value_DoubleValue{DoubleValue: pv.Double()}}, nil
 	case parquet.ByteArray:
 		return &schema_pb.Value{Kind: &schema_pb.Value_BytesValue{BytesValue: pv.ByteArray()}}, nil
 	default:
 		return nil, fmt.Errorf("unsupported parquet value kind: %v", pv.Kind())
 	}
 }
 // compareSchemaValues compares two schema_pb.Value objects
 func (h *HybridMessageScanner) compareSchemaValues(v1, v2 *schema_pb.Value) int {
 	if v1 == nil && v2 == nil {
 		return 0
 	}
 	if v1 == nil {
 		return -1
 	}
 	if v2 == nil {
 		return 1
 	}
 	// Extract raw values and compare
 	raw1 := h.extractRawValueFromSchema(v1)
 	raw2 := h.extractRawValueFromSchema(v2)
 	return h.compareRawValues(raw1, raw2)
 }
 // extractRawValueFromSchema extracts the raw value from schema_pb.Value
 func (h *HybridMessageScanner) extractRawValueFromSchema(value *schema_pb.Value) interface{} {
 	switch v := value.Kind.(type) {
 	case *schema_pb.Value_BoolValue:
 		return v.BoolValue
 	case *schema_pb.Value_Int32Value:
 		return v.Int32Value
 	case *schema_pb.Value_Int64Value:
 		return v.Int64Value
 	case *schema_pb.Value_FloatValue:
 		return v.FloatValue
 	case *schema_pb.Value_DoubleValue:
 		return v.DoubleValue
 	case *schema_pb.Value_BytesValue:
 		return string(v.BytesValue) // Convert to string for comparison
 	case *schema_pb.Value_StringValue:
 		return v.StringValue
 	}
 	return nil
 }
 // compareRawValues compares two raw values
 func (h *HybridMessageScanner) compareRawValues(v1, v2 interface{}) int {
 	// Handle nil cases
 	if v1 == nil && v2 == nil {
 		return 0
 	}
 	if v1 == nil {
 		return -1
 	}
 	if v2 == nil {
 		return 1
 	}
 	// Compare based on type
 	switch val1 := v1.(type) {
 	case bool:
 		if val2, ok := v2.(bool); ok {
 			if val1 == val2 {
 				return 0
 			}
 			if val1 {
 				return 1
 			}
 			return -1
 		}
 	case int32:
 		if val2, ok := v2.(int32); ok {
 			if val1 < val2 {
 				return -1
 			} else if val1 > val2 {
 				return 1
 			}
 			return 0
 		}
 	case int64:
 		if val2, ok := v2.(int64); ok {
 			if val1 < val2 {
 				return -1
 			} else if val1 > val2 {
 				return 1
 			}
 			return 0
 		}
 	case float32:
 		if val2, ok := v2.(float32); ok {
 			if val1 < val2 {
 				return -1
 			} else if val1 > val2 {
 				return 1
 			}
 			return 0
 		}
 	case float64:
 		if val2, ok := v2.(float64); ok {
 			if val1 < val2 {
 				return -1
 			} else if val1 > val2 {
 				return 1
 			}
 			return 0
 		}
 	case string:
 		if val2, ok := v2.(string); ok {
 			if val1 < val2 {
 				return -1
 			} else if val1 > val2 {
 				return 1
 			}
 			return 0
 		}
 	}
 	// Default: try string comparison
 	str1 := fmt.Sprintf("%v", v1)
 	str2 := fmt.Sprintf("%v", v2)
 	if str1 < str2 {
 		return -1
 	} else if str1 > str2 {
 		return 1
 	}
 	return 0
 }