mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2025-09-23 07:43:37 +08:00
171 lines
5.1 KiB
Go
171 lines
5.1 KiB
Go
package engine
|
|
|
|
import (
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/schema_pb"
|
|
)
|
|
|
|
// isSystemColumn checks if a column is a system column (_timestamp_ns, _key, _source)
|
|
func (e *SQLEngine) isSystemColumn(columnName string) bool {
|
|
lowerName := strings.ToLower(columnName)
|
|
return lowerName == "_timestamp_ns" || lowerName == "timestamp_ns" ||
|
|
lowerName == "_key" || lowerName == "key" ||
|
|
lowerName == "_source" || lowerName == "source"
|
|
}
|
|
|
|
// isRegularColumn checks if a column might be a regular data column (placeholder)
|
|
func (e *SQLEngine) isRegularColumn(columnName string) bool {
|
|
// For now, assume any non-system column is a regular column
|
|
return !e.isSystemColumn(columnName)
|
|
}
|
|
|
|
// getSystemColumnGlobalMin computes global min for system columns using file metadata
|
|
func (e *SQLEngine) getSystemColumnGlobalMin(columnName string, allFileStats map[string][]*ParquetFileStats) interface{} {
|
|
lowerName := strings.ToLower(columnName)
|
|
|
|
switch lowerName {
|
|
case "_timestamp_ns", "timestamp_ns":
|
|
// For timestamps, find the earliest timestamp across all files
|
|
// This should match what's in the Extended["min"] metadata
|
|
var minTimestamp *int64
|
|
for _, fileStats := range allFileStats {
|
|
for _, fileStat := range fileStats {
|
|
// Extract timestamp from filename (format: YYYY-MM-DD-HH-MM-SS.parquet)
|
|
timestamp := e.extractTimestampFromFilename(fileStat.FileName)
|
|
if timestamp != 0 {
|
|
if minTimestamp == nil || timestamp < *minTimestamp {
|
|
minTimestamp = ×tamp
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if minTimestamp != nil {
|
|
return *minTimestamp
|
|
}
|
|
|
|
case "_key", "key":
|
|
// For keys, we'd need to read the actual parquet column stats
|
|
// Fall back to scanning if not available in our current stats
|
|
return nil
|
|
|
|
case "_source", "source":
|
|
// Source is always "parquet_archive" for parquet files
|
|
return "parquet_archive"
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// getSystemColumnGlobalMax computes global max for system columns using file metadata
|
|
func (e *SQLEngine) getSystemColumnGlobalMax(columnName string, allFileStats map[string][]*ParquetFileStats) interface{} {
|
|
lowerName := strings.ToLower(columnName)
|
|
|
|
switch lowerName {
|
|
case "_timestamp_ns", "timestamp_ns":
|
|
// For timestamps, find the latest timestamp across all files
|
|
// This should match what's in the Extended["max"] metadata
|
|
var maxTimestamp *int64
|
|
for _, fileStats := range allFileStats {
|
|
for _, fileStat := range fileStats {
|
|
// Extract timestamp from filename (format: YYYY-MM-DD-HH-MM-SS.parquet)
|
|
timestamp := e.extractTimestampFromFilename(fileStat.FileName)
|
|
if timestamp != 0 {
|
|
if maxTimestamp == nil || timestamp > *maxTimestamp {
|
|
maxTimestamp = ×tamp
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if maxTimestamp != nil {
|
|
return *maxTimestamp
|
|
}
|
|
|
|
case "_key", "key":
|
|
// For keys, we'd need to read the actual parquet column stats
|
|
// Fall back to scanning if not available in our current stats
|
|
return nil
|
|
|
|
case "_source", "source":
|
|
// Source is always "parquet_archive" for parquet files
|
|
return "parquet_archive"
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// extractTimestampFromFilename extracts timestamp from parquet filename
|
|
func (e *SQLEngine) extractTimestampFromFilename(filename string) int64 {
|
|
// Expected format: YYYY-MM-DD-HH-MM-SS.parquet or similar
|
|
// Try to parse timestamp from filename
|
|
re := regexp.MustCompile(`(\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2})`)
|
|
matches := re.FindStringSubmatch(filename)
|
|
if len(matches) > 1 {
|
|
timestampStr := matches[1]
|
|
// Convert to time and then to nanoseconds
|
|
t, err := time.Parse("2006-01-02-15-04-05", timestampStr)
|
|
if err == nil {
|
|
return t.UnixNano()
|
|
}
|
|
}
|
|
|
|
// Fallback: try to parse as unix timestamp if filename is numeric
|
|
if timestampStr := strings.TrimSuffix(filename, ".parquet"); timestampStr != filename {
|
|
if timestamp, err := strconv.ParseInt(timestampStr, 10, 64); err == nil {
|
|
// Assume it's already in nanoseconds
|
|
return timestamp
|
|
}
|
|
}
|
|
|
|
return 0
|
|
}
|
|
|
|
// findColumnValue performs case-insensitive lookup of column values
|
|
// Now includes support for system columns stored in HybridScanResult
|
|
func (e *SQLEngine) findColumnValue(result HybridScanResult, columnName string) *schema_pb.Value {
|
|
lowerName := strings.ToLower(columnName)
|
|
|
|
// Check system columns first
|
|
switch lowerName {
|
|
case "_timestamp_ns", "timestamp_ns":
|
|
return &schema_pb.Value{
|
|
Kind: &schema_pb.Value_Int64Value{Int64Value: result.Timestamp},
|
|
}
|
|
case "_key", "key":
|
|
return &schema_pb.Value{
|
|
Kind: &schema_pb.Value_BytesValue{BytesValue: result.Key},
|
|
}
|
|
case "_source", "source":
|
|
return &schema_pb.Value{
|
|
Kind: &schema_pb.Value_StringValue{StringValue: result.Source},
|
|
}
|
|
}
|
|
|
|
// Check regular columns in the record data
|
|
if result.RecordValue != nil {
|
|
recordValue, ok := result.RecordValue.Kind.(*schema_pb.Value_RecordValue)
|
|
if !ok {
|
|
return nil
|
|
}
|
|
|
|
if recordValue.RecordValue.Fields != nil {
|
|
// Try exact match first
|
|
if value, exists := recordValue.RecordValue.Fields[columnName]; exists {
|
|
return value
|
|
}
|
|
|
|
// Try case-insensitive match
|
|
for fieldName, value := range recordValue.RecordValue.Fields {
|
|
if strings.EqualFold(fieldName, columnName) {
|
|
return value
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|