mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2025-09-23 11:23:35 +08:00
the parquet file should also remember the first buffer_start attribute from the sources
This commit is contained in:
@@ -287,13 +287,21 @@ func writeLogFilesToParquet(filerClient filer_pb.FilerClient, partitionDir strin
|
|||||||
// write to parquet file to partitionDir
|
// write to parquet file to partitionDir
|
||||||
parquetFileName := fmt.Sprintf("%s.parquet", time.Unix(0, startTsNs).UTC().Format("2006-01-02-15-04-05"))
|
parquetFileName := fmt.Sprintf("%s.parquet", time.Unix(0, startTsNs).UTC().Format("2006-01-02-15-04-05"))
|
||||||
|
|
||||||
// Collect source log file names for deduplication metadata
|
// Collect source log file names and buffer_start metadata for deduplication
|
||||||
var sourceLogFiles []string
|
var sourceLogFiles []string
|
||||||
|
var earliestBufferStart int64
|
||||||
for _, logFile := range logFileGroups {
|
for _, logFile := range logFileGroups {
|
||||||
sourceLogFiles = append(sourceLogFiles, logFile.Name)
|
sourceLogFiles = append(sourceLogFiles, logFile.Name)
|
||||||
|
|
||||||
|
// Extract buffer_start from log file metadata
|
||||||
|
if bufferStart := getBufferStartFromLogFile(logFile); bufferStart > 0 {
|
||||||
|
if earliestBufferStart == 0 || bufferStart < earliestBufferStart {
|
||||||
|
earliestBufferStart = bufferStart
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := saveParquetFileToPartitionDir(filerClient, tempFile, partitionDir, parquetFileName, preference, startTsNs, stopTsNs, sourceLogFiles); err != nil {
|
if err := saveParquetFileToPartitionDir(filerClient, tempFile, partitionDir, parquetFileName, preference, startTsNs, stopTsNs, sourceLogFiles, earliestBufferStart); err != nil {
|
||||||
return fmt.Errorf("save parquet file %s: %v", parquetFileName, err)
|
return fmt.Errorf("save parquet file %s: %v", parquetFileName, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -301,7 +309,7 @@ func writeLogFilesToParquet(filerClient filer_pb.FilerClient, partitionDir strin
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func saveParquetFileToPartitionDir(filerClient filer_pb.FilerClient, sourceFile *os.File, partitionDir, parquetFileName string, preference *operation.StoragePreference, startTsNs, stopTsNs int64, sourceLogFiles []string) error {
|
func saveParquetFileToPartitionDir(filerClient filer_pb.FilerClient, sourceFile *os.File, partitionDir, parquetFileName string, preference *operation.StoragePreference, startTsNs, stopTsNs int64, sourceLogFiles []string, earliestBufferStart int64) error {
|
||||||
uploader, err := operation.NewUploader()
|
uploader, err := operation.NewUploader()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("new uploader: %w", err)
|
return fmt.Errorf("new uploader: %w", err)
|
||||||
@@ -340,6 +348,13 @@ func saveParquetFileToPartitionDir(filerClient filer_pb.FilerClient, sourceFile
|
|||||||
entry.Extended["sources"] = sourceLogFilesJson
|
entry.Extended["sources"] = sourceLogFilesJson
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Store earliest buffer_start for precise broker deduplication
|
||||||
|
if earliestBufferStart > 0 {
|
||||||
|
bufferStartBytes := make([]byte, 8)
|
||||||
|
binary.BigEndian.PutUint64(bufferStartBytes, uint64(earliestBufferStart))
|
||||||
|
entry.Extended["buffer_start"] = bufferStartBytes
|
||||||
|
}
|
||||||
|
|
||||||
for i := int64(0); i < chunkCount; i++ {
|
for i := int64(0); i < chunkCount; i++ {
|
||||||
fileId, uploadResult, err, _ := uploader.UploadWithRetry(
|
fileId, uploadResult, err, _ := uploader.UploadWithRetry(
|
||||||
filerClient,
|
filerClient,
|
||||||
@@ -472,3 +487,24 @@ func eachChunk(buf []byte, eachLogEntryFn log_buffer.EachLogEntryFuncType) (proc
|
|||||||
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// getBufferStartFromLogFile extracts the buffer_start index from log file extended metadata
|
||||||
|
func getBufferStartFromLogFile(logFile *filer_pb.Entry) int64 {
|
||||||
|
if logFile.Extended == nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse buffer_start format (same as used in query engine)
|
||||||
|
if startJson, exists := logFile.Extended["buffer_start"]; exists {
|
||||||
|
// LogBufferStart struct (JSON format)
|
||||||
|
type LogBufferStart struct {
|
||||||
|
StartIndex int64 `json:"start_index"`
|
||||||
|
}
|
||||||
|
var bufferStart LogBufferStart
|
||||||
|
if err := json.Unmarshal(startJson, &bufferStart); err == nil {
|
||||||
|
return bufferStart.StartIndex
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
@@ -2,6 +2,7 @@ package engine
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"encoding/binary"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
@@ -517,7 +518,7 @@ func (c *BrokerClient) GetUnflushedMessages(ctx context.Context, namespace, topi
|
|||||||
}
|
}
|
||||||
|
|
||||||
// getEarliestBufferStart finds the earliest buffer_start index from disk files in the partition
|
// getEarliestBufferStart finds the earliest buffer_start index from disk files in the partition
|
||||||
// This is used for precise deduplication - any buffer index >= this value may still be in memory
|
// This checks both live log files and Parquet files for the most precise deduplication
|
||||||
func (c *BrokerClient) getEarliestBufferStart(ctx context.Context, partitionPath string) (int64, error) {
|
func (c *BrokerClient) getEarliestBufferStart(ctx context.Context, partitionPath string) (int64, error) {
|
||||||
filerClient, err := c.GetFilerClient()
|
filerClient, err := c.GetFilerClient()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -527,12 +528,12 @@ func (c *BrokerClient) getEarliestBufferStart(ctx context.Context, partitionPath
|
|||||||
var earliestBufferIndex int64 = -1 // -1 means no buffer_start found
|
var earliestBufferIndex int64 = -1 // -1 means no buffer_start found
|
||||||
|
|
||||||
err = filer_pb.ReadDirAllEntries(ctx, filerClient, util.FullPath(partitionPath), "", func(entry *filer_pb.Entry, isLast bool) error {
|
err = filer_pb.ReadDirAllEntries(ctx, filerClient, util.FullPath(partitionPath), "", func(entry *filer_pb.Entry, isLast bool) error {
|
||||||
// Skip directories and parquet files
|
// Skip directories
|
||||||
if entry.IsDirectory || strings.HasSuffix(entry.Name, ".parquet") {
|
if entry.IsDirectory {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract buffer_start from file extended attributes
|
// Extract buffer_start from file extended attributes (both log files and parquet files)
|
||||||
bufferStart := c.getBufferStartFromEntry(entry)
|
bufferStart := c.getBufferStartFromEntry(entry)
|
||||||
if bufferStart != nil && bufferStart.StartIndex > 0 {
|
if bufferStart != nil && bufferStart.StartIndex > 0 {
|
||||||
if earliestBufferIndex == -1 || bufferStart.StartIndex < earliestBufferIndex {
|
if earliestBufferIndex == -1 || bufferStart.StartIndex < earliestBufferIndex {
|
||||||
@@ -555,15 +556,24 @@ func (c *BrokerClient) getEarliestBufferStart(ctx context.Context, partitionPath
|
|||||||
}
|
}
|
||||||
|
|
||||||
// getBufferStartFromEntry extracts LogBufferStart from file entry metadata
|
// getBufferStartFromEntry extracts LogBufferStart from file entry metadata
|
||||||
|
// Handles both JSON format (log files) and binary format (Parquet files)
|
||||||
func (c *BrokerClient) getBufferStartFromEntry(entry *filer_pb.Entry) *LogBufferStart {
|
func (c *BrokerClient) getBufferStartFromEntry(entry *filer_pb.Entry) *LogBufferStart {
|
||||||
if entry.Extended == nil {
|
if entry.Extended == nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse buffer_start format
|
if startData, exists := entry.Extended["buffer_start"]; exists {
|
||||||
if startJson, exists := entry.Extended["buffer_start"]; exists {
|
// Try binary format first (Parquet files)
|
||||||
|
if len(startData) == 8 {
|
||||||
|
startIndex := int64(binary.BigEndian.Uint64(startData))
|
||||||
|
if startIndex > 0 {
|
||||||
|
return &LogBufferStart{StartIndex: startIndex}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try JSON format (log files)
|
||||||
var bufferStart LogBufferStart
|
var bufferStart LogBufferStart
|
||||||
if err := json.Unmarshal(startJson, &bufferStart); err == nil {
|
if err := json.Unmarshal(startData, &bufferStart); err == nil {
|
||||||
return &bufferStart
|
return &bufferStart
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user