add telemetry (#6926)

* add telemetry

* fix go mod

* add default telemetry server url

* Update README.md

* replace with broker count instead of s3 count

* Update telemetry.pb.go

* github action to deploy
This commit is contained in:
Chris Lu
2025-06-28 14:11:55 -07:00
committed by GitHub
parent 29892c43ff
commit a1aab8a083
23 changed files with 3657 additions and 0 deletions

157
.github/workflows/deploy_telemetry.yml vendored Normal file
View File

@@ -0,0 +1,157 @@
# This workflow will build and deploy the SeaweedFS telemetry server
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-go
name: Deploy Telemetry Server
on:
push:
branches: [ "master" ]
paths:
- 'telemetry/**'
workflow_dispatch:
inputs:
setup:
description: 'Run first-time server setup'
required: true
type: boolean
default: false
deploy:
description: 'Deploy telemetry server to remote server'
required: true
type: boolean
default: false
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Go
uses: actions/setup-go@v4
with:
go-version: '1.24'
- name: Build Telemetry Server
run: |
go mod tidy
cd telemetry/server
GOOS=linux GOARCH=amd64 go build -o telemetry-server main.go
- name: First-time Server Setup
if: github.event_name == 'workflow_dispatch' && inputs.setup
env:
SSH_PRIVATE_KEY: ${{ secrets.TELEMETRY_SSH_PRIVATE_KEY }}
REMOTE_HOST: ${{ secrets.TELEMETRY_HOST }}
REMOTE_USER: ${{ secrets.TELEMETRY_USER }}
run: |
mkdir -p ~/.ssh
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
chmod 600 ~/.ssh/deploy_key
echo "Host *" > ~/.ssh/config
echo " StrictHostKeyChecking no" >> ~/.ssh/config
# Create all required directories with proper permissions
ssh -i ~/.ssh/deploy_key $REMOTE_USER@$REMOTE_HOST "
mkdir -p ~/seaweedfs-telemetry/bin ~/seaweedfs-telemetry/logs ~/seaweedfs-telemetry/data ~/seaweedfs-telemetry/tmp && \
chmod 755 ~/seaweedfs-telemetry/logs && \
chmod 755 ~/seaweedfs-telemetry/data && \
touch ~/seaweedfs-telemetry/logs/telemetry.log ~/seaweedfs-telemetry/logs/telemetry.error.log && \
chmod 644 ~/seaweedfs-telemetry/logs/*.log"
# Create systemd service file
echo "
[Unit]
Description=SeaweedFS Telemetry Server
After=network.target
[Service]
Type=simple
User=$REMOTE_USER
WorkingDirectory=/home/$REMOTE_USER/seaweedfs-telemetry
ExecStart=/home/$REMOTE_USER/seaweedfs-telemetry/bin/telemetry-server -port=8353
Restart=always
RestartSec=5
StandardOutput=append:/home/$REMOTE_USER/seaweedfs-telemetry/logs/telemetry.log
StandardError=append:/home/$REMOTE_USER/seaweedfs-telemetry/logs/telemetry.error.log
[Install]
WantedBy=multi-user.target" > telemetry.service
# Setup logrotate configuration
echo "# SeaweedFS Telemetry service log rotation
/home/$REMOTE_USER/seaweedfs-telemetry/logs/*.log {
daily
rotate 30
compress
delaycompress
missingok
notifempty
create 644 $REMOTE_USER $REMOTE_USER
postrotate
systemctl restart telemetry.service
endscript
}" > telemetry_logrotate
# Copy Grafana dashboard and Prometheus config
scp -i ~/.ssh/deploy_key telemetry/grafana-dashboard.json $REMOTE_USER@$REMOTE_HOST:~/seaweedfs-telemetry/
scp -i ~/.ssh/deploy_key telemetry/prometheus.yml $REMOTE_USER@$REMOTE_HOST:~/seaweedfs-telemetry/
# Copy and install service and logrotate files
scp -i ~/.ssh/deploy_key telemetry.service telemetry_logrotate $REMOTE_USER@$REMOTE_HOST:~/seaweedfs-telemetry/
ssh -i ~/.ssh/deploy_key $REMOTE_USER@$REMOTE_HOST "
sudo mv ~/seaweedfs-telemetry/telemetry.service /etc/systemd/system/ && \
sudo mv ~/seaweedfs-telemetry/telemetry_logrotate /etc/logrotate.d/seaweedfs-telemetry && \
sudo systemctl daemon-reload && \
sudo systemctl enable telemetry.service"
rm -f ~/.ssh/deploy_key
- name: Deploy Telemetry Server to Remote Server
if: (github.event_name == 'push' && contains(github.ref, 'refs/heads/master')) || (github.event_name == 'workflow_dispatch' && inputs.deploy)
env:
SSH_PRIVATE_KEY: ${{ secrets.TELEMETRY_SSH_PRIVATE_KEY }}
REMOTE_HOST: ${{ secrets.TELEMETRY_HOST }}
REMOTE_USER: ${{ secrets.TELEMETRY_USER }}
run: |
mkdir -p ~/.ssh
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
chmod 600 ~/.ssh/deploy_key
echo "Host *" > ~/.ssh/config
echo " StrictHostKeyChecking no" >> ~/.ssh/config
# Create temp directory and copy binary
ssh -i ~/.ssh/deploy_key $REMOTE_USER@$REMOTE_HOST "mkdir -p ~/seaweedfs-telemetry/tmp"
scp -i ~/.ssh/deploy_key telemetry/server/telemetry-server $REMOTE_USER@$REMOTE_HOST:~/seaweedfs-telemetry/tmp/
# Copy updated configuration files
scp -i ~/.ssh/deploy_key telemetry/grafana-dashboard.json $REMOTE_USER@$REMOTE_HOST:~/seaweedfs-telemetry/
scp -i ~/.ssh/deploy_key telemetry/prometheus.yml $REMOTE_USER@$REMOTE_HOST:~/seaweedfs-telemetry/
# Stop service, move binary, and restart
ssh -i ~/.ssh/deploy_key $REMOTE_USER@$REMOTE_HOST "
sudo systemctl stop telemetry.service || true && \
mkdir -p ~/seaweedfs-telemetry/bin && \
mv ~/seaweedfs-telemetry/tmp/telemetry-server ~/seaweedfs-telemetry/bin/ && \
chmod +x ~/seaweedfs-telemetry/bin/telemetry-server && \
sudo systemctl start telemetry.service && \
sudo systemctl status telemetry.service"
# Verify deployment
ssh -i ~/.ssh/deploy_key $REMOTE_USER@$REMOTE_HOST "
echo 'Waiting for service to start...'
sleep 5
curl -f http://localhost:8353/health || echo 'Health check failed'"
rm -f ~/.ssh/deploy_key
- name: Notify Deployment Status
if: always()
run: |
if [ "${{ job.status }}" == "success" ]; then
echo "✅ Telemetry server deployment successful"
echo "Dashboard: http://${{ secrets.TELEMETRY_HOST }}:8353"
echo "Metrics: http://${{ secrets.TELEMETRY_HOST }}:8353/metrics"
else
echo "❌ Telemetry server deployment failed"
fi

271
telemetry/DEPLOYMENT.md Normal file
View File

@@ -0,0 +1,271 @@
# SeaweedFS Telemetry Server Deployment
This document describes how to deploy the SeaweedFS telemetry server to a remote server using GitHub Actions.
## Prerequisites
1. A remote Linux server with:
- SSH access
- systemd (for service management)
- Optional: Prometheus and Grafana (for monitoring)
2. GitHub repository secrets configured (see [Setup GitHub Secrets](#setup-github-secrets) below):
- `TELEMETRY_SSH_PRIVATE_KEY`: SSH private key for accessing the remote server
- `TELEMETRY_HOST`: Remote server hostname or IP address
- `TELEMETRY_USER`: Username for SSH access
## Setup GitHub Secrets
Before using the deployment workflow, you need to configure the required secrets in your GitHub repository.
### Step 1: Generate SSH Key Pair
On your local machine, generate a new SSH key pair specifically for deployment:
```bash
# Generate a new SSH key pair
ssh-keygen -t ed25519 -C "seaweedfs-telemetry-deploy" -f ~/.ssh/seaweedfs_telemetry_deploy
# This creates two files:
# ~/.ssh/seaweedfs_telemetry_deploy (private key)
# ~/.ssh/seaweedfs_telemetry_deploy.pub (public key)
```
### Step 2: Configure Remote Server
Copy the public key to your remote server:
```bash
# Copy public key to remote server
ssh-copy-id -i ~/.ssh/seaweedfs_telemetry_deploy.pub user@your-server.com
# Or manually append to authorized_keys
cat ~/.ssh/seaweedfs_telemetry_deploy.pub | ssh user@your-server.com "mkdir -p ~/.ssh && cat >> ~/.ssh/authorized_keys"
```
Test the SSH connection:
```bash
# Test SSH connection with the new key
ssh -i ~/.ssh/seaweedfs_telemetry_deploy user@your-server.com "echo 'SSH connection successful'"
```
### Step 3: Add Secrets to GitHub Repository
1. Go to your GitHub repository
2. Click on **Settings** tab
3. In the sidebar, click **Secrets and variables****Actions**
4. Click **New repository secret** for each of the following:
#### TELEMETRY_SSH_PRIVATE_KEY
```bash
# Display the private key content
cat ~/.ssh/seaweedfs_telemetry_deploy
```
- **Name**: `TELEMETRY_SSH_PRIVATE_KEY`
- **Value**: Copy the entire private key content, including the `-----BEGIN OPENSSH PRIVATE KEY-----` and `-----END OPENSSH PRIVATE KEY-----` lines
#### TELEMETRY_HOST
- **Name**: `TELEMETRY_HOST`
- **Value**: Your server's hostname or IP address (e.g., `telemetry.example.com` or `192.168.1.100`)
#### TELEMETRY_USER
- **Name**: `TELEMETRY_USER`
- **Value**: The username on the remote server (e.g., `ubuntu`, `deploy`, or your username)
### Step 4: Verify Configuration
Create a simple test workflow or manually trigger the deployment to verify the secrets are working correctly.
### Security Best Practices
1. **Dedicated SSH Key**: Use a separate SSH key only for deployment
2. **Limited Permissions**: Create a dedicated user on the remote server with minimal required permissions
3. **Key Rotation**: Regularly rotate SSH keys
4. **Server Access**: Restrict SSH access to specific IP ranges if possible
### Example Server Setup
If you're setting up a new server, here's a basic configuration:
```bash
# On the remote server, create a dedicated user for deployment
sudo useradd -m -s /bin/bash seaweedfs-deploy
sudo usermod -aG sudo seaweedfs-deploy # Only if sudo access is needed
# Switch to the deployment user
sudo su - seaweedfs-deploy
# Create SSH directory
mkdir -p ~/.ssh
chmod 700 ~/.ssh
# Add your public key (paste the content of seaweedfs_telemetry_deploy.pub)
nano ~/.ssh/authorized_keys
chmod 600 ~/.ssh/authorized_keys
```
### Troubleshooting
#### SSH Connection Issues
```bash
# Test SSH connection manually
ssh -i ~/.ssh/seaweedfs_telemetry_deploy -v user@your-server.com
# Check SSH key permissions
ls -la ~/.ssh/seaweedfs_telemetry_deploy*
# Should show: -rw------- for private key, -rw-r--r-- for public key
```
#### GitHub Actions Fails
1. **Check secrets**: Ensure all three secrets are properly set in GitHub
2. **Verify SSH key**: Make sure the entire private key (including headers/footers) is copied
3. **Test connectivity**: Manually SSH to the server from your local machine
4. **Check user permissions**: Ensure the remote user has necessary permissions
## GitHub Actions Workflow
The deployment workflow (`.github/workflows/deploy_telemetry.yml`) provides two main operations:
### 1. First-time Setup
Run this once to set up the remote server:
1. Go to GitHub Actions in your repository
2. Select "Deploy Telemetry Server" workflow
3. Click "Run workflow"
4. Check "Run first-time server setup"
5. Click "Run workflow"
This will:
- Create necessary directories on the remote server
- Set up systemd service configuration
- Configure log rotation
- Upload Grafana dashboard and Prometheus configuration
### 2. Deploy Updates
Deployments happen automatically when:
- Code is pushed to the `master` branch with changes in the `telemetry/` directory
Or manually trigger deployment:
1. Go to GitHub Actions in your repository
2. Select "Deploy Telemetry Server" workflow
3. Click "Run workflow"
4. Check "Deploy telemetry server to remote server"
5. Click "Run workflow"
## Server Directory Structure
After setup, the remote server will have:
```
~/seaweedfs-telemetry/
├── bin/
│ └── telemetry-server # Binary executable
├── logs/
│ ├── telemetry.log # Application logs
│ └── telemetry.error.log # Error logs
├── data/ # Data directory (if needed)
├── grafana-dashboard.json # Grafana dashboard configuration
└── prometheus.yml # Prometheus configuration
```
## Service Management
The telemetry server runs as a systemd service:
```bash
# Check service status
sudo systemctl status telemetry.service
# View logs
sudo journalctl -u telemetry.service -f
# Restart service
sudo systemctl restart telemetry.service
# Stop/start service
sudo systemctl stop telemetry.service
sudo systemctl start telemetry.service
```
## Accessing the Service
After deployment, the telemetry server will be available at:
- **Dashboard**: `http://your-server:8353`
- **API**: `http://your-server:8353/api/*`
- **Metrics**: `http://your-server:8353/metrics`
- **Health Check**: `http://your-server:8353/health`
## Optional: Prometheus and Grafana Integration
### Prometheus Setup
1. Install Prometheus on your server
2. Update `/etc/prometheus/prometheus.yml` to include:
```yaml
scrape_configs:
- job_name: 'seaweedfs-telemetry'
static_configs:
- targets: ['localhost:8353']
metrics_path: '/metrics'
```
### Grafana Setup
1. Install Grafana on your server
2. Import the dashboard from `~/seaweedfs-telemetry/grafana-dashboard.json`
3. Configure Prometheus as a data source pointing to your Prometheus instance
## Troubleshooting
### Deployment Fails
1. Check GitHub Actions logs for detailed error messages
2. Verify SSH connectivity: `ssh user@host`
3. Ensure all required secrets are configured in GitHub
### Service Won't Start
1. Check service logs: `sudo journalctl -u telemetry.service`
2. Verify binary permissions: `ls -la ~/seaweedfs-telemetry/bin/`
3. Test binary manually: `~/seaweedfs-telemetry/bin/telemetry-server -help`
### Port Conflicts
If port 8353 is already in use:
1. Edit the systemd service: `sudo systemctl edit telemetry.service`
2. Add override configuration:
```ini
[Service]
ExecStart=
ExecStart=/home/user/seaweedfs-telemetry/bin/telemetry-server -port=8354
```
3. Reload and restart: `sudo systemctl daemon-reload && sudo systemctl restart telemetry.service`
## Security Considerations
1. **Firewall**: Consider restricting access to telemetry ports
2. **SSH Keys**: Use dedicated SSH keys with minimal permissions
3. **User Permissions**: Run the service as a non-privileged user
4. **Network**: Consider running on internal networks only
## Monitoring
Monitor the deployment and service health:
- **GitHub Actions**: Check workflow runs for deployment status
- **System Logs**: `sudo journalctl -u telemetry.service`
- **Application Logs**: `tail -f ~/seaweedfs-telemetry/logs/telemetry.log`
- **Health Endpoint**: `curl http://localhost:8353/health`
- **Metrics**: `curl http://localhost:8353/metrics`

351
telemetry/README.md Normal file
View File

@@ -0,0 +1,351 @@
# SeaweedFS Telemetry System
A privacy-respecting telemetry system for SeaweedFS that collects cluster-level usage statistics and provides visualization through Prometheus and Grafana.
## Features
- **Privacy-First Design**: Uses in-memory cluster IDs (regenerated on restart), no personal data collection
- **Prometheus Integration**: Native Prometheus metrics for monitoring and alerting
- **Grafana Dashboards**: Pre-built dashboards for data visualization
- **Protocol Buffers**: Efficient binary data transmission for optimal performance
- **Opt-in Only**: Disabled by default, requires explicit configuration
- **Docker Compose**: Complete monitoring stack deployment
- **Automatic Cleanup**: Configurable data retention policies
## Architecture
```
SeaweedFS Cluster → Telemetry Client → Telemetry Server → Prometheus → Grafana
(protobuf) (metrics) (queries)
```
## Data Transmission
The telemetry system uses **Protocol Buffers exclusively** for efficient binary data transmission:
- **Compact Format**: 30-50% smaller than JSON
- **Fast Serialization**: Better performance than text-based formats
- **Type Safety**: Strong typing with generated Go structs
- **Schema Evolution**: Built-in versioning support
### Protobuf Schema
```protobuf
message TelemetryData {
string cluster_id = 1; // In-memory generated UUID
string version = 2; // SeaweedFS version
string os = 3; // Operating system
repeated string features = 4; // Enabled features
string deployment = 5; // Deployment type
int32 volume_server_count = 6; // Number of volume servers
uint64 total_disk_bytes = 7; // Total disk usage
int32 total_volume_count = 8; // Total volume count
int64 timestamp = 9; // Collection timestamp
}
```
## Privacy Approach
- **No Personal Data**: No hostnames, IP addresses, or user information
- **In-Memory IDs**: Cluster IDs are generated in-memory and change on restart
- **Aggregated Data**: Only cluster-level statistics, no individual file/user data
- **Opt-in Only**: Telemetry is disabled by default
- **Transparent**: Open source implementation, clear data collection policy
## Collected Data
| Field | Description | Example |
|-------|-------------|---------|
| `cluster_id` | In-memory UUID (changes on restart) | `a1b2c3d4-...` |
| `version` | SeaweedFS version | `3.45` |
| `os` | Operating system and architecture | `linux/amd64` |
| `features` | Enabled components | `["filer", "s3api"]` |
| `deployment` | Deployment type | `cluster` |
| `volume_server_count` | Number of volume servers | `5` |
| `total_disk_bytes` | Total disk usage across cluster | `1073741824` |
| `total_volume_count` | Total number of volumes | `120` |
| `timestamp` | When data was collected | `1640995200` |
## Quick Start
### 1. Deploy Telemetry Server
```bash
# Clone and start the complete monitoring stack
git clone https://github.com/seaweedfs/seaweedfs.git
cd seaweedfs/telemetry
docker-compose up -d
# Or run the server directly
cd server
go run . -port=8080 -dashboard=true
```
### 2. Configure SeaweedFS
```bash
# Enable telemetry in SeaweedFS master (uses default telemetry.seaweedfs.com:3091)
weed master -telemetry=true
# Or in server mode
weed server -telemetry=true
# Or specify custom telemetry server
weed master -telemetry=true -telemetry.url=http://localhost:8080/api/collect
```
### 3. Access Dashboards
- **Telemetry Server**: http://localhost:8080
- **Prometheus**: http://localhost:9090
- **Grafana**: http://localhost:3000 (admin/admin)
## Configuration
### SeaweedFS Master/Server
```bash
# Enable telemetry
-telemetry=true
# Set custom telemetry server URL (optional, defaults to telemetry.seaweedfs.com:3091)
-telemetry.url=http://your-telemetry-server:8080/api/collect
```
### Telemetry Server
```bash
# Server configuration
-port=8080 # Server port
-dashboard=true # Enable built-in dashboard
-cleanup=24h # Cleanup interval
-max-age=720h # Maximum data retention (30 days)
# Example
./telemetry-server -port=8080 -dashboard=true -cleanup=24h -max-age=720h
```
## Prometheus Metrics
The telemetry server exposes these Prometheus metrics:
### Cluster Metrics
- `seaweedfs_telemetry_total_clusters`: Total unique clusters (30 days)
- `seaweedfs_telemetry_active_clusters`: Active clusters (7 days)
### Per-Cluster Metrics
- `seaweedfs_telemetry_volume_servers{cluster_id, version, os, deployment}`: Volume servers per cluster
- `seaweedfs_telemetry_disk_bytes{cluster_id, version, os, deployment}`: Disk usage per cluster
- `seaweedfs_telemetry_volume_count{cluster_id, version, os, deployment}`: Volume count per cluster
- `seaweedfs_telemetry_filer_count{cluster_id, version, os, deployment}`: Filer servers per cluster
- `seaweedfs_telemetry_broker_count{cluster_id, version, os, deployment}`: Broker servers per cluster
- `seaweedfs_telemetry_cluster_info{cluster_id, version, os, deployment, features}`: Cluster metadata
### Server Metrics
- `seaweedfs_telemetry_reports_received_total`: Total telemetry reports received
## API Endpoints
### Data Collection
```bash
# Submit telemetry data (protobuf only)
POST /api/collect
Content-Type: application/x-protobuf
[TelemetryRequest protobuf data]
```
### Statistics (JSON for dashboard/debugging)
```bash
# Get aggregated statistics
GET /api/stats
# Get recent cluster instances
GET /api/instances?limit=100
# Get metrics over time
GET /api/metrics?days=30
```
### Monitoring
```bash
# Prometheus metrics
GET /metrics
```
## Docker Deployment
### Complete Stack (Recommended)
```yaml
# docker-compose.yml
version: '3.8'
services:
telemetry-server:
build: ./server
ports:
- "8080:8080"
command: ["-port=8080", "-dashboard=true", "-cleanup=24h"]
prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
grafana:
image: grafana/grafana:latest
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
volumes:
- ./grafana-provisioning:/etc/grafana/provisioning
- ./grafana-dashboard.json:/var/lib/grafana/dashboards/seaweedfs.json
```
```bash
# Deploy the stack
docker-compose up -d
# Scale telemetry server if needed
docker-compose up -d --scale telemetry-server=3
```
### Server Only
```bash
# Build and run telemetry server
cd server
docker build -t seaweedfs-telemetry .
docker run -p 8080:8080 seaweedfs-telemetry -port=8080 -dashboard=true
```
## Development
### Protocol Buffer Development
```bash
# Generate protobuf code
cd telemetry
protoc --go_out=. --go_opt=paths=source_relative proto/telemetry.proto
# The generated code is already included in the repository
```
### Build from Source
```bash
# Build telemetry server
cd telemetry/server
go build -o telemetry-server .
# Build SeaweedFS with telemetry support
cd ../..
go build -o weed ./weed
```
### Testing
```bash
# Test telemetry server
cd telemetry/server
go test ./...
# Test protobuf communication (requires protobuf tools)
# See telemetry client code for examples
```
## Grafana Dashboard
The included Grafana dashboard provides:
- **Overview**: Total and active clusters, version distribution
- **Resource Usage**: Volume servers and disk usage over time
- **Deployments**: Deployment type and OS distribution
- **Growth Trends**: Historical growth patterns
### Custom Queries
```promql
# Total active clusters
seaweedfs_telemetry_active_clusters
# Disk usage by version
sum by (version) (seaweedfs_telemetry_disk_bytes)
# Volume servers by deployment type
sum by (deployment) (seaweedfs_telemetry_volume_servers)
# Filer servers by version
sum by (version) (seaweedfs_telemetry_filer_count)
# Broker servers across all clusters
sum(seaweedfs_telemetry_broker_count)
# Growth rate (weekly)
increase(seaweedfs_telemetry_total_clusters[7d])
```
## Security Considerations
- **Network Security**: Use HTTPS in production environments
- **Access Control**: Implement authentication for Grafana and Prometheus
- **Data Retention**: Configure appropriate retention policies
- **Monitoring**: Monitor the telemetry infrastructure itself
## Troubleshooting
### Common Issues
**SeaweedFS not sending data:**
```bash
# Check telemetry configuration
weed master -h | grep telemetry
# Verify connectivity
curl -v http://your-telemetry-server:8080/api/collect
```
**Server not receiving data:**
```bash
# Check server logs
docker-compose logs telemetry-server
# Verify metrics endpoint
curl http://localhost:8080/metrics
```
**Prometheus not scraping:**
```bash
# Check Prometheus targets
curl http://localhost:9090/api/v1/targets
# Verify configuration
docker-compose logs prometheus
```
### Debugging
```bash
# Enable verbose logging in SeaweedFS
weed master -v=2 -telemetry=true
# Check telemetry server metrics
curl http://localhost:8080/metrics | grep seaweedfs_telemetry
# Test data flow
curl http://localhost:8080/api/stats
```
## Contributing
1. Fork the repository
2. Create a feature branch
3. Make your changes
4. Add tests if applicable
5. Submit a pull request
## License
This telemetry system is part of SeaweedFS and follows the same Apache 2.0 license.

View File

@@ -0,0 +1,55 @@
version: '3.8'
services:
telemetry-server:
build: ./server
ports:
- "8080:8080"
command: [
"./telemetry-server",
"-port=8080",
"-dashboard=false", # Disable built-in dashboard, use Grafana
"-log=true",
"-cors=true"
]
networks:
- telemetry
prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=200h'
- '--web.enable-lifecycle'
networks:
- telemetry
grafana:
image: grafana/grafana:latest
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
- GF_USERS_ALLOW_SIGN_UP=false
volumes:
- grafana_data:/var/lib/grafana
- ./grafana-dashboard.json:/var/lib/grafana/dashboards/seaweedfs-telemetry.json
- ./grafana-provisioning:/etc/grafana/provisioning
networks:
- telemetry
volumes:
prometheus_data:
grafana_data:
networks:
telemetry:
driver: bridge

View File

@@ -0,0 +1,734 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {
"align": "auto",
"cellOptions": {
"type": "auto"
},
"inspect": false
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"showHeader": true
},
"pluginVersion": "10.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "seaweedfs_telemetry_total_clusters",
"format": "time_series",
"refId": "A"
}
],
"title": "Total SeaweedFS Clusters",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {
"align": "auto",
"cellOptions": {
"type": "auto"
},
"inspect": false
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"id": 2,
"options": {
"showHeader": true
},
"pluginVersion": "10.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "seaweedfs_telemetry_active_clusters",
"format": "time_series",
"refId": "A"
}
],
"title": "Active Clusters (7 days)",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
}
},
"mappings": []
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"id": 3,
"options": {
"legend": {
"displayMode": "visible",
"placement": "bottom",
"showLegend": true
},
"pieType": "pie",
"reduceOptions": {
"values": false,
"calcs": [
"lastNotNull"
],
"fields": ""
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "count by (version) (seaweedfs_telemetry_cluster_info)",
"format": "time_series",
"legendFormat": "{{version}}",
"refId": "A"
}
],
"title": "SeaweedFS Version Distribution",
"type": "piechart"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
}
},
"mappings": []
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"id": 4,
"options": {
"legend": {
"displayMode": "visible",
"placement": "bottom",
"showLegend": true
},
"pieType": "pie",
"reduceOptions": {
"values": false,
"calcs": [
"lastNotNull"
],
"fields": ""
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "count by (os) (seaweedfs_telemetry_cluster_info)",
"format": "time_series",
"legendFormat": "{{os}}",
"refId": "A"
}
],
"title": "Operating System Distribution",
"type": "piechart"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 16
},
"id": 5,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "sum(seaweedfs_telemetry_volume_servers)",
"format": "time_series",
"legendFormat": "Total Volume Servers",
"refId": "A"
}
],
"title": "Total Volume Servers Over Time",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 24
},
"id": 6,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "sum(seaweedfs_telemetry_disk_bytes)",
"format": "time_series",
"legendFormat": "Total Disk Usage",
"refId": "A"
}
],
"title": "Total Disk Usage Over Time",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 24
},
"id": 7,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "sum(seaweedfs_telemetry_volume_count)",
"format": "time_series",
"legendFormat": "Total Volume Count",
"refId": "A"
}
],
"title": "Total Volume Count Over Time",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 32
},
"id": 8,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "sum(seaweedfs_telemetry_filer_count)",
"format": "time_series",
"legendFormat": "Total Filer Count",
"refId": "A"
}
],
"title": "Total Filer Servers Over Time",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 32
},
"id": 9,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "sum(seaweedfs_telemetry_broker_count)",
"format": "time_series",
"legendFormat": "Total Broker Count",
"refId": "A"
}
],
"title": "Total Broker Servers Over Time",
"type": "timeseries"
}
],
"refresh": "5m",
"schemaVersion": 38,
"style": "dark",
"tags": [
"seaweedfs",
"telemetry"
],
"templating": {
"list": []
},
"time": {
"from": "now-24h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "SeaweedFS Telemetry Dashboard",
"uid": "seaweedfs-telemetry",
"version": 1,
"weekStart": ""
}

View File

@@ -0,0 +1,12 @@
apiVersion: 1
providers:
- name: 'seaweedfs'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards

View File

@@ -0,0 +1,9 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: true

15
telemetry/prometheus.yml Normal file
View File

@@ -0,0 +1,15 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
scrape_configs:
- job_name: 'seaweedfs-telemetry'
static_configs:
- targets: ['telemetry-server:8080']
scrape_interval: 30s
metrics_path: '/metrics'
scrape_timeout: 10s

View File

@@ -0,0 +1,398 @@
// Code generated by protoc-gen-go. DO NOT EDIT.
// versions:
// protoc-gen-go v1.34.2
// protoc v5.29.3
// source: proto/telemetry.proto
package proto
import (
protoreflect "google.golang.org/protobuf/reflect/protoreflect"
protoimpl "google.golang.org/protobuf/runtime/protoimpl"
reflect "reflect"
sync "sync"
)
const (
// Verify that this generated code is sufficiently up-to-date.
_ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion)
// Verify that runtime/protoimpl is sufficiently up-to-date.
_ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20)
)
// TelemetryData represents cluster-level telemetry information
type TelemetryData struct {
state protoimpl.MessageState
sizeCache protoimpl.SizeCache
unknownFields protoimpl.UnknownFields
// Unique cluster identifier (generated in-memory)
ClusterId string `protobuf:"bytes,1,opt,name=cluster_id,json=clusterId,proto3" json:"cluster_id,omitempty"`
// SeaweedFS version
Version string `protobuf:"bytes,2,opt,name=version,proto3" json:"version,omitempty"`
// Operating system (e.g., "linux/amd64")
Os string `protobuf:"bytes,3,opt,name=os,proto3" json:"os,omitempty"`
// Enabled features (e.g., ["filer", "s3api", "mq"])
Features []string `protobuf:"bytes,4,rep,name=features,proto3" json:"features,omitempty"`
// Deployment type ("standalone", "cluster", "master-only", "volume-only")
Deployment string `protobuf:"bytes,5,opt,name=deployment,proto3" json:"deployment,omitempty"`
// Number of volume servers in the cluster
VolumeServerCount int32 `protobuf:"varint,6,opt,name=volume_server_count,json=volumeServerCount,proto3" json:"volume_server_count,omitempty"`
// Total disk usage across all volume servers (in bytes)
TotalDiskBytes uint64 `protobuf:"varint,7,opt,name=total_disk_bytes,json=totalDiskBytes,proto3" json:"total_disk_bytes,omitempty"`
// Total number of volumes in the cluster
TotalVolumeCount int32 `protobuf:"varint,8,opt,name=total_volume_count,json=totalVolumeCount,proto3" json:"total_volume_count,omitempty"`
// Number of filer servers in the cluster
FilerCount int32 `protobuf:"varint,9,opt,name=filer_count,json=filerCount,proto3" json:"filer_count,omitempty"`
// Number of broker servers in the cluster
BrokerCount int32 `protobuf:"varint,10,opt,name=broker_count,json=brokerCount,proto3" json:"broker_count,omitempty"`
// Unix timestamp when the data was collected
Timestamp int64 `protobuf:"varint,11,opt,name=timestamp,proto3" json:"timestamp,omitempty"`
}
func (x *TelemetryData) Reset() {
*x = TelemetryData{}
if protoimpl.UnsafeEnabled {
mi := &file_proto_telemetry_proto_msgTypes[0]
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
ms.StoreMessageInfo(mi)
}
}
func (x *TelemetryData) String() string {
return protoimpl.X.MessageStringOf(x)
}
func (*TelemetryData) ProtoMessage() {}
func (x *TelemetryData) ProtoReflect() protoreflect.Message {
mi := &file_proto_telemetry_proto_msgTypes[0]
if protoimpl.UnsafeEnabled && x != nil {
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
if ms.LoadMessageInfo() == nil {
ms.StoreMessageInfo(mi)
}
return ms
}
return mi.MessageOf(x)
}
// Deprecated: Use TelemetryData.ProtoReflect.Descriptor instead.
func (*TelemetryData) Descriptor() ([]byte, []int) {
return file_proto_telemetry_proto_rawDescGZIP(), []int{0}
}
func (x *TelemetryData) GetClusterId() string {
if x != nil {
return x.ClusterId
}
return ""
}
func (x *TelemetryData) GetVersion() string {
if x != nil {
return x.Version
}
return ""
}
func (x *TelemetryData) GetOs() string {
if x != nil {
return x.Os
}
return ""
}
func (x *TelemetryData) GetFeatures() []string {
if x != nil {
return x.Features
}
return nil
}
func (x *TelemetryData) GetDeployment() string {
if x != nil {
return x.Deployment
}
return ""
}
func (x *TelemetryData) GetVolumeServerCount() int32 {
if x != nil {
return x.VolumeServerCount
}
return 0
}
func (x *TelemetryData) GetTotalDiskBytes() uint64 {
if x != nil {
return x.TotalDiskBytes
}
return 0
}
func (x *TelemetryData) GetTotalVolumeCount() int32 {
if x != nil {
return x.TotalVolumeCount
}
return 0
}
func (x *TelemetryData) GetFilerCount() int32 {
if x != nil {
return x.FilerCount
}
return 0
}
func (x *TelemetryData) GetBrokerCount() int32 {
if x != nil {
return x.BrokerCount
}
return 0
}
func (x *TelemetryData) GetTimestamp() int64 {
if x != nil {
return x.Timestamp
}
return 0
}
// TelemetryRequest is sent from SeaweedFS clusters to the telemetry server
type TelemetryRequest struct {
state protoimpl.MessageState
sizeCache protoimpl.SizeCache
unknownFields protoimpl.UnknownFields
Data *TelemetryData `protobuf:"bytes,1,opt,name=data,proto3" json:"data,omitempty"`
}
func (x *TelemetryRequest) Reset() {
*x = TelemetryRequest{}
if protoimpl.UnsafeEnabled {
mi := &file_proto_telemetry_proto_msgTypes[1]
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
ms.StoreMessageInfo(mi)
}
}
func (x *TelemetryRequest) String() string {
return protoimpl.X.MessageStringOf(x)
}
func (*TelemetryRequest) ProtoMessage() {}
func (x *TelemetryRequest) ProtoReflect() protoreflect.Message {
mi := &file_proto_telemetry_proto_msgTypes[1]
if protoimpl.UnsafeEnabled && x != nil {
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
if ms.LoadMessageInfo() == nil {
ms.StoreMessageInfo(mi)
}
return ms
}
return mi.MessageOf(x)
}
// Deprecated: Use TelemetryRequest.ProtoReflect.Descriptor instead.
func (*TelemetryRequest) Descriptor() ([]byte, []int) {
return file_proto_telemetry_proto_rawDescGZIP(), []int{1}
}
func (x *TelemetryRequest) GetData() *TelemetryData {
if x != nil {
return x.Data
}
return nil
}
// TelemetryResponse is returned by the telemetry server
type TelemetryResponse struct {
state protoimpl.MessageState
sizeCache protoimpl.SizeCache
unknownFields protoimpl.UnknownFields
Success bool `protobuf:"varint,1,opt,name=success,proto3" json:"success,omitempty"`
Message string `protobuf:"bytes,2,opt,name=message,proto3" json:"message,omitempty"`
}
func (x *TelemetryResponse) Reset() {
*x = TelemetryResponse{}
if protoimpl.UnsafeEnabled {
mi := &file_proto_telemetry_proto_msgTypes[2]
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
ms.StoreMessageInfo(mi)
}
}
func (x *TelemetryResponse) String() string {
return protoimpl.X.MessageStringOf(x)
}
func (*TelemetryResponse) ProtoMessage() {}
func (x *TelemetryResponse) ProtoReflect() protoreflect.Message {
mi := &file_proto_telemetry_proto_msgTypes[2]
if protoimpl.UnsafeEnabled && x != nil {
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
if ms.LoadMessageInfo() == nil {
ms.StoreMessageInfo(mi)
}
return ms
}
return mi.MessageOf(x)
}
// Deprecated: Use TelemetryResponse.ProtoReflect.Descriptor instead.
func (*TelemetryResponse) Descriptor() ([]byte, []int) {
return file_proto_telemetry_proto_rawDescGZIP(), []int{2}
}
func (x *TelemetryResponse) GetSuccess() bool {
if x != nil {
return x.Success
}
return false
}
func (x *TelemetryResponse) GetMessage() string {
if x != nil {
return x.Message
}
return ""
}
var File_proto_telemetry_proto protoreflect.FileDescriptor
var file_proto_telemetry_proto_rawDesc = []byte{
0x0a, 0x15, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x74, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74, 0x72,
0x79, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x09, 0x74, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74,
0x72, 0x79, 0x22, 0xfe, 0x02, 0x0a, 0x0d, 0x54, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74, 0x72, 0x79,
0x44, 0x61, 0x74, 0x61, 0x12, 0x1d, 0x0a, 0x0a, 0x63, 0x6c, 0x75, 0x73, 0x74, 0x65, 0x72, 0x5f,
0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x09, 0x63, 0x6c, 0x75, 0x73, 0x74, 0x65,
0x72, 0x49, 0x64, 0x12, 0x18, 0x0a, 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x18, 0x02,
0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x12, 0x0e, 0x0a,
0x02, 0x6f, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x6f, 0x73, 0x12, 0x1a, 0x0a,
0x08, 0x66, 0x65, 0x61, 0x74, 0x75, 0x72, 0x65, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x09, 0x52,
0x08, 0x66, 0x65, 0x61, 0x74, 0x75, 0x72, 0x65, 0x73, 0x12, 0x1e, 0x0a, 0x0a, 0x64, 0x65, 0x70,
0x6c, 0x6f, 0x79, 0x6d, 0x65, 0x6e, 0x74, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0a, 0x64,
0x65, 0x70, 0x6c, 0x6f, 0x79, 0x6d, 0x65, 0x6e, 0x74, 0x12, 0x2e, 0x0a, 0x13, 0x76, 0x6f, 0x6c,
0x75, 0x6d, 0x65, 0x5f, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74,
0x18, 0x06, 0x20, 0x01, 0x28, 0x05, 0x52, 0x11, 0x76, 0x6f, 0x6c, 0x75, 0x6d, 0x65, 0x53, 0x65,
0x72, 0x76, 0x65, 0x72, 0x43, 0x6f, 0x75, 0x6e, 0x74, 0x12, 0x28, 0x0a, 0x10, 0x74, 0x6f, 0x74,
0x61, 0x6c, 0x5f, 0x64, 0x69, 0x73, 0x6b, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x07, 0x20,
0x01, 0x28, 0x04, 0x52, 0x0e, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x44, 0x69, 0x73, 0x6b, 0x42, 0x79,
0x74, 0x65, 0x73, 0x12, 0x2c, 0x0a, 0x12, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x5f, 0x76, 0x6f, 0x6c,
0x75, 0x6d, 0x65, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x18, 0x08, 0x20, 0x01, 0x28, 0x05, 0x52,
0x10, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x56, 0x6f, 0x6c, 0x75, 0x6d, 0x65, 0x43, 0x6f, 0x75, 0x6e,
0x74, 0x12, 0x1f, 0x0a, 0x0b, 0x66, 0x69, 0x6c, 0x65, 0x72, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74,
0x18, 0x09, 0x20, 0x01, 0x28, 0x05, 0x52, 0x0a, 0x66, 0x69, 0x6c, 0x65, 0x72, 0x43, 0x6f, 0x75,
0x6e, 0x74, 0x12, 0x21, 0x0a, 0x0c, 0x62, 0x72, 0x6f, 0x6b, 0x65, 0x72, 0x5f, 0x63, 0x6f, 0x75,
0x6e, 0x74, 0x18, 0x0a, 0x20, 0x01, 0x28, 0x05, 0x52, 0x0b, 0x62, 0x72, 0x6f, 0x6b, 0x65, 0x72,
0x43, 0x6f, 0x75, 0x6e, 0x74, 0x12, 0x1c, 0x0a, 0x09, 0x74, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61,
0x6d, 0x70, 0x18, 0x0b, 0x20, 0x01, 0x28, 0x03, 0x52, 0x09, 0x74, 0x69, 0x6d, 0x65, 0x73, 0x74,
0x61, 0x6d, 0x70, 0x22, 0x40, 0x0a, 0x10, 0x54, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74, 0x72, 0x79,
0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x2c, 0x0a, 0x04, 0x64, 0x61, 0x74, 0x61, 0x18,
0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x18, 0x2e, 0x74, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74, 0x72,
0x79, 0x2e, 0x54, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74, 0x72, 0x79, 0x44, 0x61, 0x74, 0x61, 0x52,
0x04, 0x64, 0x61, 0x74, 0x61, 0x22, 0x47, 0x0a, 0x11, 0x54, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74,
0x72, 0x79, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x73, 0x75,
0x63, 0x63, 0x65, 0x73, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x08, 0x52, 0x07, 0x73, 0x75, 0x63,
0x63, 0x65, 0x73, 0x73, 0x12, 0x18, 0x0a, 0x07, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x18,
0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x42, 0x30,
0x5a, 0x2e, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x73, 0x65, 0x61,
0x77, 0x65, 0x65, 0x64, 0x66, 0x73, 0x2f, 0x73, 0x65, 0x61, 0x77, 0x65, 0x65, 0x64, 0x66, 0x73,
0x2f, 0x74, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74, 0x72, 0x79, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f,
0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33,
}
var (
file_proto_telemetry_proto_rawDescOnce sync.Once
file_proto_telemetry_proto_rawDescData = file_proto_telemetry_proto_rawDesc
)
func file_proto_telemetry_proto_rawDescGZIP() []byte {
file_proto_telemetry_proto_rawDescOnce.Do(func() {
file_proto_telemetry_proto_rawDescData = protoimpl.X.CompressGZIP(file_proto_telemetry_proto_rawDescData)
})
return file_proto_telemetry_proto_rawDescData
}
var file_proto_telemetry_proto_msgTypes = make([]protoimpl.MessageInfo, 3)
var file_proto_telemetry_proto_goTypes = []any{
(*TelemetryData)(nil), // 0: telemetry.TelemetryData
(*TelemetryRequest)(nil), // 1: telemetry.TelemetryRequest
(*TelemetryResponse)(nil), // 2: telemetry.TelemetryResponse
}
var file_proto_telemetry_proto_depIdxs = []int32{
0, // 0: telemetry.TelemetryRequest.data:type_name -> telemetry.TelemetryData
1, // [1:1] is the sub-list for method output_type
1, // [1:1] is the sub-list for method input_type
1, // [1:1] is the sub-list for extension type_name
1, // [1:1] is the sub-list for extension extendee
0, // [0:1] is the sub-list for field type_name
}
func init() { file_proto_telemetry_proto_init() }
func file_proto_telemetry_proto_init() {
if File_proto_telemetry_proto != nil {
return
}
if !protoimpl.UnsafeEnabled {
file_proto_telemetry_proto_msgTypes[0].Exporter = func(v any, i int) any {
switch v := v.(*TelemetryData); i {
case 0:
return &v.state
case 1:
return &v.sizeCache
case 2:
return &v.unknownFields
default:
return nil
}
}
file_proto_telemetry_proto_msgTypes[1].Exporter = func(v any, i int) any {
switch v := v.(*TelemetryRequest); i {
case 0:
return &v.state
case 1:
return &v.sizeCache
case 2:
return &v.unknownFields
default:
return nil
}
}
file_proto_telemetry_proto_msgTypes[2].Exporter = func(v any, i int) any {
switch v := v.(*TelemetryResponse); i {
case 0:
return &v.state
case 1:
return &v.sizeCache
case 2:
return &v.unknownFields
default:
return nil
}
}
}
type x struct{}
out := protoimpl.TypeBuilder{
File: protoimpl.DescBuilder{
GoPackagePath: reflect.TypeOf(x{}).PkgPath(),
RawDescriptor: file_proto_telemetry_proto_rawDesc,
NumEnums: 0,
NumMessages: 3,
NumExtensions: 0,
NumServices: 0,
},
GoTypes: file_proto_telemetry_proto_goTypes,
DependencyIndexes: file_proto_telemetry_proto_depIdxs,
MessageInfos: file_proto_telemetry_proto_msgTypes,
}.Build()
File_proto_telemetry_proto = out.File
file_proto_telemetry_proto_rawDesc = nil
file_proto_telemetry_proto_goTypes = nil
file_proto_telemetry_proto_depIdxs = nil
}

View File

@@ -0,0 +1,52 @@
syntax = "proto3";
package telemetry;
option go_package = "github.com/seaweedfs/seaweedfs/telemetry/proto";
// TelemetryData represents cluster-level telemetry information
message TelemetryData {
// Unique cluster identifier (generated in-memory)
string cluster_id = 1;
// SeaweedFS version
string version = 2;
// Operating system (e.g., "linux/amd64")
string os = 3;
// Enabled features (e.g., ["filer", "s3api", "mq"])
repeated string features = 4;
// Deployment type ("standalone", "cluster", "master-only", "volume-only")
string deployment = 5;
// Number of volume servers in the cluster
int32 volume_server_count = 6;
// Total disk usage across all volume servers (in bytes)
uint64 total_disk_bytes = 7;
// Total number of volumes in the cluster
int32 total_volume_count = 8;
// Number of filer servers in the cluster
int32 filer_count = 9;
// Number of broker servers in the cluster
int32 broker_count = 10;
// Unix timestamp when the data was collected
int64 timestamp = 11;
}
// TelemetryRequest is sent from SeaweedFS clusters to the telemetry server
message TelemetryRequest {
TelemetryData data = 1;
}
// TelemetryResponse is returned by the telemetry server
message TelemetryResponse {
bool success = 1;
string message = 2;
}

View File

@@ -0,0 +1,18 @@
FROM golang:1.21-alpine AS builder
WORKDIR /app
COPY go.mod go.sum ./
RUN go mod download
COPY . .
RUN CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -ldflags '-extldflags "-static"' -o telemetry-server .
FROM alpine:latest
RUN apk --no-cache add ca-certificates
WORKDIR /root/
COPY --from=builder /app/telemetry-server .
EXPOSE 8080
CMD ["./telemetry-server"]

97
telemetry/server/Makefile Normal file
View File

@@ -0,0 +1,97 @@
.PHONY: build run clean test deps proto integration-test test-all
# Build the telemetry server
build:
go build -o telemetry-server .
# Run the server in development mode
run:
go run . -port=8080 -dashboard=true -cleanup=1h -max-age=24h
# Run the server in production mode
run-prod:
./telemetry-server -port=8080 -dashboard=true -cleanup=24h -max-age=720h
# Clean build artifacts
clean:
rm -f telemetry-server
rm -f ../test/telemetry-server-test.log
go clean
# Run unit tests
test:
go test ./...
# Run integration tests
integration-test:
@echo "🧪 Running telemetry integration tests..."
cd ../../ && go run telemetry/test/integration.go
# Run all tests (unit + integration)
test-all: test integration-test
# Install dependencies
deps:
go mod download
go mod tidy
# Generate protobuf code (requires protoc)
proto:
cd .. && protoc --go_out=. --go_opt=paths=source_relative proto/telemetry.proto
# Build Docker image
docker-build:
docker build -t seaweedfs-telemetry .
# Run with Docker
docker-run:
docker run -p 8080:8080 seaweedfs-telemetry -port=8080 -dashboard=true
# Development with auto-reload (requires air: go install github.com/cosmtrek/air@latest)
dev:
air
# Check if protoc is available
check-protoc:
@which protoc > /dev/null || (echo "protoc is required for proto generation. Install from https://grpc.io/docs/protoc-installation/" && exit 1)
# Full development setup
setup: check-protoc deps proto build
# Run a quick smoke test
smoke-test: build
@echo "🔥 Running smoke test..."
@timeout 10s ./telemetry-server -port=18081 > /dev/null 2>&1 & \
SERVER_PID=$$!; \
sleep 2; \
if curl -s http://localhost:18081/health > /dev/null; then \
echo "✅ Smoke test passed - server responds to health check"; \
else \
echo "❌ Smoke test failed - server not responding"; \
exit 1; \
fi; \
kill $$SERVER_PID 2>/dev/null || true
# Continuous integration target
ci: deps proto build test integration-test
@echo "🎉 All CI tests passed!"
# Help
help:
@echo "Available targets:"
@echo " build - Build the telemetry server binary"
@echo " run - Run server in development mode"
@echo " run-prod - Run server in production mode"
@echo " clean - Clean build artifacts"
@echo " test - Run unit tests"
@echo " integration-test- Run integration tests"
@echo " test-all - Run all tests (unit + integration)"
@echo " deps - Install Go dependencies"
@echo " proto - Generate protobuf code"
@echo " docker-build - Build Docker image"
@echo " docker-run - Run with Docker"
@echo " dev - Run with auto-reload (requires air)"
@echo " smoke-test - Quick server health check"
@echo " setup - Full development setup"
@echo " ci - Continuous integration (all tests)"
@echo " help - Show this help"

View File

@@ -0,0 +1,152 @@
package api
import (
"encoding/json"
"io"
"net/http"
"strconv"
"time"
"github.com/seaweedfs/seaweedfs/telemetry/proto"
"github.com/seaweedfs/seaweedfs/telemetry/server/storage"
protobuf "google.golang.org/protobuf/proto"
)
type Handler struct {
storage *storage.PrometheusStorage
}
func NewHandler(storage *storage.PrometheusStorage) *Handler {
return &Handler{storage: storage}
}
func (h *Handler) CollectTelemetry(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
contentType := r.Header.Get("Content-Type")
// Only accept protobuf content type
if contentType != "application/x-protobuf" && contentType != "application/protobuf" {
http.Error(w, "Content-Type must be application/x-protobuf", http.StatusUnsupportedMediaType)
return
}
// Read protobuf request
body, err := io.ReadAll(r.Body)
if err != nil {
http.Error(w, "Failed to read request body", http.StatusBadRequest)
return
}
req := &proto.TelemetryRequest{}
if err := protobuf.Unmarshal(body, req); err != nil {
http.Error(w, "Invalid protobuf data", http.StatusBadRequest)
return
}
data := req.Data
if data == nil {
http.Error(w, "Missing telemetry data", http.StatusBadRequest)
return
}
// Validate required fields
if data.ClusterId == "" || data.Version == "" || data.Os == "" {
http.Error(w, "Missing required fields", http.StatusBadRequest)
return
}
// Set timestamp if not provided
if data.Timestamp == 0 {
data.Timestamp = time.Now().Unix()
}
// Store the telemetry data
if err := h.storage.StoreTelemetry(data); err != nil {
http.Error(w, "Failed to store data", http.StatusInternalServerError)
return
}
// Return protobuf response
resp := &proto.TelemetryResponse{
Success: true,
Message: "Telemetry data received",
}
respData, err := protobuf.Marshal(resp)
if err != nil {
http.Error(w, "Failed to marshal response", http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/x-protobuf")
w.WriteHeader(http.StatusOK)
w.Write(respData)
}
func (h *Handler) GetStats(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
stats, err := h.storage.GetStats()
if err != nil {
http.Error(w, "Failed to get stats", http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(stats)
}
func (h *Handler) GetInstances(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
limitStr := r.URL.Query().Get("limit")
limit := 100 // default
if limitStr != "" {
if l, err := strconv.Atoi(limitStr); err == nil && l > 0 && l <= 1000 {
limit = l
}
}
instances, err := h.storage.GetInstances(limit)
if err != nil {
http.Error(w, "Failed to get instances", http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(instances)
}
func (h *Handler) GetMetrics(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
daysStr := r.URL.Query().Get("days")
days := 30 // default
if daysStr != "" {
if d, err := strconv.Atoi(daysStr); err == nil && d > 0 && d <= 365 {
days = d
}
}
metrics, err := h.storage.GetMetrics(days)
if err != nil {
http.Error(w, "Failed to get metrics", http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(metrics)
}

View File

@@ -0,0 +1,278 @@
package dashboard
import (
"net/http"
)
type Handler struct{}
func NewHandler() *Handler {
return &Handler{}
}
func (h *Handler) ServeIndex(w http.ResponseWriter, r *http.Request) {
html := `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>SeaweedFS Telemetry Dashboard</title>
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
margin: 0;
padding: 20px;
background-color: #f5f5f5;
}
.container {
max-width: 1200px;
margin: 0 auto;
}
.header {
background: white;
padding: 20px;
border-radius: 8px;
margin-bottom: 20px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.stats-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
gap: 20px;
margin-bottom: 20px;
}
.stat-card {
background: white;
padding: 20px;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.stat-value {
font-size: 2em;
font-weight: bold;
color: #2196F3;
}
.stat-label {
color: #666;
margin-top: 5px;
}
.chart-container {
background: white;
padding: 20px;
border-radius: 8px;
margin-bottom: 20px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.chart-title {
font-size: 1.2em;
font-weight: bold;
margin-bottom: 15px;
}
.loading {
text-align: center;
padding: 40px;
color: #666;
}
.error {
background: #ffebee;
color: #c62828;
padding: 15px;
border-radius: 4px;
margin: 10px 0;
}
</style>
</head>
<body>
<div class="container">
<div class="header">
<h1>SeaweedFS Telemetry Dashboard</h1>
<p>Privacy-respecting usage analytics for SeaweedFS</p>
</div>
<div id="loading" class="loading">Loading telemetry data...</div>
<div id="error" class="error" style="display: none;"></div>
<div id="dashboard" style="display: none;">
<div class="stats-grid">
<div class="stat-card">
<div class="stat-value" id="totalInstances">-</div>
<div class="stat-label">Total Instances (30 days)</div>
</div>
<div class="stat-card">
<div class="stat-value" id="activeInstances">-</div>
<div class="stat-label">Active Instances (7 days)</div>
</div>
<div class="stat-card">
<div class="stat-value" id="totalVersions">-</div>
<div class="stat-label">Different Versions</div>
</div>
<div class="stat-card">
<div class="stat-value" id="totalOS">-</div>
<div class="stat-label">Operating Systems</div>
</div>
</div>
<div class="chart-container">
<div class="chart-title">Version Distribution</div>
<canvas id="versionChart" width="400" height="200"></canvas>
</div>
<div class="chart-container">
<div class="chart-title">Operating System Distribution</div>
<canvas id="osChart" width="400" height="200"></canvas>
</div>
<div class="chart-container">
<div class="chart-title">Deployment Types</div>
<canvas id="deploymentChart" width="400" height="200"></canvas>
</div>
<div class="chart-container">
<div class="chart-title">Volume Servers Over Time</div>
<canvas id="serverChart" width="400" height="200"></canvas>
</div>
<div class="chart-container">
<div class="chart-title">Total Disk Usage Over Time</div>
<canvas id="diskChart" width="400" height="200"></canvas>
</div>
</div>
</div>
<script>
let charts = {};
async function loadDashboard() {
try {
// Load stats
const statsResponse = await fetch('/api/stats');
const stats = await statsResponse.json();
// Load metrics
const metricsResponse = await fetch('/api/metrics?days=30');
const metrics = await metricsResponse.json();
updateStats(stats);
updateCharts(stats, metrics);
document.getElementById('loading').style.display = 'none';
document.getElementById('dashboard').style.display = 'block';
} catch (error) {
console.error('Error loading dashboard:', error);
showError('Failed to load telemetry data: ' + error.message);
}
}
function updateStats(stats) {
document.getElementById('totalInstances').textContent = stats.total_instances || 0;
document.getElementById('activeInstances').textContent = stats.active_instances || 0;
document.getElementById('totalVersions').textContent = Object.keys(stats.versions || {}).length;
document.getElementById('totalOS').textContent = Object.keys(stats.os_distribution || {}).length;
}
function updateCharts(stats, metrics) {
// Version chart
createPieChart('versionChart', 'Version Distribution', stats.versions || {});
// OS chart
createPieChart('osChart', 'Operating System Distribution', stats.os_distribution || {});
// Deployment chart
createPieChart('deploymentChart', 'Deployment Types', stats.deployments || {});
// Server count over time
if (metrics.dates && metrics.server_counts) {
createLineChart('serverChart', 'Volume Servers', metrics.dates, metrics.server_counts, '#2196F3');
}
// Disk usage over time
if (metrics.dates && metrics.disk_usage) {
const diskUsageGB = metrics.disk_usage.map(bytes => Math.round(bytes / (1024 * 1024 * 1024)));
createLineChart('diskChart', 'Disk Usage (GB)', metrics.dates, diskUsageGB, '#4CAF50');
}
}
function createPieChart(canvasId, title, data) {
const ctx = document.getElementById(canvasId).getContext('2d');
if (charts[canvasId]) {
charts[canvasId].destroy();
}
const labels = Object.keys(data);
const values = Object.values(data);
charts[canvasId] = new Chart(ctx, {
type: 'pie',
data: {
labels: labels,
datasets: [{
data: values,
backgroundColor: [
'#FF6384', '#36A2EB', '#FFCE56', '#4BC0C0',
'#9966FF', '#FF9F40', '#FF6384', '#C9CBCF'
]
}]
},
options: {
responsive: true,
plugins: {
legend: {
position: 'bottom'
}
}
}
});
}
function createLineChart(canvasId, label, labels, data, color) {
const ctx = document.getElementById(canvasId).getContext('2d');
if (charts[canvasId]) {
charts[canvasId].destroy();
}
charts[canvasId] = new Chart(ctx, {
type: 'line',
data: {
labels: labels,
datasets: [{
label: label,
data: data,
borderColor: color,
backgroundColor: color + '20',
fill: true,
tension: 0.1
}]
},
options: {
responsive: true,
scales: {
y: {
beginAtZero: true
}
}
}
});
}
function showError(message) {
document.getElementById('loading').style.display = 'none';
document.getElementById('error').style.display = 'block';
document.getElementById('error').textContent = message;
}
// Load dashboard on page load
loadDashboard();
// Refresh every 5 minutes
setInterval(loadDashboard, 5 * 60 * 1000);
</script>
</body>
</html>`
w.Header().Set("Content-Type", "text/html")
w.WriteHeader(http.StatusOK)
w.Write([]byte(html))
}

31
telemetry/server/go.sum Normal file
View File

@@ -0,0 +1,31 @@
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg=
github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo=
github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4=
github.com/prometheus/client_golang v1.17.0 h1:rl2sfwZMtSthVU752MqfjQozy7blglC+1SOtjMAMh+Q=
github.com/prometheus/client_golang v1.17.0/go.mod h1:VeL+gMmOAxkS2IqfCq0ZmHSL+LjWfWDUmp1mBz9JgUY=
github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16 h1:v7DLqVdK4VrYkVD5diGdl4sxJurKJEMnODWRJlxV9oM=
github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16/go.mod h1:oMQmHW1/JoDwqLtg57MGgP/Fb1CJEYF2imWWhWtMkYU=
github.com/prometheus/common v0.44.0 h1:+5BrQJwiBB9xsMygAB3TNvpQKOwlkc25LbISbrdOOfY=
github.com/prometheus/common v0.44.0/go.mod h1:ofAIvZbQ1e/nugmZGz4/qCb9Ap1VoSTIO7x0VV9VvuY=
github.com/prometheus/procfs v0.11.1 h1:xRC8Iq1yyca5ypa9n1EZnWZkt7dwcoRPQwX/5gwaUuI=
github.com/prometheus/procfs v0.11.1/go.mod h1:eesXgaPo1q7lBpVMoMy0ZOFTth9hBn4W/y0/p/ScXhY=
golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.11.0 h1:eG7RXZHdqOJ1i+0lgLgCpSXAp6M3LYlAo6osgSi0xOM=
golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8=
google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=

111
telemetry/server/main.go Normal file
View File

@@ -0,0 +1,111 @@
package main
import (
"encoding/json"
"flag"
"fmt"
"log"
"net/http"
"time"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/seaweedfs/seaweedfs/telemetry/server/api"
"github.com/seaweedfs/seaweedfs/telemetry/server/dashboard"
"github.com/seaweedfs/seaweedfs/telemetry/server/storage"
)
var (
port = flag.Int("port", 8080, "HTTP server port")
enableCORS = flag.Bool("cors", true, "Enable CORS for dashboard")
logRequests = flag.Bool("log", true, "Log incoming requests")
enableDashboard = flag.Bool("dashboard", true, "Enable built-in dashboard (optional when using Grafana)")
cleanupInterval = flag.Duration("cleanup", 24*time.Hour, "Cleanup interval for old instances")
maxInstanceAge = flag.Duration("max-age", 30*24*time.Hour, "Maximum age for instances before cleanup")
)
func main() {
flag.Parse()
// Create Prometheus storage instance
store := storage.NewPrometheusStorage()
// Start cleanup routine
go func() {
ticker := time.NewTicker(*cleanupInterval)
defer ticker.Stop()
for range ticker.C {
store.CleanupOldInstances(*maxInstanceAge)
}
}()
// Setup HTTP handlers
mux := http.NewServeMux()
// Prometheus metrics endpoint
mux.Handle("/metrics", promhttp.Handler())
// API endpoints
apiHandler := api.NewHandler(store)
mux.HandleFunc("/api/collect", corsMiddleware(logMiddleware(apiHandler.CollectTelemetry)))
mux.HandleFunc("/api/stats", corsMiddleware(logMiddleware(apiHandler.GetStats)))
mux.HandleFunc("/api/instances", corsMiddleware(logMiddleware(apiHandler.GetInstances)))
mux.HandleFunc("/api/metrics", corsMiddleware(logMiddleware(apiHandler.GetMetrics)))
// Dashboard (optional)
if *enableDashboard {
dashboardHandler := dashboard.NewHandler()
mux.HandleFunc("/", corsMiddleware(dashboardHandler.ServeIndex))
mux.HandleFunc("/dashboard", corsMiddleware(dashboardHandler.ServeIndex))
mux.Handle("/static/", http.StripPrefix("/static/", http.FileServer(http.Dir("./static"))))
}
// Health check
mux.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]string{
"status": "ok",
"time": time.Now().UTC().Format(time.RFC3339),
})
})
addr := fmt.Sprintf(":%d", *port)
log.Printf("Starting telemetry server on %s", addr)
log.Printf("Prometheus metrics: http://localhost%s/metrics", addr)
if *enableDashboard {
log.Printf("Dashboard: http://localhost%s/dashboard", addr)
}
log.Printf("Cleanup interval: %v, Max instance age: %v", *cleanupInterval, *maxInstanceAge)
if err := http.ListenAndServe(addr, mux); err != nil {
log.Fatalf("Server failed: %v", err)
}
}
func corsMiddleware(next http.HandlerFunc) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
if *enableCORS {
w.Header().Set("Access-Control-Allow-Origin", "*")
w.Header().Set("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
w.Header().Set("Access-Control-Allow-Headers", "Content-Type, Authorization")
}
if r.Method == "OPTIONS" {
w.WriteHeader(http.StatusOK)
return
}
next(w, r)
}
}
func logMiddleware(next http.HandlerFunc) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
if *logRequests {
start := time.Now()
next(w, r)
log.Printf("%s %s %s %v", r.Method, r.URL.Path, r.RemoteAddr, time.Since(start))
} else {
next(w, r)
}
}
}

View File

@@ -0,0 +1,245 @@
package storage
import (
"encoding/json"
"sync"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/seaweedfs/seaweedfs/telemetry/proto"
)
type PrometheusStorage struct {
// Prometheus metrics
totalClusters prometheus.Gauge
activeClusters prometheus.Gauge
volumeServerCount *prometheus.GaugeVec
totalDiskBytes *prometheus.GaugeVec
totalVolumeCount *prometheus.GaugeVec
filerCount *prometheus.GaugeVec
brokerCount *prometheus.GaugeVec
clusterInfo *prometheus.GaugeVec
telemetryReceived prometheus.Counter
// In-memory storage for API endpoints (if needed)
mu sync.RWMutex
instances map[string]*telemetryData
stats map[string]interface{}
}
// telemetryData is an internal struct that includes the received timestamp
type telemetryData struct {
*proto.TelemetryData
ReceivedAt time.Time `json:"received_at"`
}
func NewPrometheusStorage() *PrometheusStorage {
return &PrometheusStorage{
totalClusters: promauto.NewGauge(prometheus.GaugeOpts{
Name: "seaweedfs_telemetry_total_clusters",
Help: "Total number of unique SeaweedFS clusters (last 30 days)",
}),
activeClusters: promauto.NewGauge(prometheus.GaugeOpts{
Name: "seaweedfs_telemetry_active_clusters",
Help: "Number of active SeaweedFS clusters (last 7 days)",
}),
volumeServerCount: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "seaweedfs_telemetry_volume_servers",
Help: "Number of volume servers per cluster",
}, []string{"cluster_id", "version", "os", "deployment"}),
totalDiskBytes: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "seaweedfs_telemetry_disk_bytes",
Help: "Total disk usage in bytes per cluster",
}, []string{"cluster_id", "version", "os", "deployment"}),
totalVolumeCount: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "seaweedfs_telemetry_volume_count",
Help: "Total number of volumes per cluster",
}, []string{"cluster_id", "version", "os", "deployment"}),
filerCount: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "seaweedfs_telemetry_filer_count",
Help: "Number of filer servers per cluster",
}, []string{"cluster_id", "version", "os", "deployment"}),
brokerCount: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "seaweedfs_telemetry_broker_count",
Help: "Number of broker servers per cluster",
}, []string{"cluster_id", "version", "os", "deployment"}),
clusterInfo: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "seaweedfs_telemetry_cluster_info",
Help: "Cluster information (always 1, labels contain metadata)",
}, []string{"cluster_id", "version", "os", "deployment", "features"}),
telemetryReceived: promauto.NewCounter(prometheus.CounterOpts{
Name: "seaweedfs_telemetry_reports_received_total",
Help: "Total number of telemetry reports received",
}),
instances: make(map[string]*telemetryData),
stats: make(map[string]interface{}),
}
}
func (s *PrometheusStorage) StoreTelemetry(data *proto.TelemetryData) error {
s.mu.Lock()
defer s.mu.Unlock()
// Update Prometheus metrics
labels := prometheus.Labels{
"cluster_id": data.ClusterId,
"version": data.Version,
"os": data.Os,
"deployment": data.Deployment,
}
s.volumeServerCount.With(labels).Set(float64(data.VolumeServerCount))
s.totalDiskBytes.With(labels).Set(float64(data.TotalDiskBytes))
s.totalVolumeCount.With(labels).Set(float64(data.TotalVolumeCount))
s.filerCount.With(labels).Set(float64(data.FilerCount))
s.brokerCount.With(labels).Set(float64(data.BrokerCount))
// Features as JSON string for the label
featuresJSON, _ := json.Marshal(data.Features)
infoLabels := prometheus.Labels{
"cluster_id": data.ClusterId,
"version": data.Version,
"os": data.Os,
"deployment": data.Deployment,
"features": string(featuresJSON),
}
s.clusterInfo.With(infoLabels).Set(1)
s.telemetryReceived.Inc()
// Store in memory for API endpoints
s.instances[data.ClusterId] = &telemetryData{
TelemetryData: data,
ReceivedAt: time.Now().UTC(),
}
// Update aggregated stats
s.updateStats()
return nil
}
func (s *PrometheusStorage) GetStats() (map[string]interface{}, error) {
s.mu.RLock()
defer s.mu.RUnlock()
// Return cached stats
result := make(map[string]interface{})
for k, v := range s.stats {
result[k] = v
}
return result, nil
}
func (s *PrometheusStorage) GetInstances(limit int) ([]*telemetryData, error) {
s.mu.RLock()
defer s.mu.RUnlock()
var instances []*telemetryData
count := 0
for _, instance := range s.instances {
if count >= limit {
break
}
instances = append(instances, instance)
count++
}
return instances, nil
}
func (s *PrometheusStorage) GetMetrics(days int) (map[string]interface{}, error) {
s.mu.RLock()
defer s.mu.RUnlock()
// Return current metrics from in-memory storage
// Historical data should be queried from Prometheus directly
cutoff := time.Now().AddDate(0, 0, -days)
var volumeServers []map[string]interface{}
var diskUsage []map[string]interface{}
for _, instance := range s.instances {
if instance.ReceivedAt.After(cutoff) {
volumeServers = append(volumeServers, map[string]interface{}{
"date": instance.ReceivedAt.Format("2006-01-02"),
"value": instance.TelemetryData.VolumeServerCount,
})
diskUsage = append(diskUsage, map[string]interface{}{
"date": instance.ReceivedAt.Format("2006-01-02"),
"value": instance.TelemetryData.TotalDiskBytes,
})
}
}
return map[string]interface{}{
"volume_servers": volumeServers,
"disk_usage": diskUsage,
}, nil
}
func (s *PrometheusStorage) updateStats() {
now := time.Now()
last7Days := now.AddDate(0, 0, -7)
last30Days := now.AddDate(0, 0, -30)
totalInstances := 0
activeInstances := 0
versions := make(map[string]int)
osDistribution := make(map[string]int)
deployments := make(map[string]int)
for _, instance := range s.instances {
if instance.ReceivedAt.After(last30Days) {
totalInstances++
}
if instance.ReceivedAt.After(last7Days) {
activeInstances++
versions[instance.TelemetryData.Version]++
osDistribution[instance.TelemetryData.Os]++
deployments[instance.TelemetryData.Deployment]++
}
}
// Update Prometheus gauges
s.totalClusters.Set(float64(totalInstances))
s.activeClusters.Set(float64(activeInstances))
// Update cached stats for API
s.stats = map[string]interface{}{
"total_instances": totalInstances,
"active_instances": activeInstances,
"versions": versions,
"os_distribution": osDistribution,
"deployments": deployments,
}
}
// CleanupOldInstances removes instances older than the specified duration
func (s *PrometheusStorage) CleanupOldInstances(maxAge time.Duration) {
s.mu.Lock()
defer s.mu.Unlock()
cutoff := time.Now().Add(-maxAge)
for instanceID, instance := range s.instances {
if instance.ReceivedAt.Before(cutoff) {
delete(s.instances, instanceID)
// Remove from Prometheus metrics
labels := prometheus.Labels{
"cluster_id": instance.TelemetryData.ClusterId,
"version": instance.TelemetryData.Version,
"os": instance.TelemetryData.Os,
"deployment": instance.TelemetryData.Deployment,
}
s.volumeServerCount.Delete(labels)
s.totalDiskBytes.Delete(labels)
s.totalVolumeCount.Delete(labels)
s.filerCount.Delete(labels)
s.brokerCount.Delete(labels)
}
}
s.updateStats()
}

View File

@@ -0,0 +1,315 @@
package main
import (
"context"
"fmt"
"io"
"log"
"net/http"
"os"
"os/exec"
"path/filepath"
"strings"
"syscall"
"time"
"github.com/seaweedfs/seaweedfs/telemetry/proto"
"github.com/seaweedfs/seaweedfs/weed/telemetry"
protobuf "google.golang.org/protobuf/proto"
)
const (
serverPort = "18080" // Use different port to avoid conflicts
serverURL = "http://localhost:" + serverPort
)
func main() {
fmt.Println("🧪 Starting SeaweedFS Telemetry Integration Test")
// Start telemetry server
fmt.Println("📡 Starting telemetry server...")
serverCmd, err := startTelemetryServer()
if err != nil {
log.Fatalf("❌ Failed to start telemetry server: %v", err)
}
defer stopServer(serverCmd)
// Wait for server to start
if !waitForServer(serverURL+"/health", 15*time.Second) {
log.Fatal("❌ Telemetry server failed to start")
}
fmt.Println("✅ Telemetry server started successfully")
// Test protobuf marshaling first
fmt.Println("🔧 Testing protobuf marshaling...")
if err := testProtobufMarshaling(); err != nil {
log.Fatalf("❌ Protobuf marshaling test failed: %v", err)
}
fmt.Println("✅ Protobuf marshaling test passed")
// Test protobuf client
fmt.Println("🔄 Testing protobuf telemetry client...")
if err := testTelemetryClient(); err != nil {
log.Fatalf("❌ Telemetry client test failed: %v", err)
}
fmt.Println("✅ Telemetry client test passed")
// Test server metrics endpoint
fmt.Println("📊 Testing Prometheus metrics endpoint...")
if err := testMetricsEndpoint(); err != nil {
log.Fatalf("❌ Metrics endpoint test failed: %v", err)
}
fmt.Println("✅ Metrics endpoint test passed")
// Test stats API
fmt.Println("📈 Testing stats API...")
if err := testStatsAPI(); err != nil {
log.Fatalf("❌ Stats API test failed: %v", err)
}
fmt.Println("✅ Stats API test passed")
// Test instances API
fmt.Println("📋 Testing instances API...")
if err := testInstancesAPI(); err != nil {
log.Fatalf("❌ Instances API test failed: %v", err)
}
fmt.Println("✅ Instances API test passed")
fmt.Println("🎉 All telemetry integration tests passed!")
}
func startTelemetryServer() (*exec.Cmd, error) {
// Get the directory where this test is running
testDir, err := os.Getwd()
if err != nil {
return nil, fmt.Errorf("failed to get working directory: %v", err)
}
// Navigate to the server directory (from main seaweedfs directory)
serverDir := filepath.Join(testDir, "telemetry", "server")
cmd := exec.Command("go", "run", ".",
"-port="+serverPort,
"-dashboard=false",
"-cleanup=1m",
"-max-age=1h")
cmd.Dir = serverDir
// Create log files for server output
logFile, err := os.Create("telemetry-server-test.log")
if err != nil {
return nil, fmt.Errorf("failed to create log file: %v", err)
}
cmd.Stdout = logFile
cmd.Stderr = logFile
if err := cmd.Start(); err != nil {
return nil, fmt.Errorf("failed to start server: %v", err)
}
return cmd, nil
}
func stopServer(cmd *exec.Cmd) {
if cmd != nil && cmd.Process != nil {
cmd.Process.Signal(syscall.SIGTERM)
cmd.Wait()
// Clean up log file
os.Remove("telemetry-server-test.log")
}
}
func waitForServer(url string, timeout time.Duration) bool {
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
fmt.Printf("⏳ Waiting for server at %s...\n", url)
for {
select {
case <-ctx.Done():
return false
default:
resp, err := http.Get(url)
if err == nil {
resp.Body.Close()
if resp.StatusCode == http.StatusOK {
return true
}
}
time.Sleep(500 * time.Millisecond)
}
}
}
func testProtobufMarshaling() error {
// Test protobuf marshaling/unmarshaling
testData := &proto.TelemetryData{
ClusterId: "test-cluster-12345",
Version: "test-3.45",
Os: "linux/amd64",
Features: []string{"filer", "s3api"},
Deployment: "test",
VolumeServerCount: 2,
TotalDiskBytes: 1000000,
TotalVolumeCount: 10,
FilerCount: 1,
BrokerCount: 1,
Timestamp: time.Now().Unix(),
}
// Marshal
data, err := protobuf.Marshal(testData)
if err != nil {
return fmt.Errorf("failed to marshal protobuf: %v", err)
}
fmt.Printf(" Protobuf size: %d bytes\n", len(data))
// Unmarshal
testData2 := &proto.TelemetryData{}
if err := protobuf.Unmarshal(data, testData2); err != nil {
return fmt.Errorf("failed to unmarshal protobuf: %v", err)
}
// Verify data
if testData2.ClusterId != testData.ClusterId {
return fmt.Errorf("protobuf data mismatch: expected %s, got %s",
testData.ClusterId, testData2.ClusterId)
}
if testData2.VolumeServerCount != testData.VolumeServerCount {
return fmt.Errorf("volume server count mismatch: expected %d, got %d",
testData.VolumeServerCount, testData2.VolumeServerCount)
}
return nil
}
func testTelemetryClient() error {
// Create telemetry client
client := telemetry.NewClient(serverURL+"/api/collect", true)
// Create test data using protobuf format
testData := &proto.TelemetryData{
Version: "test-3.45",
Os: "linux/amd64",
Features: []string{"filer", "s3api", "mq"},
Deployment: "integration-test",
VolumeServerCount: 3,
TotalDiskBytes: 1073741824, // 1GB
TotalVolumeCount: 50,
FilerCount: 2,
BrokerCount: 1,
Timestamp: time.Now().Unix(),
}
// Send telemetry data
if err := client.SendTelemetry(testData); err != nil {
return fmt.Errorf("failed to send telemetry: %v", err)
}
fmt.Printf(" Sent telemetry for cluster: %s\n", client.GetInstanceID())
// Wait a bit for processing
time.Sleep(2 * time.Second)
return nil
}
func testMetricsEndpoint() error {
resp, err := http.Get(serverURL + "/metrics")
if err != nil {
return fmt.Errorf("failed to get metrics: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("metrics endpoint returned status %d", resp.StatusCode)
}
// Read response and check for expected metrics
content, err := io.ReadAll(resp.Body)
if err != nil {
return fmt.Errorf("failed to read metrics response: %v", err)
}
contentStr := string(content)
expectedMetrics := []string{
"seaweedfs_telemetry_total_clusters",
"seaweedfs_telemetry_active_clusters",
"seaweedfs_telemetry_reports_received_total",
"seaweedfs_telemetry_volume_servers",
"seaweedfs_telemetry_disk_bytes",
"seaweedfs_telemetry_volume_count",
"seaweedfs_telemetry_filer_count",
"seaweedfs_telemetry_broker_count",
}
for _, metric := range expectedMetrics {
if !strings.Contains(contentStr, metric) {
return fmt.Errorf("missing expected metric: %s", metric)
}
}
// Check that we have at least one report received
if !strings.Contains(contentStr, "seaweedfs_telemetry_reports_received_total 1") {
fmt.Printf(" Warning: Expected at least 1 report received, metrics content:\n%s\n", contentStr)
}
fmt.Printf(" Found %d expected metrics\n", len(expectedMetrics))
return nil
}
func testStatsAPI() error {
resp, err := http.Get(serverURL + "/api/stats")
if err != nil {
return fmt.Errorf("failed to get stats: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("stats API returned status %d", resp.StatusCode)
}
// Read and verify JSON response
content, err := io.ReadAll(resp.Body)
if err != nil {
return fmt.Errorf("failed to read stats response: %v", err)
}
contentStr := string(content)
if !strings.Contains(contentStr, "total_instances") {
return fmt.Errorf("stats response missing total_instances field")
}
fmt.Printf(" Stats response: %s\n", contentStr)
return nil
}
func testInstancesAPI() error {
resp, err := http.Get(serverURL + "/api/instances?limit=10")
if err != nil {
return fmt.Errorf("failed to get instances: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("instances API returned status %d", resp.StatusCode)
}
// Read response
content, err := io.ReadAll(resp.Body)
if err != nil {
return fmt.Errorf("failed to read instances response: %v", err)
}
fmt.Printf(" Instances response length: %d bytes\n", len(content))
return nil
}

View File

@@ -61,6 +61,8 @@ type MasterOptions struct {
electionTimeout *time.Duration
raftHashicorp *bool
raftBootstrap *bool
telemetryUrl *string
telemetryEnabled *bool
}
func init() {
@@ -88,6 +90,8 @@ func init() {
m.electionTimeout = cmdMaster.Flag.Duration("electionTimeout", 10*time.Second, "election timeout of master servers")
m.raftHashicorp = cmdMaster.Flag.Bool("raftHashicorp", false, "use hashicorp raft")
m.raftBootstrap = cmdMaster.Flag.Bool("raftBootstrap", false, "Whether to bootstrap the Raft cluster")
m.telemetryUrl = cmdMaster.Flag.String("telemetry.url", "https://telemetry.seaweedfs.com:3091/api/collect", "telemetry server URL to send usage statistics")
m.telemetryEnabled = cmdMaster.Flag.Bool("telemetry", false, "enable telemetry reporting")
}
var cmdMaster = &Command{
@@ -332,5 +336,7 @@ func (m *MasterOptions) toMasterOption(whiteList []string) *weed_server.MasterOp
DisableHttp: *m.disableHttp,
MetricsAddress: *m.metricsAddress,
MetricsIntervalSec: *m.metricsIntervalSec,
TelemetryUrl: *m.telemetryUrl,
TelemetryEnabled: *m.telemetryEnabled,
}
}

View File

@@ -104,6 +104,8 @@ func init() {
masterOptions.raftBootstrap = cmdServer.Flag.Bool("master.raftBootstrap", false, "Whether to bootstrap the Raft cluster")
masterOptions.heartbeatInterval = cmdServer.Flag.Duration("master.heartbeatInterval", 300*time.Millisecond, "heartbeat interval of master servers, and will be randomly multiplied by [1, 1.25)")
masterOptions.electionTimeout = cmdServer.Flag.Duration("master.electionTimeout", 10*time.Second, "election timeout of master servers")
masterOptions.telemetryUrl = cmdServer.Flag.String("master.telemetry.url", "https://telemetry.seaweedfs.com:3091/api/collect", "telemetry server URL to send usage statistics")
masterOptions.telemetryEnabled = cmdServer.Flag.Bool("master.telemetry", false, "enable telemetry reporting")
filerOptions.filerGroup = cmdServer.Flag.String("filer.filerGroup", "", "share metadata with other filers in the same filerGroup")
filerOptions.collection = cmdServer.Flag.String("filer.collection", "", "all data will be stored in this collection")

View File

@@ -8,11 +8,13 @@ import (
"net/url"
"os"
"regexp"
"runtime"
"strings"
"sync"
"time"
"github.com/seaweedfs/seaweedfs/weed/stats"
"github.com/seaweedfs/seaweedfs/weed/telemetry"
"github.com/seaweedfs/seaweedfs/weed/cluster"
"github.com/seaweedfs/seaweedfs/weed/pb"
@@ -30,6 +32,7 @@ import (
"github.com/seaweedfs/seaweedfs/weed/topology"
"github.com/seaweedfs/seaweedfs/weed/util"
util_http "github.com/seaweedfs/seaweedfs/weed/util/http"
"github.com/seaweedfs/seaweedfs/weed/util/version"
"github.com/seaweedfs/seaweedfs/weed/wdclient"
)
@@ -52,6 +55,8 @@ type MasterOption struct {
MetricsAddress string
MetricsIntervalSec int
IsFollower bool
TelemetryUrl string
TelemetryEnabled bool
}
type MasterServer struct {
@@ -76,6 +81,9 @@ type MasterServer struct {
adminLocks *AdminLocks
Cluster *cluster.Cluster
// telemetry
telemetryCollector *telemetry.Collector
}
func NewMasterServer(r *mux.Router, option *MasterOption, peers map[string]pb.ServerAddress) *MasterServer {
@@ -131,6 +139,28 @@ func NewMasterServer(r *mux.Router, option *MasterOption, peers map[string]pb.Se
ms.vg = topology.NewDefaultVolumeGrowth()
glog.V(0).Infoln("Volume Size Limit is", ms.option.VolumeSizeLimitMB, "MB")
// Initialize telemetry after topology is created
if option.TelemetryEnabled && option.TelemetryUrl != "" {
telemetryClient := telemetry.NewClient(option.TelemetryUrl, option.TelemetryEnabled)
ms.telemetryCollector = telemetry.NewCollector(telemetryClient, ms.Topo, ms.Cluster)
ms.telemetryCollector.SetMasterServer(ms)
// Set version and OS information
ms.telemetryCollector.SetVersion(version.VERSION_NUMBER)
ms.telemetryCollector.SetOS(runtime.GOOS + "/" + runtime.GOARCH)
// Determine features and deployment type
features := []string{"master"}
if len(peers) > 1 {
features = append(features, "cluster")
}
ms.telemetryCollector.SetFeatures(features)
ms.telemetryCollector.SetDeployment(telemetry.DetermineDeployment(true, false, len(peers)))
// Start periodic telemetry collection (every 24 hours)
ms.telemetryCollector.StartPeriodicCollection(24 * time.Hour)
}
ms.guard = security.NewGuard(append(ms.option.WhiteList, whiteList...), signingKey, expiresAfterSec, readSigningKey, readExpiresAfterSec)
handleStaticResources2(r)

100
weed/telemetry/client.go Normal file
View File

@@ -0,0 +1,100 @@
package telemetry
import (
"bytes"
"fmt"
"net/http"
"time"
"github.com/google/uuid"
"github.com/seaweedfs/seaweedfs/telemetry/proto"
"github.com/seaweedfs/seaweedfs/weed/glog"
protobuf "google.golang.org/protobuf/proto"
)
type Client struct {
url string
enabled bool
instanceID string
httpClient *http.Client
}
// NewClient creates a new telemetry client
func NewClient(url string, enabled bool) *Client {
return &Client{
url: url,
enabled: enabled,
instanceID: uuid.New().String(), // Generate UUID in memory only
httpClient: &http.Client{
Timeout: 10 * time.Second,
},
}
}
// IsEnabled returns whether telemetry is enabled
func (c *Client) IsEnabled() bool {
return c.enabled && c.url != ""
}
// SendTelemetry sends telemetry data synchronously using protobuf format
func (c *Client) SendTelemetry(data *proto.TelemetryData) error {
if !c.IsEnabled() {
return nil
}
// Set the cluster ID
data.ClusterId = c.instanceID
return c.sendProtobuf(data)
}
// SendTelemetryAsync sends telemetry data asynchronously
func (c *Client) SendTelemetryAsync(data *proto.TelemetryData) {
if !c.IsEnabled() {
return
}
go func() {
if err := c.SendTelemetry(data); err != nil {
glog.V(1).Infof("Failed to send telemetry: %v", err)
}
}()
}
// sendProtobuf sends data using protobuf format
func (c *Client) sendProtobuf(data *proto.TelemetryData) error {
req := &proto.TelemetryRequest{
Data: data,
}
body, err := protobuf.Marshal(req)
if err != nil {
return fmt.Errorf("failed to marshal protobuf: %v", err)
}
httpReq, err := http.NewRequest("POST", c.url, bytes.NewBuffer(body))
if err != nil {
return fmt.Errorf("failed to create request: %v", err)
}
httpReq.Header.Set("Content-Type", "application/x-protobuf")
httpReq.Header.Set("User-Agent", fmt.Sprintf("SeaweedFS/%s", data.Version))
resp, err := c.httpClient.Do(httpReq)
if err != nil {
return fmt.Errorf("failed to send request: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("server returned status %d", resp.StatusCode)
}
glog.V(2).Infof("Telemetry sent successfully via protobuf")
return nil
}
// GetInstanceID returns the current instance ID
func (c *Client) GetInstanceID() string {
return c.instanceID
}

218
weed/telemetry/collector.go Normal file
View File

@@ -0,0 +1,218 @@
package telemetry
import (
"time"
"github.com/seaweedfs/seaweedfs/telemetry/proto"
"github.com/seaweedfs/seaweedfs/weed/cluster"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/topology"
)
type Collector struct {
client *Client
topo *topology.Topology
cluster *cluster.Cluster
masterServer interface{} // Will be set to *weed_server.MasterServer to access client tracking
features []string
deployment string
version string
os string
}
// NewCollector creates a new telemetry collector
func NewCollector(client *Client, topo *topology.Topology, cluster *cluster.Cluster) *Collector {
return &Collector{
client: client,
topo: topo,
cluster: cluster,
masterServer: nil,
features: []string{},
deployment: "unknown",
version: "unknown",
os: "unknown",
}
}
// SetFeatures sets the list of enabled features
func (c *Collector) SetFeatures(features []string) {
c.features = features
}
// SetDeployment sets the deployment type (standalone, cluster, etc.)
func (c *Collector) SetDeployment(deployment string) {
c.deployment = deployment
}
// SetVersion sets the SeaweedFS version
func (c *Collector) SetVersion(version string) {
c.version = version
}
// SetOS sets the operating system information
func (c *Collector) SetOS(os string) {
c.os = os
}
// SetMasterServer sets a reference to the master server for client tracking
func (c *Collector) SetMasterServer(masterServer interface{}) {
c.masterServer = masterServer
}
// CollectAndSendAsync collects telemetry data and sends it asynchronously
func (c *Collector) CollectAndSendAsync() {
if !c.client.IsEnabled() {
return
}
go func() {
data := c.collectData()
c.client.SendTelemetryAsync(data)
}()
}
// StartPeriodicCollection starts sending telemetry data periodically
func (c *Collector) StartPeriodicCollection(interval time.Duration) {
if !c.client.IsEnabled() {
glog.V(1).Infof("Telemetry is disabled, skipping periodic collection")
return
}
glog.V(0).Infof("Starting telemetry collection every %v", interval)
// Send initial telemetry after a short delay
go func() {
time.Sleep(30 * time.Second) // Wait for cluster to stabilize
c.CollectAndSendAsync()
}()
// Start periodic collection
ticker := time.NewTicker(interval)
go func() {
defer ticker.Stop()
for range ticker.C {
c.CollectAndSendAsync()
}
}()
}
// collectData gathers telemetry data from the topology
func (c *Collector) collectData() *proto.TelemetryData {
data := &proto.TelemetryData{
Version: c.version,
Os: c.os,
Features: c.features,
Deployment: c.deployment,
Timestamp: time.Now().Unix(),
}
if c.topo != nil {
// Collect volume server count
data.VolumeServerCount = int32(c.countVolumeServers())
// Collect total disk usage and volume count
diskBytes, volumeCount := c.collectVolumeStats()
data.TotalDiskBytes = diskBytes
data.TotalVolumeCount = int32(volumeCount)
}
if c.cluster != nil {
// Collect filer and broker counts
data.FilerCount = int32(c.countFilers())
data.BrokerCount = int32(c.countBrokers())
}
return data
}
// countVolumeServers counts the number of active volume servers
func (c *Collector) countVolumeServers() int {
count := 0
for _, dcNode := range c.topo.Children() {
dc := dcNode.(*topology.DataCenter)
for _, rackNode := range dc.Children() {
rack := rackNode.(*topology.Rack)
for range rack.Children() {
count++
}
}
}
return count
}
// collectVolumeStats collects total disk usage and volume count
func (c *Collector) collectVolumeStats() (uint64, int) {
var totalDiskBytes uint64
var totalVolumeCount int
for _, dcNode := range c.topo.Children() {
dc := dcNode.(*topology.DataCenter)
for _, rackNode := range dc.Children() {
rack := rackNode.(*topology.Rack)
for _, dnNode := range rack.Children() {
dn := dnNode.(*topology.DataNode)
volumes := dn.GetVolumes()
for _, volumeInfo := range volumes {
totalVolumeCount++
totalDiskBytes += volumeInfo.Size
}
}
}
}
return totalDiskBytes, totalVolumeCount
}
// countFilers counts the number of active filer servers across all groups
func (c *Collector) countFilers() int {
// Count all filer-type nodes in the cluster
// This includes both pure filer servers and S3 servers (which register as filers)
count := 0
for _, groupName := range c.getAllFilerGroups() {
nodes := c.cluster.ListClusterNode(cluster.FilerGroupName(groupName), cluster.FilerType)
count += len(nodes)
}
return count
}
// countBrokers counts the number of active broker servers
func (c *Collector) countBrokers() int {
// Count brokers across all broker groups
count := 0
for _, groupName := range c.getAllBrokerGroups() {
nodes := c.cluster.ListClusterNode(cluster.FilerGroupName(groupName), cluster.BrokerType)
count += len(nodes)
}
return count
}
// getAllFilerGroups returns all filer group names
func (c *Collector) getAllFilerGroups() []string {
// For simplicity, we check the default group
// In a more sophisticated implementation, we could enumerate all groups
return []string{""}
}
// getAllBrokerGroups returns all broker group names
func (c *Collector) getAllBrokerGroups() []string {
// For simplicity, we check the default group
// In a more sophisticated implementation, we could enumerate all groups
return []string{""}
}
// DetermineDeployment determines the deployment type based on configuration
func DetermineDeployment(isMasterEnabled, isVolumeEnabled bool, peerCount int) string {
if isMasterEnabled && isVolumeEnabled {
if peerCount > 1 {
return "cluster"
}
return "standalone"
}
if isMasterEnabled {
return "master-only"
}
if isVolumeEnabled {
return "volume-only"
}
return "unknown"
}