diff --git a/.github/workflows/deploy_telemetry.yml b/.github/workflows/deploy_telemetry.yml new file mode 100644 index 000000000..8f10af0ce --- /dev/null +++ b/.github/workflows/deploy_telemetry.yml @@ -0,0 +1,157 @@ +# This workflow will build and deploy the SeaweedFS telemetry server +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-go + +name: Deploy Telemetry Server + +on: + push: + branches: [ "master" ] + paths: + - 'telemetry/**' + workflow_dispatch: + inputs: + setup: + description: 'Run first-time server setup' + required: true + type: boolean + default: false + deploy: + description: 'Deploy telemetry server to remote server' + required: true + type: boolean + default: false + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v4 + with: + go-version: '1.24' + + - name: Build Telemetry Server + run: | + go mod tidy + cd telemetry/server + GOOS=linux GOARCH=amd64 go build -o telemetry-server main.go + + - name: First-time Server Setup + if: github.event_name == 'workflow_dispatch' && inputs.setup + env: + SSH_PRIVATE_KEY: ${{ secrets.TELEMETRY_SSH_PRIVATE_KEY }} + REMOTE_HOST: ${{ secrets.TELEMETRY_HOST }} + REMOTE_USER: ${{ secrets.TELEMETRY_USER }} + run: | + mkdir -p ~/.ssh + echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key + chmod 600 ~/.ssh/deploy_key + echo "Host *" > ~/.ssh/config + echo " StrictHostKeyChecking no" >> ~/.ssh/config + + # Create all required directories with proper permissions + ssh -i ~/.ssh/deploy_key $REMOTE_USER@$REMOTE_HOST " + mkdir -p ~/seaweedfs-telemetry/bin ~/seaweedfs-telemetry/logs ~/seaweedfs-telemetry/data ~/seaweedfs-telemetry/tmp && \ + chmod 755 ~/seaweedfs-telemetry/logs && \ + chmod 755 ~/seaweedfs-telemetry/data && \ + touch ~/seaweedfs-telemetry/logs/telemetry.log ~/seaweedfs-telemetry/logs/telemetry.error.log && \ + chmod 644 ~/seaweedfs-telemetry/logs/*.log" + + # Create systemd service file + echo " + [Unit] + Description=SeaweedFS Telemetry Server + After=network.target + + [Service] + Type=simple + User=$REMOTE_USER + WorkingDirectory=/home/$REMOTE_USER/seaweedfs-telemetry + ExecStart=/home/$REMOTE_USER/seaweedfs-telemetry/bin/telemetry-server -port=8353 + Restart=always + RestartSec=5 + StandardOutput=append:/home/$REMOTE_USER/seaweedfs-telemetry/logs/telemetry.log + StandardError=append:/home/$REMOTE_USER/seaweedfs-telemetry/logs/telemetry.error.log + + [Install] + WantedBy=multi-user.target" > telemetry.service + + # Setup logrotate configuration + echo "# SeaweedFS Telemetry service log rotation + /home/$REMOTE_USER/seaweedfs-telemetry/logs/*.log { + daily + rotate 30 + compress + delaycompress + missingok + notifempty + create 644 $REMOTE_USER $REMOTE_USER + postrotate + systemctl restart telemetry.service + endscript + }" > telemetry_logrotate + + # Copy Grafana dashboard and Prometheus config + scp -i ~/.ssh/deploy_key telemetry/grafana-dashboard.json $REMOTE_USER@$REMOTE_HOST:~/seaweedfs-telemetry/ + scp -i ~/.ssh/deploy_key telemetry/prometheus.yml $REMOTE_USER@$REMOTE_HOST:~/seaweedfs-telemetry/ + + # Copy and install service and logrotate files + scp -i ~/.ssh/deploy_key telemetry.service telemetry_logrotate $REMOTE_USER@$REMOTE_HOST:~/seaweedfs-telemetry/ + ssh -i ~/.ssh/deploy_key $REMOTE_USER@$REMOTE_HOST " + sudo mv ~/seaweedfs-telemetry/telemetry.service /etc/systemd/system/ && \ + sudo mv ~/seaweedfs-telemetry/telemetry_logrotate /etc/logrotate.d/seaweedfs-telemetry && \ + sudo systemctl daemon-reload && \ + sudo systemctl enable telemetry.service" + + rm -f ~/.ssh/deploy_key + + - name: Deploy Telemetry Server to Remote Server + if: (github.event_name == 'push' && contains(github.ref, 'refs/heads/master')) || (github.event_name == 'workflow_dispatch' && inputs.deploy) + env: + SSH_PRIVATE_KEY: ${{ secrets.TELEMETRY_SSH_PRIVATE_KEY }} + REMOTE_HOST: ${{ secrets.TELEMETRY_HOST }} + REMOTE_USER: ${{ secrets.TELEMETRY_USER }} + run: | + mkdir -p ~/.ssh + echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key + chmod 600 ~/.ssh/deploy_key + echo "Host *" > ~/.ssh/config + echo " StrictHostKeyChecking no" >> ~/.ssh/config + + # Create temp directory and copy binary + ssh -i ~/.ssh/deploy_key $REMOTE_USER@$REMOTE_HOST "mkdir -p ~/seaweedfs-telemetry/tmp" + scp -i ~/.ssh/deploy_key telemetry/server/telemetry-server $REMOTE_USER@$REMOTE_HOST:~/seaweedfs-telemetry/tmp/ + + # Copy updated configuration files + scp -i ~/.ssh/deploy_key telemetry/grafana-dashboard.json $REMOTE_USER@$REMOTE_HOST:~/seaweedfs-telemetry/ + scp -i ~/.ssh/deploy_key telemetry/prometheus.yml $REMOTE_USER@$REMOTE_HOST:~/seaweedfs-telemetry/ + + # Stop service, move binary, and restart + ssh -i ~/.ssh/deploy_key $REMOTE_USER@$REMOTE_HOST " + sudo systemctl stop telemetry.service || true && \ + mkdir -p ~/seaweedfs-telemetry/bin && \ + mv ~/seaweedfs-telemetry/tmp/telemetry-server ~/seaweedfs-telemetry/bin/ && \ + chmod +x ~/seaweedfs-telemetry/bin/telemetry-server && \ + sudo systemctl start telemetry.service && \ + sudo systemctl status telemetry.service" + + # Verify deployment + ssh -i ~/.ssh/deploy_key $REMOTE_USER@$REMOTE_HOST " + echo 'Waiting for service to start...' + sleep 5 + curl -f http://localhost:8353/health || echo 'Health check failed'" + + rm -f ~/.ssh/deploy_key + + - name: Notify Deployment Status + if: always() + run: | + if [ "${{ job.status }}" == "success" ]; then + echo "✅ Telemetry server deployment successful" + echo "Dashboard: http://${{ secrets.TELEMETRY_HOST }}:8353" + echo "Metrics: http://${{ secrets.TELEMETRY_HOST }}:8353/metrics" + else + echo "❌ Telemetry server deployment failed" + fi \ No newline at end of file diff --git a/telemetry/DEPLOYMENT.md b/telemetry/DEPLOYMENT.md new file mode 100644 index 000000000..d5cd69154 --- /dev/null +++ b/telemetry/DEPLOYMENT.md @@ -0,0 +1,271 @@ +# SeaweedFS Telemetry Server Deployment + +This document describes how to deploy the SeaweedFS telemetry server to a remote server using GitHub Actions. + +## Prerequisites + +1. A remote Linux server with: + - SSH access + - systemd (for service management) + - Optional: Prometheus and Grafana (for monitoring) + +2. GitHub repository secrets configured (see [Setup GitHub Secrets](#setup-github-secrets) below): + - `TELEMETRY_SSH_PRIVATE_KEY`: SSH private key for accessing the remote server + - `TELEMETRY_HOST`: Remote server hostname or IP address + - `TELEMETRY_USER`: Username for SSH access + +## Setup GitHub Secrets + +Before using the deployment workflow, you need to configure the required secrets in your GitHub repository. + +### Step 1: Generate SSH Key Pair + +On your local machine, generate a new SSH key pair specifically for deployment: + +```bash +# Generate a new SSH key pair +ssh-keygen -t ed25519 -C "seaweedfs-telemetry-deploy" -f ~/.ssh/seaweedfs_telemetry_deploy + +# This creates two files: +# ~/.ssh/seaweedfs_telemetry_deploy (private key) +# ~/.ssh/seaweedfs_telemetry_deploy.pub (public key) +``` + +### Step 2: Configure Remote Server + +Copy the public key to your remote server: + +```bash +# Copy public key to remote server +ssh-copy-id -i ~/.ssh/seaweedfs_telemetry_deploy.pub user@your-server.com + +# Or manually append to authorized_keys +cat ~/.ssh/seaweedfs_telemetry_deploy.pub | ssh user@your-server.com "mkdir -p ~/.ssh && cat >> ~/.ssh/authorized_keys" +``` + +Test the SSH connection: + +```bash +# Test SSH connection with the new key +ssh -i ~/.ssh/seaweedfs_telemetry_deploy user@your-server.com "echo 'SSH connection successful'" +``` + +### Step 3: Add Secrets to GitHub Repository + +1. Go to your GitHub repository +2. Click on **Settings** tab +3. In the sidebar, click **Secrets and variables** → **Actions** +4. Click **New repository secret** for each of the following: + +#### TELEMETRY_SSH_PRIVATE_KEY + +```bash +# Display the private key content +cat ~/.ssh/seaweedfs_telemetry_deploy +``` + +- **Name**: `TELEMETRY_SSH_PRIVATE_KEY` +- **Value**: Copy the entire private key content, including the `-----BEGIN OPENSSH PRIVATE KEY-----` and `-----END OPENSSH PRIVATE KEY-----` lines + +#### TELEMETRY_HOST + +- **Name**: `TELEMETRY_HOST` +- **Value**: Your server's hostname or IP address (e.g., `telemetry.example.com` or `192.168.1.100`) + +#### TELEMETRY_USER + +- **Name**: `TELEMETRY_USER` +- **Value**: The username on the remote server (e.g., `ubuntu`, `deploy`, or your username) + +### Step 4: Verify Configuration + +Create a simple test workflow or manually trigger the deployment to verify the secrets are working correctly. + +### Security Best Practices + +1. **Dedicated SSH Key**: Use a separate SSH key only for deployment +2. **Limited Permissions**: Create a dedicated user on the remote server with minimal required permissions +3. **Key Rotation**: Regularly rotate SSH keys +4. **Server Access**: Restrict SSH access to specific IP ranges if possible + +### Example Server Setup + +If you're setting up a new server, here's a basic configuration: + +```bash +# On the remote server, create a dedicated user for deployment +sudo useradd -m -s /bin/bash seaweedfs-deploy +sudo usermod -aG sudo seaweedfs-deploy # Only if sudo access is needed + +# Switch to the deployment user +sudo su - seaweedfs-deploy + +# Create SSH directory +mkdir -p ~/.ssh +chmod 700 ~/.ssh + +# Add your public key (paste the content of seaweedfs_telemetry_deploy.pub) +nano ~/.ssh/authorized_keys +chmod 600 ~/.ssh/authorized_keys +``` + +### Troubleshooting + +#### SSH Connection Issues + +```bash +# Test SSH connection manually +ssh -i ~/.ssh/seaweedfs_telemetry_deploy -v user@your-server.com + +# Check SSH key permissions +ls -la ~/.ssh/seaweedfs_telemetry_deploy* +# Should show: -rw------- for private key, -rw-r--r-- for public key +``` + +#### GitHub Actions Fails + +1. **Check secrets**: Ensure all three secrets are properly set in GitHub +2. **Verify SSH key**: Make sure the entire private key (including headers/footers) is copied +3. **Test connectivity**: Manually SSH to the server from your local machine +4. **Check user permissions**: Ensure the remote user has necessary permissions + +## GitHub Actions Workflow + +The deployment workflow (`.github/workflows/deploy_telemetry.yml`) provides two main operations: + +### 1. First-time Setup + +Run this once to set up the remote server: + +1. Go to GitHub Actions in your repository +2. Select "Deploy Telemetry Server" workflow +3. Click "Run workflow" +4. Check "Run first-time server setup" +5. Click "Run workflow" + +This will: +- Create necessary directories on the remote server +- Set up systemd service configuration +- Configure log rotation +- Upload Grafana dashboard and Prometheus configuration + + +### 2. Deploy Updates + +Deployments happen automatically when: +- Code is pushed to the `master` branch with changes in the `telemetry/` directory + +Or manually trigger deployment: +1. Go to GitHub Actions in your repository +2. Select "Deploy Telemetry Server" workflow +3. Click "Run workflow" +4. Check "Deploy telemetry server to remote server" +5. Click "Run workflow" + +## Server Directory Structure + +After setup, the remote server will have: + +``` +~/seaweedfs-telemetry/ +├── bin/ +│ └── telemetry-server # Binary executable +├── logs/ +│ ├── telemetry.log # Application logs +│ └── telemetry.error.log # Error logs +├── data/ # Data directory (if needed) +├── grafana-dashboard.json # Grafana dashboard configuration +└── prometheus.yml # Prometheus configuration +``` + +## Service Management + +The telemetry server runs as a systemd service: + +```bash +# Check service status +sudo systemctl status telemetry.service + +# View logs +sudo journalctl -u telemetry.service -f + +# Restart service +sudo systemctl restart telemetry.service + +# Stop/start service +sudo systemctl stop telemetry.service +sudo systemctl start telemetry.service +``` + +## Accessing the Service + +After deployment, the telemetry server will be available at: + +- **Dashboard**: `http://your-server:8353` +- **API**: `http://your-server:8353/api/*` +- **Metrics**: `http://your-server:8353/metrics` +- **Health Check**: `http://your-server:8353/health` + +## Optional: Prometheus and Grafana Integration + +### Prometheus Setup + +1. Install Prometheus on your server +2. Update `/etc/prometheus/prometheus.yml` to include: + ```yaml + scrape_configs: + - job_name: 'seaweedfs-telemetry' + static_configs: + - targets: ['localhost:8353'] + metrics_path: '/metrics' + ``` + +### Grafana Setup + +1. Install Grafana on your server +2. Import the dashboard from `~/seaweedfs-telemetry/grafana-dashboard.json` +3. Configure Prometheus as a data source pointing to your Prometheus instance + +## Troubleshooting + +### Deployment Fails + +1. Check GitHub Actions logs for detailed error messages +2. Verify SSH connectivity: `ssh user@host` +3. Ensure all required secrets are configured in GitHub + +### Service Won't Start + +1. Check service logs: `sudo journalctl -u telemetry.service` +2. Verify binary permissions: `ls -la ~/seaweedfs-telemetry/bin/` +3. Test binary manually: `~/seaweedfs-telemetry/bin/telemetry-server -help` + +### Port Conflicts + +If port 8353 is already in use: + +1. Edit the systemd service: `sudo systemctl edit telemetry.service` +2. Add override configuration: + ```ini + [Service] + ExecStart= + ExecStart=/home/user/seaweedfs-telemetry/bin/telemetry-server -port=8354 + ``` +3. Reload and restart: `sudo systemctl daemon-reload && sudo systemctl restart telemetry.service` + +## Security Considerations + +1. **Firewall**: Consider restricting access to telemetry ports +2. **SSH Keys**: Use dedicated SSH keys with minimal permissions +3. **User Permissions**: Run the service as a non-privileged user +4. **Network**: Consider running on internal networks only + +## Monitoring + +Monitor the deployment and service health: + +- **GitHub Actions**: Check workflow runs for deployment status +- **System Logs**: `sudo journalctl -u telemetry.service` +- **Application Logs**: `tail -f ~/seaweedfs-telemetry/logs/telemetry.log` +- **Health Endpoint**: `curl http://localhost:8353/health` +- **Metrics**: `curl http://localhost:8353/metrics` \ No newline at end of file diff --git a/telemetry/README.md b/telemetry/README.md new file mode 100644 index 000000000..aee050943 --- /dev/null +++ b/telemetry/README.md @@ -0,0 +1,351 @@ +# SeaweedFS Telemetry System + +A privacy-respecting telemetry system for SeaweedFS that collects cluster-level usage statistics and provides visualization through Prometheus and Grafana. + +## Features + +- **Privacy-First Design**: Uses in-memory cluster IDs (regenerated on restart), no personal data collection +- **Prometheus Integration**: Native Prometheus metrics for monitoring and alerting +- **Grafana Dashboards**: Pre-built dashboards for data visualization +- **Protocol Buffers**: Efficient binary data transmission for optimal performance +- **Opt-in Only**: Disabled by default, requires explicit configuration +- **Docker Compose**: Complete monitoring stack deployment +- **Automatic Cleanup**: Configurable data retention policies + +## Architecture + +``` +SeaweedFS Cluster → Telemetry Client → Telemetry Server → Prometheus → Grafana + (protobuf) (metrics) (queries) +``` + +## Data Transmission + +The telemetry system uses **Protocol Buffers exclusively** for efficient binary data transmission: + +- **Compact Format**: 30-50% smaller than JSON +- **Fast Serialization**: Better performance than text-based formats +- **Type Safety**: Strong typing with generated Go structs +- **Schema Evolution**: Built-in versioning support + +### Protobuf Schema + +```protobuf +message TelemetryData { + string cluster_id = 1; // In-memory generated UUID + string version = 2; // SeaweedFS version + string os = 3; // Operating system + repeated string features = 4; // Enabled features + string deployment = 5; // Deployment type + int32 volume_server_count = 6; // Number of volume servers + uint64 total_disk_bytes = 7; // Total disk usage + int32 total_volume_count = 8; // Total volume count + int64 timestamp = 9; // Collection timestamp +} +``` + +## Privacy Approach + +- **No Personal Data**: No hostnames, IP addresses, or user information +- **In-Memory IDs**: Cluster IDs are generated in-memory and change on restart +- **Aggregated Data**: Only cluster-level statistics, no individual file/user data +- **Opt-in Only**: Telemetry is disabled by default +- **Transparent**: Open source implementation, clear data collection policy + +## Collected Data + +| Field | Description | Example | +|-------|-------------|---------| +| `cluster_id` | In-memory UUID (changes on restart) | `a1b2c3d4-...` | +| `version` | SeaweedFS version | `3.45` | +| `os` | Operating system and architecture | `linux/amd64` | +| `features` | Enabled components | `["filer", "s3api"]` | +| `deployment` | Deployment type | `cluster` | +| `volume_server_count` | Number of volume servers | `5` | +| `total_disk_bytes` | Total disk usage across cluster | `1073741824` | +| `total_volume_count` | Total number of volumes | `120` | +| `timestamp` | When data was collected | `1640995200` | + +## Quick Start + +### 1. Deploy Telemetry Server + +```bash +# Clone and start the complete monitoring stack +git clone https://github.com/seaweedfs/seaweedfs.git +cd seaweedfs/telemetry +docker-compose up -d + +# Or run the server directly +cd server +go run . -port=8080 -dashboard=true +``` + +### 2. Configure SeaweedFS + +```bash +# Enable telemetry in SeaweedFS master (uses default telemetry.seaweedfs.com:3091) +weed master -telemetry=true + +# Or in server mode +weed server -telemetry=true + +# Or specify custom telemetry server +weed master -telemetry=true -telemetry.url=http://localhost:8080/api/collect +``` + +### 3. Access Dashboards + +- **Telemetry Server**: http://localhost:8080 +- **Prometheus**: http://localhost:9090 +- **Grafana**: http://localhost:3000 (admin/admin) + +## Configuration + +### SeaweedFS Master/Server + +```bash +# Enable telemetry +-telemetry=true + +# Set custom telemetry server URL (optional, defaults to telemetry.seaweedfs.com:3091) +-telemetry.url=http://your-telemetry-server:8080/api/collect +``` + +### Telemetry Server + +```bash +# Server configuration +-port=8080 # Server port +-dashboard=true # Enable built-in dashboard +-cleanup=24h # Cleanup interval +-max-age=720h # Maximum data retention (30 days) + +# Example +./telemetry-server -port=8080 -dashboard=true -cleanup=24h -max-age=720h +``` + +## Prometheus Metrics + +The telemetry server exposes these Prometheus metrics: + +### Cluster Metrics +- `seaweedfs_telemetry_total_clusters`: Total unique clusters (30 days) +- `seaweedfs_telemetry_active_clusters`: Active clusters (7 days) + +### Per-Cluster Metrics +- `seaweedfs_telemetry_volume_servers{cluster_id, version, os, deployment}`: Volume servers per cluster +- `seaweedfs_telemetry_disk_bytes{cluster_id, version, os, deployment}`: Disk usage per cluster +- `seaweedfs_telemetry_volume_count{cluster_id, version, os, deployment}`: Volume count per cluster +- `seaweedfs_telemetry_filer_count{cluster_id, version, os, deployment}`: Filer servers per cluster +- `seaweedfs_telemetry_broker_count{cluster_id, version, os, deployment}`: Broker servers per cluster +- `seaweedfs_telemetry_cluster_info{cluster_id, version, os, deployment, features}`: Cluster metadata + +### Server Metrics +- `seaweedfs_telemetry_reports_received_total`: Total telemetry reports received + +## API Endpoints + +### Data Collection +```bash +# Submit telemetry data (protobuf only) +POST /api/collect +Content-Type: application/x-protobuf +[TelemetryRequest protobuf data] +``` + +### Statistics (JSON for dashboard/debugging) +```bash +# Get aggregated statistics +GET /api/stats + +# Get recent cluster instances +GET /api/instances?limit=100 + +# Get metrics over time +GET /api/metrics?days=30 +``` + +### Monitoring +```bash +# Prometheus metrics +GET /metrics +``` + +## Docker Deployment + +### Complete Stack (Recommended) + +```yaml +# docker-compose.yml +version: '3.8' +services: + telemetry-server: + build: ./server + ports: + - "8080:8080" + command: ["-port=8080", "-dashboard=true", "-cleanup=24h"] + + prometheus: + image: prom/prometheus:latest + ports: + - "9090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + + grafana: + image: grafana/grafana:latest + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin + volumes: + - ./grafana-provisioning:/etc/grafana/provisioning + - ./grafana-dashboard.json:/var/lib/grafana/dashboards/seaweedfs.json +``` + +```bash +# Deploy the stack +docker-compose up -d + +# Scale telemetry server if needed +docker-compose up -d --scale telemetry-server=3 +``` + +### Server Only + +```bash +# Build and run telemetry server +cd server +docker build -t seaweedfs-telemetry . +docker run -p 8080:8080 seaweedfs-telemetry -port=8080 -dashboard=true +``` + +## Development + +### Protocol Buffer Development + +```bash +# Generate protobuf code +cd telemetry +protoc --go_out=. --go_opt=paths=source_relative proto/telemetry.proto + +# The generated code is already included in the repository +``` + +### Build from Source + +```bash +# Build telemetry server +cd telemetry/server +go build -o telemetry-server . + +# Build SeaweedFS with telemetry support +cd ../.. +go build -o weed ./weed +``` + +### Testing + +```bash +# Test telemetry server +cd telemetry/server +go test ./... + +# Test protobuf communication (requires protobuf tools) +# See telemetry client code for examples +``` + +## Grafana Dashboard + +The included Grafana dashboard provides: + +- **Overview**: Total and active clusters, version distribution +- **Resource Usage**: Volume servers and disk usage over time +- **Deployments**: Deployment type and OS distribution +- **Growth Trends**: Historical growth patterns + +### Custom Queries + +```promql +# Total active clusters +seaweedfs_telemetry_active_clusters + +# Disk usage by version +sum by (version) (seaweedfs_telemetry_disk_bytes) + +# Volume servers by deployment type +sum by (deployment) (seaweedfs_telemetry_volume_servers) + +# Filer servers by version +sum by (version) (seaweedfs_telemetry_filer_count) + +# Broker servers across all clusters +sum(seaweedfs_telemetry_broker_count) + +# Growth rate (weekly) +increase(seaweedfs_telemetry_total_clusters[7d]) +``` + +## Security Considerations + +- **Network Security**: Use HTTPS in production environments +- **Access Control**: Implement authentication for Grafana and Prometheus +- **Data Retention**: Configure appropriate retention policies +- **Monitoring**: Monitor the telemetry infrastructure itself + +## Troubleshooting + +### Common Issues + +**SeaweedFS not sending data:** +```bash +# Check telemetry configuration +weed master -h | grep telemetry + +# Verify connectivity +curl -v http://your-telemetry-server:8080/api/collect +``` + +**Server not receiving data:** +```bash +# Check server logs +docker-compose logs telemetry-server + +# Verify metrics endpoint +curl http://localhost:8080/metrics +``` + +**Prometheus not scraping:** +```bash +# Check Prometheus targets +curl http://localhost:9090/api/v1/targets + +# Verify configuration +docker-compose logs prometheus +``` + +### Debugging + +```bash +# Enable verbose logging in SeaweedFS +weed master -v=2 -telemetry=true + +# Check telemetry server metrics +curl http://localhost:8080/metrics | grep seaweedfs_telemetry + +# Test data flow +curl http://localhost:8080/api/stats +``` + +## Contributing + +1. Fork the repository +2. Create a feature branch +3. Make your changes +4. Add tests if applicable +5. Submit a pull request + +## License + +This telemetry system is part of SeaweedFS and follows the same Apache 2.0 license. \ No newline at end of file diff --git a/telemetry/docker-compose.yml b/telemetry/docker-compose.yml new file mode 100644 index 000000000..73f0e8f70 --- /dev/null +++ b/telemetry/docker-compose.yml @@ -0,0 +1,55 @@ +version: '3.8' + +services: + telemetry-server: + build: ./server + ports: + - "8080:8080" + command: [ + "./telemetry-server", + "-port=8080", + "-dashboard=false", # Disable built-in dashboard, use Grafana + "-log=true", + "-cors=true" + ] + networks: + - telemetry + + prometheus: + image: prom/prometheus:latest + ports: + - "9090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + - prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/etc/prometheus/console_libraries' + - '--web.console.templates=/etc/prometheus/consoles' + - '--storage.tsdb.retention.time=200h' + - '--web.enable-lifecycle' + networks: + - telemetry + + grafana: + image: grafana/grafana:latest + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_USERS_ALLOW_SIGN_UP=false + volumes: + - grafana_data:/var/lib/grafana + - ./grafana-dashboard.json:/var/lib/grafana/dashboards/seaweedfs-telemetry.json + - ./grafana-provisioning:/etc/grafana/provisioning + networks: + - telemetry + +volumes: + prometheus_data: + grafana_data: + +networks: + telemetry: + driver: bridge \ No newline at end of file diff --git a/telemetry/grafana-dashboard.json b/telemetry/grafana-dashboard.json new file mode 100644 index 000000000..c33896dab --- /dev/null +++ b/telemetry/grafana-dashboard.json @@ -0,0 +1,734 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "showHeader": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "seaweedfs_telemetry_total_clusters", + "format": "time_series", + "refId": "A" + } + ], + "title": "Total SeaweedFS Clusters", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "showHeader": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "seaweedfs_telemetry_active_clusters", + "format": "time_series", + "refId": "A" + } + ], + "title": "Active Clusters (7 days)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "displayMode": "visible", + "placement": "bottom", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "values": false, + "calcs": [ + "lastNotNull" + ], + "fields": "" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "count by (version) (seaweedfs_telemetry_cluster_info)", + "format": "time_series", + "legendFormat": "{{version}}", + "refId": "A" + } + ], + "title": "SeaweedFS Version Distribution", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "displayMode": "visible", + "placement": "bottom", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "values": false, + "calcs": [ + "lastNotNull" + ], + "fields": "" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "count by (os) (seaweedfs_telemetry_cluster_info)", + "format": "time_series", + "legendFormat": "{{os}}", + "refId": "A" + } + ], + "title": "Operating System Distribution", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(seaweedfs_telemetry_volume_servers)", + "format": "time_series", + "legendFormat": "Total Volume Servers", + "refId": "A" + } + ], + "title": "Total Volume Servers Over Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(seaweedfs_telemetry_disk_bytes)", + "format": "time_series", + "legendFormat": "Total Disk Usage", + "refId": "A" + } + ], + "title": "Total Disk Usage Over Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(seaweedfs_telemetry_volume_count)", + "format": "time_series", + "legendFormat": "Total Volume Count", + "refId": "A" + } + ], + "title": "Total Volume Count Over Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(seaweedfs_telemetry_filer_count)", + "format": "time_series", + "legendFormat": "Total Filer Count", + "refId": "A" + } + ], + "title": "Total Filer Servers Over Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(seaweedfs_telemetry_broker_count)", + "format": "time_series", + "legendFormat": "Total Broker Count", + "refId": "A" + } + ], + "title": "Total Broker Servers Over Time", + "type": "timeseries" + } + ], + "refresh": "5m", + "schemaVersion": 38, + "style": "dark", + "tags": [ + "seaweedfs", + "telemetry" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "SeaweedFS Telemetry Dashboard", + "uid": "seaweedfs-telemetry", + "version": 1, + "weekStart": "" +} \ No newline at end of file diff --git a/telemetry/grafana-provisioning/dashboards/dashboards.yml b/telemetry/grafana-provisioning/dashboards/dashboards.yml new file mode 100644 index 000000000..82fd18a7a --- /dev/null +++ b/telemetry/grafana-provisioning/dashboards/dashboards.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: 'seaweedfs' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards \ No newline at end of file diff --git a/telemetry/grafana-provisioning/datasources/prometheus.yml b/telemetry/grafana-provisioning/datasources/prometheus.yml new file mode 100644 index 000000000..38fb02c68 --- /dev/null +++ b/telemetry/grafana-provisioning/datasources/prometheus.yml @@ -0,0 +1,9 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: true \ No newline at end of file diff --git a/telemetry/prometheus.yml b/telemetry/prometheus.yml new file mode 100644 index 000000000..e33d518e7 --- /dev/null +++ b/telemetry/prometheus.yml @@ -0,0 +1,15 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +rule_files: + # - "first_rules.yml" + # - "second_rules.yml" + +scrape_configs: + - job_name: 'seaweedfs-telemetry' + static_configs: + - targets: ['telemetry-server:8080'] + scrape_interval: 30s + metrics_path: '/metrics' + scrape_timeout: 10s \ No newline at end of file diff --git a/telemetry/proto/telemetry.pb.go b/telemetry/proto/telemetry.pb.go new file mode 100644 index 000000000..b1bf44db7 --- /dev/null +++ b/telemetry/proto/telemetry.pb.go @@ -0,0 +1,398 @@ +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.34.2 +// protoc v5.29.3 +// source: proto/telemetry.proto + +package proto + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +// TelemetryData represents cluster-level telemetry information +type TelemetryData struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + // Unique cluster identifier (generated in-memory) + ClusterId string `protobuf:"bytes,1,opt,name=cluster_id,json=clusterId,proto3" json:"cluster_id,omitempty"` + // SeaweedFS version + Version string `protobuf:"bytes,2,opt,name=version,proto3" json:"version,omitempty"` + // Operating system (e.g., "linux/amd64") + Os string `protobuf:"bytes,3,opt,name=os,proto3" json:"os,omitempty"` + // Enabled features (e.g., ["filer", "s3api", "mq"]) + Features []string `protobuf:"bytes,4,rep,name=features,proto3" json:"features,omitempty"` + // Deployment type ("standalone", "cluster", "master-only", "volume-only") + Deployment string `protobuf:"bytes,5,opt,name=deployment,proto3" json:"deployment,omitempty"` + // Number of volume servers in the cluster + VolumeServerCount int32 `protobuf:"varint,6,opt,name=volume_server_count,json=volumeServerCount,proto3" json:"volume_server_count,omitempty"` + // Total disk usage across all volume servers (in bytes) + TotalDiskBytes uint64 `protobuf:"varint,7,opt,name=total_disk_bytes,json=totalDiskBytes,proto3" json:"total_disk_bytes,omitempty"` + // Total number of volumes in the cluster + TotalVolumeCount int32 `protobuf:"varint,8,opt,name=total_volume_count,json=totalVolumeCount,proto3" json:"total_volume_count,omitempty"` + // Number of filer servers in the cluster + FilerCount int32 `protobuf:"varint,9,opt,name=filer_count,json=filerCount,proto3" json:"filer_count,omitempty"` + // Number of broker servers in the cluster + BrokerCount int32 `protobuf:"varint,10,opt,name=broker_count,json=brokerCount,proto3" json:"broker_count,omitempty"` + // Unix timestamp when the data was collected + Timestamp int64 `protobuf:"varint,11,opt,name=timestamp,proto3" json:"timestamp,omitempty"` +} + +func (x *TelemetryData) Reset() { + *x = TelemetryData{} + if protoimpl.UnsafeEnabled { + mi := &file_proto_telemetry_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *TelemetryData) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*TelemetryData) ProtoMessage() {} + +func (x *TelemetryData) ProtoReflect() protoreflect.Message { + mi := &file_proto_telemetry_proto_msgTypes[0] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use TelemetryData.ProtoReflect.Descriptor instead. +func (*TelemetryData) Descriptor() ([]byte, []int) { + return file_proto_telemetry_proto_rawDescGZIP(), []int{0} +} + +func (x *TelemetryData) GetClusterId() string { + if x != nil { + return x.ClusterId + } + return "" +} + +func (x *TelemetryData) GetVersion() string { + if x != nil { + return x.Version + } + return "" +} + +func (x *TelemetryData) GetOs() string { + if x != nil { + return x.Os + } + return "" +} + +func (x *TelemetryData) GetFeatures() []string { + if x != nil { + return x.Features + } + return nil +} + +func (x *TelemetryData) GetDeployment() string { + if x != nil { + return x.Deployment + } + return "" +} + +func (x *TelemetryData) GetVolumeServerCount() int32 { + if x != nil { + return x.VolumeServerCount + } + return 0 +} + +func (x *TelemetryData) GetTotalDiskBytes() uint64 { + if x != nil { + return x.TotalDiskBytes + } + return 0 +} + +func (x *TelemetryData) GetTotalVolumeCount() int32 { + if x != nil { + return x.TotalVolumeCount + } + return 0 +} + +func (x *TelemetryData) GetFilerCount() int32 { + if x != nil { + return x.FilerCount + } + return 0 +} + +func (x *TelemetryData) GetBrokerCount() int32 { + if x != nil { + return x.BrokerCount + } + return 0 +} + +func (x *TelemetryData) GetTimestamp() int64 { + if x != nil { + return x.Timestamp + } + return 0 +} + +// TelemetryRequest is sent from SeaweedFS clusters to the telemetry server +type TelemetryRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Data *TelemetryData `protobuf:"bytes,1,opt,name=data,proto3" json:"data,omitempty"` +} + +func (x *TelemetryRequest) Reset() { + *x = TelemetryRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_proto_telemetry_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *TelemetryRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*TelemetryRequest) ProtoMessage() {} + +func (x *TelemetryRequest) ProtoReflect() protoreflect.Message { + mi := &file_proto_telemetry_proto_msgTypes[1] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use TelemetryRequest.ProtoReflect.Descriptor instead. +func (*TelemetryRequest) Descriptor() ([]byte, []int) { + return file_proto_telemetry_proto_rawDescGZIP(), []int{1} +} + +func (x *TelemetryRequest) GetData() *TelemetryData { + if x != nil { + return x.Data + } + return nil +} + +// TelemetryResponse is returned by the telemetry server +type TelemetryResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Success bool `protobuf:"varint,1,opt,name=success,proto3" json:"success,omitempty"` + Message string `protobuf:"bytes,2,opt,name=message,proto3" json:"message,omitempty"` +} + +func (x *TelemetryResponse) Reset() { + *x = TelemetryResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_proto_telemetry_proto_msgTypes[2] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *TelemetryResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*TelemetryResponse) ProtoMessage() {} + +func (x *TelemetryResponse) ProtoReflect() protoreflect.Message { + mi := &file_proto_telemetry_proto_msgTypes[2] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use TelemetryResponse.ProtoReflect.Descriptor instead. +func (*TelemetryResponse) Descriptor() ([]byte, []int) { + return file_proto_telemetry_proto_rawDescGZIP(), []int{2} +} + +func (x *TelemetryResponse) GetSuccess() bool { + if x != nil { + return x.Success + } + return false +} + +func (x *TelemetryResponse) GetMessage() string { + if x != nil { + return x.Message + } + return "" +} + +var File_proto_telemetry_proto protoreflect.FileDescriptor + +var file_proto_telemetry_proto_rawDesc = []byte{ + 0x0a, 0x15, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x74, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74, 0x72, + 0x79, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x09, 0x74, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74, + 0x72, 0x79, 0x22, 0xfe, 0x02, 0x0a, 0x0d, 0x54, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74, 0x72, 0x79, + 0x44, 0x61, 0x74, 0x61, 0x12, 0x1d, 0x0a, 0x0a, 0x63, 0x6c, 0x75, 0x73, 0x74, 0x65, 0x72, 0x5f, + 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x09, 0x63, 0x6c, 0x75, 0x73, 0x74, 0x65, + 0x72, 0x49, 0x64, 0x12, 0x18, 0x0a, 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x18, 0x02, + 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x12, 0x0e, 0x0a, + 0x02, 0x6f, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x6f, 0x73, 0x12, 0x1a, 0x0a, + 0x08, 0x66, 0x65, 0x61, 0x74, 0x75, 0x72, 0x65, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x09, 0x52, + 0x08, 0x66, 0x65, 0x61, 0x74, 0x75, 0x72, 0x65, 0x73, 0x12, 0x1e, 0x0a, 0x0a, 0x64, 0x65, 0x70, + 0x6c, 0x6f, 0x79, 0x6d, 0x65, 0x6e, 0x74, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0a, 0x64, + 0x65, 0x70, 0x6c, 0x6f, 0x79, 0x6d, 0x65, 0x6e, 0x74, 0x12, 0x2e, 0x0a, 0x13, 0x76, 0x6f, 0x6c, + 0x75, 0x6d, 0x65, 0x5f, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, + 0x18, 0x06, 0x20, 0x01, 0x28, 0x05, 0x52, 0x11, 0x76, 0x6f, 0x6c, 0x75, 0x6d, 0x65, 0x53, 0x65, + 0x72, 0x76, 0x65, 0x72, 0x43, 0x6f, 0x75, 0x6e, 0x74, 0x12, 0x28, 0x0a, 0x10, 0x74, 0x6f, 0x74, + 0x61, 0x6c, 0x5f, 0x64, 0x69, 0x73, 0x6b, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x07, 0x20, + 0x01, 0x28, 0x04, 0x52, 0x0e, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x44, 0x69, 0x73, 0x6b, 0x42, 0x79, + 0x74, 0x65, 0x73, 0x12, 0x2c, 0x0a, 0x12, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x5f, 0x76, 0x6f, 0x6c, + 0x75, 0x6d, 0x65, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x18, 0x08, 0x20, 0x01, 0x28, 0x05, 0x52, + 0x10, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x56, 0x6f, 0x6c, 0x75, 0x6d, 0x65, 0x43, 0x6f, 0x75, 0x6e, + 0x74, 0x12, 0x1f, 0x0a, 0x0b, 0x66, 0x69, 0x6c, 0x65, 0x72, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, + 0x18, 0x09, 0x20, 0x01, 0x28, 0x05, 0x52, 0x0a, 0x66, 0x69, 0x6c, 0x65, 0x72, 0x43, 0x6f, 0x75, + 0x6e, 0x74, 0x12, 0x21, 0x0a, 0x0c, 0x62, 0x72, 0x6f, 0x6b, 0x65, 0x72, 0x5f, 0x63, 0x6f, 0x75, + 0x6e, 0x74, 0x18, 0x0a, 0x20, 0x01, 0x28, 0x05, 0x52, 0x0b, 0x62, 0x72, 0x6f, 0x6b, 0x65, 0x72, + 0x43, 0x6f, 0x75, 0x6e, 0x74, 0x12, 0x1c, 0x0a, 0x09, 0x74, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, + 0x6d, 0x70, 0x18, 0x0b, 0x20, 0x01, 0x28, 0x03, 0x52, 0x09, 0x74, 0x69, 0x6d, 0x65, 0x73, 0x74, + 0x61, 0x6d, 0x70, 0x22, 0x40, 0x0a, 0x10, 0x54, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74, 0x72, 0x79, + 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x2c, 0x0a, 0x04, 0x64, 0x61, 0x74, 0x61, 0x18, + 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x18, 0x2e, 0x74, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74, 0x72, + 0x79, 0x2e, 0x54, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74, 0x72, 0x79, 0x44, 0x61, 0x74, 0x61, 0x52, + 0x04, 0x64, 0x61, 0x74, 0x61, 0x22, 0x47, 0x0a, 0x11, 0x54, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74, + 0x72, 0x79, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x73, 0x75, + 0x63, 0x63, 0x65, 0x73, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x08, 0x52, 0x07, 0x73, 0x75, 0x63, + 0x63, 0x65, 0x73, 0x73, 0x12, 0x18, 0x0a, 0x07, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x18, + 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x42, 0x30, + 0x5a, 0x2e, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x73, 0x65, 0x61, + 0x77, 0x65, 0x65, 0x64, 0x66, 0x73, 0x2f, 0x73, 0x65, 0x61, 0x77, 0x65, 0x65, 0x64, 0x66, 0x73, + 0x2f, 0x74, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x74, 0x72, 0x79, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, + 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, +} + +var ( + file_proto_telemetry_proto_rawDescOnce sync.Once + file_proto_telemetry_proto_rawDescData = file_proto_telemetry_proto_rawDesc +) + +func file_proto_telemetry_proto_rawDescGZIP() []byte { + file_proto_telemetry_proto_rawDescOnce.Do(func() { + file_proto_telemetry_proto_rawDescData = protoimpl.X.CompressGZIP(file_proto_telemetry_proto_rawDescData) + }) + return file_proto_telemetry_proto_rawDescData +} + +var file_proto_telemetry_proto_msgTypes = make([]protoimpl.MessageInfo, 3) +var file_proto_telemetry_proto_goTypes = []any{ + (*TelemetryData)(nil), // 0: telemetry.TelemetryData + (*TelemetryRequest)(nil), // 1: telemetry.TelemetryRequest + (*TelemetryResponse)(nil), // 2: telemetry.TelemetryResponse +} +var file_proto_telemetry_proto_depIdxs = []int32{ + 0, // 0: telemetry.TelemetryRequest.data:type_name -> telemetry.TelemetryData + 1, // [1:1] is the sub-list for method output_type + 1, // [1:1] is the sub-list for method input_type + 1, // [1:1] is the sub-list for extension type_name + 1, // [1:1] is the sub-list for extension extendee + 0, // [0:1] is the sub-list for field type_name +} + +func init() { file_proto_telemetry_proto_init() } +func file_proto_telemetry_proto_init() { + if File_proto_telemetry_proto != nil { + return + } + if !protoimpl.UnsafeEnabled { + file_proto_telemetry_proto_msgTypes[0].Exporter = func(v any, i int) any { + switch v := v.(*TelemetryData); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_proto_telemetry_proto_msgTypes[1].Exporter = func(v any, i int) any { + switch v := v.(*TelemetryRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_proto_telemetry_proto_msgTypes[2].Exporter = func(v any, i int) any { + switch v := v.(*TelemetryResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: file_proto_telemetry_proto_rawDesc, + NumEnums: 0, + NumMessages: 3, + NumExtensions: 0, + NumServices: 0, + }, + GoTypes: file_proto_telemetry_proto_goTypes, + DependencyIndexes: file_proto_telemetry_proto_depIdxs, + MessageInfos: file_proto_telemetry_proto_msgTypes, + }.Build() + File_proto_telemetry_proto = out.File + file_proto_telemetry_proto_rawDesc = nil + file_proto_telemetry_proto_goTypes = nil + file_proto_telemetry_proto_depIdxs = nil +} diff --git a/telemetry/proto/telemetry.proto b/telemetry/proto/telemetry.proto new file mode 100644 index 000000000..07bc79446 --- /dev/null +++ b/telemetry/proto/telemetry.proto @@ -0,0 +1,52 @@ +syntax = "proto3"; + +package telemetry; + +option go_package = "github.com/seaweedfs/seaweedfs/telemetry/proto"; + +// TelemetryData represents cluster-level telemetry information +message TelemetryData { + // Unique cluster identifier (generated in-memory) + string cluster_id = 1; + + // SeaweedFS version + string version = 2; + + // Operating system (e.g., "linux/amd64") + string os = 3; + + // Enabled features (e.g., ["filer", "s3api", "mq"]) + repeated string features = 4; + + // Deployment type ("standalone", "cluster", "master-only", "volume-only") + string deployment = 5; + + // Number of volume servers in the cluster + int32 volume_server_count = 6; + + // Total disk usage across all volume servers (in bytes) + uint64 total_disk_bytes = 7; + + // Total number of volumes in the cluster + int32 total_volume_count = 8; + + // Number of filer servers in the cluster + int32 filer_count = 9; + + // Number of broker servers in the cluster + int32 broker_count = 10; + + // Unix timestamp when the data was collected + int64 timestamp = 11; +} + +// TelemetryRequest is sent from SeaweedFS clusters to the telemetry server +message TelemetryRequest { + TelemetryData data = 1; +} + +// TelemetryResponse is returned by the telemetry server +message TelemetryResponse { + bool success = 1; + string message = 2; +} \ No newline at end of file diff --git a/telemetry/server/Dockerfile b/telemetry/server/Dockerfile new file mode 100644 index 000000000..8f3782fcf --- /dev/null +++ b/telemetry/server/Dockerfile @@ -0,0 +1,18 @@ +FROM golang:1.21-alpine AS builder + +WORKDIR /app +COPY go.mod go.sum ./ +RUN go mod download + +COPY . . +RUN CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -ldflags '-extldflags "-static"' -o telemetry-server . + +FROM alpine:latest +RUN apk --no-cache add ca-certificates +WORKDIR /root/ + +COPY --from=builder /app/telemetry-server . + +EXPOSE 8080 + +CMD ["./telemetry-server"] \ No newline at end of file diff --git a/telemetry/server/Makefile b/telemetry/server/Makefile new file mode 100644 index 000000000..cf57f1777 --- /dev/null +++ b/telemetry/server/Makefile @@ -0,0 +1,97 @@ +.PHONY: build run clean test deps proto integration-test test-all + +# Build the telemetry server +build: + go build -o telemetry-server . + +# Run the server in development mode +run: + go run . -port=8080 -dashboard=true -cleanup=1h -max-age=24h + +# Run the server in production mode +run-prod: + ./telemetry-server -port=8080 -dashboard=true -cleanup=24h -max-age=720h + +# Clean build artifacts +clean: + rm -f telemetry-server + rm -f ../test/telemetry-server-test.log + go clean + +# Run unit tests +test: + go test ./... + +# Run integration tests +integration-test: + @echo "🧪 Running telemetry integration tests..." + cd ../../ && go run telemetry/test/integration.go + +# Run all tests (unit + integration) +test-all: test integration-test + +# Install dependencies +deps: + go mod download + go mod tidy + +# Generate protobuf code (requires protoc) +proto: + cd .. && protoc --go_out=. --go_opt=paths=source_relative proto/telemetry.proto + +# Build Docker image +docker-build: + docker build -t seaweedfs-telemetry . + +# Run with Docker +docker-run: + docker run -p 8080:8080 seaweedfs-telemetry -port=8080 -dashboard=true + +# Development with auto-reload (requires air: go install github.com/cosmtrek/air@latest) +dev: + air + +# Check if protoc is available +check-protoc: + @which protoc > /dev/null || (echo "protoc is required for proto generation. Install from https://grpc.io/docs/protoc-installation/" && exit 1) + +# Full development setup +setup: check-protoc deps proto build + +# Run a quick smoke test +smoke-test: build + @echo "🔥 Running smoke test..." + @timeout 10s ./telemetry-server -port=18081 > /dev/null 2>&1 & \ + SERVER_PID=$$!; \ + sleep 2; \ + if curl -s http://localhost:18081/health > /dev/null; then \ + echo "✅ Smoke test passed - server responds to health check"; \ + else \ + echo "❌ Smoke test failed - server not responding"; \ + exit 1; \ + fi; \ + kill $$SERVER_PID 2>/dev/null || true + +# Continuous integration target +ci: deps proto build test integration-test + @echo "🎉 All CI tests passed!" + +# Help +help: + @echo "Available targets:" + @echo " build - Build the telemetry server binary" + @echo " run - Run server in development mode" + @echo " run-prod - Run server in production mode" + @echo " clean - Clean build artifacts" + @echo " test - Run unit tests" + @echo " integration-test- Run integration tests" + @echo " test-all - Run all tests (unit + integration)" + @echo " deps - Install Go dependencies" + @echo " proto - Generate protobuf code" + @echo " docker-build - Build Docker image" + @echo " docker-run - Run with Docker" + @echo " dev - Run with auto-reload (requires air)" + @echo " smoke-test - Quick server health check" + @echo " setup - Full development setup" + @echo " ci - Continuous integration (all tests)" + @echo " help - Show this help" \ No newline at end of file diff --git a/telemetry/server/api/handlers.go b/telemetry/server/api/handlers.go new file mode 100644 index 000000000..0ff00330b --- /dev/null +++ b/telemetry/server/api/handlers.go @@ -0,0 +1,152 @@ +package api + +import ( + "encoding/json" + "io" + "net/http" + "strconv" + "time" + + "github.com/seaweedfs/seaweedfs/telemetry/proto" + "github.com/seaweedfs/seaweedfs/telemetry/server/storage" + protobuf "google.golang.org/protobuf/proto" +) + +type Handler struct { + storage *storage.PrometheusStorage +} + +func NewHandler(storage *storage.PrometheusStorage) *Handler { + return &Handler{storage: storage} +} + +func (h *Handler) CollectTelemetry(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + contentType := r.Header.Get("Content-Type") + + // Only accept protobuf content type + if contentType != "application/x-protobuf" && contentType != "application/protobuf" { + http.Error(w, "Content-Type must be application/x-protobuf", http.StatusUnsupportedMediaType) + return + } + + // Read protobuf request + body, err := io.ReadAll(r.Body) + if err != nil { + http.Error(w, "Failed to read request body", http.StatusBadRequest) + return + } + + req := &proto.TelemetryRequest{} + if err := protobuf.Unmarshal(body, req); err != nil { + http.Error(w, "Invalid protobuf data", http.StatusBadRequest) + return + } + + data := req.Data + if data == nil { + http.Error(w, "Missing telemetry data", http.StatusBadRequest) + return + } + + // Validate required fields + if data.ClusterId == "" || data.Version == "" || data.Os == "" { + http.Error(w, "Missing required fields", http.StatusBadRequest) + return + } + + // Set timestamp if not provided + if data.Timestamp == 0 { + data.Timestamp = time.Now().Unix() + } + + // Store the telemetry data + if err := h.storage.StoreTelemetry(data); err != nil { + http.Error(w, "Failed to store data", http.StatusInternalServerError) + return + } + + // Return protobuf response + resp := &proto.TelemetryResponse{ + Success: true, + Message: "Telemetry data received", + } + + respData, err := protobuf.Marshal(resp) + if err != nil { + http.Error(w, "Failed to marshal response", http.StatusInternalServerError) + return + } + + w.Header().Set("Content-Type", "application/x-protobuf") + w.WriteHeader(http.StatusOK) + w.Write(respData) +} + +func (h *Handler) GetStats(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + stats, err := h.storage.GetStats() + if err != nil { + http.Error(w, "Failed to get stats", http.StatusInternalServerError) + return + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(stats) +} + +func (h *Handler) GetInstances(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + limitStr := r.URL.Query().Get("limit") + limit := 100 // default + if limitStr != "" { + if l, err := strconv.Atoi(limitStr); err == nil && l > 0 && l <= 1000 { + limit = l + } + } + + instances, err := h.storage.GetInstances(limit) + if err != nil { + http.Error(w, "Failed to get instances", http.StatusInternalServerError) + return + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(instances) +} + +func (h *Handler) GetMetrics(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + daysStr := r.URL.Query().Get("days") + days := 30 // default + if daysStr != "" { + if d, err := strconv.Atoi(daysStr); err == nil && d > 0 && d <= 365 { + days = d + } + } + + metrics, err := h.storage.GetMetrics(days) + if err != nil { + http.Error(w, "Failed to get metrics", http.StatusInternalServerError) + return + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(metrics) +} diff --git a/telemetry/server/dashboard/dashboard.go b/telemetry/server/dashboard/dashboard.go new file mode 100644 index 000000000..9a56c7f1b --- /dev/null +++ b/telemetry/server/dashboard/dashboard.go @@ -0,0 +1,278 @@ +package dashboard + +import ( + "net/http" +) + +type Handler struct{} + +func NewHandler() *Handler { + return &Handler{} +} + +func (h *Handler) ServeIndex(w http.ResponseWriter, r *http.Request) { + html := ` + + + + + SeaweedFS Telemetry Dashboard + + + + +
+
+

SeaweedFS Telemetry Dashboard

+

Privacy-respecting usage analytics for SeaweedFS

+
+ +
Loading telemetry data...
+ + + +
+ + + +` + + w.Header().Set("Content-Type", "text/html") + w.WriteHeader(http.StatusOK) + w.Write([]byte(html)) +} diff --git a/telemetry/server/go.sum b/telemetry/server/go.sum new file mode 100644 index 000000000..0aec189da --- /dev/null +++ b/telemetry/server/go.sum @@ -0,0 +1,31 @@ +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= +github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= +github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= +github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo= +github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= +github.com/prometheus/client_golang v1.17.0 h1:rl2sfwZMtSthVU752MqfjQozy7blglC+1SOtjMAMh+Q= +github.com/prometheus/client_golang v1.17.0/go.mod h1:VeL+gMmOAxkS2IqfCq0ZmHSL+LjWfWDUmp1mBz9JgUY= +github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16 h1:v7DLqVdK4VrYkVD5diGdl4sxJurKJEMnODWRJlxV9oM= +github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16/go.mod h1:oMQmHW1/JoDwqLtg57MGgP/Fb1CJEYF2imWWhWtMkYU= +github.com/prometheus/common v0.44.0 h1:+5BrQJwiBB9xsMygAB3TNvpQKOwlkc25LbISbrdOOfY= +github.com/prometheus/common v0.44.0/go.mod h1:ofAIvZbQ1e/nugmZGz4/qCb9Ap1VoSTIO7x0VV9VvuY= +github.com/prometheus/procfs v0.11.1 h1:xRC8Iq1yyca5ypa9n1EZnWZkt7dwcoRPQwX/5gwaUuI= +github.com/prometheus/procfs v0.11.1/go.mod h1:eesXgaPo1q7lBpVMoMy0ZOFTth9hBn4W/y0/p/ScXhY= +golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.11.0 h1:eG7RXZHdqOJ1i+0lgLgCpSXAp6M3LYlAo6osgSi0xOM= +golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= +google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= diff --git a/telemetry/server/main.go b/telemetry/server/main.go new file mode 100644 index 000000000..6cbae05c7 --- /dev/null +++ b/telemetry/server/main.go @@ -0,0 +1,111 @@ +package main + +import ( + "encoding/json" + "flag" + "fmt" + "log" + "net/http" + "time" + + "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/seaweedfs/seaweedfs/telemetry/server/api" + "github.com/seaweedfs/seaweedfs/telemetry/server/dashboard" + "github.com/seaweedfs/seaweedfs/telemetry/server/storage" +) + +var ( + port = flag.Int("port", 8080, "HTTP server port") + enableCORS = flag.Bool("cors", true, "Enable CORS for dashboard") + logRequests = flag.Bool("log", true, "Log incoming requests") + enableDashboard = flag.Bool("dashboard", true, "Enable built-in dashboard (optional when using Grafana)") + cleanupInterval = flag.Duration("cleanup", 24*time.Hour, "Cleanup interval for old instances") + maxInstanceAge = flag.Duration("max-age", 30*24*time.Hour, "Maximum age for instances before cleanup") +) + +func main() { + flag.Parse() + + // Create Prometheus storage instance + store := storage.NewPrometheusStorage() + + // Start cleanup routine + go func() { + ticker := time.NewTicker(*cleanupInterval) + defer ticker.Stop() + for range ticker.C { + store.CleanupOldInstances(*maxInstanceAge) + } + }() + + // Setup HTTP handlers + mux := http.NewServeMux() + + // Prometheus metrics endpoint + mux.Handle("/metrics", promhttp.Handler()) + + // API endpoints + apiHandler := api.NewHandler(store) + mux.HandleFunc("/api/collect", corsMiddleware(logMiddleware(apiHandler.CollectTelemetry))) + mux.HandleFunc("/api/stats", corsMiddleware(logMiddleware(apiHandler.GetStats))) + mux.HandleFunc("/api/instances", corsMiddleware(logMiddleware(apiHandler.GetInstances))) + mux.HandleFunc("/api/metrics", corsMiddleware(logMiddleware(apiHandler.GetMetrics))) + + // Dashboard (optional) + if *enableDashboard { + dashboardHandler := dashboard.NewHandler() + mux.HandleFunc("/", corsMiddleware(dashboardHandler.ServeIndex)) + mux.HandleFunc("/dashboard", corsMiddleware(dashboardHandler.ServeIndex)) + mux.Handle("/static/", http.StripPrefix("/static/", http.FileServer(http.Dir("./static")))) + } + + // Health check + mux.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(map[string]string{ + "status": "ok", + "time": time.Now().UTC().Format(time.RFC3339), + }) + }) + + addr := fmt.Sprintf(":%d", *port) + log.Printf("Starting telemetry server on %s", addr) + log.Printf("Prometheus metrics: http://localhost%s/metrics", addr) + if *enableDashboard { + log.Printf("Dashboard: http://localhost%s/dashboard", addr) + } + log.Printf("Cleanup interval: %v, Max instance age: %v", *cleanupInterval, *maxInstanceAge) + + if err := http.ListenAndServe(addr, mux); err != nil { + log.Fatalf("Server failed: %v", err) + } +} + +func corsMiddleware(next http.HandlerFunc) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + if *enableCORS { + w.Header().Set("Access-Control-Allow-Origin", "*") + w.Header().Set("Access-Control-Allow-Methods", "GET, POST, OPTIONS") + w.Header().Set("Access-Control-Allow-Headers", "Content-Type, Authorization") + } + + if r.Method == "OPTIONS" { + w.WriteHeader(http.StatusOK) + return + } + + next(w, r) + } +} + +func logMiddleware(next http.HandlerFunc) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + if *logRequests { + start := time.Now() + next(w, r) + log.Printf("%s %s %s %v", r.Method, r.URL.Path, r.RemoteAddr, time.Since(start)) + } else { + next(w, r) + } + } +} diff --git a/telemetry/server/storage/prometheus.go b/telemetry/server/storage/prometheus.go new file mode 100644 index 000000000..d25dd669a --- /dev/null +++ b/telemetry/server/storage/prometheus.go @@ -0,0 +1,245 @@ +package storage + +import ( + "encoding/json" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + "github.com/seaweedfs/seaweedfs/telemetry/proto" +) + +type PrometheusStorage struct { + // Prometheus metrics + totalClusters prometheus.Gauge + activeClusters prometheus.Gauge + volumeServerCount *prometheus.GaugeVec + totalDiskBytes *prometheus.GaugeVec + totalVolumeCount *prometheus.GaugeVec + filerCount *prometheus.GaugeVec + brokerCount *prometheus.GaugeVec + clusterInfo *prometheus.GaugeVec + telemetryReceived prometheus.Counter + + // In-memory storage for API endpoints (if needed) + mu sync.RWMutex + instances map[string]*telemetryData + stats map[string]interface{} +} + +// telemetryData is an internal struct that includes the received timestamp +type telemetryData struct { + *proto.TelemetryData + ReceivedAt time.Time `json:"received_at"` +} + +func NewPrometheusStorage() *PrometheusStorage { + return &PrometheusStorage{ + totalClusters: promauto.NewGauge(prometheus.GaugeOpts{ + Name: "seaweedfs_telemetry_total_clusters", + Help: "Total number of unique SeaweedFS clusters (last 30 days)", + }), + activeClusters: promauto.NewGauge(prometheus.GaugeOpts{ + Name: "seaweedfs_telemetry_active_clusters", + Help: "Number of active SeaweedFS clusters (last 7 days)", + }), + volumeServerCount: promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "seaweedfs_telemetry_volume_servers", + Help: "Number of volume servers per cluster", + }, []string{"cluster_id", "version", "os", "deployment"}), + totalDiskBytes: promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "seaweedfs_telemetry_disk_bytes", + Help: "Total disk usage in bytes per cluster", + }, []string{"cluster_id", "version", "os", "deployment"}), + totalVolumeCount: promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "seaweedfs_telemetry_volume_count", + Help: "Total number of volumes per cluster", + }, []string{"cluster_id", "version", "os", "deployment"}), + filerCount: promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "seaweedfs_telemetry_filer_count", + Help: "Number of filer servers per cluster", + }, []string{"cluster_id", "version", "os", "deployment"}), + brokerCount: promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "seaweedfs_telemetry_broker_count", + Help: "Number of broker servers per cluster", + }, []string{"cluster_id", "version", "os", "deployment"}), + clusterInfo: promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "seaweedfs_telemetry_cluster_info", + Help: "Cluster information (always 1, labels contain metadata)", + }, []string{"cluster_id", "version", "os", "deployment", "features"}), + telemetryReceived: promauto.NewCounter(prometheus.CounterOpts{ + Name: "seaweedfs_telemetry_reports_received_total", + Help: "Total number of telemetry reports received", + }), + instances: make(map[string]*telemetryData), + stats: make(map[string]interface{}), + } +} + +func (s *PrometheusStorage) StoreTelemetry(data *proto.TelemetryData) error { + s.mu.Lock() + defer s.mu.Unlock() + + // Update Prometheus metrics + labels := prometheus.Labels{ + "cluster_id": data.ClusterId, + "version": data.Version, + "os": data.Os, + "deployment": data.Deployment, + } + + s.volumeServerCount.With(labels).Set(float64(data.VolumeServerCount)) + s.totalDiskBytes.With(labels).Set(float64(data.TotalDiskBytes)) + s.totalVolumeCount.With(labels).Set(float64(data.TotalVolumeCount)) + s.filerCount.With(labels).Set(float64(data.FilerCount)) + s.brokerCount.With(labels).Set(float64(data.BrokerCount)) + + // Features as JSON string for the label + featuresJSON, _ := json.Marshal(data.Features) + infoLabels := prometheus.Labels{ + "cluster_id": data.ClusterId, + "version": data.Version, + "os": data.Os, + "deployment": data.Deployment, + "features": string(featuresJSON), + } + s.clusterInfo.With(infoLabels).Set(1) + + s.telemetryReceived.Inc() + + // Store in memory for API endpoints + s.instances[data.ClusterId] = &telemetryData{ + TelemetryData: data, + ReceivedAt: time.Now().UTC(), + } + + // Update aggregated stats + s.updateStats() + + return nil +} + +func (s *PrometheusStorage) GetStats() (map[string]interface{}, error) { + s.mu.RLock() + defer s.mu.RUnlock() + + // Return cached stats + result := make(map[string]interface{}) + for k, v := range s.stats { + result[k] = v + } + return result, nil +} + +func (s *PrometheusStorage) GetInstances(limit int) ([]*telemetryData, error) { + s.mu.RLock() + defer s.mu.RUnlock() + + var instances []*telemetryData + count := 0 + for _, instance := range s.instances { + if count >= limit { + break + } + instances = append(instances, instance) + count++ + } + + return instances, nil +} + +func (s *PrometheusStorage) GetMetrics(days int) (map[string]interface{}, error) { + s.mu.RLock() + defer s.mu.RUnlock() + + // Return current metrics from in-memory storage + // Historical data should be queried from Prometheus directly + cutoff := time.Now().AddDate(0, 0, -days) + + var volumeServers []map[string]interface{} + var diskUsage []map[string]interface{} + + for _, instance := range s.instances { + if instance.ReceivedAt.After(cutoff) { + volumeServers = append(volumeServers, map[string]interface{}{ + "date": instance.ReceivedAt.Format("2006-01-02"), + "value": instance.TelemetryData.VolumeServerCount, + }) + diskUsage = append(diskUsage, map[string]interface{}{ + "date": instance.ReceivedAt.Format("2006-01-02"), + "value": instance.TelemetryData.TotalDiskBytes, + }) + } + } + + return map[string]interface{}{ + "volume_servers": volumeServers, + "disk_usage": diskUsage, + }, nil +} + +func (s *PrometheusStorage) updateStats() { + now := time.Now() + last7Days := now.AddDate(0, 0, -7) + last30Days := now.AddDate(0, 0, -30) + + totalInstances := 0 + activeInstances := 0 + versions := make(map[string]int) + osDistribution := make(map[string]int) + deployments := make(map[string]int) + + for _, instance := range s.instances { + if instance.ReceivedAt.After(last30Days) { + totalInstances++ + } + if instance.ReceivedAt.After(last7Days) { + activeInstances++ + versions[instance.TelemetryData.Version]++ + osDistribution[instance.TelemetryData.Os]++ + deployments[instance.TelemetryData.Deployment]++ + } + } + + // Update Prometheus gauges + s.totalClusters.Set(float64(totalInstances)) + s.activeClusters.Set(float64(activeInstances)) + + // Update cached stats for API + s.stats = map[string]interface{}{ + "total_instances": totalInstances, + "active_instances": activeInstances, + "versions": versions, + "os_distribution": osDistribution, + "deployments": deployments, + } +} + +// CleanupOldInstances removes instances older than the specified duration +func (s *PrometheusStorage) CleanupOldInstances(maxAge time.Duration) { + s.mu.Lock() + defer s.mu.Unlock() + + cutoff := time.Now().Add(-maxAge) + for instanceID, instance := range s.instances { + if instance.ReceivedAt.Before(cutoff) { + delete(s.instances, instanceID) + + // Remove from Prometheus metrics + labels := prometheus.Labels{ + "cluster_id": instance.TelemetryData.ClusterId, + "version": instance.TelemetryData.Version, + "os": instance.TelemetryData.Os, + "deployment": instance.TelemetryData.Deployment, + } + s.volumeServerCount.Delete(labels) + s.totalDiskBytes.Delete(labels) + s.totalVolumeCount.Delete(labels) + s.filerCount.Delete(labels) + s.brokerCount.Delete(labels) + } + } + + s.updateStats() +} diff --git a/telemetry/test/integration.go b/telemetry/test/integration.go new file mode 100644 index 000000000..d0eea4777 --- /dev/null +++ b/telemetry/test/integration.go @@ -0,0 +1,315 @@ +package main + +import ( + "context" + "fmt" + "io" + "log" + "net/http" + "os" + "os/exec" + "path/filepath" + "strings" + "syscall" + "time" + + "github.com/seaweedfs/seaweedfs/telemetry/proto" + "github.com/seaweedfs/seaweedfs/weed/telemetry" + protobuf "google.golang.org/protobuf/proto" +) + +const ( + serverPort = "18080" // Use different port to avoid conflicts + serverURL = "http://localhost:" + serverPort +) + +func main() { + fmt.Println("🧪 Starting SeaweedFS Telemetry Integration Test") + + // Start telemetry server + fmt.Println("📡 Starting telemetry server...") + serverCmd, err := startTelemetryServer() + if err != nil { + log.Fatalf("❌ Failed to start telemetry server: %v", err) + } + defer stopServer(serverCmd) + + // Wait for server to start + if !waitForServer(serverURL+"/health", 15*time.Second) { + log.Fatal("❌ Telemetry server failed to start") + } + fmt.Println("✅ Telemetry server started successfully") + + // Test protobuf marshaling first + fmt.Println("🔧 Testing protobuf marshaling...") + if err := testProtobufMarshaling(); err != nil { + log.Fatalf("❌ Protobuf marshaling test failed: %v", err) + } + fmt.Println("✅ Protobuf marshaling test passed") + + // Test protobuf client + fmt.Println("🔄 Testing protobuf telemetry client...") + if err := testTelemetryClient(); err != nil { + log.Fatalf("❌ Telemetry client test failed: %v", err) + } + fmt.Println("✅ Telemetry client test passed") + + // Test server metrics endpoint + fmt.Println("📊 Testing Prometheus metrics endpoint...") + if err := testMetricsEndpoint(); err != nil { + log.Fatalf("❌ Metrics endpoint test failed: %v", err) + } + fmt.Println("✅ Metrics endpoint test passed") + + // Test stats API + fmt.Println("📈 Testing stats API...") + if err := testStatsAPI(); err != nil { + log.Fatalf("❌ Stats API test failed: %v", err) + } + fmt.Println("✅ Stats API test passed") + + // Test instances API + fmt.Println("📋 Testing instances API...") + if err := testInstancesAPI(); err != nil { + log.Fatalf("❌ Instances API test failed: %v", err) + } + fmt.Println("✅ Instances API test passed") + + fmt.Println("🎉 All telemetry integration tests passed!") +} + +func startTelemetryServer() (*exec.Cmd, error) { + // Get the directory where this test is running + testDir, err := os.Getwd() + if err != nil { + return nil, fmt.Errorf("failed to get working directory: %v", err) + } + + // Navigate to the server directory (from main seaweedfs directory) + serverDir := filepath.Join(testDir, "telemetry", "server") + + cmd := exec.Command("go", "run", ".", + "-port="+serverPort, + "-dashboard=false", + "-cleanup=1m", + "-max-age=1h") + + cmd.Dir = serverDir + + // Create log files for server output + logFile, err := os.Create("telemetry-server-test.log") + if err != nil { + return nil, fmt.Errorf("failed to create log file: %v", err) + } + + cmd.Stdout = logFile + cmd.Stderr = logFile + + if err := cmd.Start(); err != nil { + return nil, fmt.Errorf("failed to start server: %v", err) + } + + return cmd, nil +} + +func stopServer(cmd *exec.Cmd) { + if cmd != nil && cmd.Process != nil { + cmd.Process.Signal(syscall.SIGTERM) + cmd.Wait() + + // Clean up log file + os.Remove("telemetry-server-test.log") + } +} + +func waitForServer(url string, timeout time.Duration) bool { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + fmt.Printf("⏳ Waiting for server at %s...\n", url) + + for { + select { + case <-ctx.Done(): + return false + default: + resp, err := http.Get(url) + if err == nil { + resp.Body.Close() + if resp.StatusCode == http.StatusOK { + return true + } + } + time.Sleep(500 * time.Millisecond) + } + } +} + +func testProtobufMarshaling() error { + // Test protobuf marshaling/unmarshaling + testData := &proto.TelemetryData{ + ClusterId: "test-cluster-12345", + Version: "test-3.45", + Os: "linux/amd64", + Features: []string{"filer", "s3api"}, + Deployment: "test", + VolumeServerCount: 2, + TotalDiskBytes: 1000000, + TotalVolumeCount: 10, + FilerCount: 1, + BrokerCount: 1, + Timestamp: time.Now().Unix(), + } + + // Marshal + data, err := protobuf.Marshal(testData) + if err != nil { + return fmt.Errorf("failed to marshal protobuf: %v", err) + } + + fmt.Printf(" Protobuf size: %d bytes\n", len(data)) + + // Unmarshal + testData2 := &proto.TelemetryData{} + if err := protobuf.Unmarshal(data, testData2); err != nil { + return fmt.Errorf("failed to unmarshal protobuf: %v", err) + } + + // Verify data + if testData2.ClusterId != testData.ClusterId { + return fmt.Errorf("protobuf data mismatch: expected %s, got %s", + testData.ClusterId, testData2.ClusterId) + } + + if testData2.VolumeServerCount != testData.VolumeServerCount { + return fmt.Errorf("volume server count mismatch: expected %d, got %d", + testData.VolumeServerCount, testData2.VolumeServerCount) + } + + return nil +} + +func testTelemetryClient() error { + // Create telemetry client + client := telemetry.NewClient(serverURL+"/api/collect", true) + + // Create test data using protobuf format + testData := &proto.TelemetryData{ + Version: "test-3.45", + Os: "linux/amd64", + Features: []string{"filer", "s3api", "mq"}, + Deployment: "integration-test", + VolumeServerCount: 3, + TotalDiskBytes: 1073741824, // 1GB + TotalVolumeCount: 50, + FilerCount: 2, + BrokerCount: 1, + Timestamp: time.Now().Unix(), + } + + // Send telemetry data + if err := client.SendTelemetry(testData); err != nil { + return fmt.Errorf("failed to send telemetry: %v", err) + } + + fmt.Printf(" Sent telemetry for cluster: %s\n", client.GetInstanceID()) + + // Wait a bit for processing + time.Sleep(2 * time.Second) + + return nil +} + +func testMetricsEndpoint() error { + resp, err := http.Get(serverURL + "/metrics") + if err != nil { + return fmt.Errorf("failed to get metrics: %v", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("metrics endpoint returned status %d", resp.StatusCode) + } + + // Read response and check for expected metrics + content, err := io.ReadAll(resp.Body) + if err != nil { + return fmt.Errorf("failed to read metrics response: %v", err) + } + + contentStr := string(content) + expectedMetrics := []string{ + "seaweedfs_telemetry_total_clusters", + "seaweedfs_telemetry_active_clusters", + "seaweedfs_telemetry_reports_received_total", + "seaweedfs_telemetry_volume_servers", + "seaweedfs_telemetry_disk_bytes", + "seaweedfs_telemetry_volume_count", + "seaweedfs_telemetry_filer_count", + "seaweedfs_telemetry_broker_count", + } + + for _, metric := range expectedMetrics { + if !strings.Contains(contentStr, metric) { + return fmt.Errorf("missing expected metric: %s", metric) + } + } + + // Check that we have at least one report received + if !strings.Contains(contentStr, "seaweedfs_telemetry_reports_received_total 1") { + fmt.Printf(" Warning: Expected at least 1 report received, metrics content:\n%s\n", contentStr) + } + + fmt.Printf(" Found %d expected metrics\n", len(expectedMetrics)) + + return nil +} + +func testStatsAPI() error { + resp, err := http.Get(serverURL + "/api/stats") + if err != nil { + return fmt.Errorf("failed to get stats: %v", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("stats API returned status %d", resp.StatusCode) + } + + // Read and verify JSON response + content, err := io.ReadAll(resp.Body) + if err != nil { + return fmt.Errorf("failed to read stats response: %v", err) + } + + contentStr := string(content) + if !strings.Contains(contentStr, "total_instances") { + return fmt.Errorf("stats response missing total_instances field") + } + + fmt.Printf(" Stats response: %s\n", contentStr) + + return nil +} + +func testInstancesAPI() error { + resp, err := http.Get(serverURL + "/api/instances?limit=10") + if err != nil { + return fmt.Errorf("failed to get instances: %v", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("instances API returned status %d", resp.StatusCode) + } + + // Read response + content, err := io.ReadAll(resp.Body) + if err != nil { + return fmt.Errorf("failed to read instances response: %v", err) + } + + fmt.Printf(" Instances response length: %d bytes\n", len(content)) + + return nil +} diff --git a/weed/command/master.go b/weed/command/master.go index a8cdf76c6..0761687d9 100644 --- a/weed/command/master.go +++ b/weed/command/master.go @@ -61,6 +61,8 @@ type MasterOptions struct { electionTimeout *time.Duration raftHashicorp *bool raftBootstrap *bool + telemetryUrl *string + telemetryEnabled *bool } func init() { @@ -88,6 +90,8 @@ func init() { m.electionTimeout = cmdMaster.Flag.Duration("electionTimeout", 10*time.Second, "election timeout of master servers") m.raftHashicorp = cmdMaster.Flag.Bool("raftHashicorp", false, "use hashicorp raft") m.raftBootstrap = cmdMaster.Flag.Bool("raftBootstrap", false, "Whether to bootstrap the Raft cluster") + m.telemetryUrl = cmdMaster.Flag.String("telemetry.url", "https://telemetry.seaweedfs.com:3091/api/collect", "telemetry server URL to send usage statistics") + m.telemetryEnabled = cmdMaster.Flag.Bool("telemetry", false, "enable telemetry reporting") } var cmdMaster = &Command{ @@ -332,5 +336,7 @@ func (m *MasterOptions) toMasterOption(whiteList []string) *weed_server.MasterOp DisableHttp: *m.disableHttp, MetricsAddress: *m.metricsAddress, MetricsIntervalSec: *m.metricsIntervalSec, + TelemetryUrl: *m.telemetryUrl, + TelemetryEnabled: *m.telemetryEnabled, } } diff --git a/weed/command/server.go b/weed/command/server.go index dd3b0c8b4..c1f80b325 100644 --- a/weed/command/server.go +++ b/weed/command/server.go @@ -104,6 +104,8 @@ func init() { masterOptions.raftBootstrap = cmdServer.Flag.Bool("master.raftBootstrap", false, "Whether to bootstrap the Raft cluster") masterOptions.heartbeatInterval = cmdServer.Flag.Duration("master.heartbeatInterval", 300*time.Millisecond, "heartbeat interval of master servers, and will be randomly multiplied by [1, 1.25)") masterOptions.electionTimeout = cmdServer.Flag.Duration("master.electionTimeout", 10*time.Second, "election timeout of master servers") + masterOptions.telemetryUrl = cmdServer.Flag.String("master.telemetry.url", "https://telemetry.seaweedfs.com:3091/api/collect", "telemetry server URL to send usage statistics") + masterOptions.telemetryEnabled = cmdServer.Flag.Bool("master.telemetry", false, "enable telemetry reporting") filerOptions.filerGroup = cmdServer.Flag.String("filer.filerGroup", "", "share metadata with other filers in the same filerGroup") filerOptions.collection = cmdServer.Flag.String("filer.collection", "", "all data will be stored in this collection") diff --git a/weed/server/master_server.go b/weed/server/master_server.go index 6569fdbd4..48576adf4 100644 --- a/weed/server/master_server.go +++ b/weed/server/master_server.go @@ -8,11 +8,13 @@ import ( "net/url" "os" "regexp" + "runtime" "strings" "sync" "time" "github.com/seaweedfs/seaweedfs/weed/stats" + "github.com/seaweedfs/seaweedfs/weed/telemetry" "github.com/seaweedfs/seaweedfs/weed/cluster" "github.com/seaweedfs/seaweedfs/weed/pb" @@ -30,6 +32,7 @@ import ( "github.com/seaweedfs/seaweedfs/weed/topology" "github.com/seaweedfs/seaweedfs/weed/util" util_http "github.com/seaweedfs/seaweedfs/weed/util/http" + "github.com/seaweedfs/seaweedfs/weed/util/version" "github.com/seaweedfs/seaweedfs/weed/wdclient" ) @@ -52,6 +55,8 @@ type MasterOption struct { MetricsAddress string MetricsIntervalSec int IsFollower bool + TelemetryUrl string + TelemetryEnabled bool } type MasterServer struct { @@ -76,6 +81,9 @@ type MasterServer struct { adminLocks *AdminLocks Cluster *cluster.Cluster + + // telemetry + telemetryCollector *telemetry.Collector } func NewMasterServer(r *mux.Router, option *MasterOption, peers map[string]pb.ServerAddress) *MasterServer { @@ -131,6 +139,28 @@ func NewMasterServer(r *mux.Router, option *MasterOption, peers map[string]pb.Se ms.vg = topology.NewDefaultVolumeGrowth() glog.V(0).Infoln("Volume Size Limit is", ms.option.VolumeSizeLimitMB, "MB") + // Initialize telemetry after topology is created + if option.TelemetryEnabled && option.TelemetryUrl != "" { + telemetryClient := telemetry.NewClient(option.TelemetryUrl, option.TelemetryEnabled) + ms.telemetryCollector = telemetry.NewCollector(telemetryClient, ms.Topo, ms.Cluster) + ms.telemetryCollector.SetMasterServer(ms) + + // Set version and OS information + ms.telemetryCollector.SetVersion(version.VERSION_NUMBER) + ms.telemetryCollector.SetOS(runtime.GOOS + "/" + runtime.GOARCH) + + // Determine features and deployment type + features := []string{"master"} + if len(peers) > 1 { + features = append(features, "cluster") + } + ms.telemetryCollector.SetFeatures(features) + ms.telemetryCollector.SetDeployment(telemetry.DetermineDeployment(true, false, len(peers))) + + // Start periodic telemetry collection (every 24 hours) + ms.telemetryCollector.StartPeriodicCollection(24 * time.Hour) + } + ms.guard = security.NewGuard(append(ms.option.WhiteList, whiteList...), signingKey, expiresAfterSec, readSigningKey, readExpiresAfterSec) handleStaticResources2(r) diff --git a/weed/telemetry/client.go b/weed/telemetry/client.go new file mode 100644 index 000000000..528984d4d --- /dev/null +++ b/weed/telemetry/client.go @@ -0,0 +1,100 @@ +package telemetry + +import ( + "bytes" + "fmt" + "net/http" + "time" + + "github.com/google/uuid" + "github.com/seaweedfs/seaweedfs/telemetry/proto" + "github.com/seaweedfs/seaweedfs/weed/glog" + protobuf "google.golang.org/protobuf/proto" +) + +type Client struct { + url string + enabled bool + instanceID string + httpClient *http.Client +} + +// NewClient creates a new telemetry client +func NewClient(url string, enabled bool) *Client { + return &Client{ + url: url, + enabled: enabled, + instanceID: uuid.New().String(), // Generate UUID in memory only + httpClient: &http.Client{ + Timeout: 10 * time.Second, + }, + } +} + +// IsEnabled returns whether telemetry is enabled +func (c *Client) IsEnabled() bool { + return c.enabled && c.url != "" +} + +// SendTelemetry sends telemetry data synchronously using protobuf format +func (c *Client) SendTelemetry(data *proto.TelemetryData) error { + if !c.IsEnabled() { + return nil + } + + // Set the cluster ID + data.ClusterId = c.instanceID + + return c.sendProtobuf(data) +} + +// SendTelemetryAsync sends telemetry data asynchronously +func (c *Client) SendTelemetryAsync(data *proto.TelemetryData) { + if !c.IsEnabled() { + return + } + + go func() { + if err := c.SendTelemetry(data); err != nil { + glog.V(1).Infof("Failed to send telemetry: %v", err) + } + }() +} + +// sendProtobuf sends data using protobuf format +func (c *Client) sendProtobuf(data *proto.TelemetryData) error { + req := &proto.TelemetryRequest{ + Data: data, + } + + body, err := protobuf.Marshal(req) + if err != nil { + return fmt.Errorf("failed to marshal protobuf: %v", err) + } + + httpReq, err := http.NewRequest("POST", c.url, bytes.NewBuffer(body)) + if err != nil { + return fmt.Errorf("failed to create request: %v", err) + } + + httpReq.Header.Set("Content-Type", "application/x-protobuf") + httpReq.Header.Set("User-Agent", fmt.Sprintf("SeaweedFS/%s", data.Version)) + + resp, err := c.httpClient.Do(httpReq) + if err != nil { + return fmt.Errorf("failed to send request: %v", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("server returned status %d", resp.StatusCode) + } + + glog.V(2).Infof("Telemetry sent successfully via protobuf") + return nil +} + +// GetInstanceID returns the current instance ID +func (c *Client) GetInstanceID() string { + return c.instanceID +} diff --git a/weed/telemetry/collector.go b/weed/telemetry/collector.go new file mode 100644 index 000000000..7991d92c8 --- /dev/null +++ b/weed/telemetry/collector.go @@ -0,0 +1,218 @@ +package telemetry + +import ( + "time" + + "github.com/seaweedfs/seaweedfs/telemetry/proto" + "github.com/seaweedfs/seaweedfs/weed/cluster" + "github.com/seaweedfs/seaweedfs/weed/glog" + "github.com/seaweedfs/seaweedfs/weed/topology" +) + +type Collector struct { + client *Client + topo *topology.Topology + cluster *cluster.Cluster + masterServer interface{} // Will be set to *weed_server.MasterServer to access client tracking + features []string + deployment string + version string + os string +} + +// NewCollector creates a new telemetry collector +func NewCollector(client *Client, topo *topology.Topology, cluster *cluster.Cluster) *Collector { + return &Collector{ + client: client, + topo: topo, + cluster: cluster, + masterServer: nil, + features: []string{}, + deployment: "unknown", + version: "unknown", + os: "unknown", + } +} + +// SetFeatures sets the list of enabled features +func (c *Collector) SetFeatures(features []string) { + c.features = features +} + +// SetDeployment sets the deployment type (standalone, cluster, etc.) +func (c *Collector) SetDeployment(deployment string) { + c.deployment = deployment +} + +// SetVersion sets the SeaweedFS version +func (c *Collector) SetVersion(version string) { + c.version = version +} + +// SetOS sets the operating system information +func (c *Collector) SetOS(os string) { + c.os = os +} + +// SetMasterServer sets a reference to the master server for client tracking +func (c *Collector) SetMasterServer(masterServer interface{}) { + c.masterServer = masterServer +} + +// CollectAndSendAsync collects telemetry data and sends it asynchronously +func (c *Collector) CollectAndSendAsync() { + if !c.client.IsEnabled() { + return + } + + go func() { + data := c.collectData() + c.client.SendTelemetryAsync(data) + }() +} + +// StartPeriodicCollection starts sending telemetry data periodically +func (c *Collector) StartPeriodicCollection(interval time.Duration) { + if !c.client.IsEnabled() { + glog.V(1).Infof("Telemetry is disabled, skipping periodic collection") + return + } + + glog.V(0).Infof("Starting telemetry collection every %v", interval) + + // Send initial telemetry after a short delay + go func() { + time.Sleep(30 * time.Second) // Wait for cluster to stabilize + c.CollectAndSendAsync() + }() + + // Start periodic collection + ticker := time.NewTicker(interval) + go func() { + defer ticker.Stop() + for range ticker.C { + c.CollectAndSendAsync() + } + }() +} + +// collectData gathers telemetry data from the topology +func (c *Collector) collectData() *proto.TelemetryData { + data := &proto.TelemetryData{ + Version: c.version, + Os: c.os, + Features: c.features, + Deployment: c.deployment, + Timestamp: time.Now().Unix(), + } + + if c.topo != nil { + // Collect volume server count + data.VolumeServerCount = int32(c.countVolumeServers()) + + // Collect total disk usage and volume count + diskBytes, volumeCount := c.collectVolumeStats() + data.TotalDiskBytes = diskBytes + data.TotalVolumeCount = int32(volumeCount) + } + + if c.cluster != nil { + // Collect filer and broker counts + data.FilerCount = int32(c.countFilers()) + data.BrokerCount = int32(c.countBrokers()) + } + + return data +} + +// countVolumeServers counts the number of active volume servers +func (c *Collector) countVolumeServers() int { + count := 0 + for _, dcNode := range c.topo.Children() { + dc := dcNode.(*topology.DataCenter) + for _, rackNode := range dc.Children() { + rack := rackNode.(*topology.Rack) + for range rack.Children() { + count++ + } + } + } + return count +} + +// collectVolumeStats collects total disk usage and volume count +func (c *Collector) collectVolumeStats() (uint64, int) { + var totalDiskBytes uint64 + var totalVolumeCount int + + for _, dcNode := range c.topo.Children() { + dc := dcNode.(*topology.DataCenter) + for _, rackNode := range dc.Children() { + rack := rackNode.(*topology.Rack) + for _, dnNode := range rack.Children() { + dn := dnNode.(*topology.DataNode) + volumes := dn.GetVolumes() + for _, volumeInfo := range volumes { + totalVolumeCount++ + totalDiskBytes += volumeInfo.Size + } + } + } + } + + return totalDiskBytes, totalVolumeCount +} + +// countFilers counts the number of active filer servers across all groups +func (c *Collector) countFilers() int { + // Count all filer-type nodes in the cluster + // This includes both pure filer servers and S3 servers (which register as filers) + count := 0 + for _, groupName := range c.getAllFilerGroups() { + nodes := c.cluster.ListClusterNode(cluster.FilerGroupName(groupName), cluster.FilerType) + count += len(nodes) + } + return count +} + +// countBrokers counts the number of active broker servers +func (c *Collector) countBrokers() int { + // Count brokers across all broker groups + count := 0 + for _, groupName := range c.getAllBrokerGroups() { + nodes := c.cluster.ListClusterNode(cluster.FilerGroupName(groupName), cluster.BrokerType) + count += len(nodes) + } + return count +} + +// getAllFilerGroups returns all filer group names +func (c *Collector) getAllFilerGroups() []string { + // For simplicity, we check the default group + // In a more sophisticated implementation, we could enumerate all groups + return []string{""} +} + +// getAllBrokerGroups returns all broker group names +func (c *Collector) getAllBrokerGroups() []string { + // For simplicity, we check the default group + // In a more sophisticated implementation, we could enumerate all groups + return []string{""} +} + +// DetermineDeployment determines the deployment type based on configuration +func DetermineDeployment(isMasterEnabled, isVolumeEnabled bool, peerCount int) string { + if isMasterEnabled && isVolumeEnabled { + if peerCount > 1 { + return "cluster" + } + return "standalone" + } + if isMasterEnabled { + return "master-only" + } + if isVolumeEnabled { + return "volume-only" + } + return "unknown" +}