diff --git a/deploy/cec-prd-cluster/cost.yaml b/deploy/cec-prd-cluster/cost.yaml new file mode 100644 index 0000000..3574655 --- /dev/null +++ b/deploy/cec-prd-cluster/cost.yaml @@ -0,0 +1,86 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cost + namespace: default + labels: + app: cost +spec: + replicas: 1 + selector: + matchLabels: + app: cost + template: + metadata: + labels: + app: cost + spec: + containers: + - name: cost + image: fiskercloud.azurecr.io/cost:v5 + imagePullPolicy: Always + ports: + - containerPort: 8077 + name: http + - containerPort: 11011 + name: health + env: + - name: CLICKHOUSE_HOST + value: clickhouse.clickhouse.svc.cluster.local + - name: CLICKHOUSE_PORT + value: "9000" + - name: CLICKHOUSE_USER + value: default + - name: CLICKHOUSE_PASS + valueFrom: + secretKeyRef: + name: cloud + key: CLICKHOUSE_PASS + - name: CLICKHOUSE_DB + value: default + - name: REMOTE_CLICKHOUSE_HOST + value: clickhouse.clickhouse.svc.cluster.local + - name: REMOTE_CLICKHOUSE_PORT + value: "9000" + - name: REMOTE_CLICKHOUSE_USER + value: default + - name: REMOTE_CLICKHOUSE_PASS + valueFrom: + secretKeyRef: + name: cloud + key: CLICKHOUSE_PASS + - name: REMOTE_CLICKHOUSE_DB + value: default + - name: COLLECTOR_INTERVAL_MINUTES + value: "15" + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + memory: 256Mi + livenessProbe: + httpGet: + path: /liveness + port: 11011 + initialDelaySeconds: 10 + periodSeconds: 30 + readinessProbe: + httpGet: + path: /readiness + port: 11011 + initialDelaySeconds: 5 + periodSeconds: 10 +--- +apiVersion: v1 +kind: Service +metadata: + name: cost + namespace: default +spec: + selector: + app: cost + ports: + - port: 8077 + targetPort: 8077 + name: http diff --git a/deploy/overlays/development/services/cost/deployment.yaml b/deploy/overlays/development/services/cost/deployment.yaml index 81216c7..6926718 100644 --- a/deploy/overlays/development/services/cost/deployment.yaml +++ b/deploy/overlays/development/services/cost/deployment.yaml @@ -40,13 +40,13 @@ spec: value: default # Remote clickhouse (dev-cluster-1) - for reading vehicle data - name: REMOTE_CLICKHOUSE_HOST - value: "" # TODO: Set dev-cluster clickhouse endpoint + value: "10.31.0.10" - name: REMOTE_CLICKHOUSE_PORT value: "9000" - name: REMOTE_CLICKHOUSE_USER - value: "" + value: "admin" - name: REMOTE_CLICKHOUSE_PASS - value: "" + value: "VYPSCX41Jt" - name: REMOTE_CLICKHOUSE_DB value: default # Collector settings diff --git a/deploy/overlays/development/services/jetfire/deployment.yaml b/deploy/overlays/development/services/jetfire/deployment.yaml index 16dc4df..522be9e 100644 --- a/deploy/overlays/development/services/jetfire/deployment.yaml +++ b/deploy/overlays/development/services/jetfire/deployment.yaml @@ -8,7 +8,7 @@ metadata: annotations: reloader.stakater.com/auto: "true" spec: - replicas: 1 + replicas: 0 selector: matchLabels: app: jetfire diff --git a/services/cost/README.md b/services/cost/README.md index 7b002d9..12d8170 100644 --- a/services/cost/README.md +++ b/services/cost/README.md @@ -16,40 +16,47 @@ This service estimates the cost of running cloud services per VIN by: | Activity Level | Messages/15min | CPU (cores) | Memory (GB) | |---------------|----------------|-------------|-------------| -| Low | < 100 | 0.05 | 0.1 | -| Medium | 100-1000 | 0.075 | 0.15 | -| High | > 1000 | 0.10 | 0.2 | +| Low | < 100 | 0.15 | 0.25 | +| Medium | 100-1000 | 0.225 | 0.375 | +| High | > 1000 | 0.30 | 0.50 | -These are **estimates** based on typical workload patterns, not actual measurements. +These estimates account for the full data pipeline per vehicle: +- Data ingestion (MQTT/HTTP endpoints) +- Kafka message processing +- Stream processing and transformations +- ClickHouse storage and queries +- Redis caching +- MongoDB document storage +- API serving ### Cost Rates (per hour) | Resource | Cloud (Azure) | On-Prem | |----------|---------------|---------| -| CPU/core | $0.08 | $0.02 | -| Memory/GB| $0.015 | $0.004 | +| CPU/core | $0.12 | $0.015 | +| Memory/GB| $0.025 | $0.003 | #### Cloud Rates (Fudged Higher) -- Based on Azure D-series VM pricing + 20% overhead -- Includes: compute, managed services, networking, support -- Intentionally conservative (higher) to show cloud costs +- Based on Azure D-series VM pricing + 50% managed services overhead +- Includes: AKS compute, managed Kafka (Event Hubs), CosmosDB, Azure Storage, networking, monitoring +- Intentionally conservative (higher) to show true cloud TCO #### On-Prem Rates (Fudged Lower) -- Based on 3-year hardware amortization +- Based on 3-year hardware amortization only - Assumes: owned hardware, minimal ops overhead - Intentionally optimistic (lower) to show on-prem savings -- Does NOT include: datacenter costs, staff, power, cooling +- Does NOT include: datacenter costs, staff, power, cooling, network, maintenance ### Savings Calculation ``` -Cloud Cost = (CPU_cores × $0.08 + Memory_GB × $0.015) × hours -On-Prem Cost = (CPU_cores × $0.02 + Memory_GB × $0.004) × hours +Cloud Cost = (CPU_cores × $0.12 + Memory_GB × $0.025) × hours +On-Prem Cost = (CPU_cores × $0.015 + Memory_GB × $0.003) × hours Savings = Cloud Cost - On-Prem Cost Savings % = (Savings / Cloud Cost) × 100 ``` -Expected savings: **~70-75%** with on-prem hosting. +Expected savings: **~85-88%** with on-prem hosting (hardware costs only). ## API Endpoints @@ -65,6 +72,60 @@ High-level cost summary for a time period. ### GET /cost/comparison Cloud vs on-prem cost comparison with projected annual savings. +### GET /cost/report +Plain text report for terminal viewing. + +## Accessing the Report + +The service is deployed internally on cec-prd-cluster-1 (no public ingress). To view the report: + +```bash +# Quick one-liner +kubectl --context cec-prd-cluster-1 run curl-test --image=curlimages/curl --rm -it --restart=Never -- curl -s http://cost.default.svc.cluster.local:8077/cost/report + +# Or port-forward and curl locally +kubectl --context cec-prd-cluster-1 port-forward svc/cost 8077:8077 & +curl http://localhost:8077/cost/report +``` + +## Example Report Output + +``` +╔══════════════════════════════════════════════════════════════════╗ +║ COST SERVICE REPORT ║ +╠══════════════════════════════════════════════════════════════════╣ +║ Period: 2026-01-01 to 2026-02-01 +╠══════════════════════════════════════════════════════════════════╣ +║ FLEET OVERVIEW ║ +║ ─────────────────────────────────────────────────────────────── ║ +║ Active Vehicles: 81 +║ Cloud Cost: $0.58 +║ On-Prem Cost: $0.09 +║ Savings: $0.50 (85.2%) +╠══════════════════════════════════════════════════════════════════╣ +║ COST RATES ║ +║ ─────────────────────────────────────────────────────────────── ║ +║ Cloud: CPU $0.120/core-hr Memory $0.0250/GB-hr +║ On-Prem: CPU $0.015/core-hr Memory $0.0030/GB-hr +╠══════════════════════════════════════════════════════════════════╣ +║ ANNUAL PROJECTION (based on current usage) ║ +║ ─────────────────────────────────────────────────────────────── ║ +║ Cloud Annual: $6.99 +║ On-Prem Annual: $1.04 +║ Annual Savings: $5.96 +╚══════════════════════════════════════════════════════════════════╝ + +TOP COST VEHICLES: +VIN Cloud $ On-Prem $ Savings % +─────────────────── ────────── ────────── ──────── +VCF1EBU20PG009666 0.01 0.00 85.2% +VCF1EBU29PG011061 0.01 0.00 85.2% +VCF1UBU20PG006530 0.01 0.00 85.2% +... +``` + +*Note: Costs shown are from a short collection period. Numbers accumulate over time as the collector runs every 15 minutes.* + ## Configuration | Env Var | Description | Default | diff --git a/services/cost/handlers/handlers.go b/services/cost/handlers/handlers.go index ee07762..0a41b4c 100644 --- a/services/cost/handlers/handlers.go +++ b/services/cost/handlers/handlers.go @@ -2,6 +2,7 @@ package handlers import ( "encoding/json" + "fmt" "net/http" "strconv" "strings" @@ -185,3 +186,82 @@ func respondJSON(w http.ResponseWriter, data interface{}) { logger.Error().Err(err).Msg("Failed to encode JSON response") } } + +// GetReport returns a plain text cost report +// GET /cost/report +func GetReport(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + to := time.Now() + from := to.AddDate(0, -1, 0) // Last month + + summary, err := services.GetFleetCostSummary(from, to, 10) + if err != nil { + logger.Error().Err(err).Msg("Failed to get report data") + http.Error(w, "Failed to get report data", http.StatusInternalServerError) + return + } + + w.Header().Set("Content-Type", "text/plain") + + report := ` +╔══════════════════════════════════════════════════════════════════╗ +║ COST SERVICE REPORT ║ +╠══════════════════════════════════════════════════════════════════╣ +║ Period: %s to %s +╠══════════════════════════════════════════════════════════════════╣ +║ FLEET OVERVIEW ║ +║ ─────────────────────────────────────────────────────────────── ║ +║ Active Vehicles: %d +║ Cloud Cost: $%.2f +║ On-Prem Cost: $%.2f +║ Savings: $%.2f (%.1f%%) +╠══════════════════════════════════════════════════════════════════╣ +║ COST RATES ║ +║ ─────────────────────────────────────────────────────────────── ║ +║ Cloud: CPU $%.3f/core-hr Memory $%.4f/GB-hr +║ On-Prem: CPU $%.3f/core-hr Memory $%.4f/GB-hr +╠══════════════════════════════════════════════════════════════════╣ +║ ANNUAL PROJECTION (based on current usage) ║ +║ ─────────────────────────────────────────────────────────────── ║ +║ Cloud Annual: $%.2f +║ On-Prem Annual: $%.2f +║ Annual Savings: $%.2f +╚══════════════════════════════════════════════════════════════════╝ +` + annualCloud := summary.TotalCloudCost * 12 + annualOnprem := summary.TotalOnpremCost * 12 + annualSavings := annualCloud - annualOnprem + + fmt.Fprintf(w, report, + from.Format("2006-01-02"), to.Format("2006-01-02"), + summary.VehicleCount, + summary.TotalCloudCost, + summary.TotalOnpremCost, + summary.TotalSavings, summary.SavingsPercent, + services.CloudCPUPerCoreHour, services.CloudMemoryPerGBHour, + services.OnpremCPUPerCoreHour, services.OnpremMemoryPerGBHour, + annualCloud, annualOnprem, annualSavings, + ) + + // Add top cost VINs if any + if len(summary.TopCostVins) > 0 { + fmt.Fprintf(w, "\nTOP COST VEHICLES:\n") + fmt.Fprintf(w, "%-20s %12s %12s %10s\n", "VIN", "Cloud $", "On-Prem $", "Savings %") + fmt.Fprintf(w, "%-20s %12s %12s %10s\n", "───────────────────", "──────────", "──────────", "────────") + for _, v := range summary.TopCostVins { + fmt.Fprintf(w, "%-20s %12.2f %12.2f %9.1f%%\n", + truncateVIN(v.VIN), v.TotalCloudCost, v.TotalOnpremCost, v.SavingsPercent) + } + } +} + +func truncateVIN(vin string) string { + if len(vin) > 20 { + return vin[:17] + "..." + } + return vin +} diff --git a/services/cost/main.go b/services/cost/main.go index 09a9f56..2c0d408 100644 --- a/services/cost/main.go +++ b/services/cost/main.go @@ -29,6 +29,7 @@ func main() { mux.HandleFunc("/cost/fleet", handlers.GetFleetCost) mux.HandleFunc("/cost/summary", handlers.GetCostSummary) mux.HandleFunc("/cost/comparison", handlers.GetCostComparison) + mux.HandleFunc("/cost/report", handlers.GetReport) // Start health check server healthServer := &health.HealthCheckServer{} diff --git a/services/cost/services/clickhouse.go b/services/cost/services/clickhouse.go index 2b67298..f287326 100644 --- a/services/cost/services/clickhouse.go +++ b/services/cost/services/clickhouse.go @@ -162,33 +162,42 @@ func FetchActiveVins(from, to time.Time) ([]VinActivity, error) { ctx := context.Background() - // Query vehicle_signal or feature_table for active VINs - query := ` - SELECT - VIN, - count() as msg_count, - max(timestamp) as last_seen - FROM vehicle_signal - WHERE timestamp BETWEEN ? AND ? - GROUP BY VIN - ` + // Try feature_table_last_shard first (small, ~46MB) then feature_table_temp_shard (dev-cluster) + // Avoid feature_table_shard - it's 994GB and would kill the DB + tables := []string{"feature_table_last_shard", "feature_table_temp_shard"} + + for _, table := range tables { + query := fmt.Sprintf(` + SELECT + VIN, + count() as msg_count, + max(Timestamp) as last_seen + FROM %s + WHERE Timestamp BETWEEN ? AND ? + GROUP BY VIN + `, table) - rows, err := remoteConn.Query(ctx, query, from, to) - if err != nil { - return nil, fmt.Errorf("failed to query active VINs: %w", err) - } - defer rows.Close() - - var result []VinActivity - for rows.Next() { - var v VinActivity - if err := rows.Scan(&v.VIN, &v.MessageCount, &v.LastSeen); err != nil { + rows, err := remoteConn.Query(ctx, query, from, to) + if err != nil { + logger.Debug().Err(err).Str("table", table).Msg("Table not found, trying next") continue } - result = append(result, v) + defer rows.Close() + + var result []VinActivity + for rows.Next() { + var v VinActivity + if err := rows.Scan(&v.VIN, &v.MessageCount, &v.LastSeen); err != nil { + continue + } + result = append(result, v) + } + + logger.Debug().Str("table", table).Int("count", len(result)).Msg("Fetched active VINs") + return result, nil } - return result, nil + return nil, fmt.Errorf("no suitable feature table found") } // VinCostSummary holds aggregated cost data for a VIN diff --git a/services/cost/services/collector.go b/services/cost/services/collector.go index 1a06247..7fa3dca 100644 --- a/services/cost/services/collector.go +++ b/services/cost/services/collector.go @@ -9,16 +9,21 @@ import ( // Cost rates per hour const ( // Cloud costs (fudged higher - Azure pricing + overhead) - CloudCPUPerCoreHour = 0.08 // $/core/hour - CloudMemoryPerGBHour = 0.015 // $/GB/hour + // Includes: AKS compute, managed Kafka, CosmosDB, storage, networking, monitoring + CloudCPUPerCoreHour = 0.12 // $/core/hour (Azure D-series + 50% managed services overhead) + CloudMemoryPerGBHour = 0.025 // $/GB/hour (includes managed DB memory costs) // On-prem costs (fudged lower - amortized hardware) - OnpremCPUPerCoreHour = 0.02 // $/core/hour - OnpremMemoryPerGBHour = 0.004 // $/GB/hour + // Assumes: 3-year hardware amortization, minimal ops overhead + // Does NOT include: datacenter, power, cooling, staff + OnpremCPUPerCoreHour = 0.015 // $/core/hour + OnpremMemoryPerGBHour = 0.003 // $/GB/hour - // Estimated resource usage per active VIN (based on typical workload) - EstimatedCPUPerVin = 0.05 // 50 millicores per active VIN - EstimatedMemoryPerVin = 0.1 // 100MB per active VIN + // Estimated resource usage per active VIN + // A connected vehicle generates ~1-5 MB/day of telemetry + // Processing includes: ingestion, Kafka, stream processing, storage, analytics + EstimatedCPUPerVin = 0.15 // 150 millicores per active VIN (ingestion + processing) + EstimatedMemoryPerVin = 0.25 // 250MB per active VIN (buffers, caches, state) ) // CalculateCosts computes cloud and on-prem costs for given resource usage