Files
cloud-services/pkg/health/health.go

299 lines
7.2 KiB
Go

package health
import (
"context"
"encoding/json"
"fmt"
"net/http"
"runtime"
"sync"
"time"
"github.com/fiskerinc/cloud-services/pkg/logger"
"github.com/gomodule/redigo/redis"
"github.com/pkg/errors"
)
// Based on https://github.com/hellofresh/health-go
// Status type represents health status
type Status string
// Possible health statuses
const (
StatusOK Status = "OK"
StatusPartiallyAvailable Status = "partially available"
StatusUnavailable Status = "unavailable"
StatusTimeout Status = "timeout"
)
type (
// CheckFunc is the func which executes the check.
CheckFunc func(context.Context) error
// InfoFunc is the func which executes to return check info
InfoFunc func(system *System)
// Config carries the parameters to run the check.
Config struct {
// Name is the name of the resource to be checked.
Name string
// Timeout is the timeout defined for every check.
Timeout time.Duration
// SkipOnErr if set to true, it will retrieve StatusOK providing the error message from the failed resource.
SkipOnErr bool
// Check is the func which executes the check.
Check CheckFunc
// Info func sets System information
Info InfoFunc
// If Vital is set to true, it means that the service won't work without this resource.
Vital bool
}
// Check represents the health check response.
Check struct {
// Status is the check status.
Status Status `json:"status"`
// Timestamp is the time in which the check occurred.
Timestamp time.Time `json:"timestamp"`
// Failures holds the failed checks along with their messages.
Failures map[string]string `json:"failures,omitempty"`
// System holds information of the go process.
System *System `json:"system"`
}
// System runtime variables about the go process.
System struct {
// Version is the go version.
Version string `json:"version"`
// GoroutinesCount is the number of the current goroutines.
GoroutinesCount int `json:"goroutines_count"`
// TotalAllocBytes is the total bytes allocated.
TotalAllocBytes int `json:"total_alloc_bytes"`
// HeapObjectsCount is the number of objects in the go heap.
HeapObjectsCount int `json:"heap_objects_count"`
// TotalAllocBytes is the bytes allocated and not yet freed.
AllocBytes int `json:"alloc_bytes"`
// RedisPoolCount is the current Redis connection pool count
RedisStats *redis.PoolStats `json:"redis_stats,omitempty"`
}
// Health is the health-checks container
Health struct {
mu sync.Mutex
checks map[string]Config
}
checkResponse struct {
config Config
err error
}
filterChecks func(checks map[string]Config) map[string]Config
)
// New instantiates and build new health check container
func New(opts ...Option) (*Health, error) {
h := &Health{
checks: make(map[string]Config),
}
for _, o := range opts {
if err := o(h); err != nil {
return nil, err
}
}
return h, nil
}
// Register registers a check config to be performed.
func (h *Health) Register(c Config) error {
if c.Timeout == 0 {
c.Timeout = time.Second * 1
}
if c.Name == "" {
return errors.New("health check must have a name to be registered")
}
h.mu.Lock()
defer h.mu.Unlock()
if _, ok := h.checks[c.Name]; ok {
return fmt.Errorf("health check %q is already registered", c.Name)
}
h.checks[c.Name] = c
return nil
}
// ReadinessHandler returns an HTTP handler (http.HandlerFunc).
func (h *Health) ReadinessHandler() http.Handler {
return http.HandlerFunc(h.ReadinessFunc)
}
// LivenessHandler returns an HTTP handler (http.HandlerFunc).
func (h *Health) LivenessHandler() http.Handler {
return http.HandlerFunc(h.LivenessFunc)
}
// LivenessFunc is the HTTP handler function.
func (h *Health) LivenessFunc(w http.ResponseWriter, r *http.Request) {
c := h.Measure(r.Context(), getLivenessCheck)
w.Header().Set("Content-Type", "application/json")
data, err := json.Marshal(c)
if err != nil {
w.WriteHeader(http.StatusInternalServerError)
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
code := http.StatusOK
if c.Status == StatusUnavailable {
code = http.StatusServiceUnavailable
}
w.WriteHeader(code)
w.Write(data)
}
// ReadinessFunc is the HTTP handler function.
func (h *Health) ReadinessFunc(w http.ResponseWriter, r *http.Request) {
c := h.Measure(r.Context(), getReadinessCheck)
w.Header().Set("Content-Type", "application/json")
data, err := json.Marshal(c)
if err != nil {
w.WriteHeader(http.StatusInternalServerError)
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
code := http.StatusOK
if c.Status == StatusUnavailable {
code = http.StatusServiceUnavailable
}
w.WriteHeader(code)
w.Write(data)
}
func (h *Health) info(system *System) {
for _, c := range h.checks {
if c.Info != nil {
c.Info(system)
}
}
}
// Measure runs all the registered health checks and returns summary status
func (h *Health) Measure(ctx context.Context, getChecks filterChecks) Check {
errTimeout := errors.New("timeout error")
h.mu.Lock()
defer h.mu.Unlock()
checksList := getChecks(h.checks)
total := len(checksList)
checkRespChan := make(chan checkResponse, total)
var wgRes sync.WaitGroup
wgRes.Add(total)
go func() {
wgRes.Wait()
close(checkRespChan)
}()
for _, c := range checksList {
go func(ctx context.Context, c Config, respChan chan<- checkResponse) {
defer wgRes.Done()
locResp := make(chan error)
go func(ctx context.Context, locResp chan<- error) {
defer close(locResp)
locResp <- c.Check(ctx)
}(ctx, locResp)
select {
case <-time.After(c.Timeout):
respChan <- checkResponse{config: c, err: errTimeout}
case err := <-locResp:
respChan <- checkResponse{config: c, err: err}
}
}(ctx, c, checkRespChan)
}
status := StatusOK
checks := make(map[string]string)
for resp := range checkRespChan {
if resp.err == errTimeout {
checks[resp.config.Name] = string(StatusTimeout)
status = getAvailability(status, resp.config)
continue
}
if resp.err != nil {
checks[resp.config.Name] = resp.err.Error()
status = getAvailability(status, resp.config)
logger.Error().Err(errors.WithMessage(resp.err, resp.config.Name)).Send()
continue
}
checks[resp.config.Name] = string(StatusOK)
}
system := newSystemMetrics()
h.info(&system)
return newCheck(status, checks, &system)
}
func getReadinessCheck(checks map[string]Config) map[string]Config {
return checks
}
func getLivenessCheck(checks map[string]Config) map[string]Config {
rez := make(map[string]Config)
for key, conf := range checks {
if conf.Vital {
rez[key] = conf
}
}
return rez
}
func newCheck(s Status, failures map[string]string, system *System) Check {
return Check{
Status: s,
Timestamp: time.Now(),
Failures: failures,
System: system,
}
}
func newSystemMetrics() System {
s := runtime.MemStats{}
runtime.ReadMemStats(&s)
return System{
Version: runtime.Version(),
GoroutinesCount: runtime.NumGoroutine(),
TotalAllocBytes: int(s.TotalAlloc),
HeapObjectsCount: int(s.HeapObjects),
AllocBytes: int(s.Alloc),
}
}
func getAvailability(s Status, c Config) Status {
if c.SkipOnErr && s != StatusUnavailable {
return StatusPartiallyAvailable
}
return StatusUnavailable
}