Add Metrics

This commit is contained in:
Michael 2019-07-18 21:36:05 +02:00 committed by Traefiker Bot
parent 4dc448056c
commit 8e97af8dc3
121 changed files with 8364 additions and 3811 deletions

View file

@ -2,6 +2,7 @@ package metrics
import (
"context"
"fmt"
"net/http"
"sort"
"strings"
@ -28,43 +29,45 @@ const (
configLastReloadSuccessName = metricConfigPrefix + "last_reload_success"
configLastReloadFailureName = metricConfigPrefix + "last_reload_failure"
// entrypoint
// entry point
metricEntryPointPrefix = MetricNamePrefix + "entrypoint_"
entrypointReqsTotalName = metricEntryPointPrefix + "requests_total"
entrypointReqDurationName = metricEntryPointPrefix + "request_duration_seconds"
entrypointOpenConnsName = metricEntryPointPrefix + "open_connections"
entryPointReqsTotalName = metricEntryPointPrefix + "requests_total"
entryPointReqDurationName = metricEntryPointPrefix + "request_duration_seconds"
entryPointOpenConnsName = metricEntryPointPrefix + "open_connections"
// backend level.
// service level.
// MetricBackendPrefix prefix of all backend metric names
MetricBackendPrefix = MetricNamePrefix + "backend_"
backendReqsTotalName = MetricBackendPrefix + "requests_total"
backendReqDurationName = MetricBackendPrefix + "request_duration_seconds"
backendOpenConnsName = MetricBackendPrefix + "open_connections"
backendRetriesTotalName = MetricBackendPrefix + "retries_total"
backendServerUpName = MetricBackendPrefix + "server_up"
// MetricServicePrefix prefix of all service metric names
MetricServicePrefix = MetricNamePrefix + "service_"
serviceReqsTotalName = MetricServicePrefix + "requests_total"
serviceReqDurationName = MetricServicePrefix + "request_duration_seconds"
serviceOpenConnsName = MetricServicePrefix + "open_connections"
serviceRetriesTotalName = MetricServicePrefix + "retries_total"
serviceServerUpName = MetricServicePrefix + "server_up"
)
// promState holds all metric state internally and acts as the only Collector we register for Prometheus.
//
// This enables control to remove metrics that belong to outdated configuration.
// As an example why this is required, consider Traefik learns about a new service.
// It populates the 'traefik_server_backend_up' metric for it with a value of 1 (alive).
// When the backend is undeployed now the metric is still there in the client library
// It populates the 'traefik_server_service_up' metric for it with a value of 1 (alive).
// When the service is undeployed now the metric is still there in the client library
// and will be returned on the metrics endpoint until Traefik would be restarted.
//
// To solve this problem promState keeps track of Traefik's dynamic configuration.
// Metrics that "belong" to a dynamic configuration part like backends or entrypoints
// Metrics that "belong" to a dynamic configuration part like services or entryPoints
// are removed after they were scraped at least once when the corresponding object
// doesn't exist anymore.
var promState = newPrometheusState()
var promRegistry = stdprometheus.NewRegistry()
// PrometheusHandler exposes Prometheus routes.
type PrometheusHandler struct{}
// Append adds Prometheus routes on a router.
func (h PrometheusHandler) Append(router *mux.Router) {
router.Methods(http.MethodGet).Path("/metrics").Handler(promhttp.Handler())
router.Methods(http.MethodGet).Path("/metrics").Handler(promhttp.HandlerFor(promRegistry, promhttp.HandlerOpts{}))
}
// RegisterPrometheus registers all Prometheus metrics.
@ -72,6 +75,17 @@ func (h PrometheusHandler) Append(router *mux.Router) {
func RegisterPrometheus(ctx context.Context, config *types.Prometheus) Registry {
standardRegistry := initStandardRegistry(config)
if err := promRegistry.Register(stdprometheus.NewProcessCollector(stdprometheus.ProcessCollectorOpts{})); err != nil {
if _, ok := err.(stdprometheus.AlreadyRegisteredError); !ok {
log.FromContext(ctx).Warn("ProcessCollector is already registered")
}
}
if err := promRegistry.Register(stdprometheus.NewGoCollector()); err != nil {
if _, ok := err.(stdprometheus.AlreadyRegisteredError); !ok {
log.FromContext(ctx).Warn("GoCollector is already registered")
}
}
if !registerPromState(ctx) {
return nil
}
@ -106,76 +120,89 @@ func initStandardRegistry(config *types.Prometheus) Registry {
Help: "Last config reload failure",
}, []string{})
entrypointReqs := newCounterFrom(promState.collectors, stdprometheus.CounterOpts{
Name: entrypointReqsTotalName,
Help: "How many HTTP requests processed on an entrypoint, partitioned by status code, protocol, and method.",
}, []string{"code", "method", "protocol", "entrypoint"})
entrypointReqDurations := newHistogramFrom(promState.collectors, stdprometheus.HistogramOpts{
Name: entrypointReqDurationName,
Help: "How long it took to process the request on an entrypoint, partitioned by status code, protocol, and method.",
Buckets: buckets,
}, []string{"code", "method", "protocol", "entrypoint"})
entrypointOpenConns := newGaugeFrom(promState.collectors, stdprometheus.GaugeOpts{
Name: entrypointOpenConnsName,
Help: "How many open connections exist on an entrypoint, partitioned by method and protocol.",
}, []string{"method", "protocol", "entrypoint"})
backendReqs := newCounterFrom(promState.collectors, stdprometheus.CounterOpts{
Name: backendReqsTotalName,
Help: "How many HTTP requests processed on a backend, partitioned by status code, protocol, and method.",
}, []string{"code", "method", "protocol", "backend"})
backendReqDurations := newHistogramFrom(promState.collectors, stdprometheus.HistogramOpts{
Name: backendReqDurationName,
Help: "How long it took to process the request on a backend, partitioned by status code, protocol, and method.",
Buckets: buckets,
}, []string{"code", "method", "protocol", "backend"})
backendOpenConns := newGaugeFrom(promState.collectors, stdprometheus.GaugeOpts{
Name: backendOpenConnsName,
Help: "How many open connections exist on a backend, partitioned by method and protocol.",
}, []string{"method", "protocol", "backend"})
backendRetries := newCounterFrom(promState.collectors, stdprometheus.CounterOpts{
Name: backendRetriesTotalName,
Help: "How many request retries happened on a backend.",
}, []string{"backend"})
backendServerUp := newGaugeFrom(promState.collectors, stdprometheus.GaugeOpts{
Name: backendServerUpName,
Help: "Backend server is up, described by gauge value of 0 or 1.",
}, []string{"backend", "url"})
promState.describers = []func(chan<- *stdprometheus.Desc){
configReloads.cv.Describe,
configReloadsFailures.cv.Describe,
lastConfigReloadSuccess.gv.Describe,
lastConfigReloadFailure.gv.Describe,
entrypointReqs.cv.Describe,
entrypointReqDurations.hv.Describe,
entrypointOpenConns.gv.Describe,
backendReqs.cv.Describe,
backendReqDurations.hv.Describe,
backendOpenConns.gv.Describe,
backendRetries.cv.Describe,
backendServerUp.gv.Describe,
}
return &standardRegistry{
enabled: true,
configReloadsCounter: configReloads,
configReloadsFailureCounter: configReloadsFailures,
lastConfigReloadSuccessGauge: lastConfigReloadSuccess,
lastConfigReloadFailureGauge: lastConfigReloadFailure,
entrypointReqsCounter: entrypointReqs,
entrypointReqDurationHistogram: entrypointReqDurations,
entrypointOpenConnsGauge: entrypointOpenConns,
backendReqsCounter: backendReqs,
backendReqDurationHistogram: backendReqDurations,
backendOpenConnsGauge: backendOpenConns,
backendRetriesCounter: backendRetries,
backendServerUpGauge: backendServerUp,
reg := &standardRegistry{
epEnabled: config.AddEntryPointsLabels,
svcEnabled: config.AddServicesLabels,
configReloadsCounter: configReloads,
configReloadsFailureCounter: configReloadsFailures,
lastConfigReloadSuccessGauge: lastConfigReloadSuccess,
lastConfigReloadFailureGauge: lastConfigReloadFailure,
}
if config.AddEntryPointsLabels {
entryPointReqs := newCounterFrom(promState.collectors, stdprometheus.CounterOpts{
Name: entryPointReqsTotalName,
Help: "How many HTTP requests processed on an entrypoint, partitioned by status code, protocol, and method.",
}, []string{"code", "method", "protocol", "entrypoint"})
entryPointReqDurations := newHistogramFrom(promState.collectors, stdprometheus.HistogramOpts{
Name: entryPointReqDurationName,
Help: "How long it took to process the request on an entrypoint, partitioned by status code, protocol, and method.",
Buckets: buckets,
}, []string{"code", "method", "protocol", "entrypoint"})
entryPointOpenConns := newGaugeFrom(promState.collectors, stdprometheus.GaugeOpts{
Name: entryPointOpenConnsName,
Help: "How many open connections exist on an entrypoint, partitioned by method and protocol.",
}, []string{"method", "protocol", "entrypoint"})
promState.describers = append(promState.describers, []func(chan<- *stdprometheus.Desc){
entryPointReqs.cv.Describe,
entryPointReqDurations.hv.Describe,
entryPointOpenConns.gv.Describe,
}...)
reg.entryPointReqsCounter = entryPointReqs
reg.entryPointReqDurationHistogram = entryPointReqDurations
reg.entryPointOpenConnsGauge = entryPointOpenConns
}
if config.AddServicesLabels {
serviceReqs := newCounterFrom(promState.collectors, stdprometheus.CounterOpts{
Name: serviceReqsTotalName,
Help: "How many HTTP requests processed on a service, partitioned by status code, protocol, and method.",
}, []string{"code", "method", "protocol", "service"})
serviceReqDurations := newHistogramFrom(promState.collectors, stdprometheus.HistogramOpts{
Name: serviceReqDurationName,
Help: "How long it took to process the request on a service, partitioned by status code, protocol, and method.",
Buckets: buckets,
}, []string{"code", "method", "protocol", "service"})
serviceOpenConns := newGaugeFrom(promState.collectors, stdprometheus.GaugeOpts{
Name: serviceOpenConnsName,
Help: "How many open connections exist on a service, partitioned by method and protocol.",
}, []string{"method", "protocol", "service"})
serviceRetries := newCounterFrom(promState.collectors, stdprometheus.CounterOpts{
Name: serviceRetriesTotalName,
Help: "How many request retries happened on a service.",
}, []string{"service"})
serviceServerUp := newGaugeFrom(promState.collectors, stdprometheus.GaugeOpts{
Name: serviceServerUpName,
Help: "service server is up, described by gauge value of 0 or 1.",
}, []string{"service", "url"})
promState.describers = append(promState.describers, []func(chan<- *stdprometheus.Desc){
serviceReqs.cv.Describe,
serviceReqDurations.hv.Describe,
serviceOpenConns.gv.Describe,
serviceRetries.cv.Describe,
serviceServerUp.gv.Describe,
}...)
reg.serviceReqsCounter = serviceReqs
reg.serviceReqDurationHistogram = serviceReqDurations
reg.serviceOpenConnsGauge = serviceOpenConns
reg.serviceRetriesCounter = serviceRetries
reg.serviceServerUpGauge = serviceServerUp
}
return reg
}
func registerPromState(ctx context.Context) bool {
if err := stdprometheus.Register(promState); err != nil {
if err := promRegistry.Register(promState); err != nil {
logger := log.FromContext(ctx)
if _, ok := err.(stdprometheus.AlreadyRegisteredError); !ok {
logger.Errorf("Unable to register Traefik to Prometheus: %v", err)
@ -189,24 +216,24 @@ func registerPromState(ctx context.Context) bool {
// OnConfigurationUpdate receives the current configuration from Traefik.
// It then converts the configuration to the optimized package internal format
// and sets it to the promState.
func OnConfigurationUpdate(configurations dynamic.Configurations) {
func OnConfigurationUpdate(dynConf dynamic.Configurations, entryPoints []string) {
dynamicConfig := newDynamicConfig()
// FIXME metrics
// for _, config := range configurations {
// for _, frontend := range config.Frontends {
// for _, entrypointName := range frontend.EntryPoints {
// dynamicConfig.entrypoints[entrypointName] = true
// }
// }
//
// for backendName, backend := range config.Backends {
// dynamicConfig.backends[backendName] = make(map[string]bool)
// for _, server := range backend.Servers {
// dynamicConfig.backends[backendName][server.URL] = true
// }
// }
// }
for _, value := range entryPoints {
dynamicConfig.entryPoints[value] = true
}
for key, config := range dynConf {
for name := range config.HTTP.Routers {
dynamicConfig.routers[fmt.Sprintf("%s@%s", name, key)] = true
}
for serviceName, service := range config.HTTP.Services {
dynamicConfig.services[fmt.Sprintf("%s@%s", serviceName, key)] = make(map[string]bool)
for _, server := range service.LoadBalancer.Servers {
dynamicConfig.services[fmt.Sprintf("%s@%s", serviceName, key)][server.URL] = true
}
}
}
promState.SetDynamicConfig(dynamicConfig)
}
@ -279,15 +306,15 @@ func (ps *prometheusState) Collect(ch chan<- stdprometheus.Metric) {
func (ps *prometheusState) isOutdated(collector *collector) bool {
labels := collector.labels
if entrypointName, ok := labels["entrypoint"]; ok && !ps.dynamicConfig.hasEntrypoint(entrypointName) {
if entrypointName, ok := labels["entrypoint"]; ok && !ps.dynamicConfig.hasEntryPoint(entrypointName) {
return true
}
if backendName, ok := labels["backend"]; ok {
if !ps.dynamicConfig.hasBackend(backendName) {
if serviceName, ok := labels["service"]; ok {
if !ps.dynamicConfig.hasService(serviceName) {
return true
}
if url, ok := labels["url"]; ok && !ps.dynamicConfig.hasServerURL(backendName, url) {
if url, ok := labels["url"]; ok && !ps.dynamicConfig.hasServerURL(serviceName, url) {
return true
}
}
@ -297,33 +324,35 @@ func (ps *prometheusState) isOutdated(collector *collector) bool {
func newDynamicConfig() *dynamicConfig {
return &dynamicConfig{
entrypoints: make(map[string]bool),
backends: make(map[string]map[string]bool),
entryPoints: make(map[string]bool),
routers: make(map[string]bool),
services: make(map[string]map[string]bool),
}
}
// dynamicConfig holds the current configuration for entrypoints, backends,
// dynamicConfig holds the current configuration for entryPoints, services,
// and server URLs in an optimized way to check for existence. This provides
// a performant way to check whether the collected metrics belong to the
// current configuration or to an outdated one.
type dynamicConfig struct {
entrypoints map[string]bool
backends map[string]map[string]bool
entryPoints map[string]bool
routers map[string]bool
services map[string]map[string]bool
}
func (d *dynamicConfig) hasEntrypoint(entrypointName string) bool {
_, ok := d.entrypoints[entrypointName]
func (d *dynamicConfig) hasEntryPoint(entrypointName string) bool {
_, ok := d.entryPoints[entrypointName]
return ok
}
func (d *dynamicConfig) hasBackend(backendName string) bool {
_, ok := d.backends[backendName]
func (d *dynamicConfig) hasService(serviceName string) bool {
_, ok := d.services[serviceName]
return ok
}
func (d *dynamicConfig) hasServerURL(backendName, serverURL string) bool {
if backend, hasBackend := d.backends[backendName]; hasBackend {
_, ok := backend[serverURL]
func (d *dynamicConfig) hasServerURL(serviceName, serverURL string) bool {
if service, hasService := d.services[serviceName]; hasService {
_, ok := service[serverURL]
return ok
}
return false
@ -479,7 +508,7 @@ func (h *histogram) Observe(value float64) {
labels := h.labelNamesValues.ToLabels()
collector := h.hv.With(labels)
collector.Observe(value)
h.collectors <- newCollector(h.name, labels, collector, func() {
h.collectors <- newCollector(h.name, labels, h.hv, func() {
h.hv.Delete(labels)
})
}