Add Metrics
This commit is contained in:
parent
4dc448056c
commit
8e97af8dc3
121 changed files with 8364 additions and 3811 deletions
|
@ -2,6 +2,7 @@ package metrics
|
|||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strings"
|
||||
|
@ -28,43 +29,45 @@ const (
|
|||
configLastReloadSuccessName = metricConfigPrefix + "last_reload_success"
|
||||
configLastReloadFailureName = metricConfigPrefix + "last_reload_failure"
|
||||
|
||||
// entrypoint
|
||||
// entry point
|
||||
metricEntryPointPrefix = MetricNamePrefix + "entrypoint_"
|
||||
entrypointReqsTotalName = metricEntryPointPrefix + "requests_total"
|
||||
entrypointReqDurationName = metricEntryPointPrefix + "request_duration_seconds"
|
||||
entrypointOpenConnsName = metricEntryPointPrefix + "open_connections"
|
||||
entryPointReqsTotalName = metricEntryPointPrefix + "requests_total"
|
||||
entryPointReqDurationName = metricEntryPointPrefix + "request_duration_seconds"
|
||||
entryPointOpenConnsName = metricEntryPointPrefix + "open_connections"
|
||||
|
||||
// backend level.
|
||||
// service level.
|
||||
|
||||
// MetricBackendPrefix prefix of all backend metric names
|
||||
MetricBackendPrefix = MetricNamePrefix + "backend_"
|
||||
backendReqsTotalName = MetricBackendPrefix + "requests_total"
|
||||
backendReqDurationName = MetricBackendPrefix + "request_duration_seconds"
|
||||
backendOpenConnsName = MetricBackendPrefix + "open_connections"
|
||||
backendRetriesTotalName = MetricBackendPrefix + "retries_total"
|
||||
backendServerUpName = MetricBackendPrefix + "server_up"
|
||||
// MetricServicePrefix prefix of all service metric names
|
||||
MetricServicePrefix = MetricNamePrefix + "service_"
|
||||
serviceReqsTotalName = MetricServicePrefix + "requests_total"
|
||||
serviceReqDurationName = MetricServicePrefix + "request_duration_seconds"
|
||||
serviceOpenConnsName = MetricServicePrefix + "open_connections"
|
||||
serviceRetriesTotalName = MetricServicePrefix + "retries_total"
|
||||
serviceServerUpName = MetricServicePrefix + "server_up"
|
||||
)
|
||||
|
||||
// promState holds all metric state internally and acts as the only Collector we register for Prometheus.
|
||||
//
|
||||
// This enables control to remove metrics that belong to outdated configuration.
|
||||
// As an example why this is required, consider Traefik learns about a new service.
|
||||
// It populates the 'traefik_server_backend_up' metric for it with a value of 1 (alive).
|
||||
// When the backend is undeployed now the metric is still there in the client library
|
||||
// It populates the 'traefik_server_service_up' metric for it with a value of 1 (alive).
|
||||
// When the service is undeployed now the metric is still there in the client library
|
||||
// and will be returned on the metrics endpoint until Traefik would be restarted.
|
||||
//
|
||||
// To solve this problem promState keeps track of Traefik's dynamic configuration.
|
||||
// Metrics that "belong" to a dynamic configuration part like backends or entrypoints
|
||||
// Metrics that "belong" to a dynamic configuration part like services or entryPoints
|
||||
// are removed after they were scraped at least once when the corresponding object
|
||||
// doesn't exist anymore.
|
||||
var promState = newPrometheusState()
|
||||
|
||||
var promRegistry = stdprometheus.NewRegistry()
|
||||
|
||||
// PrometheusHandler exposes Prometheus routes.
|
||||
type PrometheusHandler struct{}
|
||||
|
||||
// Append adds Prometheus routes on a router.
|
||||
func (h PrometheusHandler) Append(router *mux.Router) {
|
||||
router.Methods(http.MethodGet).Path("/metrics").Handler(promhttp.Handler())
|
||||
router.Methods(http.MethodGet).Path("/metrics").Handler(promhttp.HandlerFor(promRegistry, promhttp.HandlerOpts{}))
|
||||
}
|
||||
|
||||
// RegisterPrometheus registers all Prometheus metrics.
|
||||
|
@ -72,6 +75,17 @@ func (h PrometheusHandler) Append(router *mux.Router) {
|
|||
func RegisterPrometheus(ctx context.Context, config *types.Prometheus) Registry {
|
||||
standardRegistry := initStandardRegistry(config)
|
||||
|
||||
if err := promRegistry.Register(stdprometheus.NewProcessCollector(stdprometheus.ProcessCollectorOpts{})); err != nil {
|
||||
if _, ok := err.(stdprometheus.AlreadyRegisteredError); !ok {
|
||||
log.FromContext(ctx).Warn("ProcessCollector is already registered")
|
||||
}
|
||||
}
|
||||
if err := promRegistry.Register(stdprometheus.NewGoCollector()); err != nil {
|
||||
if _, ok := err.(stdprometheus.AlreadyRegisteredError); !ok {
|
||||
log.FromContext(ctx).Warn("GoCollector is already registered")
|
||||
}
|
||||
}
|
||||
|
||||
if !registerPromState(ctx) {
|
||||
return nil
|
||||
}
|
||||
|
@ -106,76 +120,89 @@ func initStandardRegistry(config *types.Prometheus) Registry {
|
|||
Help: "Last config reload failure",
|
||||
}, []string{})
|
||||
|
||||
entrypointReqs := newCounterFrom(promState.collectors, stdprometheus.CounterOpts{
|
||||
Name: entrypointReqsTotalName,
|
||||
Help: "How many HTTP requests processed on an entrypoint, partitioned by status code, protocol, and method.",
|
||||
}, []string{"code", "method", "protocol", "entrypoint"})
|
||||
entrypointReqDurations := newHistogramFrom(promState.collectors, stdprometheus.HistogramOpts{
|
||||
Name: entrypointReqDurationName,
|
||||
Help: "How long it took to process the request on an entrypoint, partitioned by status code, protocol, and method.",
|
||||
Buckets: buckets,
|
||||
}, []string{"code", "method", "protocol", "entrypoint"})
|
||||
entrypointOpenConns := newGaugeFrom(promState.collectors, stdprometheus.GaugeOpts{
|
||||
Name: entrypointOpenConnsName,
|
||||
Help: "How many open connections exist on an entrypoint, partitioned by method and protocol.",
|
||||
}, []string{"method", "protocol", "entrypoint"})
|
||||
|
||||
backendReqs := newCounterFrom(promState.collectors, stdprometheus.CounterOpts{
|
||||
Name: backendReqsTotalName,
|
||||
Help: "How many HTTP requests processed on a backend, partitioned by status code, protocol, and method.",
|
||||
}, []string{"code", "method", "protocol", "backend"})
|
||||
backendReqDurations := newHistogramFrom(promState.collectors, stdprometheus.HistogramOpts{
|
||||
Name: backendReqDurationName,
|
||||
Help: "How long it took to process the request on a backend, partitioned by status code, protocol, and method.",
|
||||
Buckets: buckets,
|
||||
}, []string{"code", "method", "protocol", "backend"})
|
||||
backendOpenConns := newGaugeFrom(promState.collectors, stdprometheus.GaugeOpts{
|
||||
Name: backendOpenConnsName,
|
||||
Help: "How many open connections exist on a backend, partitioned by method and protocol.",
|
||||
}, []string{"method", "protocol", "backend"})
|
||||
backendRetries := newCounterFrom(promState.collectors, stdprometheus.CounterOpts{
|
||||
Name: backendRetriesTotalName,
|
||||
Help: "How many request retries happened on a backend.",
|
||||
}, []string{"backend"})
|
||||
backendServerUp := newGaugeFrom(promState.collectors, stdprometheus.GaugeOpts{
|
||||
Name: backendServerUpName,
|
||||
Help: "Backend server is up, described by gauge value of 0 or 1.",
|
||||
}, []string{"backend", "url"})
|
||||
|
||||
promState.describers = []func(chan<- *stdprometheus.Desc){
|
||||
configReloads.cv.Describe,
|
||||
configReloadsFailures.cv.Describe,
|
||||
lastConfigReloadSuccess.gv.Describe,
|
||||
lastConfigReloadFailure.gv.Describe,
|
||||
entrypointReqs.cv.Describe,
|
||||
entrypointReqDurations.hv.Describe,
|
||||
entrypointOpenConns.gv.Describe,
|
||||
backendReqs.cv.Describe,
|
||||
backendReqDurations.hv.Describe,
|
||||
backendOpenConns.gv.Describe,
|
||||
backendRetries.cv.Describe,
|
||||
backendServerUp.gv.Describe,
|
||||
}
|
||||
|
||||
return &standardRegistry{
|
||||
enabled: true,
|
||||
configReloadsCounter: configReloads,
|
||||
configReloadsFailureCounter: configReloadsFailures,
|
||||
lastConfigReloadSuccessGauge: lastConfigReloadSuccess,
|
||||
lastConfigReloadFailureGauge: lastConfigReloadFailure,
|
||||
entrypointReqsCounter: entrypointReqs,
|
||||
entrypointReqDurationHistogram: entrypointReqDurations,
|
||||
entrypointOpenConnsGauge: entrypointOpenConns,
|
||||
backendReqsCounter: backendReqs,
|
||||
backendReqDurationHistogram: backendReqDurations,
|
||||
backendOpenConnsGauge: backendOpenConns,
|
||||
backendRetriesCounter: backendRetries,
|
||||
backendServerUpGauge: backendServerUp,
|
||||
reg := &standardRegistry{
|
||||
epEnabled: config.AddEntryPointsLabels,
|
||||
svcEnabled: config.AddServicesLabels,
|
||||
configReloadsCounter: configReloads,
|
||||
configReloadsFailureCounter: configReloadsFailures,
|
||||
lastConfigReloadSuccessGauge: lastConfigReloadSuccess,
|
||||
lastConfigReloadFailureGauge: lastConfigReloadFailure,
|
||||
}
|
||||
|
||||
if config.AddEntryPointsLabels {
|
||||
entryPointReqs := newCounterFrom(promState.collectors, stdprometheus.CounterOpts{
|
||||
Name: entryPointReqsTotalName,
|
||||
Help: "How many HTTP requests processed on an entrypoint, partitioned by status code, protocol, and method.",
|
||||
}, []string{"code", "method", "protocol", "entrypoint"})
|
||||
entryPointReqDurations := newHistogramFrom(promState.collectors, stdprometheus.HistogramOpts{
|
||||
Name: entryPointReqDurationName,
|
||||
Help: "How long it took to process the request on an entrypoint, partitioned by status code, protocol, and method.",
|
||||
Buckets: buckets,
|
||||
}, []string{"code", "method", "protocol", "entrypoint"})
|
||||
entryPointOpenConns := newGaugeFrom(promState.collectors, stdprometheus.GaugeOpts{
|
||||
Name: entryPointOpenConnsName,
|
||||
Help: "How many open connections exist on an entrypoint, partitioned by method and protocol.",
|
||||
}, []string{"method", "protocol", "entrypoint"})
|
||||
|
||||
promState.describers = append(promState.describers, []func(chan<- *stdprometheus.Desc){
|
||||
entryPointReqs.cv.Describe,
|
||||
entryPointReqDurations.hv.Describe,
|
||||
entryPointOpenConns.gv.Describe,
|
||||
}...)
|
||||
reg.entryPointReqsCounter = entryPointReqs
|
||||
reg.entryPointReqDurationHistogram = entryPointReqDurations
|
||||
reg.entryPointOpenConnsGauge = entryPointOpenConns
|
||||
}
|
||||
if config.AddServicesLabels {
|
||||
serviceReqs := newCounterFrom(promState.collectors, stdprometheus.CounterOpts{
|
||||
Name: serviceReqsTotalName,
|
||||
Help: "How many HTTP requests processed on a service, partitioned by status code, protocol, and method.",
|
||||
}, []string{"code", "method", "protocol", "service"})
|
||||
serviceReqDurations := newHistogramFrom(promState.collectors, stdprometheus.HistogramOpts{
|
||||
Name: serviceReqDurationName,
|
||||
Help: "How long it took to process the request on a service, partitioned by status code, protocol, and method.",
|
||||
Buckets: buckets,
|
||||
}, []string{"code", "method", "protocol", "service"})
|
||||
serviceOpenConns := newGaugeFrom(promState.collectors, stdprometheus.GaugeOpts{
|
||||
Name: serviceOpenConnsName,
|
||||
Help: "How many open connections exist on a service, partitioned by method and protocol.",
|
||||
}, []string{"method", "protocol", "service"})
|
||||
serviceRetries := newCounterFrom(promState.collectors, stdprometheus.CounterOpts{
|
||||
Name: serviceRetriesTotalName,
|
||||
Help: "How many request retries happened on a service.",
|
||||
}, []string{"service"})
|
||||
serviceServerUp := newGaugeFrom(promState.collectors, stdprometheus.GaugeOpts{
|
||||
Name: serviceServerUpName,
|
||||
Help: "service server is up, described by gauge value of 0 or 1.",
|
||||
}, []string{"service", "url"})
|
||||
|
||||
promState.describers = append(promState.describers, []func(chan<- *stdprometheus.Desc){
|
||||
serviceReqs.cv.Describe,
|
||||
serviceReqDurations.hv.Describe,
|
||||
serviceOpenConns.gv.Describe,
|
||||
serviceRetries.cv.Describe,
|
||||
serviceServerUp.gv.Describe,
|
||||
}...)
|
||||
|
||||
reg.serviceReqsCounter = serviceReqs
|
||||
reg.serviceReqDurationHistogram = serviceReqDurations
|
||||
reg.serviceOpenConnsGauge = serviceOpenConns
|
||||
reg.serviceRetriesCounter = serviceRetries
|
||||
reg.serviceServerUpGauge = serviceServerUp
|
||||
}
|
||||
|
||||
return reg
|
||||
}
|
||||
|
||||
func registerPromState(ctx context.Context) bool {
|
||||
if err := stdprometheus.Register(promState); err != nil {
|
||||
if err := promRegistry.Register(promState); err != nil {
|
||||
logger := log.FromContext(ctx)
|
||||
if _, ok := err.(stdprometheus.AlreadyRegisteredError); !ok {
|
||||
logger.Errorf("Unable to register Traefik to Prometheus: %v", err)
|
||||
|
@ -189,24 +216,24 @@ func registerPromState(ctx context.Context) bool {
|
|||
// OnConfigurationUpdate receives the current configuration from Traefik.
|
||||
// It then converts the configuration to the optimized package internal format
|
||||
// and sets it to the promState.
|
||||
func OnConfigurationUpdate(configurations dynamic.Configurations) {
|
||||
func OnConfigurationUpdate(dynConf dynamic.Configurations, entryPoints []string) {
|
||||
dynamicConfig := newDynamicConfig()
|
||||
|
||||
// FIXME metrics
|
||||
// for _, config := range configurations {
|
||||
// for _, frontend := range config.Frontends {
|
||||
// for _, entrypointName := range frontend.EntryPoints {
|
||||
// dynamicConfig.entrypoints[entrypointName] = true
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// for backendName, backend := range config.Backends {
|
||||
// dynamicConfig.backends[backendName] = make(map[string]bool)
|
||||
// for _, server := range backend.Servers {
|
||||
// dynamicConfig.backends[backendName][server.URL] = true
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
for _, value := range entryPoints {
|
||||
dynamicConfig.entryPoints[value] = true
|
||||
}
|
||||
for key, config := range dynConf {
|
||||
for name := range config.HTTP.Routers {
|
||||
dynamicConfig.routers[fmt.Sprintf("%s@%s", name, key)] = true
|
||||
}
|
||||
|
||||
for serviceName, service := range config.HTTP.Services {
|
||||
dynamicConfig.services[fmt.Sprintf("%s@%s", serviceName, key)] = make(map[string]bool)
|
||||
for _, server := range service.LoadBalancer.Servers {
|
||||
dynamicConfig.services[fmt.Sprintf("%s@%s", serviceName, key)][server.URL] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
promState.SetDynamicConfig(dynamicConfig)
|
||||
}
|
||||
|
@ -279,15 +306,15 @@ func (ps *prometheusState) Collect(ch chan<- stdprometheus.Metric) {
|
|||
func (ps *prometheusState) isOutdated(collector *collector) bool {
|
||||
labels := collector.labels
|
||||
|
||||
if entrypointName, ok := labels["entrypoint"]; ok && !ps.dynamicConfig.hasEntrypoint(entrypointName) {
|
||||
if entrypointName, ok := labels["entrypoint"]; ok && !ps.dynamicConfig.hasEntryPoint(entrypointName) {
|
||||
return true
|
||||
}
|
||||
|
||||
if backendName, ok := labels["backend"]; ok {
|
||||
if !ps.dynamicConfig.hasBackend(backendName) {
|
||||
if serviceName, ok := labels["service"]; ok {
|
||||
if !ps.dynamicConfig.hasService(serviceName) {
|
||||
return true
|
||||
}
|
||||
if url, ok := labels["url"]; ok && !ps.dynamicConfig.hasServerURL(backendName, url) {
|
||||
if url, ok := labels["url"]; ok && !ps.dynamicConfig.hasServerURL(serviceName, url) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
@ -297,33 +324,35 @@ func (ps *prometheusState) isOutdated(collector *collector) bool {
|
|||
|
||||
func newDynamicConfig() *dynamicConfig {
|
||||
return &dynamicConfig{
|
||||
entrypoints: make(map[string]bool),
|
||||
backends: make(map[string]map[string]bool),
|
||||
entryPoints: make(map[string]bool),
|
||||
routers: make(map[string]bool),
|
||||
services: make(map[string]map[string]bool),
|
||||
}
|
||||
}
|
||||
|
||||
// dynamicConfig holds the current configuration for entrypoints, backends,
|
||||
// dynamicConfig holds the current configuration for entryPoints, services,
|
||||
// and server URLs in an optimized way to check for existence. This provides
|
||||
// a performant way to check whether the collected metrics belong to the
|
||||
// current configuration or to an outdated one.
|
||||
type dynamicConfig struct {
|
||||
entrypoints map[string]bool
|
||||
backends map[string]map[string]bool
|
||||
entryPoints map[string]bool
|
||||
routers map[string]bool
|
||||
services map[string]map[string]bool
|
||||
}
|
||||
|
||||
func (d *dynamicConfig) hasEntrypoint(entrypointName string) bool {
|
||||
_, ok := d.entrypoints[entrypointName]
|
||||
func (d *dynamicConfig) hasEntryPoint(entrypointName string) bool {
|
||||
_, ok := d.entryPoints[entrypointName]
|
||||
return ok
|
||||
}
|
||||
|
||||
func (d *dynamicConfig) hasBackend(backendName string) bool {
|
||||
_, ok := d.backends[backendName]
|
||||
func (d *dynamicConfig) hasService(serviceName string) bool {
|
||||
_, ok := d.services[serviceName]
|
||||
return ok
|
||||
}
|
||||
|
||||
func (d *dynamicConfig) hasServerURL(backendName, serverURL string) bool {
|
||||
if backend, hasBackend := d.backends[backendName]; hasBackend {
|
||||
_, ok := backend[serverURL]
|
||||
func (d *dynamicConfig) hasServerURL(serviceName, serverURL string) bool {
|
||||
if service, hasService := d.services[serviceName]; hasService {
|
||||
_, ok := service[serverURL]
|
||||
return ok
|
||||
}
|
||||
return false
|
||||
|
@ -479,7 +508,7 @@ func (h *histogram) Observe(value float64) {
|
|||
labels := h.labelNamesValues.ToLabels()
|
||||
collector := h.hv.With(labels)
|
||||
collector.Observe(value)
|
||||
h.collectors <- newCollector(h.name, labels, collector, func() {
|
||||
h.collectors <- newCollector(h.name, labels, h.hv, func() {
|
||||
h.hv.Delete(labels)
|
||||
})
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue