320 lines
10 KiB
Go
320 lines
10 KiB
Go
|
|
/*
|
||
|
|
Copyright 2020
|
||
|
|
|
||
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
|
you may not use this file except in compliance with the License.
|
||
|
|
You may obtain a copy of the License at
|
||
|
|
|
||
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
||
|
|
|
||
|
|
Unless required by applicable law or agreed to in writing, software
|
||
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
|
See the License for the specific language governing permissions and
|
||
|
|
limitations under the License.
|
||
|
|
*/
|
||
|
|
|
||
|
|
package metricsprovider
|
||
|
|
|
||
|
|
import (
|
||
|
|
"context"
|
||
|
|
"crypto/tls"
|
||
|
|
"crypto/x509"
|
||
|
|
"fmt"
|
||
|
|
"io/ioutil"
|
||
|
|
"net"
|
||
|
|
"net/http"
|
||
|
|
"net/url"
|
||
|
|
"os"
|
||
|
|
"time"
|
||
|
|
|
||
|
|
"k8s.io/client-go/transport"
|
||
|
|
|
||
|
|
"github.com/paypal/load-watcher/pkg/watcher"
|
||
|
|
"github.com/prometheus/client_golang/api"
|
||
|
|
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
|
||
|
|
"github.com/prometheus/common/config"
|
||
|
|
"github.com/prometheus/common/model"
|
||
|
|
log "github.com/sirupsen/logrus"
|
||
|
|
|
||
|
|
_ "k8s.io/client-go/plugin/pkg/client/auth/oidc"
|
||
|
|
)
|
||
|
|
|
||
|
|
const (
|
||
|
|
EnableOpenShiftAuth = "ENABLE_OPENSHIFT_AUTH"
|
||
|
|
K8sPodCAFilePath = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
|
||
|
|
DefaultPromAddress = "http://prometheus-k8s:9090"
|
||
|
|
promStd = "stddev_over_time"
|
||
|
|
promAvg = "avg_over_time"
|
||
|
|
promCpuMetric = "instance:node_cpu:ratio"
|
||
|
|
promMemMetric = "instance:node_memory_utilisation:ratio"
|
||
|
|
promTransBandMetric = "instance:node_network_transmit_bytes:rate:sum"
|
||
|
|
promTransBandDropMetric = "instance:node_network_transmit_drop_excluding_lo:rate5m"
|
||
|
|
promRecBandMetric = "instance:node_network_receive_bytes:rate:sum"
|
||
|
|
promRecBandDropMetric = "instance:node_network_receive_drop_excluding_lo:rate5m"
|
||
|
|
promDiskIOMetric = "instance_device:node_disk_io_time_seconds:rate5m"
|
||
|
|
promScaphHostPower = "scaph_host_power_microwatts"
|
||
|
|
promScaphHostJoules = "scaph_host_energy_microjoules"
|
||
|
|
promKeplerHostCoreJoules = "kepler_node_core_joules_total"
|
||
|
|
promKeplerHostUncoreJoules = "kepler_node_uncore_joules_total"
|
||
|
|
promKeplerHostDRAMJoules = "kepler_node_dram_joules_total"
|
||
|
|
promKeplerHostPackageJoules = "kepler_node_package_joules_total"
|
||
|
|
promKeplerHostOtherJoules = "kepler_node_other_joules_total"
|
||
|
|
promKeplerHostGPUJoules = "kepler_node_gpu_joules_total"
|
||
|
|
promKeplerHostPlatformJoules = "kepler_node_platform_joules_total"
|
||
|
|
promKeplerHostEnergyStat = "kepler_node_energy_stat"
|
||
|
|
allHosts = "all"
|
||
|
|
hostMetricKey = "node"
|
||
|
|
)
|
||
|
|
|
||
|
|
type promClient struct {
|
||
|
|
client api.Client
|
||
|
|
promAddress string
|
||
|
|
}
|
||
|
|
|
||
|
|
func loadCAFile(filepath string) (*x509.CertPool, error) {
|
||
|
|
caCert, err := ioutil.ReadFile(filepath)
|
||
|
|
if err != nil {
|
||
|
|
return nil, err
|
||
|
|
}
|
||
|
|
|
||
|
|
caCertPool := x509.NewCertPool()
|
||
|
|
if ok := caCertPool.AppendCertsFromPEM(caCert); !ok {
|
||
|
|
return nil, fmt.Errorf("failed to append CA certificate to the pool")
|
||
|
|
}
|
||
|
|
|
||
|
|
return caCertPool, nil
|
||
|
|
}
|
||
|
|
|
||
|
|
func NewPromClient(opts watcher.MetricsProviderOpts) (watcher.MetricsProviderClient, error) {
|
||
|
|
if opts.Name != watcher.PromClientName {
|
||
|
|
return nil, fmt.Errorf("metric provider name should be %v, found %v", watcher.PromClientName, opts.Name)
|
||
|
|
}
|
||
|
|
|
||
|
|
var client api.Client
|
||
|
|
var err error
|
||
|
|
var promToken, promAddress = "", DefaultPromAddress
|
||
|
|
if opts.AuthToken != "" {
|
||
|
|
promToken = opts.AuthToken
|
||
|
|
}
|
||
|
|
if opts.Address != "" {
|
||
|
|
promAddress = opts.Address
|
||
|
|
}
|
||
|
|
|
||
|
|
// Ignore TLS verify errors if InsecureSkipVerify is set
|
||
|
|
roundTripper := api.DefaultRoundTripper
|
||
|
|
|
||
|
|
// Check if EnableOpenShiftAuth is set.
|
||
|
|
_, enableOpenShiftAuth := os.LookupEnv(EnableOpenShiftAuth)
|
||
|
|
if enableOpenShiftAuth {
|
||
|
|
// Retrieve Pod CA cert
|
||
|
|
caCertPool, err := loadCAFile(K8sPodCAFilePath)
|
||
|
|
if err != nil {
|
||
|
|
return nil, fmt.Errorf("Error loading CA file: %v", err)
|
||
|
|
}
|
||
|
|
|
||
|
|
// Get Prometheus Host
|
||
|
|
u, _ := url.Parse(opts.Address)
|
||
|
|
roundTripper = transport.NewBearerAuthRoundTripper(
|
||
|
|
opts.AuthToken,
|
||
|
|
&http.Transport{
|
||
|
|
Proxy: http.ProxyFromEnvironment,
|
||
|
|
DialContext: (&net.Dialer{
|
||
|
|
Timeout: 30 * time.Second,
|
||
|
|
KeepAlive: 30 * time.Second,
|
||
|
|
}).DialContext,
|
||
|
|
TLSHandshakeTimeout: 10 * time.Second,
|
||
|
|
TLSClientConfig: &tls.Config{
|
||
|
|
RootCAs: caCertPool,
|
||
|
|
ServerName: u.Host,
|
||
|
|
},
|
||
|
|
},
|
||
|
|
)
|
||
|
|
} else if opts.InsecureSkipVerify {
|
||
|
|
roundTripper = &http.Transport{
|
||
|
|
Proxy: http.ProxyFromEnvironment,
|
||
|
|
DialContext: (&net.Dialer{
|
||
|
|
Timeout: 30 * time.Second,
|
||
|
|
KeepAlive: 30 * time.Second,
|
||
|
|
}).DialContext,
|
||
|
|
TLSHandshakeTimeout: 10 * time.Second,
|
||
|
|
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if promToken != "" {
|
||
|
|
client, err = api.NewClient(api.Config{
|
||
|
|
Address: promAddress,
|
||
|
|
RoundTripper: config.NewAuthorizationCredentialsRoundTripper("Bearer", config.NewInlineSecret(opts.AuthToken), roundTripper),
|
||
|
|
})
|
||
|
|
} else {
|
||
|
|
client, err = api.NewClient(api.Config{
|
||
|
|
Address: promAddress,
|
||
|
|
})
|
||
|
|
}
|
||
|
|
|
||
|
|
if err != nil {
|
||
|
|
log.Errorf("error creating prometheus client: %v", err)
|
||
|
|
return nil, err
|
||
|
|
}
|
||
|
|
|
||
|
|
return promClient{
|
||
|
|
client: client,
|
||
|
|
promAddress: promAddress,
|
||
|
|
}, err
|
||
|
|
}
|
||
|
|
|
||
|
|
func (s promClient) Name() string {
|
||
|
|
return watcher.PromClientName
|
||
|
|
}
|
||
|
|
|
||
|
|
func (s promClient) FetchHostMetrics(host string, window *watcher.Window) ([]watcher.Metric, error) {
|
||
|
|
var metricList []watcher.Metric
|
||
|
|
var anyerr error
|
||
|
|
|
||
|
|
for _, method := range []string{promAvg, promStd} {
|
||
|
|
for _, metric := range []string{promCpuMetric, promMemMetric, promTransBandMetric, promTransBandDropMetric, promRecBandMetric, promRecBandDropMetric,
|
||
|
|
promDiskIOMetric, promScaphHostPower, promScaphHostJoules, promKeplerHostCoreJoules, promKeplerHostUncoreJoules, promKeplerHostDRAMJoules,
|
||
|
|
promKeplerHostPackageJoules, promKeplerHostOtherJoules, promKeplerHostGPUJoules, promKeplerHostPlatformJoules, promKeplerHostEnergyStat} {
|
||
|
|
promQuery := s.buildPromQuery(host, metric, method, window.Duration)
|
||
|
|
promResults, err := s.getPromResults(promQuery)
|
||
|
|
|
||
|
|
if err != nil {
|
||
|
|
log.Errorf("error querying Prometheus for query %v: %v\n", promQuery, err)
|
||
|
|
anyerr = err
|
||
|
|
continue
|
||
|
|
}
|
||
|
|
|
||
|
|
curMetricMap := s.promResults2MetricMap(promResults, metric, method, window.Duration)
|
||
|
|
metricList = append(metricList, curMetricMap[host]...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return metricList, anyerr
|
||
|
|
}
|
||
|
|
|
||
|
|
// FetchAllHostsMetrics Fetch all host metrics with different operators (avg_over_time, stddev_over_time) and different resource types (CPU, Memory)
|
||
|
|
func (s promClient) FetchAllHostsMetrics(window *watcher.Window) (map[string][]watcher.Metric, error) {
|
||
|
|
hostMetrics := make(map[string][]watcher.Metric)
|
||
|
|
var anyerr error
|
||
|
|
|
||
|
|
for _, method := range []string{promAvg, promStd} {
|
||
|
|
for _, metric := range []string{promCpuMetric, promMemMetric, promTransBandMetric, promTransBandDropMetric, promRecBandMetric, promRecBandDropMetric,
|
||
|
|
promDiskIOMetric, promScaphHostPower, promScaphHostJoules, promKeplerHostCoreJoules, promKeplerHostUncoreJoules, promKeplerHostDRAMJoules,
|
||
|
|
promKeplerHostPackageJoules, promKeplerHostOtherJoules, promKeplerHostGPUJoules, promKeplerHostPlatformJoules, promKeplerHostEnergyStat} {
|
||
|
|
promQuery := s.buildPromQuery(allHosts, metric, method, window.Duration)
|
||
|
|
promResults, err := s.getPromResults(promQuery)
|
||
|
|
|
||
|
|
if err != nil {
|
||
|
|
log.Errorf("error querying Prometheus for query %v: %v\n", promQuery, err)
|
||
|
|
anyerr = err
|
||
|
|
continue
|
||
|
|
}
|
||
|
|
|
||
|
|
curMetricMap := s.promResults2MetricMap(promResults, metric, method, window.Duration)
|
||
|
|
|
||
|
|
for k, v := range curMetricMap {
|
||
|
|
// skip empty keys
|
||
|
|
if k == "" {
|
||
|
|
continue
|
||
|
|
}
|
||
|
|
hostMetrics[k] = append(hostMetrics[k], v...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return hostMetrics, anyerr
|
||
|
|
}
|
||
|
|
|
||
|
|
func (s promClient) Health() (int, error) {
|
||
|
|
req, err := http.NewRequest("HEAD", s.promAddress, nil)
|
||
|
|
if err != nil {
|
||
|
|
return -1, err
|
||
|
|
}
|
||
|
|
resp, _, err := s.client.Do(context.Background(), req)
|
||
|
|
if err != nil {
|
||
|
|
return -1, err
|
||
|
|
}
|
||
|
|
if resp.StatusCode != http.StatusOK {
|
||
|
|
return -1, fmt.Errorf("received response status code: %v", resp.StatusCode)
|
||
|
|
}
|
||
|
|
return 0, nil
|
||
|
|
}
|
||
|
|
|
||
|
|
func (s promClient) buildPromQuery(host string, metric string, method string, rollup string) string {
|
||
|
|
var promQuery string
|
||
|
|
|
||
|
|
if host == allHosts {
|
||
|
|
promQuery = fmt.Sprintf("%s(%s[%s])", method, metric, rollup)
|
||
|
|
} else {
|
||
|
|
promQuery = fmt.Sprintf("%s(%s{%s=\"%s\"}[%s])", method, metric, hostMetricKey, host, rollup)
|
||
|
|
}
|
||
|
|
|
||
|
|
return promQuery
|
||
|
|
}
|
||
|
|
|
||
|
|
func (s promClient) getPromResults(promQuery string) (model.Value, error) {
|
||
|
|
v1api := v1.NewAPI(s.client)
|
||
|
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||
|
|
defer cancel()
|
||
|
|
|
||
|
|
results, warnings, err := v1api.Query(ctx, promQuery, time.Now())
|
||
|
|
if err != nil {
|
||
|
|
return nil, err
|
||
|
|
}
|
||
|
|
if len(warnings) > 0 {
|
||
|
|
log.Warnf("Warnings: %v\n", warnings)
|
||
|
|
}
|
||
|
|
log.Debugf("result:\n%v\n", results)
|
||
|
|
|
||
|
|
return results, nil
|
||
|
|
}
|
||
|
|
|
||
|
|
func (s promClient) promResults2MetricMap(promresults model.Value, metric string, method string, rollup string) map[string][]watcher.Metric {
|
||
|
|
var metricType string
|
||
|
|
var operator string
|
||
|
|
|
||
|
|
curMetrics := make(map[string][]watcher.Metric)
|
||
|
|
|
||
|
|
switch metric {
|
||
|
|
case promCpuMetric: // CPU metrics
|
||
|
|
metricType = watcher.CPU
|
||
|
|
case promMemMetric: // Memory metrics
|
||
|
|
metricType = watcher.Memory
|
||
|
|
case promDiskIOMetric: // Storage metrics
|
||
|
|
metricType = watcher.Storage
|
||
|
|
case promScaphHostPower, promScaphHostJoules, // Energy-related metrics
|
||
|
|
promKeplerHostCoreJoules, promKeplerHostUncoreJoules,
|
||
|
|
promKeplerHostDRAMJoules, promKeplerHostPackageJoules,
|
||
|
|
promKeplerHostOtherJoules, promKeplerHostGPUJoules,
|
||
|
|
promKeplerHostPlatformJoules, promKeplerHostEnergyStat:
|
||
|
|
metricType = watcher.Energy
|
||
|
|
case promTransBandMetric, promTransBandDropMetric, // Bandwidth-related metrics
|
||
|
|
promRecBandMetric, promRecBandDropMetric:
|
||
|
|
metricType = watcher.Bandwidth
|
||
|
|
default:
|
||
|
|
metricType = watcher.Unknown
|
||
|
|
}
|
||
|
|
|
||
|
|
if method == promAvg {
|
||
|
|
operator = watcher.Average
|
||
|
|
} else if method == promStd {
|
||
|
|
operator = watcher.Std
|
||
|
|
} else {
|
||
|
|
operator = watcher.UnknownOperator
|
||
|
|
}
|
||
|
|
|
||
|
|
switch promresults.(type) {
|
||
|
|
case model.Vector:
|
||
|
|
for _, result := range promresults.(model.Vector) {
|
||
|
|
curMetric := watcher.Metric{Name: metric, Type: metricType, Operator: operator, Rollup: rollup, Value: float64(result.Value * 100)}
|
||
|
|
curHost := string(result.Metric[hostMetricKey])
|
||
|
|
curMetrics[curHost] = append(curMetrics[curHost], curMetric)
|
||
|
|
}
|
||
|
|
default:
|
||
|
|
log.Errorf("error: The Prometheus results should not be type: %v.\n", promresults.Type())
|
||
|
|
}
|
||
|
|
|
||
|
|
return curMetrics
|
||
|
|
}
|