Skip to content

Commit 29e305b

Browse files
authored
cmd,collector: add --scrape.timeout to time-bound scrapes (#1)
Currently the exporter runs queries without any time bounds. This means a poorly behaved query will just run for ever. A connection limit on the exporter role should prevent queries from piling up, but query pile up is still something to be concerned about. Validated by creating this file: ```bash ❯ cat timeout_test_queries.yaml pg_long_query: query: "SELECT 5 as value FROM (SELECT pg_sleep(5)) as t" metrics: - value: usage: "GAUGE" description: "5 second query" ``` Running with a `--scrape.timeout` of 10s: ```bash ❯ ./postgres_exporter \ --extend.query-path=timeout_test_queries.yaml \ --scrape.timeout=10s \ --log.level=info ``` `cURL`-ing: ```bash ❯ curl http://localhost:9187/metrics | grep long_query % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 28414 0 28414 0 0 5645 0 --:--:-- 0:00:05 --:--:-- 5646# HELP pg_long_query_value 5 second query # TYPE pg_long_query_value gauge pg_long_query_value{server="127.0.0.1:5432"} 5 100 97364 0 97364 0 0 19341 0 --:--:-- 0:00:05 --:--:-- 25461 ``` Repeating with a `--scrape.timeout` of 2s does not include the `pg_long_query_value` ```bash ❯ curl http://localhost:9187/metrics | grep long_query % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 93091 0 93091 0 0 46352 0 --:--:-- 0:00:02 --:--:-- 46360 ``` But does include partial results: ```bash ❯ curl http://localhost:9187/metrics | grep pg_settings | wc -l % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 93192 0 93192 0 0 46313 0 --:--:-- 0:00:02 --:--:-- 46318 810 ``` --------- Signed-off-by: Max Englander <[email protected]>
1 parent 198454c commit 29e305b

File tree

8 files changed

+65
-23
lines changed

8 files changed

+65
-23
lines changed

cmd/postgres_exporter/datasource.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
package main
1515

1616
import (
17+
"context"
1718
"fmt"
1819
"net/url"
1920
"os"
@@ -95,7 +96,7 @@ func (e *Exporter) discoverDatabaseDSNs() []string {
9596
return result
9697
}
9798

98-
func (e *Exporter) scrapeDSN(ch chan<- prometheus.Metric, dsn string) error {
99+
func (e *Exporter) scrapeDSN(ctx context.Context, ch chan<- prometheus.Metric, dsn string) error {
99100
server, err := e.servers.GetServer(dsn)
100101

101102
if err != nil {
@@ -108,11 +109,11 @@ func (e *Exporter) scrapeDSN(ch chan<- prometheus.Metric, dsn string) error {
108109
}
109110

110111
// Check if map versions need to be updated
111-
if err := e.checkMapVersions(ch, server); err != nil {
112+
if err := e.checkMapVersions(ctx, ch, server); err != nil {
112113
logger.Warn("Proceeding with outdated query maps, as the Postgres version could not be determined", "err", err)
113114
}
114115

115-
return server.Scrape(ch, e.disableSettingsMetrics)
116+
return server.Scrape(ctx, ch, e.disableSettingsMetrics)
116117
}
117118

118119
// try to get the DataSource

cmd/postgres_exporter/main.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ var (
4949
excludeDatabases = kingpin.Flag("exclude-databases", "A list of databases to remove when autoDiscoverDatabases is enabled (DEPRECATED)").Default("").Envar("PG_EXPORTER_EXCLUDE_DATABASES").String()
5050
includeDatabases = kingpin.Flag("include-databases", "A list of databases to include when autoDiscoverDatabases is enabled (DEPRECATED)").Default("").Envar("PG_EXPORTER_INCLUDE_DATABASES").String()
5151
metricPrefix = kingpin.Flag("metric-prefix", "A metric prefix can be used to have non-default (not \"pg\") prefixes for each of the metrics").Default("pg").Envar("PG_EXPORTER_METRIC_PREFIX").String()
52+
scrapeTimeout = kingpin.Flag("scrape-timeout", "Maximum time for a scrape to complete before timing out (0 = no timeout)").Default("0").Envar("PG_EXPORTER_SCRAPE_TIMEOUT").Duration()
5253
logger = promslog.NewNopLogger()
5354
)
5455

@@ -114,6 +115,7 @@ func main() {
114115
WithConstantLabels(*constantLabelsList),
115116
ExcludeDatabases(excludedDatabases),
116117
IncludeDatabases(*includeDatabases),
118+
WithTimeout(*scrapeTimeout),
117119
}
118120

119121
exporter := NewExporter(dsns, opts...)
@@ -136,6 +138,7 @@ func main() {
136138
excludedDatabases,
137139
dsn,
138140
[]string{},
141+
collector.WithTimeout(*scrapeTimeout),
139142
)
140143
if err != nil {
141144
logger.Warn("Failed to create PostgresCollector", "err", err.Error())

cmd/postgres_exporter/namespace.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
package main
1515

1616
import (
17+
"context"
1718
"database/sql"
1819
"errors"
1920
"fmt"
@@ -26,7 +27,7 @@ import (
2627

2728
// Query within a namespace mapping and emit metrics. Returns fatal errors if
2829
// the scrape fails, and a slice of errors if they were non-fatal.
29-
func queryNamespaceMapping(server *Server, namespace string, mapping MetricMapNamespace) ([]prometheus.Metric, []error, error) {
30+
func queryNamespaceMapping(ctx context.Context, server *Server, namespace string, mapping MetricMapNamespace) ([]prometheus.Metric, []error, error) {
3031
// Check for a query override for this namespace
3132
query, found := server.queryOverrides[namespace]
3233

@@ -44,9 +45,9 @@ func queryNamespaceMapping(server *Server, namespace string, mapping MetricMapNa
4445
if !found {
4546
// I've no idea how to avoid this properly at the moment, but this is
4647
// an admin tool so you're not injecting SQL right?
47-
rows, err = server.db.Query(fmt.Sprintf("SELECT * FROM %s;", namespace)) // nolint: gas
48+
rows, err = server.db.QueryContext(ctx, fmt.Sprintf("SELECT * FROM %s;", namespace)) // nolint: gas
4849
} else {
49-
rows, err = server.db.Query(query)
50+
rows, err = server.db.QueryContext(ctx, query)
5051
}
5152
if err != nil {
5253
return []prometheus.Metric{}, []error{}, fmt.Errorf("Error running query on database %q: %s %v", server, namespace, err)
@@ -182,7 +183,7 @@ func queryNamespaceMapping(server *Server, namespace string, mapping MetricMapNa
182183

183184
// Iterate through all the namespace mappings in the exporter and run their
184185
// queries.
185-
func queryNamespaceMappings(ch chan<- prometheus.Metric, server *Server) map[string]error {
186+
func queryNamespaceMappings(ctx context.Context, ch chan<- prometheus.Metric, server *Server) map[string]error {
186187
// Return a map of namespace -> errors
187188
namespaceErrors := make(map[string]error)
188189

@@ -224,7 +225,7 @@ func queryNamespaceMappings(ch chan<- prometheus.Metric, server *Server) map[str
224225
var nonFatalErrors []error
225226
var err error
226227
if scrapeMetric {
227-
metrics, nonFatalErrors, err = queryNamespaceMapping(server, namespace, mapping)
228+
metrics, nonFatalErrors, err = queryNamespaceMapping(ctx, server, namespace, mapping)
228229
} else {
229230
metrics = cachedMetric.metrics
230231
}

cmd/postgres_exporter/pg_setting.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
package main
1515

1616
import (
17+
"context"
1718
"fmt"
1819
"math"
1920
"strconv"
@@ -30,7 +31,7 @@ var (
3031
)
3132

3233
// Query the pg_settings view containing runtime variables
33-
func querySettings(ch chan<- prometheus.Metric, server *Server) error {
34+
func querySettings(ctx context.Context, ch chan<- prometheus.Metric, server *Server) error {
3435
logger.Debug("Querying pg_setting view", "server", server)
3536

3637
// pg_settings docs: https://www.postgresql.org/docs/current/static/view-pg-settings.html
@@ -39,7 +40,7 @@ func querySettings(ch chan<- prometheus.Metric, server *Server) error {
3940
// types in normaliseUnit() below
4041
query := "SELECT name, setting, COALESCE(unit, ''), short_desc, vartype FROM pg_settings WHERE vartype IN ('bool', 'integer', 'real') AND name != 'sync_commit_cancel_wait';"
4142

42-
rows, err := server.db.Query(query)
43+
rows, err := server.db.QueryContext(ctx, query)
4344
if err != nil {
4445
return fmt.Errorf("Error running query on database %q: %s %v", server, namespace, err)
4546
}

cmd/postgres_exporter/postgres_exporter.go

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
package main
1515

1616
import (
17+
"context"
1718
"crypto/sha256"
1819
"database/sql"
1920
"errors"
@@ -421,6 +422,7 @@ type Exporter struct {
421422
psqlUp prometheus.Gauge
422423
userQueriesError *prometheus.GaugeVec
423424
totalScrapes prometheus.Counter
425+
scrapeTimeout time.Duration
424426

425427
// servers are used to allow re-using the DB connection between scrapes.
426428
// servers contains metrics map and query overrides.
@@ -507,6 +509,13 @@ func parseConstLabels(s string) prometheus.Labels {
507509
return labels
508510
}
509511

512+
// WithTimeout configures the scrape timeout.
513+
func WithTimeout(timeout time.Duration) ExporterOpt {
514+
return func(e *Exporter) {
515+
e.scrapeTimeout = timeout
516+
}
517+
}
518+
510519
// NewExporter returns a new PostgreSQL exporter for the provided DSN.
511520
func NewExporter(dsn []string, opts ...ExporterOpt) *Exporter {
512521
e := &Exporter{
@@ -567,7 +576,16 @@ func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
567576

568577
// Collect implements prometheus.Collector.
569578
func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
570-
e.scrape(ch)
579+
var ctx context.Context
580+
if e.scrapeTimeout > 0 {
581+
var cancel context.CancelFunc
582+
ctx, cancel = context.WithTimeout(context.Background(), e.scrapeTimeout)
583+
defer cancel()
584+
} else {
585+
ctx = context.Background()
586+
}
587+
588+
e.scrape(ctx, ch)
571589

572590
ch <- e.duration
573591
ch <- e.totalScrapes
@@ -583,9 +601,9 @@ func newDesc(subsystem, name, help string, labels prometheus.Labels) *prometheus
583601
)
584602
}
585603

586-
func checkPostgresVersion(db *sql.DB, server string) (semver.Version, string, error) {
604+
func checkPostgresVersion(ctx context.Context, db *sql.DB, server string) (semver.Version, string, error) {
587605
logger.Debug("Querying PostgreSQL version", "server", server)
588-
versionRow := db.QueryRow("SELECT version();")
606+
versionRow := db.QueryRowContext(ctx, "SELECT version();")
589607
var versionString string
590608
err := versionRow.Scan(&versionString)
591609
if err != nil {
@@ -600,8 +618,8 @@ func checkPostgresVersion(db *sql.DB, server string) (semver.Version, string, er
600618
}
601619

602620
// Check and update the exporters query maps if the version has changed.
603-
func (e *Exporter) checkMapVersions(ch chan<- prometheus.Metric, server *Server) error {
604-
semanticVersion, versionString, err := checkPostgresVersion(server.db, server.String())
621+
func (e *Exporter) checkMapVersions(ctx context.Context, ch chan<- prometheus.Metric, server *Server) error {
622+
semanticVersion, versionString, err := checkPostgresVersion(ctx, server.db, server.String())
605623
if err != nil {
606624
return fmt.Errorf("Error fetching version string on %q: %v", server, err)
607625
}
@@ -662,7 +680,7 @@ func (e *Exporter) checkMapVersions(ch chan<- prometheus.Metric, server *Server)
662680
return nil
663681
}
664682

665-
func (e *Exporter) scrape(ch chan<- prometheus.Metric) {
683+
func (e *Exporter) scrape(ctx context.Context, ch chan<- prometheus.Metric) {
666684
defer func(begun time.Time) {
667685
e.duration.Set(time.Since(begun).Seconds())
668686
}(time.Now())
@@ -678,7 +696,7 @@ func (e *Exporter) scrape(ch chan<- prometheus.Metric) {
678696
var connectionErrorsCount int
679697

680698
for _, dsn := range dsns {
681-
if err := e.scrapeDSN(ch, dsn); err != nil {
699+
if err := e.scrapeDSN(ctx, ch, dsn); err != nil {
682700
errorsCount++
683701

684702
logger.Error("error scraping dsn", "err", err, "dsn", loggableDSN(dsn))

cmd/postgres_exporter/probe.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ func handleProbe(logger *slog.Logger, excludeDatabases []string) http.HandlerFun
7272
WithConstantLabels(*constantLabelsList),
7373
ExcludeDatabases(excludeDatabases),
7474
IncludeDatabases(*includeDatabases),
75+
WithTimeout(*scrapeTimeout),
7576
}
7677

7778
dsns := []string{dsn.GetConnectionString()}

cmd/postgres_exporter/server.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
package main
1515

1616
import (
17+
"context"
1718
"database/sql"
1819
"fmt"
1920
"sync"
@@ -110,20 +111,20 @@ func (s *Server) String() string {
110111
}
111112

112113
// Scrape loads metrics.
113-
func (s *Server) Scrape(ch chan<- prometheus.Metric, disableSettingsMetrics bool) error {
114+
func (s *Server) Scrape(ctx context.Context, ch chan<- prometheus.Metric, disableSettingsMetrics bool) error {
114115
s.mappingMtx.RLock()
115116
defer s.mappingMtx.RUnlock()
116117

117118
var err error
118119

119120
if !disableSettingsMetrics && s.master {
120-
if err = querySettings(ch, s); err != nil {
121+
if err = querySettings(ctx, ch, s); err != nil {
121122
err = fmt.Errorf("error retrieving settings: %s", err)
122123
return err
123124
}
124125
}
125126

126-
errMap := queryNamespaceMappings(ch, s)
127+
errMap := queryNamespaceMappings(ctx, ch, s)
127128
if len(errMap) == 0 {
128129
return nil
129130
}

collector/collector.go

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,14 +89,23 @@ func registerCollector(name string, isDefaultEnabled bool, createFunc func(colle
8989

9090
// PostgresCollector implements the prometheus.Collector interface.
9191
type PostgresCollector struct {
92-
Collectors map[string]Collector
93-
logger *slog.Logger
92+
Collectors map[string]Collector
93+
logger *slog.Logger
94+
scrapeTimeout time.Duration
9495

9596
instance *instance
9697
}
9798

9899
type Option func(*PostgresCollector) error
99100

101+
// WithTimeout configures the scrape timeout.
102+
func WithTimeout(timeout time.Duration) Option {
103+
return func(p *PostgresCollector) error {
104+
p.scrapeTimeout = timeout
105+
return nil
106+
}
107+
}
108+
100109
// NewPostgresCollector creates a new PostgresCollector.
101110
func NewPostgresCollector(logger *slog.Logger, excludeDatabases []string, dsn string, filters []string, options ...Option) (*PostgresCollector, error) {
102111
p := &PostgresCollector{
@@ -166,7 +175,14 @@ func (p PostgresCollector) Describe(ch chan<- *prometheus.Desc) {
166175

167176
// Collect implements the prometheus.Collector interface.
168177
func (p PostgresCollector) Collect(ch chan<- prometheus.Metric) {
169-
ctx := context.TODO()
178+
var ctx context.Context
179+
if p.scrapeTimeout > 0 {
180+
var cancel context.CancelFunc
181+
ctx, cancel = context.WithTimeout(context.Background(), p.scrapeTimeout)
182+
defer cancel()
183+
} else {
184+
ctx = context.Background()
185+
}
170186

171187
// copy the instance so that concurrent scrapes have independent instances
172188
inst := p.instance.copy()

0 commit comments

Comments
 (0)