From dc6cefa0f4debe09da557b6e8061f6e45d58bf3d Mon Sep 17 00:00:00 2001 From: sam Date: Wed, 13 May 2026 12:42:13 +0200 Subject: [PATCH] fix(monitoring): tighten grafana log panel regex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace substring patterns like (?i)(error|exception|fail|fatal|critical|panic|warn|warning) with level-anchored \b(ERROR|FATAL|CRITICAL|Exception|Traceback)\b |\[(error|fatal|critical)\] across all per-service Error Logs panels, the Echo Application Error Logs panel, and the logfilter variable. Same shape for warning panels with WARN|WARNING. Underscore is a word character in RE2, so \berror\b does not match inside transcript_error / latest_error / etc. — drops the directus column noise without exclusion lists. Case-sensitive uppercase catches python logging level prefixes; bracketed lowercase catches nginx-style. Exception/Traceback covers python tracebacks. Real fix (promtail label extraction) supersedes this — separate follow-up. Refs: ECHO-815 Co-authored-by: Sameer --- .../configmap-grafana-dashboards.yaml | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/helm/monitoring/templates/configmap-grafana-dashboards.yaml b/helm/monitoring/templates/configmap-grafana-dashboards.yaml index eff458f..16f45d1 100644 --- a/helm/monitoring/templates/configmap-grafana-dashboards.yaml +++ b/helm/monitoring/templates/configmap-grafana-dashboards.yaml @@ -1036,7 +1036,7 @@ data: }, "targets": [ { - "expr": "{namespace=~\"echo-.*\"} |~ \"error|ERROR|Error|Exception|exception|EXCEPTION\"" + "expr": "{namespace=~\"echo-.*\"} |~ \"\\\\b(ERROR|FATAL|CRITICAL|Exception|Traceback)\\\\b|\\\\[(error|fatal|critical)\\\\]\"" } ], "options": { @@ -1233,13 +1233,13 @@ data: "fieldConfig": {"defaults": {"unit": "short"}}, "datasource": {"type": "loki", "uid": "loki"}, "targets": [ - {"refId": "A", "expr": "sum(count_over_time(({namespace=\"echo-prod\", component=\"api\"} |~ \"(?i)(error|exception|fail|fatal|critical|panic|warn|warning)\")[5m]))"} + {"refId": "A", "expr": "sum(count_over_time(({namespace=\"echo-prod\", component=\"api\"} |~ \"\\\\b(ERROR|FATAL|CRITICAL|Exception|Traceback)\\\\b|\\\\[(error|fatal|critical)\\\\]\")[5m]))"} ] }, {"type": "logs", "title": "API Error Logs", "id": 202, "gridPos": {"h": 8, "w": 18, "x": 6, "y": 36}, "datasource": {"type": "loki", "uid": "loki"}, "targets": [ - {"refId": "A", "expr": "{namespace=\"echo-prod\", component=\"api\"} |~ \"(?i)(error|exception|fail|fatal|critical|panic|warn|warning)\""} + {"refId": "A", "expr": "{namespace=\"echo-prod\", component=\"api\"} |~ \"\\\\b(ERROR|FATAL|CRITICAL|Exception|Traceback)\\\\b|\\\\[(error|fatal|critical)\\\\]\""} ], "options": {"dedupStrategy": "none", "enableLogDetails": true, "prettifyLogMessage": false, "showCommonLabels": false, "showLabels": false, "showTime": true, "sortOrder": "Descending", "wrapLogMessage": true} }, @@ -1247,13 +1247,13 @@ data: "fieldConfig": {"defaults": {"unit": "short"}}, "datasource": {"type": "loki", "uid": "loki"}, "targets": [ - {"refId": "A", "expr": "sum(count_over_time(({namespace=\"echo-prod\", component=\"directus\"} |~ \"(?i)(error|exception|fail|fatal|critical|panic|warn|warning)\")[5m]))"} + {"refId": "A", "expr": "sum(count_over_time(({namespace=\"echo-prod\", component=\"directus\"} |~ \"\\\\b(ERROR|FATAL|CRITICAL|Exception|Traceback)\\\\b|\\\\[(error|fatal|critical)\\\\]\")[5m]))"} ] }, {"type": "logs", "title": "Directus Error Logs", "id": 204, "gridPos": {"h": 8, "w": 18, "x": 6, "y": 44}, "datasource": {"type": "loki", "uid": "loki"}, "targets": [ - {"refId": "A", "expr": "{namespace=\"echo-prod\", component=\"directus\"} |~ \"(?i)(error|exception|fail|fatal|critical|panic|warn|warning)\""} + {"refId": "A", "expr": "{namespace=\"echo-prod\", component=\"directus\"} |~ \"\\\\b(ERROR|FATAL|CRITICAL|Exception|Traceback)\\\\b|\\\\[(error|fatal|critical)\\\\]\""} ], "options": {"dedupStrategy": "none", "enableLogDetails": true, "prettifyLogMessage": false, "showCommonLabels": false, "showLabels": false, "showTime": true, "sortOrder": "Descending", "wrapLogMessage": true} }, @@ -1261,13 +1261,13 @@ data: "fieldConfig": {"defaults": {"unit": "short"}}, "datasource": {"type": "loki", "uid": "loki"}, "targets": [ - {"refId": "A", "expr": "sum(count_over_time(({namespace=\"echo-prod\", component=\"worker\"} |~ \"(?i)(error|exception|fail|fatal|critical|panic|warn|warning)\")[5m]))"} + {"refId": "A", "expr": "sum(count_over_time(({namespace=\"echo-prod\", component=\"worker\"} |~ \"\\\\b(ERROR|FATAL|CRITICAL|Exception|Traceback)\\\\b|\\\\[(error|fatal|critical)\\\\]\")[5m]))"} ] }, {"type": "logs", "title": "Worker Error Logs", "id": 206, "gridPos": {"h": 8, "w": 18, "x": 6, "y": 52}, "datasource": {"type": "loki", "uid": "loki"}, "targets": [ - {"refId": "A", "expr": "{namespace=\"echo-prod\", component=\"worker\"} |~ \"(?i)(error|exception|fail|fatal|critical|panic|warn|warning)\""} + {"refId": "A", "expr": "{namespace=\"echo-prod\", component=\"worker\"} |~ \"\\\\b(ERROR|FATAL|CRITICAL|Exception|Traceback)\\\\b|\\\\[(error|fatal|critical)\\\\]\""} ], "options": {"dedupStrategy": "none", "enableLogDetails": true, "prettifyLogMessage": false, "showCommonLabels": false, "showLabels": false, "showTime": true, "sortOrder": "Descending", "wrapLogMessage": true} }, @@ -1275,13 +1275,13 @@ data: "fieldConfig": {"defaults": {"unit": "short"}}, "datasource": {"type": "loki", "uid": "loki"}, "targets": [ - {"refId": "A", "expr": "sum(count_over_time(({namespace=\"echo-prod\", component=\"worker-cpu\"} |~ \"(?i)(error|exception|fail|fatal|critical|panic|warn|warning)\")[5m]))"} + {"refId": "A", "expr": "sum(count_over_time(({namespace=\"echo-prod\", component=\"worker-cpu\"} |~ \"\\\\b(ERROR|FATAL|CRITICAL|Exception|Traceback)\\\\b|\\\\[(error|fatal|critical)\\\\]\")[5m]))"} ] }, {"type": "logs", "title": "Worker-CPU Error Logs", "id": 208, "gridPos": {"h": 8, "w": 18, "x": 6, "y": 60}, "datasource": {"type": "loki", "uid": "loki"}, "targets": [ - {"refId": "A", "expr": "{namespace=\"echo-prod\", component=\"worker-cpu\"} |~ \"(?i)(error|exception|fail|fatal|critical|panic|warn|warning)\""} + {"refId": "A", "expr": "{namespace=\"echo-prod\", component=\"worker-cpu\"} |~ \"\\\\b(ERROR|FATAL|CRITICAL|Exception|Traceback)\\\\b|\\\\[(error|fatal|critical)\\\\]\""} ], "options": {"dedupStrategy": "none", "enableLogDetails": true, "prettifyLogMessage": false, "showCommonLabels": false, "showLabels": false, "showTime": true, "sortOrder": "Descending", "wrapLogMessage": true} } @@ -1290,13 +1290,13 @@ data: "fieldConfig": {"defaults": {"unit": "short"}}, "datasource": {"type": "loki", "uid": "loki"}, "targets": [ - {"refId": "A", "expr": "sum(count_over_time(({namespace=\"echo-prod\", container=\"neo4j\"} |~ \"(?i)(error|exception|fail|fatal|critical|panic|warn|warning)\")[5m]))"} + {"refId": "A", "expr": "sum(count_over_time(({namespace=\"echo-prod\", container=\"neo4j\"} |~ \"\\\\b(ERROR|FATAL|CRITICAL|Exception|Traceback)\\\\b|\\\\[(error|fatal|critical)\\\\]\")[5m]))"} ] }, {"type": "logs", "title": "Neo4j Error Logs", "id": 210, "gridPos": {"h": 8, "w": 18, "x": 6, "y": 68}, "datasource": {"type": "loki", "uid": "loki"}, "targets": [ - {"refId": "A", "expr": "{namespace=\"echo-prod\", container=\"neo4j\"} |~ \"(?i)(error|exception|fail|fatal|critical|panic|warn|warning)\""} + {"refId": "A", "expr": "{namespace=\"echo-prod\", container=\"neo4j\"} |~ \"\\\\b(ERROR|FATAL|CRITICAL|Exception|Traceback)\\\\b|\\\\[(error|fatal|critical)\\\\]\""} ], "options": {"dedupStrategy": "none", "enableLogDetails": true, "prettifyLogMessage": false, "showCommonLabels": false, "showLabels": false, "showTime": true, "sortOrder": "Descending", "wrapLogMessage": true} } @@ -1805,12 +1805,12 @@ data: { "selected": false, "text": "Error Logs", - "value": "|~ \"(?i)error|exception|fail|fatal\"" + "value": "|~ \"\\\\b(ERROR|FATAL|CRITICAL|Exception|Traceback)\\\\b|\\\\[(error|fatal|critical)\\\\]\"" }, { "selected": false, "text": "Warning Logs", - "value": "|~ \"(?i)warn|warning\"" + "value": "|~ \"\\\\b(WARN|WARNING)\\\\b|\\\\[(warn|warning)\\\\]\"" }, { "selected": false, @@ -1818,7 +1818,7 @@ data: "value": "|~ \"worker-cpu\"" } ], - "query": "All Logs : ,Error Logs : |~ \"(?i)error|exception|fail|fatal\",Warning Logs : |~ \"(?i)warn|warning\",Worker CPU Logs : |~ \"worker-cpu\"", + "query": "All Logs : ,Error Logs : |~ \"\\\\b(ERROR|FATAL|CRITICAL|Exception|Traceback)\\\\b|\\\\[(error|fatal|critical)\\\\]\",Warning Logs : |~ \"\\\\b(WARN|WARNING)\\\\b|\\\\[(warn|warning)\\\\]\",Worker CPU Logs : |~ \"worker-cpu\"", "skipUrlSync": false, "type": "custom" },