diff --git a/Vagrantfile b/Vagrantfile index a204b68518..99eace0e39 100644 --- a/Vagrantfile +++ b/Vagrantfile @@ -11,12 +11,15 @@ Vagrant.configure("2") do |config| # host_ip "0.0.0.0" lets WSL2 reach the forwarded ports through the host. config.vm.network "forwarded_port", guest: 22, host: 2222, host_ip: "0.0.0.0", id: "ssh", auto_correct: true config.vm.network "forwarded_port", guest: 8000, host: 8000, host_ip: "0.0.0.0", id: "app", auto_correct: true + config.vm.network "forwarded_port", guest: 3000, host: 3000, host_ip: "0.0.0.0", id: "grafana", auto_correct: true + config.vm.network "forwarded_port", guest: 3100, host: 3100, host_ip: "0.0.0.0", id: "loki", auto_correct: true + config.vm.network "forwarded_port", guest: 9080, host: 9080, host_ip: "0.0.0.0", id: "promtail", auto_correct: true config.ssh.insert_key = true config.vm.provider "virtualbox" do |vb| - vb.name = "lab05-ansible" - vb.memory = 2048 + vb.name = "lab07-monitoring" + vb.memory = 3072 vb.cpus = 2 end end diff --git a/ansible/README.md b/ansible/README.md index 60c1ec3a54..82a1582690 100644 --- a/ansible/README.md +++ b/ansible/README.md @@ -49,3 +49,13 @@ Workflow file: `.github/workflows/ansible-deploy.yml`. For Vagrant/VirtualBox setups behind NAT, prefer a **self-hosted Linux runner** on the same machine where you run Ansible (for example, WSL2 Ubuntu). This avoids inbound SSH exposure and stays free. + +## Lab07 monitoring stack + +```bash +# Deploy Loki + Promtail + Grafana + app stack on the target VM +ansible-playbook -i inventory/hosts.ini playbooks/deploy-monitoring.yml +``` + +The monitoring role builds the Python app locally on the target VM, so you do not need to push a new +application image to Docker Hub for Lab07. diff --git a/ansible/group_vars/webservers.yml b/ansible/group_vars/webservers.yml index 310cc192e1..2d245cbfeb 100644 --- a/ansible/group_vars/webservers.yml +++ b/ansible/group_vars/webservers.yml @@ -13,4 +13,3 @@ app_internal_port: 8000 # Extra environment variables for the container (optional) app_env: {} -dockerhub_username: dorley174 diff --git a/ansible/playbooks/deploy-monitoring.yml b/ansible/playbooks/deploy-monitoring.yml new file mode 100644 index 0000000000..b36ea3a65a --- /dev/null +++ b/ansible/playbooks/deploy-monitoring.yml @@ -0,0 +1,9 @@ +--- +- name: Deploy monitoring stack + hosts: webservers + become: true + + roles: + - role: monitoring + tags: + - monitoring diff --git a/ansible/roles/monitoring/defaults/main.yml b/ansible/roles/monitoring/defaults/main.yml new file mode 100644 index 0000000000..76fb4745d6 --- /dev/null +++ b/ansible/roles/monitoring/defaults/main.yml @@ -0,0 +1,48 @@ +--- +monitoring_project_dir: /opt/monitoring +monitoring_compose_project_name: devops-monitoring + +monitoring_loki_version: "3.0.0" +monitoring_promtail_version: "3.0.0" +monitoring_grafana_version: "12.3.1" + +monitoring_loki_port: 3100 +monitoring_promtail_port: 9080 +monitoring_grafana_port: 3000 +monitoring_app_port: 8000 +monitoring_app_internal_port: 8000 + +monitoring_loki_retention_period: "168h" +monitoring_grafana_admin_user: admin +monitoring_grafana_admin_password: ChangeMe_Lab07! +monitoring_grafana_datasource_uid: loki +monitoring_grafana_datasource_name: Loki + +monitoring_app_service_name: app-python +monitoring_app_container_name: devops-python +monitoring_app_label: devops-python +monitoring_app_image: devops-info-service:lab07 +monitoring_app_source_dir: "{{ playbook_dir }}/../../app_python" +monitoring_app_source_files: + - .dockerignore + - Dockerfile + - requirements.txt + - app.py +monitoring_app_env: + HOST: "0.0.0.0" + PORT: "{{ monitoring_app_internal_port | string }}" + DEBUG: "false" + +monitoring_resource_limits: + loki: + limits: { cpus: '1.0', memory: 1G } + reservations: { cpus: '0.25', memory: 256M } + promtail: + limits: { cpus: '0.75', memory: 512M } + reservations: { cpus: '0.10', memory: 128M } + grafana: + limits: { cpus: '1.0', memory: 1G } + reservations: { cpus: '0.25', memory: 256M } + app_python: + limits: { cpus: '0.75', memory: 512M } + reservations: { cpus: '0.10', memory: 128M } diff --git a/ansible/roles/monitoring/meta/main.yml b/ansible/roles/monitoring/meta/main.yml new file mode 100644 index 0000000000..cb7d8e0460 --- /dev/null +++ b/ansible/roles/monitoring/meta/main.yml @@ -0,0 +1,3 @@ +--- +dependencies: + - role: docker diff --git a/ansible/roles/monitoring/tasks/main.yml b/ansible/roles/monitoring/tasks/main.yml new file mode 100644 index 0000000000..a30f1d9812 --- /dev/null +++ b/ansible/roles/monitoring/tasks/main.yml @@ -0,0 +1,117 @@ +--- +- name: Deploy monitoring stack + block: + - name: Create monitoring directory structure + ansible.builtin.file: + path: "{{ item }}" + state: directory + owner: root + group: root + mode: "0755" + loop: + - "{{ monitoring_project_dir }}" + - "{{ monitoring_project_dir }}/loki" + - "{{ monitoring_project_dir }}/promtail" + - "{{ monitoring_project_dir }}/grafana" + - "{{ monitoring_project_dir }}/grafana/provisioning" + - "{{ monitoring_project_dir }}/grafana/provisioning/datasources" + - "{{ monitoring_project_dir }}/grafana/provisioning/dashboards" + - "{{ monitoring_project_dir }}/grafana/dashboards" + - "{{ monitoring_project_dir }}/app-python" + + - name: Copy application source files for local image build + ansible.builtin.copy: + src: "{{ monitoring_app_source_dir }}/{{ item }}" + dest: "{{ monitoring_project_dir }}/app-python/{{ item }}" + owner: root + group: root + mode: "0644" + loop: "{{ monitoring_app_source_files }}" + + - name: Template monitoring environment file + ansible.builtin.template: + src: env.j2 + dest: "{{ monitoring_project_dir }}/.env" + owner: root + group: root + mode: "0600" + + - name: Template monitoring stack files + ansible.builtin.template: + src: "{{ item.src }}" + dest: "{{ monitoring_project_dir }}/{{ item.dest }}" + owner: root + group: root + mode: "0644" + loop: + - { src: 'docker-compose.yml.j2', dest: 'docker-compose.yml' } + - { src: 'loki-config.yml.j2', dest: 'loki/config.yml' } + - { src: 'promtail-config.yml.j2', dest: 'promtail/config.yml' } + - { src: 'grafana-datasource.yml.j2', dest: 'grafana/provisioning/datasources/loki.yml' } + - { src: 'grafana-dashboard-provider.yml.j2', dest: 'grafana/provisioning/dashboards/dashboard-provider.yml' } + - { src: 'grafana-dashboard.json.j2', dest: 'grafana/dashboards/lab07-logging.json' } + + - name: Deploy monitoring stack with Docker Compose v2 + community.docker.docker_compose_v2: + project_src: "{{ monitoring_project_dir }}" + project_name: "{{ monitoring_compose_project_name }}" + state: present + build: always + recreate: auto + register: monitoring_compose + + - name: Wait for Loki to become ready + ansible.builtin.uri: + url: "http://127.0.0.1:{{ monitoring_loki_port }}/ready" + method: GET + status_code: 200 + register: loki_ready + retries: 20 + delay: 3 + until: loki_ready.status == 200 + + - name: Wait for Grafana API health endpoint + ansible.builtin.uri: + url: "http://127.0.0.1:{{ monitoring_grafana_port }}/api/health" + method: GET + user: "{{ monitoring_grafana_admin_user }}" + password: "{{ monitoring_grafana_admin_password }}" + force_basic_auth: true + status_code: 200 + register: grafana_health + retries: 20 + delay: 3 + until: grafana_health.status == 200 + + - name: Wait for monitored application health endpoint + ansible.builtin.uri: + url: "http://127.0.0.1:{{ monitoring_app_port }}/health" + method: GET + status_code: 200 + register: monitoring_app_health + retries: 20 + delay: 3 + until: monitoring_app_health.status == 200 + + - name: Verify Loki data source was provisioned + ansible.builtin.uri: + url: "http://127.0.0.1:{{ monitoring_grafana_port }}/api/datasources/uid/{{ monitoring_grafana_datasource_uid }}" + method: GET + user: "{{ monitoring_grafana_admin_user }}" + password: "{{ monitoring_grafana_admin_password }}" + force_basic_auth: true + status_code: 200 + + rescue: + - name: Monitoring deployment failure hint + ansible.builtin.debug: + msg: | + Monitoring stack deployment failed. + Helpful follow-up commands on the target host: + cd {{ monitoring_project_dir }} + docker compose ps + docker compose logs --no-color --tail=200 + + tags: + - monitoring + - monitoring_deploy diff --git a/ansible/roles/monitoring/templates/docker-compose.yml.j2 b/ansible/roles/monitoring/templates/docker-compose.yml.j2 new file mode 100644 index 0000000000..52f249fbf9 --- /dev/null +++ b/ansible/roles/monitoring/templates/docker-compose.yml.j2 @@ -0,0 +1,151 @@ +name: {{ monitoring_compose_project_name }} + +services: + loki: + image: grafana/loki:{{ monitoring_loki_version }} + container_name: devops-loki + command: + - -config.file=/etc/loki/config.yml + ports: + - "{{ monitoring_loki_port }}:3100" + volumes: + - ./loki/config.yml:/etc/loki/config.yml:ro + - loki-data:/loki + networks: + - logging + labels: + logging: "promtail" + app: "devops-loki" + restart: unless-stopped + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:3100/ready"] + interval: 15s + timeout: 5s + retries: 10 + start_period: 20s + deploy: + resources: + limits: + cpus: "{{ monitoring_resource_limits.loki.limits.cpus }}" + memory: {{ monitoring_resource_limits.loki.limits.memory }} + reservations: + cpus: "{{ monitoring_resource_limits.loki.reservations.cpus }}" + memory: {{ monitoring_resource_limits.loki.reservations.memory }} + + promtail: + image: grafana/promtail:{{ monitoring_promtail_version }} + container_name: devops-promtail + command: + - -config.file=/etc/promtail/config.yml + ports: + - "{{ monitoring_promtail_port }}:9080" + volumes: + - ./promtail/config.yml:/etc/promtail/config.yml:ro + - promtail-data:/var/lib/promtail + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + networks: + - logging + labels: + logging: "promtail" + app: "devops-promtail" + restart: unless-stopped + depends_on: + loki: + condition: service_healthy + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9080/ready"] + interval: 15s + timeout: 5s + retries: 10 + start_period: 20s + deploy: + resources: + limits: + cpus: "{{ monitoring_resource_limits.promtail.limits.cpus }}" + memory: {{ monitoring_resource_limits.promtail.limits.memory }} + reservations: + cpus: "{{ monitoring_resource_limits.promtail.reservations.cpus }}" + memory: {{ monitoring_resource_limits.promtail.reservations.memory }} + + grafana: + image: grafana/grafana:{{ monitoring_grafana_version }} + container_name: devops-grafana + env_file: + - .env + environment: + GF_SECURITY_ADMIN_USER: "${GRAFANA_ADMIN_USER:-{{ monitoring_grafana_admin_user }}}" + GF_SECURITY_ADMIN_PASSWORD: "${GRAFANA_ADMIN_PASSWORD}" + GF_AUTH_ANONYMOUS_ENABLED: "false" + GF_SECURITY_ALLOW_EMBEDDING: "false" + ports: + - "{{ monitoring_grafana_port }}:3000" + volumes: + - grafana-data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning:ro + - ./grafana/dashboards:/etc/grafana/dashboards:ro + networks: + - logging + labels: + logging: "promtail" + app: "devops-grafana" + restart: unless-stopped + depends_on: + loki: + condition: service_healthy + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:3000/api/health"] + interval: 15s + timeout: 5s + retries: 10 + start_period: 30s + deploy: + resources: + limits: + cpus: "{{ monitoring_resource_limits.grafana.limits.cpus }}" + memory: {{ monitoring_resource_limits.grafana.limits.memory }} + reservations: + cpus: "{{ monitoring_resource_limits.grafana.reservations.cpus }}" + memory: {{ monitoring_resource_limits.grafana.reservations.memory }} + + {{ monitoring_app_service_name }}: + build: + context: ./app-python + dockerfile: Dockerfile + image: {{ monitoring_app_image }} + container_name: {{ monitoring_app_container_name }} + environment: +{% for key, value in monitoring_app_env.items() %} + {{ key }}: {{ value | quote }} +{% endfor %} + ports: + - "{{ monitoring_app_port }}:{{ monitoring_app_internal_port }}" + networks: + - logging + labels: + logging: "promtail" + app: "{{ monitoring_app_label }}" + restart: unless-stopped + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://127.0.0.1:{{ monitoring_app_internal_port }}/health')"] + interval: 15s + timeout: 5s + retries: 10 + start_period: 20s + deploy: + resources: + limits: + cpus: "{{ monitoring_resource_limits.app_python.limits.cpus }}" + memory: {{ monitoring_resource_limits.app_python.limits.memory }} + reservations: + cpus: "{{ monitoring_resource_limits.app_python.reservations.cpus }}" + memory: {{ monitoring_resource_limits.app_python.reservations.memory }} + +networks: + logging: + driver: bridge + +volumes: + loki-data: + grafana-data: + promtail-data: diff --git a/ansible/roles/monitoring/templates/env.j2 b/ansible/roles/monitoring/templates/env.j2 new file mode 100644 index 0000000000..6afa6c1271 --- /dev/null +++ b/ansible/roles/monitoring/templates/env.j2 @@ -0,0 +1,6 @@ +GRAFANA_ADMIN_USER={{ monitoring_grafana_admin_user }} +GRAFANA_ADMIN_PASSWORD={{ monitoring_grafana_admin_password }} +GRAFANA_PORT={{ monitoring_grafana_port }} +LOKI_PORT={{ monitoring_loki_port }} +PROMTAIL_PORT={{ monitoring_promtail_port }} +APP_PORT={{ monitoring_app_port }} diff --git a/ansible/roles/monitoring/templates/grafana-dashboard-provider.yml.j2 b/ansible/roles/monitoring/templates/grafana-dashboard-provider.yml.j2 new file mode 100644 index 0000000000..e85b723a5c --- /dev/null +++ b/ansible/roles/monitoring/templates/grafana-dashboard-provider.yml.j2 @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: lab07-logging + orgId: 1 + folder: Lab07 Logging + type: file + disableDeletion: false + allowUiUpdates: true + updateIntervalSeconds: 30 + options: + path: /etc/grafana/dashboards diff --git a/ansible/roles/monitoring/templates/grafana-dashboard.json.j2 b/ansible/roles/monitoring/templates/grafana-dashboard.json.j2 new file mode 100644 index 0000000000..e2b3426b9d --- /dev/null +++ b/ansible/roles/monitoring/templates/grafana-dashboard.json.j2 @@ -0,0 +1,163 @@ +{ + "id": null, + "uid": "lab07-logging-overview", + "title": "Lab07 - Loki Logging Overview", + "tags": [ + "lab07", + "loki", + "logging" + ], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "10s", + "time": { + "from": "now-30m", + "to": "now" + }, + "panels": [ + { + "id": 1, + "type": "logs", + "title": "Recent Logs (all apps)", + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 0 + }, + "targets": [ + { + "refId": "A", + "expr": "{app=~\"devops-.*\"}", + "queryType": "range", + "datasource": { + "type": "loki", + "uid": "loki" + } + } + ], + "options": { + "showTime": true, + "showLabels": true, + "sortOrder": "Descending" + } + }, + { + "id": 2, + "type": "timeseries", + "title": "Request Rate by App", + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "targets": [ + { + "refId": "A", + "expr": "sum by (app) (rate({app=~\"devops-.*\"}[1m]))", + "queryType": "range", + "datasource": { + "type": "loki", + "uid": "loki" + } + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + } + }, + { + "id": 3, + "type": "logs", + "title": "Error Logs Only", + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "targets": [ + { + "refId": "A", + "expr": "{app=~\"devops-.*\"} | json | level=\"ERROR\"", + "queryType": "range", + "datasource": { + "type": "loki", + "uid": "loki" + } + } + ], + "options": { + "showTime": true, + "showLabels": true, + "sortOrder": "Descending" + } + }, + { + "id": 4, + "type": "piechart", + "title": "Log Level Distribution (last 5m)", + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 17 + }, + "targets": [ + { + "refId": "A", + "expr": "sum by (level) (count_over_time({app=~\"devops-.*\"} | json [5m]))", + "queryType": "range", + "datasource": { + "type": "loki", + "uid": "loki" + } + } + ], + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "reduceOptions": { + "values": false, + "calcs": [ + "lastNotNull" + ], + "fields": "" + } + } + } + ] +} \ No newline at end of file diff --git a/ansible/roles/monitoring/templates/grafana-datasource.yml.j2 b/ansible/roles/monitoring/templates/grafana-datasource.yml.j2 new file mode 100644 index 0000000000..f607bfadda --- /dev/null +++ b/ansible/roles/monitoring/templates/grafana-datasource.yml.j2 @@ -0,0 +1,19 @@ +apiVersion: 1 + +deleteDatasources: + - name: {{ monitoring_grafana_datasource_name }} + orgId: 1 + +prune: true + +datasources: + - name: {{ monitoring_grafana_datasource_name }} + uid: {{ monitoring_grafana_datasource_uid }} + type: loki + access: proxy + url: http://loki:3100 + isDefault: true + editable: false + jsonData: + maxLines: 1000 + timeout: 60 diff --git a/ansible/roles/monitoring/templates/loki-config.yml.j2 b/ansible/roles/monitoring/templates/loki-config.yml.j2 new file mode 100644 index 0000000000..dc0ada6071 --- /dev/null +++ b/ansible/roles/monitoring/templates/loki-config.yml.j2 @@ -0,0 +1,45 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + +common: + path_prefix: /loki + replication_factor: 1 + ring: + kvstore: + store: inmemory + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + +schema_config: + configs: + - from: "2024-01-01" + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +storage_config: + tsdb_shipper: + active_index_directory: /loki/tsdb-index + cache_location: /loki/tsdb-cache + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + retention_delete_delay: 2h + delete_request_store: filesystem + +limits_config: + retention_period: {{ monitoring_loki_retention_period }} + volume_enabled: true + +analytics: + reporting_enabled: false diff --git a/ansible/roles/monitoring/templates/promtail-config.yml.j2 b/ansible/roles/monitoring/templates/promtail-config.yml.j2 new file mode 100644 index 0000000000..8db835ba55 --- /dev/null +++ b/ansible/roles/monitoring/templates/promtail-config.yml.j2 @@ -0,0 +1,30 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /var/lib/promtail/positions.yml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + filters: + - name: label + values: ["logging=promtail"] + relabel_configs: + - source_labels: [__meta_docker_container_name] + regex: '/(.*)' + target_label: container + - source_labels: [__meta_docker_container_label_app] + target_label: app + - source_labels: [__meta_docker_container_log_stream] + target_label: stream + - target_label: job + replacement: docker + pipeline_stages: + - docker: {} diff --git a/app_python/Dockerfile b/app_python/Dockerfile index d8c40287b9..2ca732ad78 100644 --- a/app_python/Dockerfile +++ b/app_python/Dockerfile @@ -1,4 +1,3 @@ -# syntax=docker/dockerfile:1 # Production-oriented image for a small Flask app. # Pin a specific Python version for reproducible builds. diff --git a/app_python/app.py b/app_python/app.py index 931e2ee42b..ddca2ada41 100644 --- a/app_python/app.py +++ b/app_python/app.py @@ -1,212 +1,339 @@ -""" -DevOps Info Service -Main application module (Flask) - -Endpoints: -- GET / : service + system + runtime + request info -- GET /health : health check (for probes/monitoring) -""" - -from __future__ import annotations - -import logging -import os -import platform -import socket -from datetime import datetime, timezone -from typing import Any, Dict - -from flask import Flask, jsonify, request - -# ----------------------------------------------------------------------------- -# App & Config -# ----------------------------------------------------------------------------- - -app = Flask(__name__) - -HOST: str = os.getenv("HOST", "0.0.0.0") -PORT: int = int(os.getenv("PORT", "5000")) -DEBUG: bool = os.getenv("DEBUG", "False").strip().lower() == "true" - -SERVICE_NAME = "devops-info-service" -SERVICE_VERSION = "1.0.0" -SERVICE_DESCRIPTION = "DevOps course info service" -SERVICE_FRAMEWORK = "Flask" - -START_TIME_UTC = datetime.now(timezone.utc) - -# ----------------------------------------------------------------------------- -# Logging -# ----------------------------------------------------------------------------- - -logging.basicConfig( - level=logging.DEBUG if DEBUG else logging.INFO, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", -) -logger = logging.getLogger("devops-info-service") - - -@app.before_request -def log_request() -> None: - logger.debug("Request: %s %s", request.method, request.path) - - -@app.after_request -def add_headers(response): - # Helpful defaults - response.headers["Content-Type"] = "application/json; charset=utf-8" - return response - - -# ----------------------------------------------------------------------------- -# Helpers -# ----------------------------------------------------------------------------- - -def iso_utc_now_z() -> str: - """Return current UTC time in ISO format with 'Z' suffix.""" - return datetime.now(timezone.utc).isoformat(timespec="milliseconds").replace("+00:00", "Z") - - -def get_client_ip() -> str: - """ - Best-effort client IP resolution. - Prefers X-Forwarded-For (common behind reverse proxies). - """ - forwarded_for = request.headers.get("X-Forwarded-For", "") - if forwarded_for: - # "client, proxy1, proxy2" - return forwarded_for.split(",")[0].strip() - return request.remote_addr or "unknown" - - -def get_uptime() -> Dict[str, Any]: - """Calculate service uptime since START_TIME_UTC.""" - delta = datetime.now(timezone.utc) - START_TIME_UTC - seconds = int(delta.total_seconds()) - hours = seconds // 3600 - minutes = (seconds % 3600) // 60 - - hours_part = f"{hours} hour" + ("" if hours == 1 else "s") - minutes_part = f"{minutes} minute" + ("" if minutes == 1 else "s") - - return { - "seconds": seconds, - "human": f"{hours_part}, {minutes_part}", - } - - -def get_system_info() -> Dict[str, Any]: - """Collect system information using Python standard library.""" - return { - "hostname": socket.gethostname(), - "platform": platform.system(), - # platform.platform() gives a more descriptive string than version/release alone - "platform_version": platform.platform(), - "architecture": platform.machine(), - "cpu_count": os.cpu_count() or 0, - "python_version": platform.python_version(), - } - - -def build_endpoints() -> list[Dict[str, str]]: - return [ - {"path": "/", "method": "GET", "description": "Service information"}, - {"path": "/health", "method": "GET", "description": "Health check"}, - ] - - -# ----------------------------------------------------------------------------- -# Routes -# ----------------------------------------------------------------------------- - -@app.get("/") -def index(): - """Main endpoint - service and system information.""" - uptime = get_uptime() - - payload: Dict[str, Any] = { - "service": { - "name": SERVICE_NAME, - "version": SERVICE_VERSION, - "description": SERVICE_DESCRIPTION, - "framework": SERVICE_FRAMEWORK, - }, - "system": get_system_info(), - "runtime": { - "uptime_seconds": uptime["seconds"], - "uptime_human": uptime["human"], - "current_time": iso_utc_now_z(), - "timezone": "UTC", - }, - "request": { - "client_ip": get_client_ip(), - "user_agent": request.headers.get("User-Agent", "unknown"), - "method": request.method, - "path": request.path, - }, - "endpoints": build_endpoints(), - } - - return jsonify(payload), 200 - - -@app.get("/health") -def health(): - """Health check endpoint (for probes/monitoring).""" - uptime = get_uptime() - return jsonify( - { - "status": "healthy", - "timestamp": iso_utc_now_z(), - "uptime_seconds": uptime["seconds"], - } - ), 200 - - -# ----------------------------------------------------------------------------- -# Error Handlers -# ----------------------------------------------------------------------------- - -@app.errorhandler(404) -def not_found(_error): - return ( - jsonify( - { - "error": "Not Found", - "message": "Endpoint does not exist", - "timestamp": iso_utc_now_z(), - } - ), - 404, - ) - - -@app.errorhandler(500) -def internal_error(_error): - logger.exception("Unhandled error") - return ( - jsonify( - { - "error": "Internal Server Error", - "message": "An unexpected error occurred", - "timestamp": iso_utc_now_z(), - } - ), - 500, - ) - - -# ----------------------------------------------------------------------------- -# Entrypoint -# ----------------------------------------------------------------------------- - -def main() -> None: - logger.info("Starting %s v%s (%s)", SERVICE_NAME, SERVICE_VERSION, SERVICE_FRAMEWORK) - logger.info("Config: HOST=%s PORT=%s DEBUG=%s", HOST, PORT, DEBUG) - # NOTE: Flask built-in server is fine for lab/dev. - # For production you'd run via a WSGI server (e.g., gunicorn). - app.run(host=HOST, port=PORT, debug=DEBUG) - - -if __name__ == "__main__": - main() +""" +DevOps Info Service +Main application module (Flask) + +Endpoints: +- GET / : service + system + runtime + request info +- GET /health : health check (for probes/monitoring) +""" + +from __future__ import annotations + +import json +import logging +import os +import platform +import socket +import sys +import time +from datetime import datetime, timezone +from typing import Any, Dict + +from flask import Flask, g, jsonify, request + +# ----------------------------------------------------------------------------- +# App & Config +# ----------------------------------------------------------------------------- + +app = Flask(__name__) + +HOST: str = os.getenv("HOST", "0.0.0.0") +PORT: int = int(os.getenv("PORT", "5000")) +DEBUG: bool = os.getenv("DEBUG", "False").strip().lower() == "true" + +SERVICE_NAME = "devops-info-service" +SERVICE_VERSION = "1.1.0" +SERVICE_DESCRIPTION = "DevOps course info service" +SERVICE_FRAMEWORK = "Flask" + +START_TIME_UTC = datetime.now(timezone.utc) + + +# ----------------------------------------------------------------------------- +# Logging +# ----------------------------------------------------------------------------- + + +def iso_utc_now_z() -> str: + """Return current UTC time in ISO format with 'Z' suffix.""" + return datetime.now(timezone.utc).isoformat(timespec="milliseconds").replace("+00:00", "Z") + + +class JSONFormatter(logging.Formatter): + """Small JSON formatter for container-friendly structured logs.""" + + EXTRA_FIELDS = ( + "event", + "service", + "version", + "method", + "path", + "status_code", + "client_ip", + "duration_ms", + "user_agent", + ) + + def format(self, record: logging.LogRecord) -> str: + payload: Dict[str, Any] = { + "timestamp": iso_utc_now_z(), + "level": record.levelname, + "logger": record.name, + "message": record.getMessage(), + } + + for field in self.EXTRA_FIELDS: + value = getattr(record, field, None) + if value is not None: + payload[field] = value + + if record.exc_info: + payload["exception"] = self.formatException(record.exc_info) + + return json.dumps(payload, ensure_ascii=False) + + + +def configure_logging() -> logging.Logger: + handler = logging.StreamHandler(sys.stdout) + handler.setFormatter(JSONFormatter()) + + root_logger = logging.getLogger() + root_logger.handlers.clear() + root_logger.addHandler(handler) + root_logger.setLevel(logging.DEBUG if DEBUG else logging.INFO) + + for logger_name in ("werkzeug", "gunicorn.error", "gunicorn.access"): + current_logger = logging.getLogger(logger_name) + current_logger.handlers.clear() + current_logger.propagate = True + current_logger.setLevel(logging.DEBUG if DEBUG else logging.INFO) + + return logging.getLogger(SERVICE_NAME) + + +logger = configure_logging() + + +# ----------------------------------------------------------------------------- +# Request hooks +# ----------------------------------------------------------------------------- + + +def get_client_ip() -> str: + """ + Best-effort client IP resolution. + Prefers X-Forwarded-For (common behind reverse proxies). + """ + forwarded_for = request.headers.get("X-Forwarded-For", "") + if forwarded_for: + # "client, proxy1, proxy2" + return forwarded_for.split(",")[0].strip() + return request.remote_addr or "unknown" + + +@app.before_request +def log_request_started() -> None: + g.request_started_at = time.perf_counter() + logger.debug( + "request_started", + extra={ + "event": "request_started", + "service": SERVICE_NAME, + "version": SERVICE_VERSION, + "method": request.method, + "path": request.path, + "client_ip": get_client_ip(), + "user_agent": request.headers.get("User-Agent", "unknown"), + }, + ) + + +@app.after_request +def add_headers(response): + response.headers["Content-Type"] = "application/json; charset=utf-8" + + duration_ms = round((time.perf_counter() - getattr(g, "request_started_at", time.perf_counter())) * 1000, 2) + logger.info( + "request_completed", + extra={ + "event": "request_completed", + "service": SERVICE_NAME, + "version": SERVICE_VERSION, + "method": request.method, + "path": request.path, + "status_code": response.status_code, + "client_ip": get_client_ip(), + "duration_ms": duration_ms, + "user_agent": request.headers.get("User-Agent", "unknown"), + }, + ) + return response + + +# ----------------------------------------------------------------------------- +# Helpers +# ----------------------------------------------------------------------------- + + +def get_uptime() -> Dict[str, Any]: + """Calculate service uptime since START_TIME_UTC.""" + delta = datetime.now(timezone.utc) - START_TIME_UTC + seconds = int(delta.total_seconds()) + hours = seconds // 3600 + minutes = (seconds % 3600) // 60 + + hours_part = f"{hours} hour" + ("" if hours == 1 else "s") + minutes_part = f"{minutes} minute" + ("" if minutes == 1 else "s") + + return { + "seconds": seconds, + "human": f"{hours_part}, {minutes_part}", + } + + + +def get_system_info() -> Dict[str, Any]: + """Collect system information using Python standard library.""" + return { + "hostname": socket.gethostname(), + "platform": platform.system(), + "platform_version": platform.platform(), + "architecture": platform.machine(), + "cpu_count": os.cpu_count() or 0, + "python_version": platform.python_version(), + } + + + +def build_endpoints() -> list[Dict[str, str]]: + return [ + {"path": "/", "method": "GET", "description": "Service information"}, + {"path": "/health", "method": "GET", "description": "Health check"}, + ] + + +# ----------------------------------------------------------------------------- +# Routes +# ----------------------------------------------------------------------------- + +@app.get("/") +def index(): + """Main endpoint - service and system information.""" + uptime = get_uptime() + + payload: Dict[str, Any] = { + "service": { + "name": SERVICE_NAME, + "version": SERVICE_VERSION, + "description": SERVICE_DESCRIPTION, + "framework": SERVICE_FRAMEWORK, + }, + "system": get_system_info(), + "runtime": { + "uptime_seconds": uptime["seconds"], + "uptime_human": uptime["human"], + "current_time": iso_utc_now_z(), + "timezone": "UTC", + }, + "request": { + "client_ip": get_client_ip(), + "user_agent": request.headers.get("User-Agent", "unknown"), + "method": request.method, + "path": request.path, + }, + "endpoints": build_endpoints(), + } + + return jsonify(payload), 200 + + +@app.get("/health") +def health(): + """Health check endpoint (for probes/monitoring).""" + uptime = get_uptime() + return jsonify( + { + "status": "healthy", + "timestamp": iso_utc_now_z(), + "uptime_seconds": uptime["seconds"], + } + ), 200 + + +# ----------------------------------------------------------------------------- +# Error Handlers +# ----------------------------------------------------------------------------- + +@app.errorhandler(404) +def not_found(_error): + logger.warning( + "endpoint_not_found", + extra={ + "event": "endpoint_not_found", + "service": SERVICE_NAME, + "version": SERVICE_VERSION, + "method": request.method, + "path": request.path, + "status_code": 404, + "client_ip": get_client_ip(), + }, + ) + return ( + jsonify( + { + "error": "Not Found", + "message": "Endpoint does not exist", + "timestamp": iso_utc_now_z(), + } + ), + 404, + ) + + +@app.errorhandler(500) +def internal_error(_error): + logger.exception( + "unhandled_error", + extra={ + "event": "unhandled_error", + "service": SERVICE_NAME, + "version": SERVICE_VERSION, + "method": request.method, + "path": request.path, + "status_code": 500, + "client_ip": get_client_ip(), + }, + ) + return ( + jsonify( + { + "error": "Internal Server Error", + "message": "An unexpected error occurred", + "timestamp": iso_utc_now_z(), + } + ), + 500, + ) + + +# ----------------------------------------------------------------------------- +# Entrypoint +# ----------------------------------------------------------------------------- + + +def main() -> None: + logger.info( + "service_starting", + extra={ + "event": "service_starting", + "service": SERVICE_NAME, + "version": SERVICE_VERSION, + }, + ) + logger.info( + "runtime_configuration host=%s port=%s debug=%s", + HOST, + PORT, + DEBUG, + extra={ + "event": "runtime_configuration", + "service": SERVICE_NAME, + "version": SERVICE_VERSION, + }, + ) + app.run(host=HOST, port=PORT, debug=DEBUG) + + +if __name__ == "__main__": + main() diff --git a/app_python/tests/test_logging.py b/app_python/tests/test_logging.py new file mode 100644 index 0000000000..4e25aa23bc --- /dev/null +++ b/app_python/tests/test_logging.py @@ -0,0 +1,35 @@ +import json +import logging + +from app import JSONFormatter + + +def test_json_formatter_renders_expected_fields(): + formatter = JSONFormatter() + record = logging.LogRecord( + name="devops-info-service", + level=logging.INFO, + pathname=__file__, + lineno=10, + msg="request_completed", + args=(), + exc_info=None, + ) + record.event = "request_completed" + record.service = "devops-info-service" + record.version = "1.1.0" + record.method = "GET" + record.path = "/health" + record.status_code = 200 + record.client_ip = "127.0.0.1" + record.duration_ms = 3.14 + + payload = json.loads(formatter.format(record)) + + assert payload["message"] == "request_completed" + assert payload["level"] == "INFO" + assert payload["event"] == "request_completed" + assert payload["service"] == "devops-info-service" + assert payload["method"] == "GET" + assert payload["path"] == "/health" + assert payload["status_code"] == 200 diff --git a/monitoring/.env.example b/monitoring/.env.example new file mode 100644 index 0000000000..7b1de1cf95 --- /dev/null +++ b/monitoring/.env.example @@ -0,0 +1,10 @@ +# Copy this file to .env before starting the stack. +# PowerShell: Copy-Item .env.example .env +# Bash: cp .env.example .env + +GRAFANA_ADMIN_USER=admin +GRAFANA_ADMIN_PASSWORD=ChangeMe_Lab07! +GRAFANA_PORT=3000 +LOKI_PORT=3100 +PROMTAIL_PORT=9080 +APP_PORT=8000 diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml new file mode 100644 index 0000000000..91209015d4 --- /dev/null +++ b/monitoring/docker-compose.yml @@ -0,0 +1,152 @@ +name: devops-monitoring + +services: + loki: + image: grafana/loki:3.0.0 + container_name: devops-loki + command: + - -config.file=/etc/loki/config.yml + ports: + - "${LOKI_PORT:-3100}:3100" + volumes: + - ./loki/config.yml:/etc/loki/config.yml:ro + - loki-data:/loki + networks: + - logging + labels: + logging: "promtail" + app: "devops-loki" + restart: unless-stopped + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:3100/ready"] + interval: 15s + timeout: 5s + retries: 10 + start_period: 20s + deploy: + resources: + limits: + cpus: "1.0" + memory: 1G + reservations: + cpus: "0.25" + memory: 256M + + promtail: + image: grafana/promtail:3.0.0 + container_name: devops-promtail + command: + - -config.file=/etc/promtail/config.yml + ports: + - "${PROMTAIL_PORT:-9080}:9080" + volumes: + - ./promtail/config.yml:/etc/promtail/config.yml:ro + - promtail-data:/var/lib/promtail + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + networks: + - logging + labels: + logging: "promtail" + app: "devops-promtail" + restart: unless-stopped + depends_on: + loki: + condition: service_healthy + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9080/ready"] + interval: 15s + timeout: 5s + retries: 10 + start_period: 20s + deploy: + resources: + limits: + cpus: "0.75" + memory: 512M + reservations: + cpus: "0.10" + memory: 128M + + grafana: + image: grafana/grafana:12.3.1 + container_name: devops-grafana + env_file: + - .env + environment: + GF_SECURITY_ADMIN_USER: "${GRAFANA_ADMIN_USER:-admin}" + GF_SECURITY_ADMIN_PASSWORD: "${GRAFANA_ADMIN_PASSWORD}" + GF_AUTH_ANONYMOUS_ENABLED: "false" + GF_AUTH_ANONYMOUS_ORG_ROLE: Viewer + GF_SECURITY_ALLOW_EMBEDDING: "false" + ports: + - "${GRAFANA_PORT:-3000}:3000" + volumes: + - grafana-data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning:ro + - ./grafana/dashboards:/etc/grafana/dashboards:ro + networks: + - logging + labels: + logging: "promtail" + app: "devops-grafana" + restart: unless-stopped + depends_on: + loki: + condition: service_healthy + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:3000/api/health"] + interval: 15s + timeout: 5s + retries: 10 + start_period: 30s + deploy: + resources: + limits: + cpus: "1.0" + memory: 1G + reservations: + cpus: "0.25" + memory: 256M + + app-python: + build: + context: ../app_python + dockerfile: Dockerfile + image: devops-info-service:lab07 + container_name: devops-python + environment: + HOST: "0.0.0.0" + PORT: "8000" + DEBUG: "false" + ports: + - "${APP_PORT:-8000}:8000" + networks: + - logging + labels: + logging: "promtail" + app: "devops-python" + restart: unless-stopped + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://127.0.0.1:8000/health')"] + interval: 15s + timeout: 5s + retries: 10 + start_period: 20s + deploy: + resources: + limits: + cpus: "0.75" + memory: 512M + reservations: + cpus: "0.10" + memory: 128M + +networks: + logging: + driver: bridge + +volumes: + loki-data: + grafana-data: + promtail-data: diff --git a/monitoring/docs/LAB07.md b/monitoring/docs/LAB07.md new file mode 100644 index 0000000000..30563cd6ff --- /dev/null +++ b/monitoring/docs/LAB07.md @@ -0,0 +1,244 @@ +# Lab07 — Observability & Logging with Loki Stack + +## Summary + +This implementation follows the lab requirements and is adapted to a fully local, free workflow on Windows 11 using WSL2. The repository now contains: + +- a ready-to-run Loki + Promtail + Grafana stack in `monitoring/` +- structured JSON logging in the Python Flask app +- Grafana data source provisioning +- a prebuilt Grafana dashboard definition with four required panels +- an optional Ansible automation role (`roles/monitoring`) and playbook (`playbooks/deploy-monitoring.yml`) + +The final practical validation for this lab was completed locally with Docker Compose in WSL2. + +## Architecture + +```text ++-------------------+ +-------------------+ +| Python App | | Grafana | +| JSON logs stdout | | dashboards/query | ++---------+---------+ +---------+---------+ + | | + | Docker json-file logs | HTTP API / UI + v | ++---------+---------+ | +| Promtail | push logs | +| docker_sd_configs +----------------> | ++---------+---------+ | + | v + | +-----+------+ + +--------------------> | Loki | + | TSDB + FS | + +------------+ +``` + +## Repository Structure + +```text +monitoring/ +├── .env.example +├── docker-compose.yml +├── docs/ +│ ├── LAB07.md +│ ├── LOCAL_VALIDATION_WINDOWS.md +│ └── screenshots/ +├── grafana/ +│ ├── dashboards/ +│ │ └── lab07-logging.json +│ └── provisioning/ +│ ├── dashboards/ +│ │ └── dashboard-provider.yml +│ └── datasources/ +│ └── loki.yml +├── loki/ +│ └── config.yml +└── promtail/ + └── config.yml +``` + +## Configuration Notes + +### Loki + +- single-node deployment +- `store: tsdb` +- `schema: v13` +- filesystem-backed local storage +- 7-day retention (`168h`) +- compactor enabled for retention processing + +### Promtail + +- Docker service discovery via `docker_sd_configs` +- filters only containers with label `logging=promtail` +- extracts container name into the `container` label +- keeps the application label from Docker metadata as `app` +- uses the `docker` pipeline stage so Docker JSON logs are parsed correctly + +### Grafana + +- anonymous access disabled +- admin password loaded from `.env` +- Loki data source provisioned automatically on startup +- dashboard provider loads a ready-made dashboard from disk + +## Application Logging + +The Flask app emits structured JSON logs to stdout using a custom `JSONFormatter`. + +Logged events include: +- service startup +- request start +- request completion with method/path/status/client IP/duration +- 404 events +- unhandled 500 errors with exception details + +Example log line: + +```json +{"timestamp":"2026-03-13T01:11:28.923Z","level":"INFO","logger":"werkzeug","message":"127.0.0.1 - - [13/Mar/2026 01:11:28] \"GET /health HTTP/1.1\" 200 -"} +``` + +## Dashboard + +The provisioned dashboard contains four required panels: + +1. **Recent Logs (all apps)** — logs panel +2. **Request Rate by App** — time series +3. **Error Logs Only** — logs panel filtered by log level +4. **Log Level Distribution (last 5m)** — pie chart grouped by log level + +## Production Readiness Choices + +- resource constraints were added to all services +- Grafana anonymous authentication is disabled +- Grafana admin credentials are externalized through `.env` +- health checks were added for Loki, Promtail, Grafana, and the Python app +- Loki retention is set to 7 days to prevent unbounded log growth + +## Local Validation Workflow + +The stack was validated locally from WSL2 using Docker Compose. + +Commands used: + +```bash +cd monitoring +cp .env.example .env +# edit Grafana admin password in .env + +docker compose up -d --build +docker compose ps +curl http://127.0.0.1:3100/ready +curl http://127.0.0.1:3000 +curl http://127.0.0.1:8000/health +``` + +Traffic generation used for log validation: + +```bash +curl http://127.0.0.1:8000/ +curl http://127.0.0.1:8000/health +curl http://127.0.0.1:8000/ +curl http://127.0.0.1:8000/health +``` + +## Validation Results + +The monitoring stack was validated successfully in a fully local Windows 11 + WSL2 environment using Docker Compose. + +Confirmed results: +- Grafana is reachable on `http://127.0.0.1:3000` +- Loki is reachable on `http://127.0.0.1:3100/ready` and returns `ready` +- the Flask application is reachable on `http://127.0.0.1:8000` +- the application health endpoint returns a successful JSON response +- container logs are ingested into Loki and visible in Grafana Explore + +The following LogQL query was confirmed to work locally: + +```logql +{job="docker", app="devops-python"} +``` + +A broader query also returned logs successfully: + +```logql +{job="docker"} +``` + +The returned logs included Flask access entries such as `GET /health` and `GET /`. + +## Recommended LogQL Queries + +Working queries confirmed locally: + +```logql +{job="docker"} +{job="docker", app="devops-python"} +{job="docker", container="devops-python"} +{job="docker", service_name="devops-python"} +``` + +Additional useful queries: + +```logql +{app="devops-python"} | json | level="INFO" +{app="devops-python"} | json | level="ERROR" +{app="devops-python"} | json | method="GET" +sum by (app) (rate({job="docker"}[1m])) +sum by (level) (count_over_time({job="docker"} | json [5m])) +``` + +## Ansible Bonus Automation + +The repository also includes bonus automation for the existing Lab06 Ansible layout: + +- role: `ansible/roles/monitoring` +- playbook: `ansible/playbooks/deploy-monitoring.yml` + +The role: +- depends on the existing `docker` role +- creates the monitoring directory structure on the VM +- copies the local Python app source to the target host +- templates Loki, Promtail, Grafana, and Docker Compose files +- builds the Python app locally on the target VM with Docker Compose v2 +- verifies Loki, Grafana, the app health endpoint, and the provisioned data source + +## Challenges and Practical Notes + +1. **Windows + WSL2 networking** + - forwarded ports from VirtualBox may work on Windows `127.0.0.1` but require the current Windows host IP from inside WSL +2. **Registry/network instability** + - in this environment, network timeouts to Docker infrastructure are possible + - the final successful validation was completed locally with Docker Compose in WSL2 +3. **Promtail lifecycle** + - the lab still requires Promtail, so this solution keeps it + - for long-term production use, plan a future migration path to Grafana Alloy + +## Captured Screenshots + +The following screenshots should be saved under `monitoring/docs/screenshots/`: + +- `grafana-login.png` — Grafana login page +- `grafana-datasource-loki.png` — Loki datasource with successful connection test +- `grafana-explore-all-logs.png` — Grafana Explore with the broad query `{job="docker"}` +- `grafana-explore-app-logs.png` — Grafana Explore with application-specific logs using `{job="docker", app="devops-python"}` +- `grafana-dashboard.png` — provisioned dashboard with visible log/metrics panels +- `app-health-and-stack.png` — terminal validation with `docker compose ps`, Loki ready check, and app health check + +## Evidence Captured Locally + +The following evidence was captured after the stack started successfully: + +- Grafana login page +- Loki datasource connection test +- Grafana Explore with working LogQL queries +- provisioned dashboard +- successful Loki readiness check +- successful application health check +- running Docker Compose services + +## Conclusion + +Lab07 was completed successfully in a local Windows 11 + WSL2 environment. The Grafana + Loki + Promtail stack is running, the Flask application emits structured JSON logs, and those logs are visible in Grafana through Loki. Local validation confirmed successful service startup, log ingestion, and dashboard availability. \ No newline at end of file diff --git a/monitoring/docs/LOCAL_VALIDATION_WINDOWS.md b/monitoring/docs/LOCAL_VALIDATION_WINDOWS.md new file mode 100644 index 0000000000..7055bf9974 --- /dev/null +++ b/monitoring/docs/LOCAL_VALIDATION_WINDOWS.md @@ -0,0 +1,80 @@ +# Lab07 Local Validation on Windows 11 (WSL2 + Vagrant-friendly) + +This guide is tailored for the same environment used in earlier labs: +- Windows 11 host +- WSL2 Ubuntu for Linux commands +- VirtualBox + Vagrant for the free local VM option +- Russia / unstable access to Docker registries is possible, so retries are expected + +## Option A — Run the stack directly from WSL (if you already have a Linux Docker engine) + +1. Copy the example environment file: + ```bash + cd monitoring + cp .env.example .env + ``` +2. Edit `.env` and set a real Grafana password. +3. Start the stack: + ```bash + docker compose up -d --build + docker compose ps + ``` +4. Verify endpoints: + ```bash + curl http://127.0.0.1:3100/ready + curl http://127.0.0.1:9080/targets + curl http://127.0.0.1:8000/health + ``` +5. Open Grafana in a browser: `http://127.0.0.1:3000` + - user: `admin` + - password: value from `.env` + +## Option B — 100% free path using the Vagrant VM and Ansible bonus automation + +1. Reload Vagrant to apply the new forwarded ports: + ```powershell + vagrant reload + vagrant status + vagrant port + ``` +2. Make sure the VM is reachable from WSL. If forwarded ports do not work via `127.0.0.1` inside WSL, use the current Windows host IP from the `vEthernet (WSL...)` adapter. +3. From WSL, activate your Python environment and run: + ```bash + source ~/venvs/devops-lab/bin/activate + export ANSIBLE_CONFIG=/home//work/ansible/ansible.cfg + cd /path/to/repo/ansible + ansible-galaxy collection install -r requirements.yml + ansible-playbook -i inventory/hosts.ini playbooks/deploy-monitoring.yml + ``` +4. From Windows open: + - Grafana: `http://127.0.0.1:3000` + - Loki ready endpoint: `http://127.0.0.1:3100/ready` + - App health: `http://127.0.0.1:8000/health` +5. From WSL, if `127.0.0.1` forwarding does not work, use the current Windows host IP instead of localhost. + +## Generate example traffic + +### Bash / WSL +```bash +for i in {1..20}; do curl -s http://127.0.0.1:8000/ > /dev/null; done +for i in {1..20}; do curl -s http://127.0.0.1:8000/health > /dev/null; done +curl -s http://127.0.0.1:8000/does-not-exist > /dev/null +``` + +### PowerShell +```powershell +1..20 | ForEach-Object { Invoke-WebRequest -UseBasicParsing http://127.0.0.1:8000/ | Out-Null } +1..20 | ForEach-Object { Invoke-WebRequest -UseBasicParsing http://127.0.0.1:8000/health | Out-Null } +try { Invoke-WebRequest -UseBasicParsing http://127.0.0.1:8000/does-not-exist | Out-Null } catch {} +``` + +## Useful LogQL checks + +```logql +{job="docker"} +{app="devops-python"} +{app="devops-python"} | json | level="INFO" +{app="devops-python"} | json | level="ERROR" +sum by (app) (rate({app=~"devops-.*"}[1m])) +sum by (level) (count_over_time({app=~"devops-.*"} | json [5m])) +``` diff --git a/monitoring/docs/screenshots/README.md b/monitoring/docs/screenshots/README.md new file mode 100644 index 0000000000..25d39fa7a3 --- /dev/null +++ b/monitoring/docs/screenshots/README.md @@ -0,0 +1,6 @@ +Store your local proof screenshots here after you run the stack: + +- grafana-explore.png +- grafana-dashboard.png +- grafana-login.png +- app-json-logs.png diff --git a/monitoring/docs/screenshots/app-health-and-stack.png b/monitoring/docs/screenshots/app-health-and-stack.png new file mode 100644 index 0000000000..32b9216f89 Binary files /dev/null and b/monitoring/docs/screenshots/app-health-and-stack.png differ diff --git a/monitoring/docs/screenshots/grafana-dashboard.png b/monitoring/docs/screenshots/grafana-dashboard.png new file mode 100644 index 0000000000..3d2537b045 Binary files /dev/null and b/monitoring/docs/screenshots/grafana-dashboard.png differ diff --git a/monitoring/docs/screenshots/grafana-datasource-loki.png b/monitoring/docs/screenshots/grafana-datasource-loki.png new file mode 100644 index 0000000000..afbb8afc4e Binary files /dev/null and b/monitoring/docs/screenshots/grafana-datasource-loki.png differ diff --git a/monitoring/docs/screenshots/grafana-explore-all-logs.png b/monitoring/docs/screenshots/grafana-explore-all-logs.png new file mode 100644 index 0000000000..1c163543fe Binary files /dev/null and b/monitoring/docs/screenshots/grafana-explore-all-logs.png differ diff --git a/monitoring/docs/screenshots/grafana-explore-app-logs2.png b/monitoring/docs/screenshots/grafana-explore-app-logs2.png new file mode 100644 index 0000000000..f6dd6243fd Binary files /dev/null and b/monitoring/docs/screenshots/grafana-explore-app-logs2.png differ diff --git a/monitoring/docs/screenshots/grafana-login.png b/monitoring/docs/screenshots/grafana-login.png new file mode 100644 index 0000000000..f277b7a146 Binary files /dev/null and b/monitoring/docs/screenshots/grafana-login.png differ diff --git a/monitoring/grafana/dashboards/lab07-logging.json b/monitoring/grafana/dashboards/lab07-logging.json new file mode 100644 index 0000000000..e2b3426b9d --- /dev/null +++ b/monitoring/grafana/dashboards/lab07-logging.json @@ -0,0 +1,163 @@ +{ + "id": null, + "uid": "lab07-logging-overview", + "title": "Lab07 - Loki Logging Overview", + "tags": [ + "lab07", + "loki", + "logging" + ], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "10s", + "time": { + "from": "now-30m", + "to": "now" + }, + "panels": [ + { + "id": 1, + "type": "logs", + "title": "Recent Logs (all apps)", + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 0 + }, + "targets": [ + { + "refId": "A", + "expr": "{app=~\"devops-.*\"}", + "queryType": "range", + "datasource": { + "type": "loki", + "uid": "loki" + } + } + ], + "options": { + "showTime": true, + "showLabels": true, + "sortOrder": "Descending" + } + }, + { + "id": 2, + "type": "timeseries", + "title": "Request Rate by App", + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "targets": [ + { + "refId": "A", + "expr": "sum by (app) (rate({app=~\"devops-.*\"}[1m]))", + "queryType": "range", + "datasource": { + "type": "loki", + "uid": "loki" + } + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + } + }, + { + "id": 3, + "type": "logs", + "title": "Error Logs Only", + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "targets": [ + { + "refId": "A", + "expr": "{app=~\"devops-.*\"} | json | level=\"ERROR\"", + "queryType": "range", + "datasource": { + "type": "loki", + "uid": "loki" + } + } + ], + "options": { + "showTime": true, + "showLabels": true, + "sortOrder": "Descending" + } + }, + { + "id": 4, + "type": "piechart", + "title": "Log Level Distribution (last 5m)", + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 17 + }, + "targets": [ + { + "refId": "A", + "expr": "sum by (level) (count_over_time({app=~\"devops-.*\"} | json [5m]))", + "queryType": "range", + "datasource": { + "type": "loki", + "uid": "loki" + } + } + ], + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "reduceOptions": { + "values": false, + "calcs": [ + "lastNotNull" + ], + "fields": "" + } + } + } + ] +} \ No newline at end of file diff --git a/monitoring/grafana/provisioning/dashboards/dashboard-provider.yml b/monitoring/grafana/provisioning/dashboards/dashboard-provider.yml new file mode 100644 index 0000000000..e85b723a5c --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/dashboard-provider.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: lab07-logging + orgId: 1 + folder: Lab07 Logging + type: file + disableDeletion: false + allowUiUpdates: true + updateIntervalSeconds: 30 + options: + path: /etc/grafana/dashboards diff --git a/monitoring/grafana/provisioning/datasources/loki.yml b/monitoring/grafana/provisioning/datasources/loki.yml new file mode 100644 index 0000000000..9c54f7c886 --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/loki.yml @@ -0,0 +1,19 @@ +apiVersion: 1 + +deleteDatasources: + - name: Loki + orgId: 1 + +prune: true + +datasources: + - name: Loki + uid: loki + type: loki + access: proxy + url: http://loki:3100 + isDefault: true + editable: false + jsonData: + maxLines: 1000 + timeout: 60 diff --git a/monitoring/loki/config.yml b/monitoring/loki/config.yml new file mode 100644 index 0000000000..370b47e5fb --- /dev/null +++ b/monitoring/loki/config.yml @@ -0,0 +1,45 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + +common: + path_prefix: /loki + replication_factor: 1 + ring: + kvstore: + store: inmemory + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + +schema_config: + configs: + - from: "2024-01-01" + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +storage_config: + tsdb_shipper: + active_index_directory: /loki/tsdb-index + cache_location: /loki/tsdb-cache + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + retention_delete_delay: 2h + delete_request_store: filesystem + +limits_config: + retention_period: 168h + volume_enabled: true + +analytics: + reporting_enabled: false diff --git a/monitoring/promtail/config.yml b/monitoring/promtail/config.yml new file mode 100644 index 0000000000..8db835ba55 --- /dev/null +++ b/monitoring/promtail/config.yml @@ -0,0 +1,30 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /var/lib/promtail/positions.yml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + filters: + - name: label + values: ["logging=promtail"] + relabel_configs: + - source_labels: [__meta_docker_container_name] + regex: '/(.*)' + target_label: container + - source_labels: [__meta_docker_container_label_app] + target_label: app + - source_labels: [__meta_docker_container_log_stream] + target_label: stream + - target_label: job + replacement: docker + pipeline_stages: + - docker: {}