Merge pull request 'deploy grafana; refactor' (#1 ) from deploy_monitoring into main

Reviewed-on: homedungeon.loc/ansible#1
deploy grafana; refactor
2026-05-22 16:30:15 +00:00 · 2026-05-22 20:29:13 +04:00
17 changed files with 927 additions and 105 deletions
--- a/group_vars/all/vars.yml
+++ b/group_vars/all/vars.yml
@@ -2,6 +2,9 @@ base_users:
  - name: "reaper"
    groups: "sudo,docker"
    group: "reaper"
+    uid: "1001"
    ssh_key: "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMudcsD9pqea/4Gv6PkwtzoDck9MHRkYKEc8hfTvtBAc"
    password: "$y$j9T$o1x4cPajXw.XUxo/UjlxD1$Wq4hI6kkuq4D5WR4jzGr12Easn0rO1E8TCNYcJGnZy6"
-    settings: ""
+    settings: ""
+
+docker_group: "reaper"
--- a/group_vars/monitoring/docker_compose_configs.yml
+++ b/group_vars/monitoring/docker_compose_configs.yml
@@ -0,0 +1,13 @@
+docker_compose_configs:
+  - dir_name: vmalert
+    files:
+      - "alerts-single-node.yml"
+      - "alerts-health.yml"
+      - "alerts-vmagent.yml"
+      - "alerts-vmalert.yml"
+  - dir_name: alertmanager
+    files:
+      - "alertmanager.yml"
+  - dir_name: prometheus
+    files:
+      - "prometheus.yml"
--- a/group_vars/monitoring/docker_compose_definition.yml
+++ b/group_vars/monitoring/docker_compose_definition.yml
@@ -0,0 +1,102 @@
+docker_compose_definition:
+  services:
+    grafana:
+      image: grafana/grafana:13.0
+      depends_on:
+        - "victoriametrics"
+      env_file: .env
+      container_name: grafana
+      restart: unless-stopped
+      user: "1001"
+      ports:
+        - "3000:3000"
+      volumes:
+        - /monitoring/grafana:/var/lib/grafana
+
+    vmagent:
+      image: victoriametrics/vmagent:v1.143.0
+      depends_on:
+        - "victoriametrics"
+      user: "1001"
+      ports:
+        - 8429:8429
+      volumes:
+        - /monitoring/vmagentdata:/vmagentdata
+        - ./configs/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
+      command:
+        - "--promscrape.config=/etc/prometheus/prometheus.yml"
+        - "--remoteWrite.url=http://victoriametrics:8428/api/v1/write"
+        - --remoteWrite.tmpDataPath=/vmagentdata/vmagent-remotewrite-data
+      restart: always
+
+    victoriametrics:
+      image: victoriametrics/victoria-metrics:v1.143.0
+      container_name: victoriametrics
+      restart: unless-stopped
+      ports:
+        - 8428:8428
+      user: "1001"
+      volumes:
+        - /monitoring/victoriametrics/victoria-metrics-data:/victoria-metrics-data
+      command:
+        - "--storageDataPath=/victoria-metrics-data"
+        - "--vmalert.proxyURL=http://vmalert:8880"
+
+    # vmalert executes alerting and recording rules
+    vmalert:
+      image: victoriametrics/vmalert:v1.143.0
+      depends_on:
+        - "victoriametrics"
+        - "alertmanager"
+      ports:
+        - 8880:8880
+      user: "1001"
+      volumes:
+        - ./configs/vmalert/alerts-single-node.yml:/etc/alerts/alerts-single-node.yml
+        - ./configs/vmalert/alerts-health.yml:/etc/alerts/alerts-health.yml
+        - ./configs/vmalert/alerts-vmagent.yml:/etc/alerts/alerts-vmagent.yml
+        - ./configs/vmalert/alerts-vmalert.yml:/etc/alerts/alerts-vmalert.yml
+      command:
+        - "--datasource.url=http://victoriametrics:8428/"
+        - "--remoteRead.url=http://victoriametrics:8428/"
+        - "--remoteWrite.url=http://vmagent:8429/"
+        - "--notifier.url=http://alertmanager:9093/"
+        - "--rule=/etc/alerts/*.yml"
+        # display source of alerts in grafana
+        - "--external.url=http://127.0.0.1:3000" #grafana outside container
+        - '{% raw %}--external.alert.source=explore?orgId=1&left={"datasource":"VictoriaMetrics","queries":[{"expr":{{.Expr|jsonEscape|queryEscape}},"refId":"A"}],"range":{"from":"{{ .ActiveAt.UnixMilli }}","to":"now"}}{% endraw %}'
+      restart: always
+
+    # alertmanager receives alerting notifications from vmalert
+    # and distributes them according to --config.file.
+    alertmanager:
+      image: prom/alertmanager:v0.28.1
+      volumes:
+        - ./configs/alertmanager/alertmanager.yml:/config/alertmanager.yml
+      command:
+        - "--config.file=/config/alertmanager.yml"
+      ports:
+        - 9093:9093
+      restart: always
+
+    node_exporter:
+      image: quay.io/prometheus/node-exporter:v1.11.1-distroless
+      container_name: node_exporter
+      command:
+        - "--path.rootfs=/host"
+      network_mode: host
+      pid: host
+      restart: unless-stopped
+      volumes:
+        - "/:/host:ro,rslave"
+    cadvisor:
+      image: ghcr.io/google/cadvisor:v0.57.0
+      container_name: cadvisor
+      volumes:
+        - /:/rootfs:ro
+        - /var/run:/var/run:ro
+        - /var/lib/docker/:/var/lib/docker:ro
+        - /dev/disk/:/dev/disk:ro
+      ports:
+        - 8081:8080
+      restart: unless-stopped
--- a/group_vars/monitoring/vars.yml
+++ b/group_vars/monitoring/vars.yml
@@ -0,0 +1,23 @@
+# compose
+docker_compose_project_name: monitoring
+docker_compose_data_dir: /compose/monitoring
+docker_compose_dotenv: |
+  GF_SERVER_ROOT_URL: /
+  GF_PLUGINS_PREINSTALL: victoriametrics-metrics-datasource, yesoreyeram-infinity-datasource
+
+scrape:
+  - name: "node_exporter"
+    targets:
+      - nextcloud.homedungeon.loc:9100
+  - name: "cadvisor"
+    targets:
+      - cadvisor:8081
+  - name: "vmagent"
+    targets:
+      - vmagent:8429
+  - name: "vmalert"
+    targets:
+      - vmalert:8880
+  - name: "victoriametrics"
+    targets:
+      - victoriametrics:8428
--- a/group_vars/nextcloudaio/vars.yml
+++ b/group_vars/nextcloudaio/vars.yml
@@ -0,0 +1,88 @@
+# compose
+docker_compose_project_name: nextcloudaio
+docker_compose_data_dir: /compose/nextcloudaio
+docker_compose_dotenv: |
+  # AIO_DISABLE_BACKUP_SECTION: false # Setting this to true allows to hide the backup section in the AIO interface. See https://github.com/nextcloud/all-in-one#how-to-disable-the-backup-section
+  APACHE_PORT: 11000 # Is needed when running behind a web server or reverse proxy (like Apache, Nginx, Caddy, Cloudflare Tunnel and else). See https://github.com/nextcloud/all-in-one/blob/main/reverse-proxy.md
+  APACHE_IP_BINDING: 127.0.0.1 # Should be set when running behind a web server or reverse proxy (like Apache, Nginx, Caddy, Cloudflare Tunnel and else) that is running on the same host. See https://github.com/nextcloud/all-in-one/blob/main/reverse-proxy.md
+  # APACHE_ADDITIONAL_NETWORK: frontend_net # (Optional) Connect the apache container to an additional docker network. Needed when behind a web server or reverse proxy (like Apache, Nginx, Caddy, Cloudflare Tunnel and else) running in a different docker network on same server. See https://github.com/nextcloud/all-in-one/blob/main/reverse-proxy.md
+  # BORG_RETENTION_POLICY: --keep-within=7d --keep-weekly=4 --keep-monthly=6 # Allows to adjust borgs retention policy. See https://github.com/nextcloud/all-in-one#how-to-adjust-borgs-retention-policy
+  # COLLABORA_SECCOMP_DISABLED: false # Setting this to true allows to disable Collabora's Seccomp feature. See https://github.com/nextcloud/all-in-one#how-to-disable-collaboras-seccomp-feature
+  # DOCKER_API_VERSION: 1.44 # You can adjust the internally used docker api version with this variable. ⚠️⚠️⚠️ Warning: please note that only the default api version (unset this variable) is supported and tested by the maintainers of Nextcloud AIO. So use this on your own risk and things might break without warning. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-internally-used-docker-api-version
+  # FULLTEXTSEARCH_JAVA_OPTIONS: "-Xms1024M -Xmx1024M" # Allows to adjust the fulltextsearch java options. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-fulltextsearch-java-options
+  NEXTCLOUD_DATADIR: /nextcloud # Allows to set the host directory for Nextcloud's datadir. ⚠️⚠️⚠️ Warning: do not set or adjust this value after the initial Nextcloud installation is done! See https://github.com/nextcloud/all-in-one#how-to-change-the-default-location-of-nextclouds-datadir
+  # NEXTCLOUD_MOUNT: /mnt/ # Allows the Nextcloud container to access the chosen directory on the host. See https://github.com/nextcloud/all-in-one#how-to-allow-the-nextcloud-container-to-access-directories-on-the-host
+  # NEXTCLOUD_UPLOAD_LIMIT: 16G # Can be adjusted if you need more. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-upload-limit-for-nextcloud
+  # NEXTCLOUD_MAX_TIME: 3600 # Can be adjusted if you need more. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-max-execution-time-for-nextcloud
+  # NEXTCLOUD_MEMORY_LIMIT: 512M # Can be adjusted if you need more. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-php-memory-limit-for-nextcloud
+  # NEXTCLOUD_TRUSTED_CACERTS_DIR: /path/to/my/cacerts # CA certificates in this directory will be trusted by the OS of the nextcloud container (Useful e.g. for LDAPS) See https://github.com/nextcloud/all-in-one#how-to-trust-user-defined-certification-authorities-ca
+  # NEXTCLOUD_STARTUP_APPS: deck twofactor_totp tasks calendar contacts notes # Allows to modify the Nextcloud apps that are installed on starting AIO the first time. See https://github.com/nextcloud/all-in-one#how-to-change-the-nextcloud-apps-that-are-installed-on-the-first-startup
+  # NEXTCLOUD_ADDITIONAL_APKS: imagemagick # This allows to add additional packages to the Nextcloud container permanently. Default is imagemagick but can be overwritten by modifying this value. See https://github.com/nextcloud/all-in-one#how-to-add-os-packages-permanently-to-the-nextcloud-container
+  # NEXTCLOUD_ADDITIONAL_PHP_EXTENSIONS: imagick # This allows to add additional php extensions to the Nextcloud container permanently. Default is imagick but can be overwritten by modifying this value. See https://github.com/nextcloud/all-in-one#how-to-add-php-extensions-permanently-to-the-nextcloud-container
+  # NEXTCLOUD_ENABLE_NVIDIA_GPU: true # This allows to enable the NVIDIA runtime and GPU access for containers that profit from it. ⚠️⚠️⚠️ Warning: this only works if an NVIDIA gpu is installed on the server. See https://github.com/nextcloud/all-in-one#how-to-enable-hardware-acceleration-for-nextcloud.
+  # NEXTCLOUD_KEEP_DISABLED_APPS: false # Setting this to true will keep Nextcloud apps that are disabled in the AIO interface and not uninstall them if they should be installed. See https://github.com/nextcloud/all-in-one#how-to-keep-disabled-apps
+  SKIP_DOMAIN_VALIDATION: true # This should only be set to true if things are correctly configured. See https://github.com/nextcloud/all-in-one#how-to-skip-the-domain-validation
+  # TALK_PORT: 3478 # This allows to adjust the port that the talk container is using which is exposed on the host. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-talk-port
+  # WATCHTOWER_DOCKER_SOCKET_PATH: /var/run/docker.sock # Needs to be specified if the docker socket on the host is not located in the default '/var/run/docker.sock'. Otherwise mastercontainer updates will fail. For macos it needs to be '/var/run/docker.sock'
+docker_compose_definition:
+  name: nextcloud-aio # Add the container to the same compose project like all the sibling containers are added to automatically.
+  services:
+    nextcloud-aio-mastercontainer:
+      image: ghcr.io/nextcloud-releases/all-in-one:latest # This is the container image used. You can switch to ghcr.io/nextcloud-releases/all-in-one:beta if you want to help testing new releases. See https://github.com/nextcloud/all-in-one#how-to-switch-the-channel
+      init: true # This setting makes sure that signals from main process inside the container are correctly forwarded to children. See https://docs.docker.com/reference/compose-file/services/#init
+      restart: always # This makes sure that the container starts always together with the host OS. See https://docs.docker.com/reference/compose-file/services/#restart
+      container_name: nextcloud-aio-mastercontainer # This line is not allowed to be changed as otherwise AIO will not work correctly
+      volumes:
+        - nextcloud_aio_mastercontainer:/mnt/docker-aio-config # This line is not allowed to be changed as otherwise the built-in backup solution will not work
+        - /var/run/docker.sock:/var/run/docker.sock:ro # May be changed on macOS, Windows or docker rootless. See the applicable documentation. If adjusting, don't forget to also set 'WATCHTOWER_DOCKER_SOCKET_PATH'!
+      # devices: ["/dev/dri"] # Uncomment to enable hardware acceleration. ⚠️⚠️⚠️ Warning: this only works if the '/dev/dri' device is present on the host! If it should not exist on your host, don't add this as otherwise the mastercontainer will fail to start! See https://github.com/nextcloud/all-in-one#how-to-enable-hardware-acceleration-for-nextcloud
+      network_mode: bridge # This adds the container to the same network as docker run would do. Comment this line and uncomment the line below and the networks section at the end of the file if you want to define a custom MTU size for the docker network
+      # networks: ["nextcloud-aio"]
+      ports:
+        # - "80:80" # Can be removed when running behind a web server or reverse proxy (like Apache, Nginx, Caddy, Cloudflare Tunnel and else). See https://github.com/nextcloud/all-in-one/blob/main/reverse-proxy.md
+        - "8080:8080" # This is the AIO interface, served via https and self-signed certificate. See https://github.com/nextcloud/all-in-one#explanation-of-used-ports
+        # - "8443:8443" # Can be removed when running behind a web server or reverse proxy (like Apache, Nginx, Caddy, Cloudflare Tunnel and else). See https://github.com/nextcloud/all-in-one/blob/main/reverse-proxy.md
+      # security_opt: ["label:disable"] # Is needed when using SELinux. See https://github.com/nextcloud/all-in-one#are-there-known-problems-when-selinux-is-enabled
+      env_file: ".env"
+
+  #   # Optional: Caddy reverse proxy. See https://github.com/nextcloud/all-in-one/discussions/575
+  #   # Alternatively, use Tailscale if you don't have a domain yet. See https://github.com/nextcloud/all-in-one/discussions/6817
+  #   # Hint: You need to uncomment APACHE_PORT: 11000 above, adjust cloud.example.com to your domain and uncomment the necessary docker volumes at the bottom of this file in order to make it work
+  #   # You can find further examples here: https://github.com/nextcloud/all-in-one/discussions/588
+    caddy:
+      image: caddy:alpine
+      restart: always
+      container_name: caddy
+      volumes:
+        - ./caddy_certs:/certs
+        - ./caddy_config:/config
+        - ./caddy_data:/data
+        - ./caddy_sites:/srv
+      network_mode: "host"
+      configs:
+        - source: Caddyfile
+          target: /etc/caddy/Caddyfile
+  configs:
+    Caddyfile:
+      content: |
+        {
+          local_certs
+        }
+        https://nextcloud.homedungeon.loc:443 {
+          reverse_proxy localhost:11000
+        }
+
+  volumes: # If you want to store the data on a different drive, see https://github.com/nextcloud/all-in-one#how-to-store-the-filesinstallation-on-a-separate-drive
+    nextcloud_aio_mastercontainer:
+      name: nextcloud_aio_mastercontainer # This line is not allowed to be changed as otherwise the built-in backup solution will not work
+    # caddy_certs:
+    # caddy_config:
+    # caddy_data:
+    # caddy_sites:
+
+  # # Adjust the MTU size of the docker network. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-mtu-size-of-the-docker-network
+  # networks:
+  #   nextcloud-aio:
+  #     name: nextcloud-aio
+  #     driver_opts:
+  #       com.docker.network.driver.mtu: 1440
--- a/host_vars/cm3588.yml
+++ b/host_vars/cm3588.yml
@@ -37,99 +37,14 @@ btrfssubvol_subvolumes:
    mount_point: /var/log
  - name: snapshots
  - name: nextcloud
-    owner: reaper
+    owner: www-data
+    group: www-data
    mount_point: /nextcloud
  - name: docker
    mount_point: /docker
  - name: compose
    owner: reaper
    mount_point: /compose
-
-# compose
-docker_compose_project_name: nextcloudaio
-docker_compose_data_dir: /compose/nextcloudaio
-docker_compose_dotenv: |
-  # AIO_DISABLE_BACKUP_SECTION: false # Setting this to true allows to hide the backup section in the AIO interface. See https://github.com/nextcloud/all-in-one#how-to-disable-the-backup-section
-  APACHE_PORT: 11000 # Is needed when running behind a web server or reverse proxy (like Apache, Nginx, Caddy, Cloudflare Tunnel and else). See https://github.com/nextcloud/all-in-one/blob/main/reverse-proxy.md
-  APACHE_IP_BINDING: 127.0.0.1 # Should be set when running behind a web server or reverse proxy (like Apache, Nginx, Caddy, Cloudflare Tunnel and else) that is running on the same host. See https://github.com/nextcloud/all-in-one/blob/main/reverse-proxy.md
-  # APACHE_ADDITIONAL_NETWORK: frontend_net # (Optional) Connect the apache container to an additional docker network. Needed when behind a web server or reverse proxy (like Apache, Nginx, Caddy, Cloudflare Tunnel and else) running in a different docker network on same server. See https://github.com/nextcloud/all-in-one/blob/main/reverse-proxy.md
-  # BORG_RETENTION_POLICY: --keep-within=7d --keep-weekly=4 --keep-monthly=6 # Allows to adjust borgs retention policy. See https://github.com/nextcloud/all-in-one#how-to-adjust-borgs-retention-policy
-  # COLLABORA_SECCOMP_DISABLED: false # Setting this to true allows to disable Collabora's Seccomp feature. See https://github.com/nextcloud/all-in-one#how-to-disable-collaboras-seccomp-feature
-  # DOCKER_API_VERSION: 1.44 # You can adjust the internally used docker api version with this variable. ⚠️⚠️⚠️ Warning: please note that only the default api version (unset this variable) is supported and tested by the maintainers of Nextcloud AIO. So use this on your own risk and things might break without warning. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-internally-used-docker-api-version
-  # FULLTEXTSEARCH_JAVA_OPTIONS: "-Xms1024M -Xmx1024M" # Allows to adjust the fulltextsearch java options. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-fulltextsearch-java-options
-  NEXTCLOUD_DATADIR: /nextcloud # Allows to set the host directory for Nextcloud's datadir. ⚠️⚠️⚠️ Warning: do not set or adjust this value after the initial Nextcloud installation is done! See https://github.com/nextcloud/all-in-one#how-to-change-the-default-location-of-nextclouds-datadir
-  # NEXTCLOUD_MOUNT: /mnt/ # Allows the Nextcloud container to access the chosen directory on the host. See https://github.com/nextcloud/all-in-one#how-to-allow-the-nextcloud-container-to-access-directories-on-the-host
-  # NEXTCLOUD_UPLOAD_LIMIT: 16G # Can be adjusted if you need more. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-upload-limit-for-nextcloud
-  # NEXTCLOUD_MAX_TIME: 3600 # Can be adjusted if you need more. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-max-execution-time-for-nextcloud
-  # NEXTCLOUD_MEMORY_LIMIT: 512M # Can be adjusted if you need more. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-php-memory-limit-for-nextcloud
-  # NEXTCLOUD_TRUSTED_CACERTS_DIR: /path/to/my/cacerts # CA certificates in this directory will be trusted by the OS of the nextcloud container (Useful e.g. for LDAPS) See https://github.com/nextcloud/all-in-one#how-to-trust-user-defined-certification-authorities-ca
-  # NEXTCLOUD_STARTUP_APPS: deck twofactor_totp tasks calendar contacts notes # Allows to modify the Nextcloud apps that are installed on starting AIO the first time. See https://github.com/nextcloud/all-in-one#how-to-change-the-nextcloud-apps-that-are-installed-on-the-first-startup
-  # NEXTCLOUD_ADDITIONAL_APKS: imagemagick # This allows to add additional packages to the Nextcloud container permanently. Default is imagemagick but can be overwritten by modifying this value. See https://github.com/nextcloud/all-in-one#how-to-add-os-packages-permanently-to-the-nextcloud-container
-  # NEXTCLOUD_ADDITIONAL_PHP_EXTENSIONS: imagick # This allows to add additional php extensions to the Nextcloud container permanently. Default is imagick but can be overwritten by modifying this value. See https://github.com/nextcloud/all-in-one#how-to-add-php-extensions-permanently-to-the-nextcloud-container
-  # NEXTCLOUD_ENABLE_NVIDIA_GPU: true # This allows to enable the NVIDIA runtime and GPU access for containers that profit from it. ⚠️⚠️⚠️ Warning: this only works if an NVIDIA gpu is installed on the server. See https://github.com/nextcloud/all-in-one#how-to-enable-hardware-acceleration-for-nextcloud.
-  # NEXTCLOUD_KEEP_DISABLED_APPS: false # Setting this to true will keep Nextcloud apps that are disabled in the AIO interface and not uninstall them if they should be installed. See https://github.com/nextcloud/all-in-one#how-to-keep-disabled-apps
-  SKIP_DOMAIN_VALIDATION: true # This should only be set to true if things are correctly configured. See https://github.com/nextcloud/all-in-one#how-to-skip-the-domain-validation
-  # TALK_PORT: 3478 # This allows to adjust the port that the talk container is using which is exposed on the host. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-talk-port
-  # WATCHTOWER_DOCKER_SOCKET_PATH: /var/run/docker.sock # Needs to be specified if the docker socket on the host is not located in the default '/var/run/docker.sock'. Otherwise mastercontainer updates will fail. For macos it needs to be '/var/run/docker.sock'
-docker_compose_definition:
-  name: nextcloud-aio # Add the container to the same compose project like all the sibling containers are added to automatically.
-  services:
-    nextcloud-aio-mastercontainer:
-      image: ghcr.io/nextcloud-releases/all-in-one:latest # This is the container image used. You can switch to ghcr.io/nextcloud-releases/all-in-one:beta if you want to help testing new releases. See https://github.com/nextcloud/all-in-one#how-to-switch-the-channel
-      init: true # This setting makes sure that signals from main process inside the container are correctly forwarded to children. See https://docs.docker.com/reference/compose-file/services/#init
-      restart: always # This makes sure that the container starts always together with the host OS. See https://docs.docker.com/reference/compose-file/services/#restart
-      container_name: nextcloud-aio-mastercontainer # This line is not allowed to be changed as otherwise AIO will not work correctly
-      volumes:
-        - nextcloud_aio_mastercontainer:/mnt/docker-aio-config # This line is not allowed to be changed as otherwise the built-in backup solution will not work
-        - /var/run/docker.sock:/var/run/docker.sock:ro # May be changed on macOS, Windows or docker rootless. See the applicable documentation. If adjusting, don't forget to also set 'WATCHTOWER_DOCKER_SOCKET_PATH'!
-      # devices: ["/dev/dri"] # Uncomment to enable hardware acceleration. ⚠️⚠️⚠️ Warning: this only works if the '/dev/dri' device is present on the host! If it should not exist on your host, don't add this as otherwise the mastercontainer will fail to start! See https://github.com/nextcloud/all-in-one#how-to-enable-hardware-acceleration-for-nextcloud
-      network_mode: bridge # This adds the container to the same network as docker run would do. Comment this line and uncomment the line below and the networks section at the end of the file if you want to define a custom MTU size for the docker network
-      # networks: ["nextcloud-aio"]
-      ports:
-        # - "80:80" # Can be removed when running behind a web server or reverse proxy (like Apache, Nginx, Caddy, Cloudflare Tunnel and else). See https://github.com/nextcloud/all-in-one/blob/main/reverse-proxy.md
-        - "8080:8080" # This is the AIO interface, served via https and self-signed certificate. See https://github.com/nextcloud/all-in-one#explanation-of-used-ports
-        # - "8443:8443" # Can be removed when running behind a web server or reverse proxy (like Apache, Nginx, Caddy, Cloudflare Tunnel and else). See https://github.com/nextcloud/all-in-one/blob/main/reverse-proxy.md
-      # security_opt: ["label:disable"] # Is needed when using SELinux. See https://github.com/nextcloud/all-in-one#are-there-known-problems-when-selinux-is-enabled
-      env_file: ".env"
-
-  #   # Optional: Caddy reverse proxy. See https://github.com/nextcloud/all-in-one/discussions/575
-  #   # Alternatively, use Tailscale if you don't have a domain yet. See https://github.com/nextcloud/all-in-one/discussions/6817
-  #   # Hint: You need to uncomment APACHE_PORT: 11000 above, adjust cloud.example.com to your domain and uncomment the necessary docker volumes at the bottom of this file in order to make it work
-  #   # You can find further examples here: https://github.com/nextcloud/all-in-one/discussions/588
-    caddy:
-      image: caddy:alpine
-      restart: always
-      container_name: caddy
-      volumes:
-        - ./caddy_certs:/certs
-        - ./caddy_config:/config
-        - ./caddy_data:/data
-        - ./caddy_sites:/srv
-      network_mode: "host"
-      configs:
-        - source: Caddyfile
-          target: /etc/caddy/Caddyfile
-  configs:
-    Caddyfile:
-      content: |
-        # Adjust cloud.example.com to your domain below
-        https://nextcloud.homedungeon.loc:443 {
-          tls /certs/cert.pem /certs/key.pem
-          reverse_proxy localhost:11000
-        }
-
-  volumes: # If you want to store the data on a different drive, see https://github.com/nextcloud/all-in-one#how-to-store-the-filesinstallation-on-a-separate-drive
-    nextcloud_aio_mastercontainer:
-      name: nextcloud_aio_mastercontainer # This line is not allowed to be changed as otherwise the built-in backup solution will not work
-    # caddy_certs:
-    # caddy_config:
-    # caddy_data:
-    # caddy_sites:
-
-  # # Adjust the MTU size of the docker network. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-mtu-size-of-the-docker-network
-  # networks:
-  #   nextcloud-aio:
-  #     name: nextcloud-aio
-  #     driver_opts:
-  #       com.docker.network.driver.mtu: 1440
-docker_group: "reaper"
+  - name: monitoring
+    owner: reaper
+    mount_point: /monitoring
--- a/8
+++ b/8
@@ -1,2 +1,8 @@
 [storage]
-cm3588
+cm3588
+
+[nextcloudaio]
+cm3588
+
+[monitoring]
+monitoring.homedungeon.loc
--- a/monitoring.yml
+++ b/monitoring.yml
@@ -0,0 +1,4 @@
+- name: "Deploy monitoring stack docker"
+  hosts: monitoring
+  roles:
+    - role: docker_compose
--- a/nextcloudaio.yml
+++ b/nextcloudaio.yml
@@ -1,4 +1,4 @@
 - name: "Deploy nextcloudaio docker"
-  hosts: storage
+  hosts: nextcloudaio
  roles:
    - role: docker_compose
--- a/prepare.yml
+++ b/prepare.yml
@@ -1,9 +1,9 @@
 - hosts: storage
-  vars:
-    ansible_ssh_user: pi
-    ansible_sudo_pass: pi
-    ansible_ssh_pass: pi
-    ansible_host: "{{ ip_before_reboot }}"
+  # vars:
+  #   ansible_ssh_user: pi
+  #   ansible_sudo_pass: pi
+  #   ansible_ssh_pass: pi
+  #   ansible_host: "{{ ip_before_reboot }}"
  roles:
    - role: dns
      tags: dns
@@ -46,7 +46,7 @@
      file:
        path: /etc/systemd/system/logrotate.timer.d
        state: directory
-        mode: '0755'
+        mode: "0755"
        owner: root
        group: root
    - name: Copy logrotate.timer
@@ -56,7 +56,7 @@
          [Timer]
          OnCalendar=hourly
          AccuracySec=1m
-        mode: '0755'
+        mode: "0755"
        owner: root
        group: root
    - name: Restart logrotate.timer
@@ -66,7 +66,7 @@

 - name: "Mount new /var/log and move logs"
  hosts: storage
-  tags: [ mount, logs ]
+  tags: [mount, logs]
  roles:
    - role: lingling9000.btrfssubvol
      tags: btrfssubvol
@@ -80,15 +80,14 @@

    - name: "Move logs to new directory"
      when:
-       - has_var_log_mount
-       - _var_log_stat.stat.isdir is not defined or not _var_log_stat.stat.isdir
+        - has_var_log_mount
+        - _var_log_stat.stat.isdir is not defined or not _var_log_stat.stat.isdir
      block:
        - name: "Mount original rootfs and move files"
          shell: "mkdir /tmp/mnt-rootfs && mount --bind / /tmp/mnt-rootfs && mv /tmp/mnt-rootfs/var/log/* /var/log/ && umount /tmp/mnt-rootfs && rmdir /tmp/mnt-rootfs"
      rescue:
        - name: "Unmount rootfs"
          shell: "umount /tmp/mnt-rootfs && rmdir /tmp/mnt-rootfs"
-
 # - name: install victoria exporters
 #   hosts: storage
 #   gather_facts: yes
@@ -96,4 +95,4 @@
 #   - role: cadvisor
 #     tags:
 #       - monitoring
-#       - cadvisor
+#       - cadvisor
--- a/ssh.cfg
+++ b/ssh.cfg
@@ -1,3 +1,6 @@
 Host cm3588
    User reaper
    Hostname 192.168.8.2
+
+Host monitoring.homedungeon.loc
+    User reaper
--- a/templates/monitoring/alertmanager/alertmanager.yml.j2
+++ b/templates/monitoring/alertmanager/alertmanager.yml.j2
@@ -0,0 +1,5 @@
+route:
+  receiver: blackhole
+
+receivers:
+  - name: blackhole
--- a/templates/monitoring/prometheus/prometheus.yml.j2
+++ b/templates/monitoring/prometheus/prometheus.yml.j2
@@ -0,0 +1,12 @@
+global:
+  scrape_interval: 15s
+  scrape_timeout: 10s
+
+scrape_configs:
+{% for service in scrape %}
+  - job_name: "{{ service.name }}"
+    static_configs:
+{% for target in service.targets %}
+    - targets: ["{{ target }}"]
+{% endfor %}
+{% endfor %}
--- a/templates/monitoring/vmalert/alerts-health.yml.j2
+++ b/templates/monitoring/vmalert/alerts-health.yml.j2
@@ -0,0 +1,128 @@
+{% raw %}
+# File contains default list of alerts for various VM components.
+# The following alerts are recommended for use for any VM installation.
+# The alerts below are just recommendations and may require some updates
+# and threshold calibration according to every specific setup.
+groups:
+  - name: vm-health
+    # note the `job` filter and update accordingly to your setup
+    rules:
+      - alert: TooManyRestarts
+        expr: changes(process_start_time_seconds{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"}[15m]) > 2
+        labels:
+          severity: critical
+        annotations:
+          summary: "{{ $labels.job }} too many restarts (instance {{ $labels.instance }})"
+          description: >
+            Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes.
+            It might be crashlooping.
+
+      - alert: ServiceDown
+        expr: up{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"} == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}"
+          description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
+
+      - alert: ProcessNearFDLimits
+        expr: (process_max_fds - process_open_fds) < 100
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: 'Number of free file descriptors is less than 100 for "{{ $labels.job }}"("{{ $labels.instance }}") for the last 5m'
+          description: |
+            Exhausting OS file descriptors limit can cause severe degradation of the process.
+            Consider to increase the limit as fast as possible.
+
+      - alert: TooHighMemoryUsage
+        expr: (min_over_time(process_resident_memory_anon_bytes[10m]) / vm_available_memory_bytes) > 0.8
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: 'It is more than 80% of memory used by "{{ $labels.job }}"("{{ $labels.instance }}")'
+          description: |
+            Too high memory usage may result into multiple issues such as OOMs or degraded performance.
+            Consider to either increase available memory or decrease the load on the process.
+
+      - alert: TooHighCPUUsage
+        expr: rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: 'More than 90% of CPU is used by "{{ $labels.job }}"("{{ $labels.instance }}") during the last 5m'
+          description: >
+            Too high CPU usage may be a sign of insufficient resources and make process unstable.
+            Consider to either increase available CPU resources or decrease the load on the process.
+
+      - alert: TooHighGoroutineSchedulingLatency
+        expr: histogram_quantile(0.99, sum(rate(go_sched_latencies_seconds_bucket{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"}[5m])) by (le, job, instance)) > 0.1
+        for: 15m
+        labels:
+          severity: critical
+        annotations:
+          summary: '"{{ $labels.job }}"("{{ $labels.instance }}") has insufficient CPU resources for >15m'
+          description: >
+            Go runtime is unable to schedule goroutines execution in acceptable time. This is usually a sign of
+            insufficient CPU resources or CPU throttling. Verify that service has enough CPU resources. Otherwise,
+            the service could work unreliably with delays in processing.
+
+      - alert: TooManyLogs
+        expr: sum(increase(vm_log_messages_total{level!="info"}[5m])) without (app_version, location, is_printed) > 0
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: 'Too many logs are generated for job "{{ $labels.job }}" ({{ $labels.instance }})'
+          description: >
+            The job \"{{ $labels.job }}\" ({{ $labels.instance }}) generated {{ $value }} log messages with the level higher than info for the last 5 minutes.
+            Check the logs for the given target. Check also the \"location\" label at the vm_log_messages_total metric if -loggerLevel command-line flag is set to value other than INFO.
+            This label contains code locations responsible for generating log messages suppressed by -loggerLevel.
+
+      - alert: ConcurrentInsertsHitTheLimit
+        expr: avg_over_time(vm_concurrent_insert_current[1m]) >= vm_concurrent_insert_capacity
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: "{{ $labels.job }} on instance {{ $labels.instance }} is constantly hitting concurrent inserts limit"
+          description: |
+            The limit of concurrent inserts on instance {{ $labels.instance }} depends on the number of CPUs.
+            Usually, when component constantly hits the limit it is likely the component is overloaded and requires more CPU.
+            In some cases for components like vmagent or vminsert the alert might trigger if there are too many clients
+            making write attempts. If vmagent's or vminsert's CPU usage and network saturation are at normal level, then
+            it might be worth adjusting `-maxConcurrentInserts` cmd-line flag.
+
+      - alert: TooHighQueryLoad
+        expr: increase(vm_concurrent_select_limit_timeout_total[5m]) > 0
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Read queries fail with timeout for {{ $labels.job }} on instance {{ $labels.instance }}"
+          description: |
+            Instance {{ $labels.instance }} ({{ $labels.job }}) is failing to serve read queries during last 15m.
+            Concurrency limit `-search.maxConcurrentRequests` was reached on this instance and extra queries were
+            put into the queue for `-search.maxQueueDuration` interval. But even after waiting in the queue these queries weren't served.
+            This happens if instance is overloaded with the current workload, or datasource is too slow to respond.
+            Possible solutions are the following:
+            * reduce the query load;
+            * increase compute resources or number of replicas;
+            * adjust limits `-search.maxConcurrentRequests` and `-search.maxQueueDuration`.
+            See more at https://docs.victoriametrics.com/victoriametrics/troubleshooting/#slow-queries.
+
+      - alert: RowsRejectedOnIngestion
+        expr: rate(vm_rows_ignored_total[5m]) > 0
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: 'Some rows are rejected on "{{ $labels.instance }}" on ingestion attempt'
+          description:
+            'Ingested rows on instance "{{ $labels.instance }}" are rejected due to the
+            following reason: "{{ $labels.reason }}"'
+{% endraw %}
--- a/templates/monitoring/vmalert/alerts-single-node.yml.j2
+++ b/templates/monitoring/vmalert/alerts-single-node.yml.j2
@@ -0,0 +1,203 @@
+{% raw %}
+# File contains default list of alerts for VictoriaMetrics single server.
+# The alerts below are just recommendations and may require some updates
+# and threshold calibration according to every specific setup.
+groups:
+  # Alerts group for VM single assumes that Grafana dashboard
+  # https://grafana.com/grafana/dashboards/10229 is installed.
+  # Pls update the `dashboard` annotation according to your setup.
+  - name: vmsingle
+    interval: 30s
+    concurrency: 2
+    rules:
+      - alert: DiskRunsOutOfSpaceIn3Days
+        expr: |
+          sum(vm_free_disk_space_bytes) without(path) /
+          (
+            (rate(vm_rows_added_to_storage_total[1d]) - sum(rate(vm_deduplicated_samples_total[1d])) without(type)) * (
+              sum(vm_data_size_bytes{type!~"indexdb.*"}) without(type) /
+              sum(vm_rows{type!~"indexdb.*"}) without(type)
+            )
+            +
+            rate(vm_new_timeseries_created_total[1d]) * (
+              sum(vm_data_size_bytes{type="indexdb/file"}) without(type)/
+              sum(vm_rows{type="indexdb/file"}) without(type)
+            )
+          ) < 3 * 24 * 3600 > 0
+        for: 30m
+        labels:
+          severity: critical
+        annotations:
+          dashboard: "{{ $externalURL }}/d/wNf0q_kZk?viewPanel=53&var-instance={{ $labels.instance }}"
+          summary: "Instance {{ $labels.instance }} will run out of disk space soon"
+          description:
+            "Taking into account current ingestion rate, free disk space will be enough only
+            for {{ $value | humanizeDuration }} on instance {{ $labels.instance }}.\n
+            Consider to limit the ingestion rate, decrease retention or scale the disk space if possible."
+
+      - alert: NodeBecomesReadonlyIn3Days
+        expr: |
+          sum(vm_free_disk_space_bytes - vm_free_disk_space_limit_bytes) without(path) /
+          (
+              (rate(vm_rows_added_to_storage_total[1d]) - sum(rate(vm_deduplicated_samples_total[1d])) without(type)) * (
+                sum(vm_data_size_bytes{type!~"indexdb.*"}) without(type) /
+                sum(vm_rows{type!~"indexdb.*"}) without(type)
+              )
+              +
+              rate(vm_new_timeseries_created_total[1d]) * (
+                sum(vm_data_size_bytes{type="indexdb/file"}) without(type) /
+                sum(vm_rows{type="indexdb/file"}) without(type)
+              )
+          ) < 3 * 24 * 3600 > 0
+        for: 30m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "{{ $externalURL }}/d/oS7Bi_0Wz?viewPanel=53&var-instance={{ $labels.instance }}"
+          summary: "Instance {{ $labels.instance }} will become read-only in 3 days"
+          description:
+            "Taking into account current ingestion rate and free disk space
+            instance {{ $labels.instance }} is writable for {{ $value | humanizeDuration }}.\n
+            Consider to limit the ingestion rate, decrease retention or scale the disk space up if possible."
+
+      - alert: DiskRunsOutOfSpace
+        expr: |
+          sum(vm_data_size_bytes) by(job, instance) /
+          (
+           sum(vm_free_disk_space_bytes) by(job, instance) +
+           sum(vm_data_size_bytes) by(job, instance)
+          ) > 0.8
+        for: 30m
+        labels:
+          severity: critical
+        annotations:
+          dashboard: "{{ $externalURL }}/d/wNf0q_kZk?viewPanel=53&var-instance={{ $labels.instance }}"
+          summary: "Instance {{ $labels.instance }} (job={{ $labels.job }}) will run out of disk space soon"
+          description:
+            "Disk utilisation on instance {{ $labels.instance }} is more than 80%.\n
+            Having less than 20% of free disk space could cripple merge processes and overall performance.
+            Consider to limit the ingestion rate, decrease retention or scale the disk space if possible."
+
+      - alert: RequestErrorsToAPI
+        expr: increase(vm_http_request_errors_total[5m]) > 0
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "{{ $externalURL }}/d/wNf0q_kZk?viewPanel=35&var-instance={{ $labels.instance }}"
+          summary: "Too many errors served for path {{ $labels.path }} (instance {{ $labels.instance }})"
+          description:
+            "Requests to path {{ $labels.path }} are receiving errors.
+            Please verify if clients are sending correct requests."
+
+      - alert: TooHighChurnRate
+        expr: |
+          (
+             sum(rate(vm_new_timeseries_created_total[5m])) by(instance)
+             /
+             sum(rate(vm_rows_inserted_total[5m])) by(instance)
+           ) > 0.1
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "{{ $externalURL }}/d/wNf0q_kZk?viewPanel=66&var-instance={{ $labels.instance }}"
+          summary: 'Churn rate is more than 10% on "{{ $labels.instance }}" for the last 15m'
+          description:
+            "VM constantly creates new time series on \"{{ $labels.instance }}\".\n
+            This effect is known as Churn Rate.\n
+            High Churn Rate is tightly connected with database performance and may
+            result in unexpected OOM's or slow queries."
+
+      - alert: TooHighChurnRate24h
+        expr: |
+          sum(increase(vm_new_timeseries_created_total[24h])) by(instance)
+          >
+          (sum(vm_cache_entries{type="storage/hour_metric_ids"}) by(instance) * 3)
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "{{ $externalURL }}/d/wNf0q_kZk?viewPanel=66&var-instance={{ $labels.instance }}"
+          summary: 'Too high number of new series on "{{ $labels.instance }}" created over last 24h'
+          description:
+            "The number of created new time series over last 24h is 3x times higher than
+            current number of active series on \"{{ $labels.instance }}\".\n
+            This effect is known as Churn Rate.\n
+            High Churn Rate is tightly connected with database performance and may
+            result in unexpected OOM's or slow queries."
+
+      - alert: TooHighSlowInsertsRate
+        expr: |
+          (
+             sum(rate(vm_slow_row_inserts_total[5m])) by(instance)
+             /
+             sum(rate(vm_rows_inserted_total[5m])) by(instance)
+           ) > 0.05
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "{{ $externalURL }}/d/wNf0q_kZk?viewPanel=68&var-instance={{ $labels.instance }}"
+          summary: 'Percentage of slow inserts is more than 5% on "{{ $labels.instance }}" for the last 15m'
+          description:
+            'High rate of slow inserts on "{{ $labels.instance }}" may be a sign of resource exhaustion
+            for the current load. It is likely more RAM is needed for optimal handling of the current number of active time series.
+            See also https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3976#issuecomment-1476883183'
+
+      - alert: MetadataCacheUtilizationIsTooHigh
+        expr: |
+          vm_metrics_metadata_storage_size_bytes / vm_metrics_metadata_storage_max_size_bytes > 0.95
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Metadata cache capacity on {{ $labels.instance }} (job={{ $labels.job }}) is utilized for more than 95% for the last 15min"
+          description:
+            "Metadata cache stores meta information about ingested time series - see https://docs.victoriametrics.com/victoriametrics/#metrics-metadata.
+            When cache is overutilized, the oldest entries will be dropped out automatically. It may result into incomplete
+            response for /api/v1/metadata API calls. It doesn't impact regular queries or alerts. Cache size is controlled
+            via -storage.maxMetadataStorageSize cmd-line flag."
+
+      - alert: MetricNameStatsCacheUtilizationIsTooHigh
+        expr: |
+          vm_cache_size_bytes{type="storage/metricNamesStatsTracker"} / vm_cache_size_max_bytes{type="storage/metricNamesStatsTracker"} > 0.95
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Cache capacity for tracking metric names usage on {{ $labels.instance }} (job={{ $labels.job }}) is utilized for more than 95% during the last 15min"
+          description:
+            "Metric names usage cache stores information about unique metric names and how frequently they are queried - see https://docs.victoriametrics.com/victoriametrics/#track-ingested-metrics-usage.
+            When cache is overutilized, it will stop tracking the new metric names. It has no other negative impact.
+            Usually, the number of unique metric names is very limited (thousands). The cache can be overutilized only if metric names
+            are changing too frequently or if the cache size is too low. There are following ways to mitigate cache overutilization:
+            - disable cache via `--storage.trackMetricNamesStats=false` flag, so metric names usage will stop tracking
+            - increase the cache size via `--storage.cacheSizeMetricNamesStats` flag
+            - reset the cache (see docs for details)"
+
+      - alert: IndexDBRecordsDrop
+        expr: increase(vm_indexdb_items_dropped_total[5m]) > 0
+        labels:
+          severity: critical
+        annotations:
+          summary: "IndexDB skipped registering items during data ingestion with reason={{ $labels.reason }}."
+          description: |
+            VictoriaMetrics could skip registering new timeseries during ingestion if they fail the validation process.
+            For example, `reason=too_long_item` means that time series cannot exceed 64KB. Please, reduce the number
+            of labels or label values for such series. Or enforce these limits via `-maxLabelsPerTimeseries` and
+            `-maxLabelValueLen` command-line flags.
+
+      - alert: TooManyTSIDMisses
+        expr: increase(vm_missing_tsids_for_metric_id_total[5m]) > 0
+        for: 15m
+        labels:
+          severity: critical
+        annotations:
+          summary: 'Unexpected TSID misses for job "{{ $labels.job }}" ({{ $labels.instance }}) for the last 15 minutes'
+          description: |
+            Unexpected TSID misses for \"{{ $labels.job }}\" ({{ $labels.instance }}) for the last 15 minutes.
+            If this happens after unclean shutdown of VictoriaMetrics process (via \"kill -9\", OOM or power off),
+            then this is OK - the alert must go away in a few minutes after the restart.
+            Otherwise this may point to the corruption of index data.
+{% endraw %}
--- a/templates/monitoring/vmalert/alerts-vmagent.yml.j2
+++ b/templates/monitoring/vmalert/alerts-vmagent.yml.j2
@@ -0,0 +1,206 @@
+{% raw %}
+# File contains default list of alerts for vmagent service.
+# The alerts below are just recommendations and may require some updates
+# and threshold calibration according to every specific setup.
+groups:
+  # Alerts group for vmagent assumes that Grafana dashboard
+  # https://grafana.com/grafana/dashboards/12683 is installed.
+  # Pls update the `dashboard` annotation according to your setup.
+  - name: vmagent
+    interval: 30s
+    concurrency: 2
+    rules:
+      - alert: PersistentQueueIsDroppingData
+        expr: sum(increase(vm_persistentqueue_bytes_dropped_total[5m])) without (path) > 0
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          dashboard: "{{ $externalURL }}/d/G7Z9GzMGz?viewPanel=49&var-instance={{ $labels.instance }}"
+          summary: "Instance {{ $labels.instance }} is dropping data from persistent queue"
+          description: "Vmagent dropped {{ $value | humanize1024 }} from persistent queue
+              on instance {{ $labels.instance }} for the last 10m."
+
+      - alert: RejectedRemoteWriteDataBlocksAreDropped
+        expr: sum(increase(vmagent_remotewrite_packets_dropped_total[5m])) without (url) > 0
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "{{ $externalURL }}/d/G7Z9GzMGz?viewPanel=79&var-instance={{ $labels.instance }}"
+          summary: "Vmagent is dropping data blocks that are rejected by remote storage"
+          description: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} drops the rejected by
+            remote-write server data blocks. Check the logs to find the reason for rejects."
+
+      - alert: TooManyScrapeErrors
+        expr: increase(vm_promscrape_scrapes_failed_total[5m]) > 0
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "{{ $externalURL }}/d/G7Z9GzMGz?viewPanel=31&var-instance={{ $labels.instance }}"
+          summary: "Vmagent fails to scrape one or more targets"
+          description: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to scrape targets for last 15m"
+
+      - alert: ScrapePoolHasNoTargets
+        expr: sum(vm_promscrape_scrape_pool_targets) without (status, instance, pod) == 0
+        for: 30m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Vmagent has scrape_pool with 0 configured/discovered targets"
+          description: "Vmagent \"{{ $labels.job }}\" has scrape_pool \"{{ $labels.scrape_job }}\"
+            with 0 discovered targets. It is likely a misconfiguration. Please follow https://docs.victoriametrics.com/victoriametrics/vmagent/#debugging-scrape-targets
+            to troubleshoot the scraping config."
+
+      - alert: TooManyWriteErrors
+        expr: |
+          (sum(increase(vm_ingestserver_request_errors_total[5m])) without (name,net,type)
+          +
+          sum(increase(vmagent_http_request_errors_total[5m])) without (path,protocol)) > 0
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "{{ $externalURL }}/d/G7Z9GzMGz?viewPanel=77&var-instance={{ $labels.instance }}"
+          summary: "Vmagent responds with too many errors on data ingestion protocols"
+          description: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} responds with errors to write requests for last 15m."
+
+      - alert: TooManyRemoteWriteErrors
+        expr: rate(vmagent_remotewrite_retries_count_total[5m]) > 0
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "{{ $externalURL }}/d/G7Z9GzMGz?viewPanel=61&var-instance={{ $labels.instance }}"
+          summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to push to remote storage"
+          description: "Vmagent fails to push data via remote write protocol to destination \"{{ $labels.url }}\"\n
+            Ensure that destination is up and reachable."
+
+      - alert: RemoteWriteConnectionIsSaturated
+        expr: |
+          (
+           rate(vmagent_remotewrite_send_duration_seconds_total[5m])
+           /
+           vmagent_remotewrite_queues
+          ) > 0.9
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "{{ $externalURL }}/d/G7Z9GzMGz?viewPanel=84&var-instance={{ $labels.instance }}"
+          summary: "Remote write connection from \"{{ $labels.job }}\" (instance {{ $labels.instance }}) to {{ $labels.url }} is saturated"
+          description: "The remote write connection between vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }}) and destination \"{{ $labels.url }}\"
+            is saturated by more than 90% and vmagent won't be able to keep up.\n
+            There could be the following reasons for this:\n
+             * vmagent can't send data fast enough through the existing network connections. Increase `-remoteWrite.queues` cmd-line flag value to establish more connections per destination.\n
+             * remote destination can't accept data fast enough. Check if remote destination has enough resources for processing."
+
+      - alert: PersistentQueueForWritesIsSaturated
+        expr: rate(vm_persistentqueue_write_duration_seconds_total[5m]) > 0.9
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "{{ $externalURL }}/d/G7Z9GzMGz?viewPanel=98&var-instance={{ $labels.instance }}"
+          summary: "Persistent queue writes for instance {{ $labels.instance }} are saturated"
+          description: "Persistent queue writes for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }})
+            are saturated by more than 90% and vmagent won't be able to keep up with flushing data on disk.
+            In this case, consider to decrease load on the vmagent or improve the disk throughput."
+
+      - alert: PersistentQueueForReadsIsSaturated
+        expr: rate(vm_persistentqueue_read_duration_seconds_total[5m]) > 0.9
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "{{ $externalURL }}/d/G7Z9GzMGz?viewPanel=99&var-instance={{ $labels.instance }}"
+          summary: "Persistent queue reads for instance {{ $labels.instance }} are saturated"
+          description: "Persistent queue reads for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }})
+            are saturated by more than 90% and vmagent won't be able to keep up with reading data from the disk.
+            In this case, consider to decrease load on the vmagent or improve the disk throughput."
+
+      - alert: SeriesLimitHourReached
+        expr: (vmagent_hourly_series_limit_current_series / vmagent_hourly_series_limit_max_series) > 0.9
+        labels:
+          severity: critical
+        annotations:
+          dashboard: "{{ $externalURL }}/d/G7Z9GzMGz?viewPanel=88&var-instance={{ $labels.instance }}"
+          summary: "Instance {{ $labels.instance }} reached 90% of the limit"
+          description: "Max series limit set via -remoteWrite.maxHourlySeries flag is close to reaching the max value.
+            Then samples for new time series will be dropped instead of sending them to remote storage systems."
+
+      - alert: SeriesLimitDayReached
+        expr: (vmagent_daily_series_limit_current_series / vmagent_daily_series_limit_max_series) > 0.9
+        labels:
+          severity: critical
+        annotations:
+          dashboard: "{{ $externalURL }}/d/G7Z9GzMGz?viewPanel=90&var-instance={{ $labels.instance }}"
+          summary: "Instance {{ $labels.instance }} reached 90% of the limit"
+          description: "Max series limit set via -remoteWrite.maxDailySeries flag is close to reaching the max value.
+            Then samples for new time series will be dropped instead of sending them to remote storage systems."
+
+      - alert: ConfigurationReloadFailure
+        expr: |
+          vm_promscrape_config_last_reload_successful != 1
+          or
+          vmagent_relabel_config_last_reload_successful != 1
+        labels:
+          severity: warning
+        annotations:
+          summary: "Configuration reload failed for vmagent instance {{ $labels.instance }}"
+          description: "Configuration hot-reload failed for vmagent on instance {{ $labels.instance }}.
+          Check vmagent's logs for detailed error message."
+
+      - alert: StreamAggrFlushTimeout
+        expr: |
+          increase(vm_streamaggr_flush_timeouts_total[5m]) > 0
+        labels:
+          severity: warning
+        annotations:
+          summary: "Streaming aggregation at \"{{ $labels.job }}\" (instance {{ $labels.instance }}) can't be finished within the configured aggregation interval."
+          description: "Stream aggregation process can't keep up with the load and might produce incorrect aggregation results. Check logs for more details.
+            Possible solutions: increase aggregation interval; aggregate smaller number of series; reduce samples' ingestion rate to stream aggregation."
+
+      - alert: StreamAggrDedupFlushTimeout
+        expr: |
+          increase(vm_streamaggr_dedup_flush_timeouts_total[5m]) > 0
+        labels:
+          severity: warning
+        annotations:
+          summary: "Deduplication \"{{ $labels.job }}\" (instance {{ $labels.instance }}) can't be finished within configured deduplication interval."
+          description: "Deduplication process can't keep up with the load and might produce incorrect results. Check docs https://docs.victoriametrics.com/victoriametrics/stream-aggregation/#deduplication and logs for more details.
+            Possible solutions: increase deduplication interval; deduplicate smaller number of series; reduce samples' ingestion rate."
+
+      - alert: PersistentQueueRunsOutOfSpaceIn12Hours
+        expr: |
+          (
+            (sum(vm_persistentqueue_free_disk_space_bytes) by (job, instance, path)) /
+            (sum(deriv(vm_persistentqueue_bytes_pending[1m])) by (job, instance, path) > 0)
+          ) *
+          on(job, instance, path) group_left(url)
+          (vmagent_remotewrite_pending_data_bytes * 0 + 1) < 12 * 3600 >0
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Persistent Queue (url {{ $labels.url }}) of {{ $labels.instance }} (job:{{ $labels.job }}) will run out of space in 12 hours."
+          description: "RemoteWrite destination ({{ $labels.url }}) is unavailable or unable to receive data in a timely manner, so the persistent queue size is growing.
+            Once the available space is exhausted, some samples will be discarded and cause incident. Please check the health of remoteWrite destination ({{ $labels.url }})."
+
+      - alert: PersistentQueueRunsOutOfSpaceIn4Hours
+        expr: |
+          (
+            (sum(vm_persistentqueue_free_disk_space_bytes) by (job, instance, path)) /
+            (sum(deriv(vm_persistentqueue_bytes_pending[1m])) by (job, instance, path) > 0)
+          ) *
+          on(job, instance, path) group_left(url)
+          (vmagent_remotewrite_pending_data_bytes * 0 + 1) < 4 * 3600 >0
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Persistent Queue (url {{ $labels.url }}) of {{ $labels.instance }} (job:{{ $labels.job }}) will run out of space in 4 hours."
+          description: "RemoteWrite destination ({{ $labels.url }}) is unavailable or unable to receive data in a timely manner, so the persistent queue size is growing.
+            Once the available space is exhausted, some samples will be discarded and cause incident. Please check the health of remoteWrite destination ({{ $labels.url }})."
+{% endraw %}
--- a/templates/monitoring/vmalert/alerts-vmalert.yml.j2
+++ b/templates/monitoring/vmalert/alerts-vmalert.yml.j2
@@ -0,0 +1,112 @@
+{% raw %}
+# File contains default list of alerts for vmalert service.
+# The alerts below are just recommendations and may require some updates
+# and threshold calibration according to every specific setup.
+groups:
+  # Alerts group for vmalert assumes that Grafana dashboard
+  # https://grafana.com/grafana/dashboards/14950 is installed.
+  # Pls update the `dashboard` annotation according to your setup.
+  - name: vmalert
+    interval: 30s
+    rules:
+      - alert: ConfigurationReloadFailure
+        expr: vmalert_config_last_reload_successful != 1
+        labels:
+          severity: warning
+        annotations:
+          summary: "Configuration reload failed for vmalert instance {{ $labels.instance }}"
+          description: "Configuration hot-reload failed for vmalert on instance {{ $labels.instance }}.
+            Check vmalert's logs for detailed error message."
+
+      - alert: AlertingRulesError
+        expr: sum(increase(vmalert_alerting_rules_errors_total[5m])) without(id) > 0
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "{{ $externalURL }}/d/LzldHAVnz?viewPanel=13&var-instance={{ $labels.instance }}&var-file={{ $labels.file }}&var-group={{ $labels.group }}"
+          summary: "Alerting rules are failing for vmalert instance {{ $labels.instance }}"
+          description: "Alerting rules execution is failing for \"{{ $labels.alertname }}\" from group \"{{ $labels.group }}\" in file \"{{ $labels.file }}\".
+            Check vmalert's logs for detailed error message."
+
+      - alert: RecordingRulesError
+        expr: sum(increase(vmalert_recording_rules_errors_total[5m])) without(id) > 0
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "{{ $externalURL }}/d/LzldHAVnz?viewPanel=30&var-instance={{ $labels.instance }}&var-file={{ $labels.file }}&var-group={{ $labels.group }}"
+          summary: "Recording rules are failing for vmalert instance {{ $labels.instance }}"
+          description: "Recording rules execution is failing for \"{{ $labels.recording }}\" from group \"{{ $labels.group }}\" in file \"{{ $labels.file }}\".
+            Check vmalert's logs for detailed error message."
+
+      - alert: RecordingRulesNoData
+        expr: sum(vmalert_recording_rules_last_evaluation_samples) without(id) < 1
+        for: 30m
+        labels:
+          severity: info
+        annotations:
+          dashboard: "{{ $externalURL }}/d/LzldHAVnz?viewPanel=33&var-file={{ $labels.file }}&var-group={{ $labels.group }}"
+          summary: "Recording rule {{ $labels.recording }} ({{ $labels.group }}) produces no data"
+          description: "Recording rule \"{{ $labels.recording }}\" from group \"{{ $labels.group }}\ in file \"{{ $labels.file }}\"
+            produces 0 samples over the last 30min. It might be caused by a misconfiguration
+            or incorrect query expression."
+
+      - alert: TooManyMissedIterations
+        expr: increase(vmalert_iteration_missed_total[5m]) > 0
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: "vmalert instance {{ $labels.instance }} is missing rules evaluations"
+          description: "vmalert instance {{ $labels.instance }} is missing rules evaluations for group \"{{ $labels.group }}\" in file \"{{ $labels.file }}\".
+            The group evaluation time takes longer than the configured evaluation interval. This may result in missed
+            alerting notifications or recording rules samples. Try increasing evaluation interval or concurrency of
+            group \"{{ $labels.group }}\". See https://docs.victoriametrics.com/victoriametrics/vmalert/#groups.
+            If rule expressions are taking longer than expected, please see https://docs.victoriametrics.com/victoriametrics/troubleshooting/#slow-queries."
+
+      - alert: RemoteWriteErrors
+        expr: increase(vmalert_remotewrite_errors_total[5m]) > 0
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: "vmalert instance {{ $labels.instance }} is failing to push metrics to remote write URL"
+          description: "vmalert instance {{ $labels.instance }} is failing to push metrics generated via alerting
+            or recording rules to the configured remote write URL. Check vmalert's logs for detailed error message."
+
+      - alert: RemoteWriteDroppingData
+        expr: increase(vmalert_remotewrite_dropped_rows_total[5m]) > 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "vmalert instance {{ $labels.instance }} is dropping data sent to remote write URL"
+          description: "vmalert instance {{ $labels.instance }} is failing to send results of alerting or recording rules
+            to the configured remote write URL. This may result into gaps in recording rules or alerts state.
+            Check vmalert's logs for detailed error message."
+
+      - alert: RemoteWriteQueueHighUsage
+        expr: histogram_quantile(0.99, sum(increase(vmalert_remotewrite_queue_size_bucket[5m])) by (job, instance, vmrange)) / vmalert_remotewrite_queue_capacity > 0.8
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Remote write queue capacity on the vmalert instance {{ $labels.instance }} has exceeded 80% utilization"
+          description: "The remote write queue on vmalert instance {{ $labels.instance }} has consistently high utilization.
+            The queue acts as a buffer between rules generating series and remote-write client consuming and pushing these series. When queue overflows, vmalert will start dropping newly generated series.
+            Queue may overflow due to multiple reasons:
+             1. Some bad rules produce too many series at once. This can be limited using the global `-rule.resultsLimit` flag or `limit` param at the rule group level.
+             2. Remote write connection is slow.  Increase `-remoteWrite.concurrency`,  so vmalert could establish more concurrent connections.
+             3. The queue size is too small. Increase `-remoteWrite.maxQueueSize` to extend the buffer size. Note that a larger queue will result in higher memory consumption when the queue is full."
+
+      - alert: AlertmanagerErrors
+        expr: increase(vmalert_alerts_send_errors_total[5m]) > 0
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: "vmalert instance {{ $labels.instance }} is failing to send notifications to Alertmanager"
+          description: "vmalert instance {{ $labels.instance }} is failing to send alert notifications to \"{{ $labels.addr }}\".
+            Check vmalert's logs for detailed error message."
+{% endraw %}
Author	SHA1	Message	Date
nikitsim	31ca57f22c	Merge pull request 'deploy grafana; refactor' (#1 ) from deploy_monitoring into main Reviewed-on: homedungeon.loc/ansible#1	2026-05-22 16:30:15 +00:00
Nikita Simonov	4f93ab0416	deploy grafana; refactor	2026-05-22 20:29:13 +04:00