Compare commits

...

2 Commits

Author SHA1 Message Date
31ca57f22c Merge pull request 'deploy grafana; refactor' (#1) from deploy_monitoring into main
Reviewed-on: homedungeon.loc/ansible#1
2026-05-22 16:30:15 +00:00
Nikita Simonov
4f93ab0416 deploy grafana; refactor 2026-05-22 20:29:13 +04:00
17 changed files with 927 additions and 105 deletions

View File

@@ -2,6 +2,9 @@ base_users:
- name: "reaper"
groups: "sudo,docker"
group: "reaper"
uid: "1001"
ssh_key: "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMudcsD9pqea/4Gv6PkwtzoDck9MHRkYKEc8hfTvtBAc"
password: "$y$j9T$o1x4cPajXw.XUxo/UjlxD1$Wq4hI6kkuq4D5WR4jzGr12Easn0rO1E8TCNYcJGnZy6"
settings: ""
settings: ""
docker_group: "reaper"

View File

@@ -0,0 +1,13 @@
docker_compose_configs:
- dir_name: vmalert
files:
- "alerts-single-node.yml"
- "alerts-health.yml"
- "alerts-vmagent.yml"
- "alerts-vmalert.yml"
- dir_name: alertmanager
files:
- "alertmanager.yml"
- dir_name: prometheus
files:
- "prometheus.yml"

View File

@@ -0,0 +1,102 @@
docker_compose_definition:
services:
grafana:
image: grafana/grafana:13.0
depends_on:
- "victoriametrics"
env_file: .env
container_name: grafana
restart: unless-stopped
user: "1001"
ports:
- "3000:3000"
volumes:
- /monitoring/grafana:/var/lib/grafana
vmagent:
image: victoriametrics/vmagent:v1.143.0
depends_on:
- "victoriametrics"
user: "1001"
ports:
- 8429:8429
volumes:
- /monitoring/vmagentdata:/vmagentdata
- ./configs/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
command:
- "--promscrape.config=/etc/prometheus/prometheus.yml"
- "--remoteWrite.url=http://victoriametrics:8428/api/v1/write"
- --remoteWrite.tmpDataPath=/vmagentdata/vmagent-remotewrite-data
restart: always
victoriametrics:
image: victoriametrics/victoria-metrics:v1.143.0
container_name: victoriametrics
restart: unless-stopped
ports:
- 8428:8428
user: "1001"
volumes:
- /monitoring/victoriametrics/victoria-metrics-data:/victoria-metrics-data
command:
- "--storageDataPath=/victoria-metrics-data"
- "--vmalert.proxyURL=http://vmalert:8880"
# vmalert executes alerting and recording rules
vmalert:
image: victoriametrics/vmalert:v1.143.0
depends_on:
- "victoriametrics"
- "alertmanager"
ports:
- 8880:8880
user: "1001"
volumes:
- ./configs/vmalert/alerts-single-node.yml:/etc/alerts/alerts-single-node.yml
- ./configs/vmalert/alerts-health.yml:/etc/alerts/alerts-health.yml
- ./configs/vmalert/alerts-vmagent.yml:/etc/alerts/alerts-vmagent.yml
- ./configs/vmalert/alerts-vmalert.yml:/etc/alerts/alerts-vmalert.yml
command:
- "--datasource.url=http://victoriametrics:8428/"
- "--remoteRead.url=http://victoriametrics:8428/"
- "--remoteWrite.url=http://vmagent:8429/"
- "--notifier.url=http://alertmanager:9093/"
- "--rule=/etc/alerts/*.yml"
# display source of alerts in grafana
- "--external.url=http://127.0.0.1:3000" #grafana outside container
- '{% raw %}--external.alert.source=explore?orgId=1&left={"datasource":"VictoriaMetrics","queries":[{"expr":{{.Expr|jsonEscape|queryEscape}},"refId":"A"}],"range":{"from":"{{ .ActiveAt.UnixMilli }}","to":"now"}}{% endraw %}'
restart: always
# alertmanager receives alerting notifications from vmalert
# and distributes them according to --config.file.
alertmanager:
image: prom/alertmanager:v0.28.1
volumes:
- ./configs/alertmanager/alertmanager.yml:/config/alertmanager.yml
command:
- "--config.file=/config/alertmanager.yml"
ports:
- 9093:9093
restart: always
node_exporter:
image: quay.io/prometheus/node-exporter:v1.11.1-distroless
container_name: node_exporter
command:
- "--path.rootfs=/host"
network_mode: host
pid: host
restart: unless-stopped
volumes:
- "/:/host:ro,rslave"
cadvisor:
image: ghcr.io/google/cadvisor:v0.57.0
container_name: cadvisor
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
ports:
- 8081:8080
restart: unless-stopped

View File

@@ -0,0 +1,23 @@
# compose
docker_compose_project_name: monitoring
docker_compose_data_dir: /compose/monitoring
docker_compose_dotenv: |
GF_SERVER_ROOT_URL: /
GF_PLUGINS_PREINSTALL: victoriametrics-metrics-datasource, yesoreyeram-infinity-datasource
scrape:
- name: "node_exporter"
targets:
- nextcloud.homedungeon.loc:9100
- name: "cadvisor"
targets:
- cadvisor:8081
- name: "vmagent"
targets:
- vmagent:8429
- name: "vmalert"
targets:
- vmalert:8880
- name: "victoriametrics"
targets:
- victoriametrics:8428

View File

@@ -0,0 +1,88 @@
# compose
docker_compose_project_name: nextcloudaio
docker_compose_data_dir: /compose/nextcloudaio
docker_compose_dotenv: |
# AIO_DISABLE_BACKUP_SECTION: false # Setting this to true allows to hide the backup section in the AIO interface. See https://github.com/nextcloud/all-in-one#how-to-disable-the-backup-section
APACHE_PORT: 11000 # Is needed when running behind a web server or reverse proxy (like Apache, Nginx, Caddy, Cloudflare Tunnel and else). See https://github.com/nextcloud/all-in-one/blob/main/reverse-proxy.md
APACHE_IP_BINDING: 127.0.0.1 # Should be set when running behind a web server or reverse proxy (like Apache, Nginx, Caddy, Cloudflare Tunnel and else) that is running on the same host. See https://github.com/nextcloud/all-in-one/blob/main/reverse-proxy.md
# APACHE_ADDITIONAL_NETWORK: frontend_net # (Optional) Connect the apache container to an additional docker network. Needed when behind a web server or reverse proxy (like Apache, Nginx, Caddy, Cloudflare Tunnel and else) running in a different docker network on same server. See https://github.com/nextcloud/all-in-one/blob/main/reverse-proxy.md
# BORG_RETENTION_POLICY: --keep-within=7d --keep-weekly=4 --keep-monthly=6 # Allows to adjust borgs retention policy. See https://github.com/nextcloud/all-in-one#how-to-adjust-borgs-retention-policy
# COLLABORA_SECCOMP_DISABLED: false # Setting this to true allows to disable Collabora's Seccomp feature. See https://github.com/nextcloud/all-in-one#how-to-disable-collaboras-seccomp-feature
# DOCKER_API_VERSION: 1.44 # You can adjust the internally used docker api version with this variable. ⚠️⚠️⚠️ Warning: please note that only the default api version (unset this variable) is supported and tested by the maintainers of Nextcloud AIO. So use this on your own risk and things might break without warning. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-internally-used-docker-api-version
# FULLTEXTSEARCH_JAVA_OPTIONS: "-Xms1024M -Xmx1024M" # Allows to adjust the fulltextsearch java options. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-fulltextsearch-java-options
NEXTCLOUD_DATADIR: /nextcloud # Allows to set the host directory for Nextcloud's datadir. ⚠️⚠️⚠️ Warning: do not set or adjust this value after the initial Nextcloud installation is done! See https://github.com/nextcloud/all-in-one#how-to-change-the-default-location-of-nextclouds-datadir
# NEXTCLOUD_MOUNT: /mnt/ # Allows the Nextcloud container to access the chosen directory on the host. See https://github.com/nextcloud/all-in-one#how-to-allow-the-nextcloud-container-to-access-directories-on-the-host
# NEXTCLOUD_UPLOAD_LIMIT: 16G # Can be adjusted if you need more. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-upload-limit-for-nextcloud
# NEXTCLOUD_MAX_TIME: 3600 # Can be adjusted if you need more. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-max-execution-time-for-nextcloud
# NEXTCLOUD_MEMORY_LIMIT: 512M # Can be adjusted if you need more. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-php-memory-limit-for-nextcloud
# NEXTCLOUD_TRUSTED_CACERTS_DIR: /path/to/my/cacerts # CA certificates in this directory will be trusted by the OS of the nextcloud container (Useful e.g. for LDAPS) See https://github.com/nextcloud/all-in-one#how-to-trust-user-defined-certification-authorities-ca
# NEXTCLOUD_STARTUP_APPS: deck twofactor_totp tasks calendar contacts notes # Allows to modify the Nextcloud apps that are installed on starting AIO the first time. See https://github.com/nextcloud/all-in-one#how-to-change-the-nextcloud-apps-that-are-installed-on-the-first-startup
# NEXTCLOUD_ADDITIONAL_APKS: imagemagick # This allows to add additional packages to the Nextcloud container permanently. Default is imagemagick but can be overwritten by modifying this value. See https://github.com/nextcloud/all-in-one#how-to-add-os-packages-permanently-to-the-nextcloud-container
# NEXTCLOUD_ADDITIONAL_PHP_EXTENSIONS: imagick # This allows to add additional php extensions to the Nextcloud container permanently. Default is imagick but can be overwritten by modifying this value. See https://github.com/nextcloud/all-in-one#how-to-add-php-extensions-permanently-to-the-nextcloud-container
# NEXTCLOUD_ENABLE_NVIDIA_GPU: true # This allows to enable the NVIDIA runtime and GPU access for containers that profit from it. ⚠️⚠️⚠️ Warning: this only works if an NVIDIA gpu is installed on the server. See https://github.com/nextcloud/all-in-one#how-to-enable-hardware-acceleration-for-nextcloud.
# NEXTCLOUD_KEEP_DISABLED_APPS: false # Setting this to true will keep Nextcloud apps that are disabled in the AIO interface and not uninstall them if they should be installed. See https://github.com/nextcloud/all-in-one#how-to-keep-disabled-apps
SKIP_DOMAIN_VALIDATION: true # This should only be set to true if things are correctly configured. See https://github.com/nextcloud/all-in-one#how-to-skip-the-domain-validation
# TALK_PORT: 3478 # This allows to adjust the port that the talk container is using which is exposed on the host. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-talk-port
# WATCHTOWER_DOCKER_SOCKET_PATH: /var/run/docker.sock # Needs to be specified if the docker socket on the host is not located in the default '/var/run/docker.sock'. Otherwise mastercontainer updates will fail. For macos it needs to be '/var/run/docker.sock'
docker_compose_definition:
name: nextcloud-aio # Add the container to the same compose project like all the sibling containers are added to automatically.
services:
nextcloud-aio-mastercontainer:
image: ghcr.io/nextcloud-releases/all-in-one:latest # This is the container image used. You can switch to ghcr.io/nextcloud-releases/all-in-one:beta if you want to help testing new releases. See https://github.com/nextcloud/all-in-one#how-to-switch-the-channel
init: true # This setting makes sure that signals from main process inside the container are correctly forwarded to children. See https://docs.docker.com/reference/compose-file/services/#init
restart: always # This makes sure that the container starts always together with the host OS. See https://docs.docker.com/reference/compose-file/services/#restart
container_name: nextcloud-aio-mastercontainer # This line is not allowed to be changed as otherwise AIO will not work correctly
volumes:
- nextcloud_aio_mastercontainer:/mnt/docker-aio-config # This line is not allowed to be changed as otherwise the built-in backup solution will not work
- /var/run/docker.sock:/var/run/docker.sock:ro # May be changed on macOS, Windows or docker rootless. See the applicable documentation. If adjusting, don't forget to also set 'WATCHTOWER_DOCKER_SOCKET_PATH'!
# devices: ["/dev/dri"] # Uncomment to enable hardware acceleration. ⚠️⚠️⚠️ Warning: this only works if the '/dev/dri' device is present on the host! If it should not exist on your host, don't add this as otherwise the mastercontainer will fail to start! See https://github.com/nextcloud/all-in-one#how-to-enable-hardware-acceleration-for-nextcloud
network_mode: bridge # This adds the container to the same network as docker run would do. Comment this line and uncomment the line below and the networks section at the end of the file if you want to define a custom MTU size for the docker network
# networks: ["nextcloud-aio"]
ports:
# - "80:80" # Can be removed when running behind a web server or reverse proxy (like Apache, Nginx, Caddy, Cloudflare Tunnel and else). See https://github.com/nextcloud/all-in-one/blob/main/reverse-proxy.md
- "8080:8080" # This is the AIO interface, served via https and self-signed certificate. See https://github.com/nextcloud/all-in-one#explanation-of-used-ports
# - "8443:8443" # Can be removed when running behind a web server or reverse proxy (like Apache, Nginx, Caddy, Cloudflare Tunnel and else). See https://github.com/nextcloud/all-in-one/blob/main/reverse-proxy.md
# security_opt: ["label:disable"] # Is needed when using SELinux. See https://github.com/nextcloud/all-in-one#are-there-known-problems-when-selinux-is-enabled
env_file: ".env"
# # Optional: Caddy reverse proxy. See https://github.com/nextcloud/all-in-one/discussions/575
# # Alternatively, use Tailscale if you don't have a domain yet. See https://github.com/nextcloud/all-in-one/discussions/6817
# # Hint: You need to uncomment APACHE_PORT: 11000 above, adjust cloud.example.com to your domain and uncomment the necessary docker volumes at the bottom of this file in order to make it work
# # You can find further examples here: https://github.com/nextcloud/all-in-one/discussions/588
caddy:
image: caddy:alpine
restart: always
container_name: caddy
volumes:
- ./caddy_certs:/certs
- ./caddy_config:/config
- ./caddy_data:/data
- ./caddy_sites:/srv
network_mode: "host"
configs:
- source: Caddyfile
target: /etc/caddy/Caddyfile
configs:
Caddyfile:
content: |
{
local_certs
}
https://nextcloud.homedungeon.loc:443 {
reverse_proxy localhost:11000
}
volumes: # If you want to store the data on a different drive, see https://github.com/nextcloud/all-in-one#how-to-store-the-filesinstallation-on-a-separate-drive
nextcloud_aio_mastercontainer:
name: nextcloud_aio_mastercontainer # This line is not allowed to be changed as otherwise the built-in backup solution will not work
# caddy_certs:
# caddy_config:
# caddy_data:
# caddy_sites:
# # Adjust the MTU size of the docker network. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-mtu-size-of-the-docker-network
# networks:
# nextcloud-aio:
# name: nextcloud-aio
# driver_opts:
# com.docker.network.driver.mtu: 1440

View File

@@ -37,99 +37,14 @@ btrfssubvol_subvolumes:
mount_point: /var/log
- name: snapshots
- name: nextcloud
owner: reaper
owner: www-data
group: www-data
mount_point: /nextcloud
- name: docker
mount_point: /docker
- name: compose
owner: reaper
mount_point: /compose
# compose
docker_compose_project_name: nextcloudaio
docker_compose_data_dir: /compose/nextcloudaio
docker_compose_dotenv: |
# AIO_DISABLE_BACKUP_SECTION: false # Setting this to true allows to hide the backup section in the AIO interface. See https://github.com/nextcloud/all-in-one#how-to-disable-the-backup-section
APACHE_PORT: 11000 # Is needed when running behind a web server or reverse proxy (like Apache, Nginx, Caddy, Cloudflare Tunnel and else). See https://github.com/nextcloud/all-in-one/blob/main/reverse-proxy.md
APACHE_IP_BINDING: 127.0.0.1 # Should be set when running behind a web server or reverse proxy (like Apache, Nginx, Caddy, Cloudflare Tunnel and else) that is running on the same host. See https://github.com/nextcloud/all-in-one/blob/main/reverse-proxy.md
# APACHE_ADDITIONAL_NETWORK: frontend_net # (Optional) Connect the apache container to an additional docker network. Needed when behind a web server or reverse proxy (like Apache, Nginx, Caddy, Cloudflare Tunnel and else) running in a different docker network on same server. See https://github.com/nextcloud/all-in-one/blob/main/reverse-proxy.md
# BORG_RETENTION_POLICY: --keep-within=7d --keep-weekly=4 --keep-monthly=6 # Allows to adjust borgs retention policy. See https://github.com/nextcloud/all-in-one#how-to-adjust-borgs-retention-policy
# COLLABORA_SECCOMP_DISABLED: false # Setting this to true allows to disable Collabora's Seccomp feature. See https://github.com/nextcloud/all-in-one#how-to-disable-collaboras-seccomp-feature
# DOCKER_API_VERSION: 1.44 # You can adjust the internally used docker api version with this variable. ⚠️⚠️⚠️ Warning: please note that only the default api version (unset this variable) is supported and tested by the maintainers of Nextcloud AIO. So use this on your own risk and things might break without warning. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-internally-used-docker-api-version
# FULLTEXTSEARCH_JAVA_OPTIONS: "-Xms1024M -Xmx1024M" # Allows to adjust the fulltextsearch java options. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-fulltextsearch-java-options
NEXTCLOUD_DATADIR: /nextcloud # Allows to set the host directory for Nextcloud's datadir. ⚠️⚠️⚠️ Warning: do not set or adjust this value after the initial Nextcloud installation is done! See https://github.com/nextcloud/all-in-one#how-to-change-the-default-location-of-nextclouds-datadir
# NEXTCLOUD_MOUNT: /mnt/ # Allows the Nextcloud container to access the chosen directory on the host. See https://github.com/nextcloud/all-in-one#how-to-allow-the-nextcloud-container-to-access-directories-on-the-host
# NEXTCLOUD_UPLOAD_LIMIT: 16G # Can be adjusted if you need more. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-upload-limit-for-nextcloud
# NEXTCLOUD_MAX_TIME: 3600 # Can be adjusted if you need more. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-max-execution-time-for-nextcloud
# NEXTCLOUD_MEMORY_LIMIT: 512M # Can be adjusted if you need more. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-php-memory-limit-for-nextcloud
# NEXTCLOUD_TRUSTED_CACERTS_DIR: /path/to/my/cacerts # CA certificates in this directory will be trusted by the OS of the nextcloud container (Useful e.g. for LDAPS) See https://github.com/nextcloud/all-in-one#how-to-trust-user-defined-certification-authorities-ca
# NEXTCLOUD_STARTUP_APPS: deck twofactor_totp tasks calendar contacts notes # Allows to modify the Nextcloud apps that are installed on starting AIO the first time. See https://github.com/nextcloud/all-in-one#how-to-change-the-nextcloud-apps-that-are-installed-on-the-first-startup
# NEXTCLOUD_ADDITIONAL_APKS: imagemagick # This allows to add additional packages to the Nextcloud container permanently. Default is imagemagick but can be overwritten by modifying this value. See https://github.com/nextcloud/all-in-one#how-to-add-os-packages-permanently-to-the-nextcloud-container
# NEXTCLOUD_ADDITIONAL_PHP_EXTENSIONS: imagick # This allows to add additional php extensions to the Nextcloud container permanently. Default is imagick but can be overwritten by modifying this value. See https://github.com/nextcloud/all-in-one#how-to-add-php-extensions-permanently-to-the-nextcloud-container
# NEXTCLOUD_ENABLE_NVIDIA_GPU: true # This allows to enable the NVIDIA runtime and GPU access for containers that profit from it. ⚠️⚠️⚠️ Warning: this only works if an NVIDIA gpu is installed on the server. See https://github.com/nextcloud/all-in-one#how-to-enable-hardware-acceleration-for-nextcloud.
# NEXTCLOUD_KEEP_DISABLED_APPS: false # Setting this to true will keep Nextcloud apps that are disabled in the AIO interface and not uninstall them if they should be installed. See https://github.com/nextcloud/all-in-one#how-to-keep-disabled-apps
SKIP_DOMAIN_VALIDATION: true # This should only be set to true if things are correctly configured. See https://github.com/nextcloud/all-in-one#how-to-skip-the-domain-validation
# TALK_PORT: 3478 # This allows to adjust the port that the talk container is using which is exposed on the host. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-talk-port
# WATCHTOWER_DOCKER_SOCKET_PATH: /var/run/docker.sock # Needs to be specified if the docker socket on the host is not located in the default '/var/run/docker.sock'. Otherwise mastercontainer updates will fail. For macos it needs to be '/var/run/docker.sock'
docker_compose_definition:
name: nextcloud-aio # Add the container to the same compose project like all the sibling containers are added to automatically.
services:
nextcloud-aio-mastercontainer:
image: ghcr.io/nextcloud-releases/all-in-one:latest # This is the container image used. You can switch to ghcr.io/nextcloud-releases/all-in-one:beta if you want to help testing new releases. See https://github.com/nextcloud/all-in-one#how-to-switch-the-channel
init: true # This setting makes sure that signals from main process inside the container are correctly forwarded to children. See https://docs.docker.com/reference/compose-file/services/#init
restart: always # This makes sure that the container starts always together with the host OS. See https://docs.docker.com/reference/compose-file/services/#restart
container_name: nextcloud-aio-mastercontainer # This line is not allowed to be changed as otherwise AIO will not work correctly
volumes:
- nextcloud_aio_mastercontainer:/mnt/docker-aio-config # This line is not allowed to be changed as otherwise the built-in backup solution will not work
- /var/run/docker.sock:/var/run/docker.sock:ro # May be changed on macOS, Windows or docker rootless. See the applicable documentation. If adjusting, don't forget to also set 'WATCHTOWER_DOCKER_SOCKET_PATH'!
# devices: ["/dev/dri"] # Uncomment to enable hardware acceleration. ⚠️⚠️⚠️ Warning: this only works if the '/dev/dri' device is present on the host! If it should not exist on your host, don't add this as otherwise the mastercontainer will fail to start! See https://github.com/nextcloud/all-in-one#how-to-enable-hardware-acceleration-for-nextcloud
network_mode: bridge # This adds the container to the same network as docker run would do. Comment this line and uncomment the line below and the networks section at the end of the file if you want to define a custom MTU size for the docker network
# networks: ["nextcloud-aio"]
ports:
# - "80:80" # Can be removed when running behind a web server or reverse proxy (like Apache, Nginx, Caddy, Cloudflare Tunnel and else). See https://github.com/nextcloud/all-in-one/blob/main/reverse-proxy.md
- "8080:8080" # This is the AIO interface, served via https and self-signed certificate. See https://github.com/nextcloud/all-in-one#explanation-of-used-ports
# - "8443:8443" # Can be removed when running behind a web server or reverse proxy (like Apache, Nginx, Caddy, Cloudflare Tunnel and else). See https://github.com/nextcloud/all-in-one/blob/main/reverse-proxy.md
# security_opt: ["label:disable"] # Is needed when using SELinux. See https://github.com/nextcloud/all-in-one#are-there-known-problems-when-selinux-is-enabled
env_file: ".env"
# # Optional: Caddy reverse proxy. See https://github.com/nextcloud/all-in-one/discussions/575
# # Alternatively, use Tailscale if you don't have a domain yet. See https://github.com/nextcloud/all-in-one/discussions/6817
# # Hint: You need to uncomment APACHE_PORT: 11000 above, adjust cloud.example.com to your domain and uncomment the necessary docker volumes at the bottom of this file in order to make it work
# # You can find further examples here: https://github.com/nextcloud/all-in-one/discussions/588
caddy:
image: caddy:alpine
restart: always
container_name: caddy
volumes:
- ./caddy_certs:/certs
- ./caddy_config:/config
- ./caddy_data:/data
- ./caddy_sites:/srv
network_mode: "host"
configs:
- source: Caddyfile
target: /etc/caddy/Caddyfile
configs:
Caddyfile:
content: |
# Adjust cloud.example.com to your domain below
https://nextcloud.homedungeon.loc:443 {
tls /certs/cert.pem /certs/key.pem
reverse_proxy localhost:11000
}
volumes: # If you want to store the data on a different drive, see https://github.com/nextcloud/all-in-one#how-to-store-the-filesinstallation-on-a-separate-drive
nextcloud_aio_mastercontainer:
name: nextcloud_aio_mastercontainer # This line is not allowed to be changed as otherwise the built-in backup solution will not work
# caddy_certs:
# caddy_config:
# caddy_data:
# caddy_sites:
# # Adjust the MTU size of the docker network. See https://github.com/nextcloud/all-in-one#how-to-adjust-the-mtu-size-of-the-docker-network
# networks:
# nextcloud-aio:
# name: nextcloud-aio
# driver_opts:
# com.docker.network.driver.mtu: 1440
docker_group: "reaper"
- name: monitoring
owner: reaper
mount_point: /monitoring

View File

@@ -1,2 +1,8 @@
[storage]
cm3588
cm3588
[nextcloudaio]
cm3588
[monitoring]
monitoring.homedungeon.loc

4
monitoring.yml Normal file
View File

@@ -0,0 +1,4 @@
- name: "Deploy monitoring stack docker"
hosts: monitoring
roles:
- role: docker_compose

View File

@@ -1,4 +1,4 @@
- name: "Deploy nextcloudaio docker"
hosts: storage
hosts: nextcloudaio
roles:
- role: docker_compose

View File

@@ -1,9 +1,9 @@
- hosts: storage
vars:
ansible_ssh_user: pi
ansible_sudo_pass: pi
ansible_ssh_pass: pi
ansible_host: "{{ ip_before_reboot }}"
# vars:
# ansible_ssh_user: pi
# ansible_sudo_pass: pi
# ansible_ssh_pass: pi
# ansible_host: "{{ ip_before_reboot }}"
roles:
- role: dns
tags: dns
@@ -46,7 +46,7 @@
file:
path: /etc/systemd/system/logrotate.timer.d
state: directory
mode: '0755'
mode: "0755"
owner: root
group: root
- name: Copy logrotate.timer
@@ -56,7 +56,7 @@
[Timer]
OnCalendar=hourly
AccuracySec=1m
mode: '0755'
mode: "0755"
owner: root
group: root
- name: Restart logrotate.timer
@@ -66,7 +66,7 @@
- name: "Mount new /var/log and move logs"
hosts: storage
tags: [ mount, logs ]
tags: [mount, logs]
roles:
- role: lingling9000.btrfssubvol
tags: btrfssubvol
@@ -80,15 +80,14 @@
- name: "Move logs to new directory"
when:
- has_var_log_mount
- _var_log_stat.stat.isdir is not defined or not _var_log_stat.stat.isdir
- has_var_log_mount
- _var_log_stat.stat.isdir is not defined or not _var_log_stat.stat.isdir
block:
- name: "Mount original rootfs and move files"
shell: "mkdir /tmp/mnt-rootfs && mount --bind / /tmp/mnt-rootfs && mv /tmp/mnt-rootfs/var/log/* /var/log/ && umount /tmp/mnt-rootfs && rmdir /tmp/mnt-rootfs"
rescue:
- name: "Unmount rootfs"
shell: "umount /tmp/mnt-rootfs && rmdir /tmp/mnt-rootfs"
# - name: install victoria exporters
# hosts: storage
# gather_facts: yes
@@ -96,4 +95,4 @@
# - role: cadvisor
# tags:
# - monitoring
# - cadvisor
# - cadvisor

View File

@@ -1,3 +1,6 @@
Host cm3588
User reaper
Hostname 192.168.8.2
Host monitoring.homedungeon.loc
User reaper

View File

@@ -0,0 +1,5 @@
route:
receiver: blackhole
receivers:
- name: blackhole

View File

@@ -0,0 +1,12 @@
global:
scrape_interval: 15s
scrape_timeout: 10s
scrape_configs:
{% for service in scrape %}
- job_name: "{{ service.name }}"
static_configs:
{% for target in service.targets %}
- targets: ["{{ target }}"]
{% endfor %}
{% endfor %}

View File

@@ -0,0 +1,128 @@
{% raw %}
# File contains default list of alerts for various VM components.
# The following alerts are recommended for use for any VM installation.
# The alerts below are just recommendations and may require some updates
# and threshold calibration according to every specific setup.
groups:
- name: vm-health
# note the `job` filter and update accordingly to your setup
rules:
- alert: TooManyRestarts
expr: changes(process_start_time_seconds{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"}[15m]) > 2
labels:
severity: critical
annotations:
summary: "{{ $labels.job }} too many restarts (instance {{ $labels.instance }})"
description: >
Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes.
It might be crashlooping.
- alert: ServiceDown
expr: up{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
- alert: ProcessNearFDLimits
expr: (process_max_fds - process_open_fds) < 100
for: 5m
labels:
severity: critical
annotations:
summary: 'Number of free file descriptors is less than 100 for "{{ $labels.job }}"("{{ $labels.instance }}") for the last 5m'
description: |
Exhausting OS file descriptors limit can cause severe degradation of the process.
Consider to increase the limit as fast as possible.
- alert: TooHighMemoryUsage
expr: (min_over_time(process_resident_memory_anon_bytes[10m]) / vm_available_memory_bytes) > 0.8
for: 5m
labels:
severity: critical
annotations:
summary: 'It is more than 80% of memory used by "{{ $labels.job }}"("{{ $labels.instance }}")'
description: |
Too high memory usage may result into multiple issues such as OOMs or degraded performance.
Consider to either increase available memory or decrease the load on the process.
- alert: TooHighCPUUsage
expr: rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9
for: 5m
labels:
severity: critical
annotations:
summary: 'More than 90% of CPU is used by "{{ $labels.job }}"("{{ $labels.instance }}") during the last 5m'
description: >
Too high CPU usage may be a sign of insufficient resources and make process unstable.
Consider to either increase available CPU resources or decrease the load on the process.
- alert: TooHighGoroutineSchedulingLatency
expr: histogram_quantile(0.99, sum(rate(go_sched_latencies_seconds_bucket{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"}[5m])) by (le, job, instance)) > 0.1
for: 15m
labels:
severity: critical
annotations:
summary: '"{{ $labels.job }}"("{{ $labels.instance }}") has insufficient CPU resources for >15m'
description: >
Go runtime is unable to schedule goroutines execution in acceptable time. This is usually a sign of
insufficient CPU resources or CPU throttling. Verify that service has enough CPU resources. Otherwise,
the service could work unreliably with delays in processing.
- alert: TooManyLogs
expr: sum(increase(vm_log_messages_total{level!="info"}[5m])) without (app_version, location, is_printed) > 0
for: 15m
labels:
severity: warning
annotations:
summary: 'Too many logs are generated for job "{{ $labels.job }}" ({{ $labels.instance }})'
description: >
The job \"{{ $labels.job }}\" ({{ $labels.instance }}) generated {{ $value }} log messages with the level higher than info for the last 5 minutes.
Check the logs for the given target. Check also the \"location\" label at the vm_log_messages_total metric if -loggerLevel command-line flag is set to value other than INFO.
This label contains code locations responsible for generating log messages suppressed by -loggerLevel.
- alert: ConcurrentInsertsHitTheLimit
expr: avg_over_time(vm_concurrent_insert_current[1m]) >= vm_concurrent_insert_capacity
for: 15m
labels:
severity: warning
annotations:
summary: "{{ $labels.job }} on instance {{ $labels.instance }} is constantly hitting concurrent inserts limit"
description: |
The limit of concurrent inserts on instance {{ $labels.instance }} depends on the number of CPUs.
Usually, when component constantly hits the limit it is likely the component is overloaded and requires more CPU.
In some cases for components like vmagent or vminsert the alert might trigger if there are too many clients
making write attempts. If vmagent's or vminsert's CPU usage and network saturation are at normal level, then
it might be worth adjusting `-maxConcurrentInserts` cmd-line flag.
- alert: TooHighQueryLoad
expr: increase(vm_concurrent_select_limit_timeout_total[5m]) > 0
for: 15m
labels:
severity: warning
annotations:
summary: "Read queries fail with timeout for {{ $labels.job }} on instance {{ $labels.instance }}"
description: |
Instance {{ $labels.instance }} ({{ $labels.job }}) is failing to serve read queries during last 15m.
Concurrency limit `-search.maxConcurrentRequests` was reached on this instance and extra queries were
put into the queue for `-search.maxQueueDuration` interval. But even after waiting in the queue these queries weren't served.
This happens if instance is overloaded with the current workload, or datasource is too slow to respond.
Possible solutions are the following:
* reduce the query load;
* increase compute resources or number of replicas;
* adjust limits `-search.maxConcurrentRequests` and `-search.maxQueueDuration`.
See more at https://docs.victoriametrics.com/victoriametrics/troubleshooting/#slow-queries.
- alert: RowsRejectedOnIngestion
expr: rate(vm_rows_ignored_total[5m]) > 0
for: 15m
labels:
severity: warning
annotations:
summary: 'Some rows are rejected on "{{ $labels.instance }}" on ingestion attempt'
description:
'Ingested rows on instance "{{ $labels.instance }}" are rejected due to the
following reason: "{{ $labels.reason }}"'
{% endraw %}

View File

@@ -0,0 +1,203 @@
{% raw %}
# File contains default list of alerts for VictoriaMetrics single server.
# The alerts below are just recommendations and may require some updates
# and threshold calibration according to every specific setup.
groups:
# Alerts group for VM single assumes that Grafana dashboard
# https://grafana.com/grafana/dashboards/10229 is installed.
# Pls update the `dashboard` annotation according to your setup.
- name: vmsingle
interval: 30s
concurrency: 2
rules:
- alert: DiskRunsOutOfSpaceIn3Days
expr: |
sum(vm_free_disk_space_bytes) without(path) /
(
(rate(vm_rows_added_to_storage_total[1d]) - sum(rate(vm_deduplicated_samples_total[1d])) without(type)) * (
sum(vm_data_size_bytes{type!~"indexdb.*"}) without(type) /
sum(vm_rows{type!~"indexdb.*"}) without(type)
)
+
rate(vm_new_timeseries_created_total[1d]) * (
sum(vm_data_size_bytes{type="indexdb/file"}) without(type)/
sum(vm_rows{type="indexdb/file"}) without(type)
)
) < 3 * 24 * 3600 > 0
for: 30m
labels:
severity: critical
annotations:
dashboard: "{{ $externalURL }}/d/wNf0q_kZk?viewPanel=53&var-instance={{ $labels.instance }}"
summary: "Instance {{ $labels.instance }} will run out of disk space soon"
description:
"Taking into account current ingestion rate, free disk space will be enough only
for {{ $value | humanizeDuration }} on instance {{ $labels.instance }}.\n
Consider to limit the ingestion rate, decrease retention or scale the disk space if possible."
- alert: NodeBecomesReadonlyIn3Days
expr: |
sum(vm_free_disk_space_bytes - vm_free_disk_space_limit_bytes) without(path) /
(
(rate(vm_rows_added_to_storage_total[1d]) - sum(rate(vm_deduplicated_samples_total[1d])) without(type)) * (
sum(vm_data_size_bytes{type!~"indexdb.*"}) without(type) /
sum(vm_rows{type!~"indexdb.*"}) without(type)
)
+
rate(vm_new_timeseries_created_total[1d]) * (
sum(vm_data_size_bytes{type="indexdb/file"}) without(type) /
sum(vm_rows{type="indexdb/file"}) without(type)
)
) < 3 * 24 * 3600 > 0
for: 30m
labels:
severity: warning
annotations:
dashboard: "{{ $externalURL }}/d/oS7Bi_0Wz?viewPanel=53&var-instance={{ $labels.instance }}"
summary: "Instance {{ $labels.instance }} will become read-only in 3 days"
description:
"Taking into account current ingestion rate and free disk space
instance {{ $labels.instance }} is writable for {{ $value | humanizeDuration }}.\n
Consider to limit the ingestion rate, decrease retention or scale the disk space up if possible."
- alert: DiskRunsOutOfSpace
expr: |
sum(vm_data_size_bytes) by(job, instance) /
(
sum(vm_free_disk_space_bytes) by(job, instance) +
sum(vm_data_size_bytes) by(job, instance)
) > 0.8
for: 30m
labels:
severity: critical
annotations:
dashboard: "{{ $externalURL }}/d/wNf0q_kZk?viewPanel=53&var-instance={{ $labels.instance }}"
summary: "Instance {{ $labels.instance }} (job={{ $labels.job }}) will run out of disk space soon"
description:
"Disk utilisation on instance {{ $labels.instance }} is more than 80%.\n
Having less than 20% of free disk space could cripple merge processes and overall performance.
Consider to limit the ingestion rate, decrease retention or scale the disk space if possible."
- alert: RequestErrorsToAPI
expr: increase(vm_http_request_errors_total[5m]) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "{{ $externalURL }}/d/wNf0q_kZk?viewPanel=35&var-instance={{ $labels.instance }}"
summary: "Too many errors served for path {{ $labels.path }} (instance {{ $labels.instance }})"
description:
"Requests to path {{ $labels.path }} are receiving errors.
Please verify if clients are sending correct requests."
- alert: TooHighChurnRate
expr: |
(
sum(rate(vm_new_timeseries_created_total[5m])) by(instance)
/
sum(rate(vm_rows_inserted_total[5m])) by(instance)
) > 0.1
for: 15m
labels:
severity: warning
annotations:
dashboard: "{{ $externalURL }}/d/wNf0q_kZk?viewPanel=66&var-instance={{ $labels.instance }}"
summary: 'Churn rate is more than 10% on "{{ $labels.instance }}" for the last 15m'
description:
"VM constantly creates new time series on \"{{ $labels.instance }}\".\n
This effect is known as Churn Rate.\n
High Churn Rate is tightly connected with database performance and may
result in unexpected OOM's or slow queries."
- alert: TooHighChurnRate24h
expr: |
sum(increase(vm_new_timeseries_created_total[24h])) by(instance)
>
(sum(vm_cache_entries{type="storage/hour_metric_ids"}) by(instance) * 3)
for: 15m
labels:
severity: warning
annotations:
dashboard: "{{ $externalURL }}/d/wNf0q_kZk?viewPanel=66&var-instance={{ $labels.instance }}"
summary: 'Too high number of new series on "{{ $labels.instance }}" created over last 24h'
description:
"The number of created new time series over last 24h is 3x times higher than
current number of active series on \"{{ $labels.instance }}\".\n
This effect is known as Churn Rate.\n
High Churn Rate is tightly connected with database performance and may
result in unexpected OOM's or slow queries."
- alert: TooHighSlowInsertsRate
expr: |
(
sum(rate(vm_slow_row_inserts_total[5m])) by(instance)
/
sum(rate(vm_rows_inserted_total[5m])) by(instance)
) > 0.05
for: 15m
labels:
severity: warning
annotations:
dashboard: "{{ $externalURL }}/d/wNf0q_kZk?viewPanel=68&var-instance={{ $labels.instance }}"
summary: 'Percentage of slow inserts is more than 5% on "{{ $labels.instance }}" for the last 15m'
description:
'High rate of slow inserts on "{{ $labels.instance }}" may be a sign of resource exhaustion
for the current load. It is likely more RAM is needed for optimal handling of the current number of active time series.
See also https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3976#issuecomment-1476883183'
- alert: MetadataCacheUtilizationIsTooHigh
expr: |
vm_metrics_metadata_storage_size_bytes / vm_metrics_metadata_storage_max_size_bytes > 0.95
for: 15m
labels:
severity: warning
annotations:
summary: "Metadata cache capacity on {{ $labels.instance }} (job={{ $labels.job }}) is utilized for more than 95% for the last 15min"
description:
"Metadata cache stores meta information about ingested time series - see https://docs.victoriametrics.com/victoriametrics/#metrics-metadata.
When cache is overutilized, the oldest entries will be dropped out automatically. It may result into incomplete
response for /api/v1/metadata API calls. It doesn't impact regular queries or alerts. Cache size is controlled
via -storage.maxMetadataStorageSize cmd-line flag."
- alert: MetricNameStatsCacheUtilizationIsTooHigh
expr: |
vm_cache_size_bytes{type="storage/metricNamesStatsTracker"} / vm_cache_size_max_bytes{type="storage/metricNamesStatsTracker"} > 0.95
for: 15m
labels:
severity: warning
annotations:
summary: "Cache capacity for tracking metric names usage on {{ $labels.instance }} (job={{ $labels.job }}) is utilized for more than 95% during the last 15min"
description:
"Metric names usage cache stores information about unique metric names and how frequently they are queried - see https://docs.victoriametrics.com/victoriametrics/#track-ingested-metrics-usage.
When cache is overutilized, it will stop tracking the new metric names. It has no other negative impact.
Usually, the number of unique metric names is very limited (thousands). The cache can be overutilized only if metric names
are changing too frequently or if the cache size is too low. There are following ways to mitigate cache overutilization:
- disable cache via `--storage.trackMetricNamesStats=false` flag, so metric names usage will stop tracking
- increase the cache size via `--storage.cacheSizeMetricNamesStats` flag
- reset the cache (see docs for details)"
- alert: IndexDBRecordsDrop
expr: increase(vm_indexdb_items_dropped_total[5m]) > 0
labels:
severity: critical
annotations:
summary: "IndexDB skipped registering items during data ingestion with reason={{ $labels.reason }}."
description: |
VictoriaMetrics could skip registering new timeseries during ingestion if they fail the validation process.
For example, `reason=too_long_item` means that time series cannot exceed 64KB. Please, reduce the number
of labels or label values for such series. Or enforce these limits via `-maxLabelsPerTimeseries` and
`-maxLabelValueLen` command-line flags.
- alert: TooManyTSIDMisses
expr: increase(vm_missing_tsids_for_metric_id_total[5m]) > 0
for: 15m
labels:
severity: critical
annotations:
summary: 'Unexpected TSID misses for job "{{ $labels.job }}" ({{ $labels.instance }}) for the last 15 minutes'
description: |
Unexpected TSID misses for \"{{ $labels.job }}\" ({{ $labels.instance }}) for the last 15 minutes.
If this happens after unclean shutdown of VictoriaMetrics process (via \"kill -9\", OOM or power off),
then this is OK - the alert must go away in a few minutes after the restart.
Otherwise this may point to the corruption of index data.
{% endraw %}

View File

@@ -0,0 +1,206 @@
{% raw %}
# File contains default list of alerts for vmagent service.
# The alerts below are just recommendations and may require some updates
# and threshold calibration according to every specific setup.
groups:
# Alerts group for vmagent assumes that Grafana dashboard
# https://grafana.com/grafana/dashboards/12683 is installed.
# Pls update the `dashboard` annotation according to your setup.
- name: vmagent
interval: 30s
concurrency: 2
rules:
- alert: PersistentQueueIsDroppingData
expr: sum(increase(vm_persistentqueue_bytes_dropped_total[5m])) without (path) > 0
for: 10m
labels:
severity: critical
annotations:
dashboard: "{{ $externalURL }}/d/G7Z9GzMGz?viewPanel=49&var-instance={{ $labels.instance }}"
summary: "Instance {{ $labels.instance }} is dropping data from persistent queue"
description: "Vmagent dropped {{ $value | humanize1024 }} from persistent queue
on instance {{ $labels.instance }} for the last 10m."
- alert: RejectedRemoteWriteDataBlocksAreDropped
expr: sum(increase(vmagent_remotewrite_packets_dropped_total[5m])) without (url) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "{{ $externalURL }}/d/G7Z9GzMGz?viewPanel=79&var-instance={{ $labels.instance }}"
summary: "Vmagent is dropping data blocks that are rejected by remote storage"
description: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} drops the rejected by
remote-write server data blocks. Check the logs to find the reason for rejects."
- alert: TooManyScrapeErrors
expr: increase(vm_promscrape_scrapes_failed_total[5m]) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "{{ $externalURL }}/d/G7Z9GzMGz?viewPanel=31&var-instance={{ $labels.instance }}"
summary: "Vmagent fails to scrape one or more targets"
description: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to scrape targets for last 15m"
- alert: ScrapePoolHasNoTargets
expr: sum(vm_promscrape_scrape_pool_targets) without (status, instance, pod) == 0
for: 30m
labels:
severity: warning
annotations:
summary: "Vmagent has scrape_pool with 0 configured/discovered targets"
description: "Vmagent \"{{ $labels.job }}\" has scrape_pool \"{{ $labels.scrape_job }}\"
with 0 discovered targets. It is likely a misconfiguration. Please follow https://docs.victoriametrics.com/victoriametrics/vmagent/#debugging-scrape-targets
to troubleshoot the scraping config."
- alert: TooManyWriteErrors
expr: |
(sum(increase(vm_ingestserver_request_errors_total[5m])) without (name,net,type)
+
sum(increase(vmagent_http_request_errors_total[5m])) without (path,protocol)) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "{{ $externalURL }}/d/G7Z9GzMGz?viewPanel=77&var-instance={{ $labels.instance }}"
summary: "Vmagent responds with too many errors on data ingestion protocols"
description: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} responds with errors to write requests for last 15m."
- alert: TooManyRemoteWriteErrors
expr: rate(vmagent_remotewrite_retries_count_total[5m]) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "{{ $externalURL }}/d/G7Z9GzMGz?viewPanel=61&var-instance={{ $labels.instance }}"
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to push to remote storage"
description: "Vmagent fails to push data via remote write protocol to destination \"{{ $labels.url }}\"\n
Ensure that destination is up and reachable."
- alert: RemoteWriteConnectionIsSaturated
expr: |
(
rate(vmagent_remotewrite_send_duration_seconds_total[5m])
/
vmagent_remotewrite_queues
) > 0.9
for: 15m
labels:
severity: warning
annotations:
dashboard: "{{ $externalURL }}/d/G7Z9GzMGz?viewPanel=84&var-instance={{ $labels.instance }}"
summary: "Remote write connection from \"{{ $labels.job }}\" (instance {{ $labels.instance }}) to {{ $labels.url }} is saturated"
description: "The remote write connection between vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }}) and destination \"{{ $labels.url }}\"
is saturated by more than 90% and vmagent won't be able to keep up.\n
There could be the following reasons for this:\n
* vmagent can't send data fast enough through the existing network connections. Increase `-remoteWrite.queues` cmd-line flag value to establish more connections per destination.\n
* remote destination can't accept data fast enough. Check if remote destination has enough resources for processing."
- alert: PersistentQueueForWritesIsSaturated
expr: rate(vm_persistentqueue_write_duration_seconds_total[5m]) > 0.9
for: 15m
labels:
severity: warning
annotations:
dashboard: "{{ $externalURL }}/d/G7Z9GzMGz?viewPanel=98&var-instance={{ $labels.instance }}"
summary: "Persistent queue writes for instance {{ $labels.instance }} are saturated"
description: "Persistent queue writes for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }})
are saturated by more than 90% and vmagent won't be able to keep up with flushing data on disk.
In this case, consider to decrease load on the vmagent or improve the disk throughput."
- alert: PersistentQueueForReadsIsSaturated
expr: rate(vm_persistentqueue_read_duration_seconds_total[5m]) > 0.9
for: 15m
labels:
severity: warning
annotations:
dashboard: "{{ $externalURL }}/d/G7Z9GzMGz?viewPanel=99&var-instance={{ $labels.instance }}"
summary: "Persistent queue reads for instance {{ $labels.instance }} are saturated"
description: "Persistent queue reads for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }})
are saturated by more than 90% and vmagent won't be able to keep up with reading data from the disk.
In this case, consider to decrease load on the vmagent or improve the disk throughput."
- alert: SeriesLimitHourReached
expr: (vmagent_hourly_series_limit_current_series / vmagent_hourly_series_limit_max_series) > 0.9
labels:
severity: critical
annotations:
dashboard: "{{ $externalURL }}/d/G7Z9GzMGz?viewPanel=88&var-instance={{ $labels.instance }}"
summary: "Instance {{ $labels.instance }} reached 90% of the limit"
description: "Max series limit set via -remoteWrite.maxHourlySeries flag is close to reaching the max value.
Then samples for new time series will be dropped instead of sending them to remote storage systems."
- alert: SeriesLimitDayReached
expr: (vmagent_daily_series_limit_current_series / vmagent_daily_series_limit_max_series) > 0.9
labels:
severity: critical
annotations:
dashboard: "{{ $externalURL }}/d/G7Z9GzMGz?viewPanel=90&var-instance={{ $labels.instance }}"
summary: "Instance {{ $labels.instance }} reached 90% of the limit"
description: "Max series limit set via -remoteWrite.maxDailySeries flag is close to reaching the max value.
Then samples for new time series will be dropped instead of sending them to remote storage systems."
- alert: ConfigurationReloadFailure
expr: |
vm_promscrape_config_last_reload_successful != 1
or
vmagent_relabel_config_last_reload_successful != 1
labels:
severity: warning
annotations:
summary: "Configuration reload failed for vmagent instance {{ $labels.instance }}"
description: "Configuration hot-reload failed for vmagent on instance {{ $labels.instance }}.
Check vmagent's logs for detailed error message."
- alert: StreamAggrFlushTimeout
expr: |
increase(vm_streamaggr_flush_timeouts_total[5m]) > 0
labels:
severity: warning
annotations:
summary: "Streaming aggregation at \"{{ $labels.job }}\" (instance {{ $labels.instance }}) can't be finished within the configured aggregation interval."
description: "Stream aggregation process can't keep up with the load and might produce incorrect aggregation results. Check logs for more details.
Possible solutions: increase aggregation interval; aggregate smaller number of series; reduce samples' ingestion rate to stream aggregation."
- alert: StreamAggrDedupFlushTimeout
expr: |
increase(vm_streamaggr_dedup_flush_timeouts_total[5m]) > 0
labels:
severity: warning
annotations:
summary: "Deduplication \"{{ $labels.job }}\" (instance {{ $labels.instance }}) can't be finished within configured deduplication interval."
description: "Deduplication process can't keep up with the load and might produce incorrect results. Check docs https://docs.victoriametrics.com/victoriametrics/stream-aggregation/#deduplication and logs for more details.
Possible solutions: increase deduplication interval; deduplicate smaller number of series; reduce samples' ingestion rate."
- alert: PersistentQueueRunsOutOfSpaceIn12Hours
expr: |
(
(sum(vm_persistentqueue_free_disk_space_bytes) by (job, instance, path)) /
(sum(deriv(vm_persistentqueue_bytes_pending[1m])) by (job, instance, path) > 0)
) *
on(job, instance, path) group_left(url)
(vmagent_remotewrite_pending_data_bytes * 0 + 1) < 12 * 3600 >0
for: 10m
labels:
severity: warning
annotations:
summary: "Persistent Queue (url {{ $labels.url }}) of {{ $labels.instance }} (job:{{ $labels.job }}) will run out of space in 12 hours."
description: "RemoteWrite destination ({{ $labels.url }}) is unavailable or unable to receive data in a timely manner, so the persistent queue size is growing.
Once the available space is exhausted, some samples will be discarded and cause incident. Please check the health of remoteWrite destination ({{ $labels.url }})."
- alert: PersistentQueueRunsOutOfSpaceIn4Hours
expr: |
(
(sum(vm_persistentqueue_free_disk_space_bytes) by (job, instance, path)) /
(sum(deriv(vm_persistentqueue_bytes_pending[1m])) by (job, instance, path) > 0)
) *
on(job, instance, path) group_left(url)
(vmagent_remotewrite_pending_data_bytes * 0 + 1) < 4 * 3600 >0
for: 10m
labels:
severity: critical
annotations:
summary: "Persistent Queue (url {{ $labels.url }}) of {{ $labels.instance }} (job:{{ $labels.job }}) will run out of space in 4 hours."
description: "RemoteWrite destination ({{ $labels.url }}) is unavailable or unable to receive data in a timely manner, so the persistent queue size is growing.
Once the available space is exhausted, some samples will be discarded and cause incident. Please check the health of remoteWrite destination ({{ $labels.url }})."
{% endraw %}

View File

@@ -0,0 +1,112 @@
{% raw %}
# File contains default list of alerts for vmalert service.
# The alerts below are just recommendations and may require some updates
# and threshold calibration according to every specific setup.
groups:
# Alerts group for vmalert assumes that Grafana dashboard
# https://grafana.com/grafana/dashboards/14950 is installed.
# Pls update the `dashboard` annotation according to your setup.
- name: vmalert
interval: 30s
rules:
- alert: ConfigurationReloadFailure
expr: vmalert_config_last_reload_successful != 1
labels:
severity: warning
annotations:
summary: "Configuration reload failed for vmalert instance {{ $labels.instance }}"
description: "Configuration hot-reload failed for vmalert on instance {{ $labels.instance }}.
Check vmalert's logs for detailed error message."
- alert: AlertingRulesError
expr: sum(increase(vmalert_alerting_rules_errors_total[5m])) without(id) > 0
for: 5m
labels:
severity: warning
annotations:
dashboard: "{{ $externalURL }}/d/LzldHAVnz?viewPanel=13&var-instance={{ $labels.instance }}&var-file={{ $labels.file }}&var-group={{ $labels.group }}"
summary: "Alerting rules are failing for vmalert instance {{ $labels.instance }}"
description: "Alerting rules execution is failing for \"{{ $labels.alertname }}\" from group \"{{ $labels.group }}\" in file \"{{ $labels.file }}\".
Check vmalert's logs for detailed error message."
- alert: RecordingRulesError
expr: sum(increase(vmalert_recording_rules_errors_total[5m])) without(id) > 0
for: 5m
labels:
severity: warning
annotations:
dashboard: "{{ $externalURL }}/d/LzldHAVnz?viewPanel=30&var-instance={{ $labels.instance }}&var-file={{ $labels.file }}&var-group={{ $labels.group }}"
summary: "Recording rules are failing for vmalert instance {{ $labels.instance }}"
description: "Recording rules execution is failing for \"{{ $labels.recording }}\" from group \"{{ $labels.group }}\" in file \"{{ $labels.file }}\".
Check vmalert's logs for detailed error message."
- alert: RecordingRulesNoData
expr: sum(vmalert_recording_rules_last_evaluation_samples) without(id) < 1
for: 30m
labels:
severity: info
annotations:
dashboard: "{{ $externalURL }}/d/LzldHAVnz?viewPanel=33&var-file={{ $labels.file }}&var-group={{ $labels.group }}"
summary: "Recording rule {{ $labels.recording }} ({{ $labels.group }}) produces no data"
description: "Recording rule \"{{ $labels.recording }}\" from group \"{{ $labels.group }}\ in file \"{{ $labels.file }}\"
produces 0 samples over the last 30min. It might be caused by a misconfiguration
or incorrect query expression."
- alert: TooManyMissedIterations
expr: increase(vmalert_iteration_missed_total[5m]) > 0
for: 15m
labels:
severity: warning
annotations:
summary: "vmalert instance {{ $labels.instance }} is missing rules evaluations"
description: "vmalert instance {{ $labels.instance }} is missing rules evaluations for group \"{{ $labels.group }}\" in file \"{{ $labels.file }}\".
The group evaluation time takes longer than the configured evaluation interval. This may result in missed
alerting notifications or recording rules samples. Try increasing evaluation interval or concurrency of
group \"{{ $labels.group }}\". See https://docs.victoriametrics.com/victoriametrics/vmalert/#groups.
If rule expressions are taking longer than expected, please see https://docs.victoriametrics.com/victoriametrics/troubleshooting/#slow-queries."
- alert: RemoteWriteErrors
expr: increase(vmalert_remotewrite_errors_total[5m]) > 0
for: 15m
labels:
severity: warning
annotations:
summary: "vmalert instance {{ $labels.instance }} is failing to push metrics to remote write URL"
description: "vmalert instance {{ $labels.instance }} is failing to push metrics generated via alerting
or recording rules to the configured remote write URL. Check vmalert's logs for detailed error message."
- alert: RemoteWriteDroppingData
expr: increase(vmalert_remotewrite_dropped_rows_total[5m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "vmalert instance {{ $labels.instance }} is dropping data sent to remote write URL"
description: "vmalert instance {{ $labels.instance }} is failing to send results of alerting or recording rules
to the configured remote write URL. This may result into gaps in recording rules or alerts state.
Check vmalert's logs for detailed error message."
- alert: RemoteWriteQueueHighUsage
expr: histogram_quantile(0.99, sum(increase(vmalert_remotewrite_queue_size_bucket[5m])) by (job, instance, vmrange)) / vmalert_remotewrite_queue_capacity > 0.8
for: 15m
labels:
severity: warning
annotations:
summary: "Remote write queue capacity on the vmalert instance {{ $labels.instance }} has exceeded 80% utilization"
description: "The remote write queue on vmalert instance {{ $labels.instance }} has consistently high utilization.
The queue acts as a buffer between rules generating series and remote-write client consuming and pushing these series. When queue overflows, vmalert will start dropping newly generated series.
Queue may overflow due to multiple reasons:
1. Some bad rules produce too many series at once. This can be limited using the global `-rule.resultsLimit` flag or `limit` param at the rule group level.
2. Remote write connection is slow. Increase `-remoteWrite.concurrency`, so vmalert could establish more concurrent connections.
3. The queue size is too small. Increase `-remoteWrite.maxQueueSize` to extend the buffer size. Note that a larger queue will result in higher memory consumption when the queue is full."
- alert: AlertmanagerErrors
expr: increase(vmalert_alerts_send_errors_total[5m]) > 0
for: 15m
labels:
severity: warning
annotations:
summary: "vmalert instance {{ $labels.instance }} is failing to send notifications to Alertmanager"
description: "vmalert instance {{ $labels.instance }} is failing to send alert notifications to \"{{ $labels.addr }}\".
Check vmalert's logs for detailed error message."
{% endraw %}