diff --git a/ansible/roles/prometheus/tasks/main.yml b/ansible/roles/prometheus/tasks/main.yml deleted file mode 100644 index e28a039..0000000 --- a/ansible/roles/prometheus/tasks/main.yml +++ /dev/null @@ -1,153 +0,0 @@ ---- -- name: check if installing a different version of prometheus - tags: - - prometheus - shell: "cat {{ prometheus_config }}/.sha256" - ignore_errors: true - register: checksum - -- name: determine if prometheus {{ prometheus_version}} should be installed - tags: - - prometheus - set_fact: - prometheus_install: (checksum.stdout.find(prometheus_sha256) == -1) or (prometheus_force is defined) - -- name: download and checksum prometheus {{ prometheus_version }} tarball - tags: - - prometheus - when: prometheus_install - get_url: - url: "{{ prometheus_url }}" - dest: "{{ prometheus_tgz }}" - checksum: "sha256:{{ prometheus_sha256 }}" - -- name: unpack prometheus tarball - tags: - - prometheus - when: prometheus_install - unarchive: - src: "{{ prometheus_tgz }}" - dest: "/tmp/" - copy: false - -- name: create prometheus user - tags: - - prometheus - when: prometheus_install - user: - name: prometheus - shell: "/bin/false" - comment: "prometheus service user" - -- name: check if prometheus service exists - tags: - - prometheus - stat: - path: "/etc/systemd/system/prometheus.service" - register: svccheck - -- name: stop prometheus service if it exists - tags: - - prometheus - when: svccheck.stat.exists - service: - name: prometheus - state: stopped - -- name: copy prometheus binaries to {{ host_bin_path }} - tags: - - prometheus - when: prometheus_install - command: "cp {{ prometheus_tmp }}/{{ item }} {{ host_bin_path }}/" - with_items: - - "prometheus" - - "promtool" - -- name: set permissions on prometheus binaries - tags: - - prometheus - when: prometheus_install - file: - path: "{{ host_bin_path }}/{{ item }}" - owner: prometheus - group: prometheus - mode: 0755 - with_items: - - "prometheus" - - "promtool" - -- name: create prometheus config directory - tags: - - prometheus - when: prometheus_install - file: - path: "{{ prometheus_config }}" - state: directory - owner: prometheus - group: prometheus - mode: 0755 - -- name: create prometheus configuration - tags: - - prometheus - template: - src: prometheus.yml.j2 - dest: "{{ prometheus_config }}/prometheus.yml" - with_items: "{{ prometheus_jobs }}" - -- name: create prometheus alerts file - tags: - - prometheus - template: - src: alerts.rules.j2 - dest: "{{ prometheus_config }}/alerts.rules" - -- name: create prometheus data directory - tags: - - prometheus - when: prometheus_install - file: - path: "{{ prometheus_data }}" - state: directory - owner: prometheus - group: prometheus - mode: 0755 - -- name: create prometheus systemd unit - tags: - - prometheus - template: - src: prometheus.service.j2 - dest: "/etc/systemd/system/prometheus.service" - -- name: reload systemd configurations - tags: - - prometheus - command: "systemctl daemon-reload" - -- name: start prometheus service - tags: - - prometheus - service: - name: prometheus - state: started - enabled: true - -- name: remove temporary prometheus files - tags: - - prometheus - when: prometheus_install - file: - path: "{{ item }}" - state: absent - with_items: - - "{{ prometheus_tmp }}" - - "{{ prometheus_tgz }}" - -- name: create checksum file for prometheus {{ prometheus_version }} - tags: - - prometheus - when: prometheus_install - template: - src: sha256.j2 - dest: "{{ prometheus_config }}/.sha256" diff --git a/ansible/roles/prometheus/templates/alerts.rules.j2 b/ansible/roles/prometheus/templates/alerts.rules.j2 deleted file mode 100644 index 1f70e1b..0000000 --- a/ansible/roles/prometheus/templates/alerts.rules.j2 +++ /dev/null @@ -1,53 +0,0 @@ -# {{ ansible_managed }} -groups: -- name: default - rules: - - alert: InstanceDown - # rtorrent can be flappy and needs a longer interval. - expr: up{instance!~"nerr-3.*",job!="rtorrent"} == 0 - for: 2m - labels: - severity: critical - annotations: -{% raw %} - summary: "Instance {{ $labels.instance }} is down." - description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes." -{% endraw %} - - alert: ServiceDown - expr: probe_success{instance!~"nerr-3.*"} == 0 - for: 10m - labels: - severity: critical - annotations: -{% raw %} - summary: "Service {{ $labels.instance }} is down." - description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes." -{% endraw %} - - alert: RtorrentDown - # rtorrent can be flappy and needs a longer interval. - expr: up{job="rtorrent"} == 0 - for: 120m - labels: - severity: critical - annotations: -{% raw %} - summary: "rTorrent {{ $labels.instance }} is down." -{% endraw %} - - alert: TLSCertificateNearExpiration - expr: probe_ssl_earliest_cert_expiry - time() < 60 * 60 * 24 * 2 - for: 1m - labels: - severity: critical - annotations: -{% raw %} - summary: "TLS certificate for {{ $labels.instance }} will expire in less than 2 days." -{% endraw %} - - alert: DiskUsageHigh - expr: (1 - node_filesystem_free_bytes{fstype=~"ext4|vfat"} / node_filesystem_size_bytes) > 0.75 - for: 1m - labels: - severity: critical - annotations: -{% raw %} - summary: "Disk usage on {{ $labels.instance }}:{{$labels.mountpoint}} ({{$labels.device}}) exceeds 75%." -{% endraw %} diff --git a/ansible/roles/prometheus/templates/prometheus.service.j2 b/ansible/roles/prometheus/templates/prometheus.service.j2 deleted file mode 100644 index a93e3cc..0000000 --- a/ansible/roles/prometheus/templates/prometheus.service.j2 +++ /dev/null @@ -1,20 +0,0 @@ -# {{ ansible_managed }} -[Unit] -Description=Prometheus metrics and monitoring system -Documentation=https://prometheus.io/docs/introduction/overview/ -Wants=network-online.target -After=network-online.target - -[Service] -User=prometheus -Group=prometheus -Type=simple -ExecStart={{ host_bin_path }}/prometheus \ - --config.file {{ prometheus_config }}/prometheus.yml \ - --storage.tsdb.path {{ prometheus_data }} \ - --storage.tsdb.retention.time 180d \ - --log.level debug \ - --web.external-url {{ prometheus_external_url }} - -[Install] -WantedBy=multi-user.target diff --git a/ansible/roles/prometheus/templates/prometheus.yml.j2 b/ansible/roles/prometheus/templates/prometheus.yml.j2 deleted file mode 100644 index 5dd6cb4..0000000 --- a/ansible/roles/prometheus/templates/prometheus.yml.j2 +++ /dev/null @@ -1,42 +0,0 @@ -# {{ ansible_managed }} -# prometheus configuration, Matt Layher, 2016-04-15 - -global: - scrape_interval: 15s - evaluation_interval: 15s - -rule_files: - - alerts.rules - -scrape_configs: -{% for job in prometheus_jobs %} - - job_name: "{{ job.job }}" - {% if job.scrape_interval is defined %} - scrape_interval: "{{ job.scrape_interval }}" - {% endif %} - static_configs: - - targets: {{ job.targets | to_yaml }} -{% if job.scheme is defined %} - scheme: "{{ job.scheme }}" -{% endif %} -{% if job.path is defined %} - metrics_path: "{{ job.path }}" -{% endif %} -{% if job.params is defined %} - params: - {{ job.params | to_yaml }} -{% endif %} -{% if job.relabel_target is defined %} - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - target_label: __address__ - replacement: "{{ job.relabel_target }}" -{% endif %} -{% endfor %} -alerting: - alertmanagers: - - static_configs: - - targets: {{ prometheus_alertmanagers | to_yaml }} diff --git a/ansible/roles/prometheus/templates/sha256.j2 b/ansible/roles/prometheus/templates/sha256.j2 deleted file mode 100644 index 789d586..0000000 --- a/ansible/roles/prometheus/templates/sha256.j2 +++ /dev/null @@ -1,3 +0,0 @@ -# {{ ansible_managed }} -# Remove this file to force ansible to reinstall prometheus {{ prometheus_version }}. -{{ prometheus_sha256 }} \ No newline at end of file diff --git a/ansible/roles/prometheus/vars/main.yml b/ansible/roles/prometheus/vars/main.yml deleted file mode 100644 index bdbe6d2..0000000 --- a/ansible/roles/prometheus/vars/main.yml +++ /dev/null @@ -1,117 +0,0 @@ ---- -# Prometheus version and checksum configuration. -prometheus_version: "2.15.2" -prometheus_sha256: "579f800ec3ec2dc9a36d2d513e7800552cf6b0898f87a8abafd54e73b53f8ad0" -# Prometheus alerting configuration. -prometheus_alertmanagers: - - "monitnerr-1:9093" -# Prometheus jobs configuration. -prometheus_jobs: - - job: "apcupsd" - targets: - - "servnerr-3:9162" - - "nerr-3:9162" - - job: "blackbox_http_2xx" - path: "/probe" - params: - module: [http_2xx] - targets: - - "https://grafana.servnerr.com" - # TODO: move onto server and re-enable. - # - "https://homeassistant.servnerr.com" - relabel_target: "monitnerr-1:9115" - - job: "blackbox_http_401" - path: "/probe" - params: - module: [http_401] - targets: - - "https://alertmanager.servnerr.com" - - "https://plex.servnerr.com" - - "https://prometheus.servnerr.com" - relabel_target: "monitnerr-1:9115" - - job: "blackbox_mdlayhercom" - # Netlify can be flappy at times, so check this less often. - scrape_interval: "1m" - path: "/probe" - params: - module: [http_2xx] - targets: - - "https://mdlayher.com" - relabel_target: "monitnerr-1:9115" - - job: "blackbox_ssh" - # This generates a lot of noise in OpenSSH logs, so do it less often. - scrape_interval: "1m" - path: "/probe" - params: - module: [ssh_banner] - targets: - - "monitnerr-1:22" - - "nerr-3:22" - - "routnerr-2:22" - - "servnerr-3:22" - - "unifi.servnerr.com:22" - relabel_target: "monitnerr-1:9115" - - job: "blackbox_exporter" - targets: - - "monitnerr-1:9115" - - job: "coredns" - targets: - - "routnerr-2:9153" - - job: "corerad" - targets: - - "routnerr-2:9430" - - job: "hdhomerun" - targets: - - "hdhomerun" - relabel_target: "servnerr-3:9137" - - job: "loki" - targets: - - "servnerr-3:3100" - - job: "node" - targets: - - "monitnerr-1:9100" - - "nerr-3:9100" - - "routnerr-2:9100" - - "servnerr-3:9100" - - job: "prometheus" - scrape_interval: "5s" - targets: - - "servnerr-3:9090" - - job: "promtail" - targets: - - "servnerr-3:9080" - - job: "rtorrent" - scrape_interval: "30s" - targets: - - "servnerr-3:9135" - - job: "snmp_exporter" - targets: - - "servnerr-3:9116" - - job: "snmp" - path: "/snmp" - params: - module: [if_mib] - targets: - - "switch-livingroom01" - - "switch-office01" - - "ap-livingroom02" - relabel_target: "servnerr-3:9116" - - job: "traefik" - targets: - - "routnerr-2:8080" -# TODO: disabled until wg-dynamic work continues. -# - job: "wgipamd" -# targets: -# - "routnerr-2:9475" -# TODO: temporarily disabled while I get this working on NixOS. -# - job: "wireguard" -# targets: -# - "routnerr-2:9586" -# Static configuration. -prometheus_config: "/etc/prometheus" -prometheus_data: "/var/lib/prometheus" -prometheus_external_url: "https://prometheus.servnerr.com" -prometheus_dir: "prometheus-{{ prometheus_version }}.linux-amd64" -prometheus_tmp: "/tmp/{{ prometheus_dir }}" -prometheus_url: "https://github.com/prometheus/prometheus/releases/download/v{{ prometheus_version }}/{{ prometheus_dir }}.tar.gz" -prometheus_tgz: "/tmp/prometheus.tar.gz"