网站运营
位置:首页>> 网站运营>> 使用docker部署grafana+prometheus配置

使用docker部署grafana+prometheus配置

作者:runzhao  发布时间:2022-05-02 15:36:33 

标签:docker,grafana,prometheus

docker-compose-monitor.yml


version: '2'

networks:
 monitor:
   driver: bridge

services:
 influxdb:
   image: influxdb:latest
   container_name: tig-influxdb
   ports:
     - "18083:8083"
     - "18086:8086"
     - "18090:8090"
   env_file:
     - 'env.influxdb'
   volumes:
     # Data persistency
     # sudo mkdir -p ./influxdb/data
     - ./influxdb/data:/var/lib/influxdb
     # 配置docker里的时间为东八区时间
     - ./timezone:/etc/timezone:ro
     - ./localtime:/etc/localtime:ro
   restart: unless-stopped #停止后自动

telegraf:
   image: telegraf:latest
   container_name: tig-telegraf
   links:
     - influxdb
   volumes:
     - ./telegraf.conf:/etc/telegraf/telegraf.conf:ro
     - ./timezone:/etc/timezone:ro
     - ./localtime:/etc/localtime:ro
   restart: unless-stopped
 prometheus:
   image: prom/prometheus
   container_name: prometheus
   hostname: prometheus
   restart: always
   volumes:
     - /home/qa/docker/grafana/prometheus.yml:/etc/prometheus/prometheus.yml
     - /home/qa/docker/grafana/node_down.yml:/etc/prometheus/node_down.yml
   ports:
     - '9090:9090'
   networks:
     - monitor

alertmanager:
   image: prom/alertmanager
   container_name: alertmanager
   hostname: alertmanager
   restart: always
   volumes:
     - /home/qa/docker/grafana/alertmanager.yml:/etc/alertmanager/alertmanager.yml
   ports:
     - '9093:9093'
   networks:
     - monitor

grafana:
   image: grafana/grafana:6.7.4
   container_name: grafana
   hostname: grafana
   restart: always
   ports:
     - '13000:3000'
   networks:
     - monitor

node-exporter:
   image: quay.io/prometheus/node-exporter
   container_name: node-exporter
   hostname: node-exporter
   restart: always
   ports:
     - '9100:9100'
   networks:
     - monitor

cadvisor:
   image: google/cadvisor:latest
   container_name: cadvisor
   hostname: cadvisor
   restart: always
   volumes:
     - /:/rootfs:ro
     - /var/run:/var/run:rw
     - /sys:/sys:ro
     - /var/lib/docker/:/var/lib/docker:ro
   ports:
     - '18080:8080'
   networks:
     - monitor

alertmanager.yml


global:
 resolve_timeout: 5m
 smtp_from: '邮箱'
 smtp_smarthost: 'smtp.exmail.qq.com:25'
 smtp_auth_username: '邮箱'
 smtp_auth_password: '密码'
 smtp_require_tls: false
 smtp_hello: 'qq.com'
route:
 group_by: ['alertname']
 group_wait: 5s
 group_interval: 5s
 repeat_interval: 5m
 receiver: 'email'
receivers:
- name: 'email'
 email_configs:
 - to: '收件邮箱'
   send_resolved: true
inhibit_rules:
 - source_match:
     severity: 'critical'
   target_match:
     severity: 'warning'
   equal: ['alertname', 'dev', 'instance']

prometheus.yml


global:
 scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
 evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
 # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
 alertmanagers:
 - static_configs:
   - targets: ['192.168.32.117:9093']
     # - alertmanager:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
 - "node_down.yml"
 # - "node-exporter-alert-rules.yml"
 # - "first_rules.yml"
 # - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
 # IO存储节点组
 - job_name: 'io'
   scrape_interval: 8s
   static_configs:#端口为node-exporter启动的端口
     - targets: ['192.168.32.117:9100']
     - targets: ['192.168.32.196:9100']
     - targets: ['192.168.32.136:9100']
     - targets: ['192.168.32.193:9100']
     - targets: ['192.168.32.153:9100']
     - targets: ['192.168.32.185:9100']
     - targets: ['192.168.32.190:19100']
     - targets: ['192.168.32.192:9100']

# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
 - job_name: 'cadvisor'
   static_configs:#端口为cadvisor启动的端口
     - targets: ['192.168.32.117:18080']
     - targets: ['192.168.32.193:8080']
     - targets: ['192.168.32.153:8080']
     - targets: ['192.168.32.185:8080']
     - targets: ['192.168.32.190:18080']
     - targets: ['192.168.32.192:18080']

node_down.yml


groups:
 - name: node_down
   rules:
     - alert: InstanceDown
       expr: up == 0
       for: 1m
       labels:
         user: test
       annotations:
         summary: 'Instance {{ $labels.instance }} down'
         description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes.'

#剩余内存小于10%
     - alert: 剩余内存小于10%
       expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
       for: 2m
       labels:
         severity: warning
       annotations:
         summary: Host out of memory (instance {{ $labels.instance }})
         description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

#剩余磁盘小于10%
     - alert: 剩余磁盘小于10%
       expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
       for: 2m
       labels:
         severity: warning
       annotations:
         summary: Host out of disk space (instance {{ $labels.instance }})
         description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

#cpu负载 > 80%
     - alert: CPU负载 > 80%
       expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
       for: 0m
       labels:
         severity: warning
       annotations:
         summary: Host high CPU load (instance {{ $labels.instance }})
         description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

告警:https://awesome-prometheus-alerts.grep.to/rules#prometheus-self-monitoring

官网仪表盘:https://grafana.com/grafana/dashboards/

来源:https://www.cnblogs.com/runzhao/p/15716274.html

0
投稿

猜你喜欢

手机版 网站运营 asp之家 www.aspxhome.com