參考:docker-prometheus/docker-compose.yml at master · Kev1nChan/docker-prometheus · GitHub
使用 Raspberry Pi 3/4 注意使用 32 位元都會有機會遇到記憶體問題
建議使用 64 bit
uname -m
不是 arrch64 都是 32bit
架設 Prometheus
調整部份 Raspberry PI
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
|
version: '3.3'
volumes:
prometheus_data: {}
grafana_data: {}
networks:
monitoring:
driver: bridge
services:
prometheus:
image: prom/prometheus:v2.26.0
container_name: prometheus
restart: always
volumes:
- /etc/localtime:/etc/localtime:ro
- $PWD/prometheus/:/etc/prometheus/
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
networks:
- monitoring
links:
- alertmanager
- cadvisor
expose:
- '9090'
ports:
- 9090:9090
depends_on:
- cadvisor
alertmanager:
image: prom/alertmanager:v0.21.0
container_name: alertmanager
restart: always
volumes:
- /etc/localtime:/etc/localtime:ro
- $PWD/alertmanager/:/etc/alertmanager/
command:
- '--config.file=/etc/alertmanager/config.yml'
- '--storage.path=/alertmanager'
networks:
- monitoring
expose:
- '9093'
ports:
- 9093:9093
cadvisor:
image: budry/cadvisor-arm:latest #gcr.io/cadvisor/cadvisor:v0.39.0
container_name: cadvisor
restart: always
volumes:
- /etc/localtime:/etc/localtime:ro
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
networks:
- monitoring
expose:
- '8080'
node_exporter:
image: quay.io/prometheus/node-exporter:latest #prom/node-exporter:v0.18.0
container_name: node-exporter
restart: always
volumes:
- /etc/localtime:/etc/localtime:ro
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- --collector.filesystem.ignored-mount-points
- "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)"
networks:
- monitoring
expose:
- '9100'
grafana:
image: grafana/grafana:7.5.2
user: "104"
container_name: grafana
restart: always
volumes:
- /etc/localtime:/etc/localtime:ro
- grafana_data:/var/lib/grafana
- $PWD/grafana/provisioning/:/etc/grafana/provisioning/
env_file:
- $PWD/grafana/config.monitoring
networks:
- monitoring
links:
- prometheus
ports:
- 3000:3000
depends_on:
- prometheus
|
接下來設定在 Grafana 設定 Promethus Datasource 設定
就完成了
相關 alertmanager 調整請看 15分鐘建製 Promethus 環境
Node Exporter
我原本以為 Node Exporter 要抓本機機器一定要用 Systemd 去執行程式
但看範例是用 Docker
感覺不一定要用 Systemd
這邊我機器都有安裝 Docker
所以我就沒選則用這個方式
使用 Systemd 方式記錄一下
可參考
- node_exporter 配置 - 简书
- Installing node_exporter as systemd serivice · GitHub
1
2
3
4
5
6
7
8
9
10
11
12
13
|
sudo tee /etc/systemd/system/node_exporter.service <<"EOF"
[Unit]
Description=Node Exporter
[Service]
User=node_exporter
Group=node_exporter
EnvironmentFile=-/etc/sysconfig/node_exporter
ExecStart=/usr/local/bin/node_exporter $OPTIONS
[Install]
WantedBy=multi-user.target
EOF
|
docker 要怎麼用呢?
其實上面就寫了
官方寫法跟我的有點不太一樣
1
2
3
4
5
6
|
docker run -d \
--net="host" \
--pid="host" \
-v "/:/host:ro,rslave" \
quay.io/prometheus/node-exporter:latest \
--path.rootfs=/host
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
---
version: '3.8'
services:
node_exporter:
image: quay.io/prometheus/node-exporter:latest
container_name: node_exporter
command:
- '--path.rootfs=/host'
network_mode: host
pid: host
restart: unless-stopped
volumes:
- '/:/host:ro,rslave'
|
這邊不太明白 network_mode 要用 host 模式?
最我我選擇調整別台使用 docker
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
---
version: '3.8'
services:
node_exporter:
image: quay.io/prometheus/node-exporter:latest
container_name: node-exporter
restart: always
volumes:
- /etc/localtime:/etc/localtime:ro
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- --collector.filesystem.ignored-mount-points
- "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)"
expose:
- '9100'
|
dockr-compose up 起來
prometheus/config.yml 修改
1
2
3
4
5
6
7
8
9
10
|
- job_name: 'node-exporter'
# Override the global default and scrape targets from this job every 5 seconds.
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
scrape_interval: 5s
static_configs:
- targets:
- 'node_exporter:9100'
- '192.168.1.xx:9100'
- '192.168.1.xx:9100'
|
有遇到
1
2
|
ERROR: Version in "./docker-compose.yml" is unsupported. You might be seeing this error because you're using the wrong Compose file version. Either specify a supported version (e.g "2.2" or "3.3") and place your service definitions under the `services` key, or omit the `version` key and place your service definitions at the root of the file to use version 1.
For more on the Compose file format versions, see https://docs.docker.com/compose/compose-file/
|
請更新 docker-compose version
armv6 (pi 1 ) 不能跑
自己找網路上 build 都失敗
仔細想想,真的 PI 效能真的很低
所以就不放這個了
1 Node Exporter for Prometheus Dashboard CN v20201010 dashboard for Grafana | Grafana Labs
Node Exporter Full dashboard for Grafana | Grafana Labs
tranmission
仔細看了一下 dashboard
兩個差異還滿大的
主要我是要看網路傳輸速度
所以我這邊選擇 python 版本
1
2
3
|
git clone https://github.com/sandrotosi/simple-transmission-exporter.git
# 修改 Dockerfile 為 arm32v7
|
1
2
|
FROM arm32v7/python:3.9
...
|
1
2
3
4
5
6
7
|
docker build -t transmission_exporter .
docker run -e TRANSMISSION_HOST=192.168.x.x \
-e TRANSMISSION_PORT=9091 \
-e TRANSMISSION_USERNAME=admin \
-e TRANSMISSION_PASSWORD=xxx \
-d -p 29091:29091 transmission_exporter
|
prometheus 設定增加後,重啟服務docker-compose restart prometheus
1
2
3
4
5
6
7
|
- job_name: 'transmission-exporter'
# Override the global default and scrape targets from this job every 5 seconds.
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets:
- '192.168.1.203:29091'
|
調整成 docker-compose
換成 docker-compose 就不能運作了…
但一般docker 指令可正常執行…
真的很奇怪
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
version: '2'
services:
transmission_exporter:
build:
context: .
dockerfile: Dockerfile
image: transmission_exporter
container_name: transmission_exporter
ports:
- "29091:29091"
environment:
- "TRANSMISSION_HOST=192.168.1.203"
- "TRANSMISSION_PORT=9091"
- "TRANSMISSION_USERNAME=admin"
- "TRANSMISSION_PASSWORD=admin"
|
20210408
後來發現 docker-compose up 起來 IP 好像不一樣
所以才不能跑
看 transmission 設定有一個設定白名單
1
2
|
"rpc-whitelist": "127.0.0.1,192.168.1.*,172.17.0.*",
"rpc-whitelist-enabled": true,
|
172.17.0.* 預設會過
但是 docker-compose 起來不是這段 IP
透過 Docker Compose 設定 network | Titangene Blog
目前指令不先研究怎麼使用
最快方法就是把IP調成172.*.*.*
traefik
1
2
3
4
5
|
command:
- "--api.insecure=true"
....(省略)
- "--accesslog=true"
- "--metrics.prometheus=true"
|
原生 traefik 就有支援 prometheus
加上這個--metrics.prometheus=true
就能使用
granafan 加表Traefik dashboard for Grafana | Grafana Labs會看到
1
|
Panel plugin not found: grafana-piechart-panel
|
docker-traefik-prometheus/config.monitoring at master · vegasbrianc/docker-traefik-prometheus · GitHub
這邊有看到跑 docker 前
可以在這邊設定這個
這樣可以忽略下面安裝 plugin
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
[root@prometheus prometheus]# grafana-cli plugins install grafana-piechart-panel
installing grafana-piechart-panel @ 1.6.1
from: https://grafana.com/api/plugins/grafana-piechart-panel/versions/1.6.1/download
into: /var/lib/grafana/plugins
✔ Installed grafana-piechart-panel successfully
Restart grafana after installing plugins . <service grafana-server restart>
[root@prometheus prometheus]# systemctl restart grafana-server.service
|
參考:解决Panel plugin not found: grafana-piechart-panel - 知我知行
但我這邊是用docker
1
2
|
docker-compose exec grafana bash
grafana-cli plugins install grafana-piechart-panel
|
我後來是用這個 dashboard
Traefik 2.2 dashboard for Grafana | Grafana Labs
後來發我現docker-traefik-prometheus/config.monitoring at master · vegasbrianc/docker-traefik-prometheus · GitHub
1
|
GF_INSTALL_PLUGINS=grafana-piechart-panel
|
process exporter
目前沒用到
是看程式有沒有執行
但目前相對程式都有 exporter
就沒有使用
Prometheus — Process-exporter进程监控 - huandada - 博客园
Blackbox Exporter
老實說,剛看好難入手
實作完才知道大概在做什麼
簡單來說,所有控制點都在 prometheus
blackbox 都是收到接收做事情簡查
不多說,馬上實作
先git clone https://github.com/prometheus/blackbox_exporter.git
下來
1
|
docker run --rm -d -p 9115:9115 --name blackbox_exporter -v `pwd`:/config prom/blackbox-exporter:master --config.file=/config/blackbox.yml
|
prometheus 設定
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
|
- job_name: 'blackbox'
metrics_path: /probe
params:
module: [http_2xx] # Look for a HTTP 200 response.
static_configs:
- targets:
- https://xxx.web # Target to probe with https.
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.1.203:9115 # The blackbox exporter's real hostname:port.
- job_name: blackbox-ping
metrics_path: /probe
params:
module: [icmp]
static_configs:
- targets:
- 192.168.1.xx # <== Put here your targets
- 192.168.1.xx # <== Put here your targets
relabel_configs: # <== This comes from the blackbox exporter README
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.1.203:9115 # Blackbox exporter.
|
接下來重啟 dokcer-compose restart prometheus
就能解決了
調整為 docker-compose
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
version: '3.3'
services:
blackbox_exporter:
image: prom/blackbox-exporter:master
container_name: blackbox_exporter
restart: always
volumes:
- /etc/localtime:/etc/localtime:ro
- $PWD:/config
command:
- '--config.file=/config/blackbox.yml'
expose:
- '9115'
ports:
- 9115:9115
|
參考:
How to ping targets using blackbox_exporter with prometheus - Stack Overflow
Blackbox Exporter 小記 - Potioneer’s Essays
网络探测:Blackbox Exporter - prometheus-book
grafana 7 监控https证书过期时间 - 海口-熟练工 - 博客园
官方 exporter 清單
Exporters and integrations | Prometheus
安裝 alertmanager-discord
首先 git clone 下來改 arm32v7 映像檔
golang build 參數調整
再 build 看看有沒有問題
1
|
git clone https://github.com/benjojo/alertmanager-discord.git
|
Dockerfile build 調整如下
中間有遇到一些狀況
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
|
# Built following https://medium.com/@chemidy/create-the-smallest-and-secured-golang-docker-image-based-on-scratch -4752223b7324
# STEP 1 build executable binary
FROM arm32v7/golang as builder
# Install SSL ca certificates
RUN apt update
RUN apt install git
RUN apt install ca-certificates
# Create appuser
RUN adduser appuser
COPY . $GOPATH/src/mypackage/myapp/
WORKDIR $GOPATH/src/mypackage/myapp/
#get dependancies
RUN go get -d -v
#build the binary
RUN CGO_ENABLED=0 GOOS=linux GOARCH=arm GOARM=7 go build -a -installsuffix cgo -ldflags="-w -s" -o /go/bin/alertma nager-discord
# STEP 2 build a small image
# start from scratch
FROM hypriot/rpi-alpine-scratch
COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/
COPY --from=builder /etc/passwd /etc/passwd
# Copy our static executable
COPY --from=builder /go/bin/alertmanager-discord /go/bin/alertmanager-discord
ENV LISTEN_ADDRESS=0.0.0.0:9094
EXPOSE 9094
USER appuser
ENTRYPOINT ["/go/bin/alertmanager-discord"]
|
1
|
docker build -t alertmanager-discord .
|
完成,之前我寫的 line 測試的 curl 用在這個會失敗
直接設定到 alertmanager 測試正常
直接docker run 跑起來吧
1
|
docker run --rm -d -e DISCORD_WEBHOOK=https://discord.com/api/webhooks/xxxxxxx -p 9094:9094 alertmanager-discord
|
alertmanager/config.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 10m
receiver: discord_webhook
receivers:
- name: 'live-monitoring'
#收邮件的邮箱
email_configs:
- to: 'your-email@163.com'
- name: 'discord_webhook'
webhook_configs:
- url: 'http://192.168.1.203:9094'
|
把一個 exporter 看會不會有任何通知
這邊注意
我用 discord-alertmanager 發現 job_name 不能設定 _
樣子
導致有時候發送訊息會異常!!
這邊要注意一下
接下來學習目標
如何設定 alertmanger
發現 PromQL 還是有學必要
因為設定 alertmanger 必須要會這個東西
還有細項動態設定 label …
先訂好一個大坑
https://www.itread01.com/content/1545670504.html
暫時用的 alertmanager rule
參考一些規則:node_exporter 配置 - 簡書
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
|
groups:
- name: example
rules:
# Alert for any instance that is unreachable for >2 minutes.
- alert: service_down
expr: up == 0
for: 2m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
- alert: 磁盤容量小於 5%
expr: 100 - ((node_filesystem_avail_bytes{job="node-exporter",mountpoint=~".*",fstype=~"ext4|xfs|ext2|ext3"} * 100) / node_filesystem_size_bytes {job="node-exporter",mountpoint=~".*",fstype=~"ext4|xfs|ext2|ext3"}) > 95
for: 2m
annotations:
summary: "服務器實例 {{ $labels.instance }} 磁盤不足 告警通知"
description: "{{ $labels.instance }}磁盤 {{ $labels.device }} 資源 已不足 5%, 當前值: {{ $value }}"
- alert: "內存容量小於 20%"
expr: ((node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes) / (node_memory_MemTotal_bytes )) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: "服務器實例 {{ $labels.instance }} 內存不足 告警通知"
description: "{{ $labels.instance }}內存資源已不足 20%,當前值: {{ $value }}"
- alert: "CPU 平均負載大於 4 個"
expr: node_load5 > 4
for: 2m
annotations:
sumary: "服務器實例 {{ $labels.instance }} CPU 負載 告警通知"
description: "{{ $labels.instance }}CPU 平均負載(5 分鐘) 已超過 4 ,當前值: {{ $value }}"
- alert: "磁盤讀 I/O 超過 30MB/s"
expr: irate(node_disk_read_bytes_total{device="sda"}[1m]) > 30000000
for: 2m
annotations:
sumary: "服務器實例 {{ $labels.instance }} I/O 讀負載 告警通知"
description: "{{ $labels.instance }}I/O 每分鐘讀已超過 30MB/s,當前值: {{ $value }}"
- alert: "磁盤寫 I/O 超過 30MB/s"
expr: irate(node_disk_written_bytes_total{device="sda"}[1m]) > 30000000
for: 2m
annotations:
sumary: "服務器實例 {{ $labels.instance }} I/O 寫負載 告警通知"
description: "{{ $labels.instance }}I/O 每分鐘寫已超過 30MB/s,當前值: {{ $value }}"
- alert: "網卡流出速率大於 10MB/s"
expr: (irate(node_network_transmit_bytes_total{device!~"lo"}[1m]) / 1000) > 1000000
for: 2m
annotations:
sumary: "服務器實例 {{ $labels.instance }} 網卡流量負載 告警通知"
description: "{{ $labels.instance }}網卡 {{ $labels.device }} 流量已經超過 10MB/s, 當前值: {{ $value }}"
- alert: "CPU 使用率大於 90%"
expr: 100 - ((avg by (instance,job,env)(irate(node_cpu_seconds_total{mode="idle"}[2m]))) *100) > 90
for: 2m
annotations:
sumary: "服務器實例 {{ $labels.instance }} CPU 使用率 告警通知"
description: "{{ $labels.instance }}CPU 使用率已超過 90%, 當前值: {{ $value }}"
- alert: SSLCertExpiringSoon
expr: probe_ssl_earliest_cert_expiry{job="blackbox"} - time() < 86400 * 30
for: 1d
|
最後有看到這個網站有一堆 rule 可以參考
Awesome Prometheus alerts | Collection of alerting rules
2021-04-14 發生問題
上 Grafana 看不到 Prometheus 資料
看 log 發現
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
prometheus | level=info ts=2021-04-14T12:32:25.883Z caller=head.go:768 component=tsdb msg="WAL segment loaded" segment=185 maxSegment=503
prometheus | panic: runtime error: invalid memory address or nil pointer dereference
prometheus | [signal SIGSEGV: segmentation violation code=0x1 addr=0xc pc=0x17127f8]
prometheus |
prometheus | goroutine 590 [running]:
prometheus | bufio.(*Writer).Available(...)
prometheus | /usr/local/go/src/bufio/bufio.go:624
prometheus | github.com/prometheus/prometheus/tsdb/chunks.(*ChunkDiskMapper).WriteChunk(0x3d39b90, 0x145c, 0x0, 0xcced3c4b, 0x178, 0xcd0878f3, 0x178, 0x26914c4, 0x4712680, 0x0, ...)
prometheus | /app/tsdb/chunks/head_chunks.go:291 +0x54c
prometheus | github.com/prometheus/prometheus/tsdb.(*memSeries).mmapCurrentHeadChunk(0x49bf340, 0x3d39b90)
prometheus | /app/tsdb/head.go:2230 +0x6c
prometheus | github.com/prometheus/prometheus/tsdb.(*memSeries).cutNewHeadChunk(0x49bf340, 0xcd08b38b, 0x178, 0x3d39b90, 0x0)
prometheus | /app/tsdb/head.go:2204 +0x24
prometheus | github.com/prometheus/prometheus/tsdb.(*memSeries).append(0x49bf340, 0xcd08b38b, 0x178, 0xc1422185, 0x3fe4f164, 0x0, 0x0, 0x3d39b90, 0x10001)
prometheus | /app/tsdb/head.go:2360 +0x3a8
prometheus | github.com/prometheus/prometheus/tsdb.(*Head).processWALSamples(0x3d22120, 0xccd1ba00, 0x178, 0x68a2180, 0x68a2140, 0x0, 0x0)
prometheus | /app/tsdb/head.go:425 +0x270
prometheus | github.com/prometheus/prometheus/tsdb.(*Head).loadWAL.func5(0x3d22120, 0x416dde0, 0x416ddf0, 0x68a2180, 0x68a2140)
prometheus | /app/tsdb/head.go:519 +0x40
prometheus | created by github.com/prometheus/prometheus/tsdb.(*Head).loadWAL
prometheus | /app/tsdb/head.go:518 +0x268
prometheus | level=info ts=2021-04-14T12:32:41.305Z caller=main.go:380 msg="No time or size retention was set so using the default time retention" duration=15d
|
因為我沒有用 healthcheck
後來沒看到有用的 healthcheck
web 介面都正常…
docker-compose restart 好像都失敗
所以docker-compose down -v
直接清掉重啟就正常…
swap 都設2GB
RAM 看起來都正常…
後來我猜測是 Prometheus 問題
把 volumes 清掉就正常
解決樹莓派遇到記憶體問題
2021-12-15
最近在回味重新看這篇時候,發現我沒有補上我調整完設定。不確定新版會不會修正,不過先留個紀錄。
主要是加這段 - '--storage.tsdb.retention.size=500MB'
。
留個prometheus docker-compose 紀錄
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
services:
prometheus:
image: prom/prometheus-linux-armv7
container_name: prometheus
restart: always
volumes:
- /etc/localtime:/etc/localtime:ro
- $PWD/prometheus/:/etc/prometheus/
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
#- '--storage.tsdb.retention=90d'
#- '--storage.tsdb.min-block-duration=2h'
#- '--storage.tsdb.max-block-duration=2h'
- '--storage.tsdb.retention.size=500MB'
networks:
- monitoring
links:
- alertmanager
- cadvisor
expose:
- '9090'
ports:
- 9090:9090
depends_on:
- cadvisor
|
下面留個失敗紀錄
失敗紀錄
rpi4 docker panic: mmap, size 134217728: cannot allocate memory · Issue #8661 · prometheus/prometheus · GitHub
後來我有找到這個跟我一樣問題
我目前估計用docker-compose up 執行是跑舊版
後來我跑最新版
但是我沒清掉 volumes 關係?(忘記)
看似 issue 問題也是升級問題
但我是跑到第五天才遇到問題
麻煩的是 alertmanager 沒發通知
很容易沒注意到
確定我過5天後又發生問題
這邊當初我應該不用下docker-compose down -v
應該先 docker-compose stop prometheus
docker volume rm (volume_name)
才對
2021-04-25
最近又遇到這個問題
查了一下,可能跟 Raspberry PI OS 32 位元有關係
聽說64 位元能解決?!(不確定)
參考:
Had the same problem. Fixed it by adding a size limit for the storage with –storage.tsdb.retention.size=500MB.
Maybe on 32bit systems there could be a default/maximum value for storage.tsdb.retention.size, along with the warning.
https://github.com/prometheus/prometheus/issues/7483#issuecomment-670512677
這個我還沒測試
2021-05-03 測試中
Raspberry PI 3 B+
Raspbian Stretch Lite October 2018
prometheus, version 2.4.3+ds (branch: debian/sid, revision: 2.4.3+ds-2) installed from armhf deb package (https://packages.debian.org/sid/net/prometheus) (so I guess no 64bit on 32bit)
–storage.tsdb.retention=15y –storage.tsdb.min-block-duration=2h –storage.tsdb.max-block-duration=2h but problem also appears on default settings of min/max block durations.
https://github.com/prometheus/prometheus/issues/4392#issuecomment-433717839
Prometheus命令
目前使用這個方案觀察看看
2021-05-03 確定跑到後面會出問題
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
prometheus:
image: prom/prometheus-linux-armv7
container_name: prometheus
restart: always
volumes:
- /etc/localtime:/etc/localtime:ro
- $PWD/prometheus/:/etc/prometheus/
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
- '--storage.tsdb.retention=90d'
- '--storage.tsdb.min-block-duration=2h'
- '--storage.tsdb.max-block-duration=2h'
networks:
- monitoring
links:
- alertmanager
- cadvisor
expose:
- '9090'
ports:
- 9090:9090
depends_on:
- cadvisor
|