Prometheus系列--pushgateway的安装与使用
对于自定义采集指标官方不推荐使用pushgateway,他们觉得pushgateway不是很优雅,官方推荐使用node_exporter的collector.textfile的功能,这样自定义的监控就可以通过node_exporter的方式被pull到server端。
如果是有不便利的环境比如某些机器不能连接server端,那可能就需要用pushgateway来中转一下了。
一、 简介
pushgateway是prometheus生态中的一个重要工具,他的作用是这样的:
1、prometheus 的server和target不在一个网段,或者因为某些原因,不能直接吧target开放给server直接获取指标采集信息;
2、监控数据的时候,可以收集采集信息,一起发送给server,让server统一收集,某种程度上,降低了server端的连接数;
不过pushgateway如果挂掉,所有的target信息都采集不到、prometheus-server只是对pushgateway进行信息信息采集所以不知道后端target是否up,pushgateway可以像server一样持久化数据;
pushgateway持久化数据需要加上启动参数--persistence.file="" 和--persistence.interval=5m
二、安装pushgateway
# 安装pushgateway
cd /usr/local/src/
wget https://github.com/prometheus/pushgateway/releases/download/v1.4.0/pushgateway-1.4.0.linux-amd64.tar.gz
tar xzf pushgateway-1.4.0.linux-amd64.tar.gz -C /usr/local/prometheus
ln -s /usr/local/prometheus/pushgateway-1.4.0.linux-amd64 /usr/local/prometheus/pushgateway
# 编写启动文件
# 因为要持久化数据,所以先创建一个持久化数据存储的路径,
mkdir /usr/local/prometheus/pushgateway/data
# 使用systemctl 管理pushgateway启停
]# cat >> /usr/lib/systemd/system/pushgateway.service <<"EOF"
[Unit]
Description=Prometheus pushgateway
Requires=network.target remote-fs.target
After=network.target remote-fs.target
[Service]
Type=simple
User=root
Group=root
ExecStart=/usr/local/prometheus/pushgateway/pushgateway --persistence.file="/usr/local/prometheus/pushgateway/data/" --persistence.interval=5m
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure
RestartSec=5s
[Install]
WantedBy=multi-user.target
EOF
# 使用supervisorctl 管理pushgateway启停
cat >> /etc/supervisor.d/pushgateway.ini <<"EOF"
[program:pushgateway] #
command=/usr/local/prometheus/pushgateway/pushgateway --persistence.file="/usr/local/prometheus/pushgateway/data/" --persistence.interval=5m ; the program (relative uses PATH, can take args)
numprocs=1 ; number of processes copies to start (def 1)
directory=/usr/local/prometheus/pushgateway/ ; directory to cwd to before exec (def no cwd)
autostart=true ; start at supervisord start (default: true)
autorestart=true ; retstart at unexpected quit (default: true)
startsecs=30 ; number of secs prog must stay running (def. 1)
startretries=3 ; max # of serial start failures (default 3)
exitcodes=0,2 ; 'expected' exit codes for process (default 0,2)
stopsignal=QUIT ; signal used to kill process (default TERM)
stopwaitsecs=10 ; max num secs to wait b4 SIGKILL (default 10)
user=root ; setuid to this UNIX account to run the program
redirect_stderr=true ; redirect proc stderr to stdout (default false)
stdout_logfile=/usr/local/prometheus/pushgateway/pushgateway.stdout.log ; stderr log path, NONE for none; default AUTO
stdout_logfile_maxbytes=64MB ; max # logfile bytes b4 rotation (default 50MB)
stdout_logfile_backups=4 ; # of stdout logfile backups (default 10)
stdout_capture_maxbytes=1MB ; number of bytes in 'capturemode' (default 0)
stdout_events_enabled=false ; emit events on stdout writes (default false)
stopasgroup=true
killasgroup=true
EOF
#启动
# systemd 方式启动
systemctl daemon-reload
systemctl enable pushgateway
systemctl start pushgateway
systemctl status pushgateway
# supervisor方式启动
supervisorctl update
supervisorctl status
supervisorctl start pushgateway
supervisorctl restart pushgateway
#检查是否启动成功
ss -untlp |grep 9091
ps -ef |grep pushgateway
如果启动不成功,使用systemd的,使用journal -xe 检查启动报错;使用supervisor,去日志文件检查启动报错。
3、配置prometheus-server
## pushgateway 的配置,检测端口是否存活
- job_name: 'pushgateway'
file_sd_configs:
- refresh_interval: 30s
files:
- ./conf/pushgateway/*.json
honor_labels: true
# 因为prometheus配置pushgateway 的时候,也会指定job和instance,但是它只表示pushgateway实例,不能真正表达收集数据的含义。所以配置pushgateway需要添加honor_labels:true,避免收集数据本身的job和instance被覆盖。
target文件
[
{
"targets": [
"xx'xx.xx.xx:9091"
],
"labels": {
"category": "ops",
"drtype": "online",
"env": "prod",
"lifecycle": "long",
"module": "xxxxx",
"project": "base",
"provider": "cloud",
"resource": "ecs",
"software": "xx-xx",
"node_name": "xx-xx-0001"
}
}
]
配置好文件之后,重启prometheus-server,一下启动方式二选一
systemctl restart prometheus
curl -X POST http://localhost:9090/-/reload
查看prometheus的web端,可以看到已经能看到target了。
四、配置grafana
如果是监控基础监控,和使用node_expoeter的图形是一样的,如果是自定义的监控信息,请往下看。
五、自定义监控
同时告警规则也需要自行编写。
自定义监控自定义脚本
custom_monitor.py
# -*- coding: utf-8 -*-
# Python3
import requests
import socket
import subprocess
import arrow
# 获取本机正在使用的IP
def get_Local_ip():
try:
csock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
csock.connect(('8.8.8.8', 80))
(addr, port) = csock.getsockname()
csock.close()
return addr
except socket.error:
return "127.0.0.1"
# 获取本机的主机名
def get_Hostname():
return socket.gethostname()
# wo服务授权到期时间
def get_Wocert_time():
expiration_days = []
today_time = arrow.utcnow()
try:
for i in range(1,6):
return_result = subprocess.getoutput("grep '文件授权,授权到期日期' /data/yozo/weboffice_%s/logs/catalina.out |tail -1 |awk -F '[]:]+' '{ print $(NF-1) }'"%i)
expiration_time = arrow.get(return_result)
expiration_day = expiration_time - today_time
expiration_day = expiration_day.days
if expiration_day not in expiration_days:
expiration_days.append(expiration_day)
return expiration_days[0]
except:
return 0
# return 20
# dcs服务授权到期时间
def get_Dcscert_time():
try:
Dcscert_time = requests.get("http://localhost:9380/dcs.web/queryInfo").json()["data"]["activatDays"]
return Dcscert_time
except:
return 0
# wo 服务可用性检测
def wo_isalive():
try:
response = requests.get("http://localhost:8080/api.do")
return response.status_code
except:
return 0
# dcs 服务可用性检测
def dcs_isalive():
try:
response = requests.get("http://localhost:9380/dcs.web/queryInfo")
return response.status_code
except:
return 0
def _submit_date(url, job_name, hostname, value):
headers = {'X-Requested-With': 'Python requests', 'Content-type': 'text/xml'}
requests.post('http://%s/metrics/job/%s/instance/%s' % (url, job_name,hostname),
data='%s\n' % (value), headers=headers)
def push_metrics(job_name,hostname,ipaddr,url):
wocert_expiraton_time = get_Wocert_time()
dcscert_expiration_time = get_Dcscert_time()
wo_status = wo_isalive()
dcs_status = dcs_isalive()
metrice_name = ""
metrice_name += 'wocert_expiraton_time{ipaddr="%s",type="wocert_expiraton_time",instance="%s",job="%s"} %s\n'%(ipaddr,hostname,job_name,wocert_expiraton_time)
metrice_name += 'dcscert_expiration_time{ipaddr="%s",type="dcscert_expiration_time",instance="%s",job="%s"} %s\n'%(ipaddr,hostname,job_name,dcscert_expiration_time)
metrice_name += 'wo_status{ipaddr="%s",type="wo_status",instance="%s",job="%s"} %s\n'%(ipaddr,hostname,job_name,wo_status)
metrice_name += 'dcs_status{ipaddr="%s",type="dcs_status",instance="%s",job="%s"} %s\n'%(ipaddr,hostname,job_name,dcs_status)
#重点是这块将取到的值组成一个字符串,字符串的格式要符合metrics的标准,可以选择target的一个metrics进行格式查看。这里给出个实例:
# uat_tomcat{id="tomcat_1018",host="uat",type="mem",instance="uat",job="uat-app-status"} 3.2
# uat_tomcat{id="tomcat_1018",host="uat",type="cpu",instance="uat",job="uat-app-status"} 2.4
# _submit_date(url=url,job_name=job_name,hostname=hostname,value=metrice_name)
print(metrice_name)
if __name__ == "__main__":
job_name = "monitor_yozo" #我这里使用的是ansible批量推的形式运行该脚本,所有用了jinja的变量,如果不需要可以直接加此设置成对应的值运行。
hostname = get_Hostname().replace("-","_") #这里我发现type标签的值不支持-,所以就替换成_
ipaddr = get_Local_ip()
url = "localhost:9091" #这里的地址填写的是altermanger的地址(algermanger:9091)
push_metrics(job_name=job_name,hostname=hostname,ipaddr=ipaddr,url=url)
roles.yaml
groups:
- name: YOZOCertexpiringSoon
rules:
WO证书即将过期
- alert: WOCertExpiringSoon
expr: wocert_expiraton_time < 30
for: 5m
labels:
severity: warning
level: 2
annotations:
summary: "WO服务授权即将过期"
description: "服务{{ $labels.exported_instance }}WO服务授权将在30天内过期,请尽快联系供应商获取新授权!"
DCS证书即将过期
- alert: DCSCertExpiringSoon
expr: dcscert_expiration_time < 30
for: 5m
labels:
severity: warning
level: 2
annotations:
summary: "DCS服务授权即将过期"
description: "服务{{ $labels.exported_instance }}DCS服务授权将在30天内过期,请尽快联系供应商获取新授权!"
grafana图形的json
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "查看永中服务的可用性以及授权到期时间",
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": 5,
"links": [],
"panels": [
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": null,
"description": "Wo服务可用性",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"id": 4,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "wo_status",
"legendFormat": "{{exported_instance}}--{{ipaddr}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Wo服务可用性",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": null,
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"id": 2,
"interval": "",
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "dcs_status",
"legendFormat": "{{exported_instance}}--{{ipaddr}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "DCS服务可用性",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": null,
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"id": 8,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "wocert_expiraton_time",
"legendFormat": "{{exported_instance}}--{{ipaddr}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "WO授权到期时间",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": null,
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"id": 6,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "dcscert_expiration_time",
"legendFormat": "{{exported_instance}}--{{ipaddr}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "DCS授权到期时间",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"schemaVersion": 20,
"style": "dark",
"tags": [],
"templating": {
"list": []
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
]
},
"timezone": "",
"title": "Yozo_monitor",
"uid": "V-idLoQMz",
"version": 20
}
- 点赞
- 收藏
- 关注作者
评论(0)