-
Telegraf
使用hddtemp
插件记录硬盘温度; -
InfluxDB2
用来接收Telegraf
采集到的数据; -
Grafana
用来数据展示。
Telegraf
配置
# Configuration for telegraf agent
[agent]
## Default data collection interval for all inputs
interval = "10s"
## Rounds collection interval to 'interval'
## ie, if interval="10s" then always collect on :00, :10, :20, etc.
round_interval = true
## Telegraf will send metrics to outputs in batches of at most
## metric_batch_size metrics.
## This controls the size of writes that Telegraf sends to output plugins.
metric_batch_size = 1000
## Maximum number of unwritten metrics per output. Increasing this value
## allows for longer periods of output downtime without dropping metrics at the
## cost of higher maximum memory usage.
metric_buffer_limit = 10000
## Collection jitter is used to jitter the collection by a random amount.
## Each plugin will sleep for a random time within jitter before collecting.
## This can be used to avoid many plugins querying things like sysfs at the
## same time, which can have a measurable effect on the system.
collection_jitter = "0s"
## Default flushing interval for all outputs. Maximum flush_interval will be
## flush_interval + flush_jitter
flush_interval = "10s"
## Jitter the flush interval by a random amount. This is primarily to avoid
## large write spikes for users running a large number of telegraf instances.
## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
flush_jitter = "0s"
## By default or when set to "0s", precision will be set to the same
## timestamp order as the collection interval, with the maximum being 1s.
## ie, when interval = "10s", precision will be "1s"
## when interval = "250ms", precision will be "1ms"
## Precision will NOT be used for service inputs. It is up to each individual
## service input to set the timestamp at the appropriate precision.
## Valid time units are "ns", "us" (or "µs"), "ms", "s".
precision = ""
## Log at debug level.
# debug = false
## Log only error level messages.
# quiet = false
## Log target controls the destination for logs and can be one of "file",
## "stderr" or, on Windows, "eventlog". When set to "file", the output file
## is determined by the "logfile" setting.
# logtarget = "file"
## Name of the file to be logged to when using the "file" logtarget. If set to
## the empty string then logs are written to stderr.
# logfile = ""
## The logfile will be rotated after the time interval specified. When set
## to 0 no time based rotation is performed. Logs are rotated only when
## written to, if there is no log activity rotation may be delayed.
# logfile_rotation_interval = "0d"
## The logfile will be rotated when it becomes larger than the specified
## size. When set to 0 no size based rotation is performed.
# logfile_rotation_max_size = "0MB"
## Maximum number of rotated archives to keep, any older logs are deleted.
## If set to -1, no archives are removed.
# logfile_rotation_max_archives = 5
## Pick a timezone to use when logging or type 'local' for local time.
## Example: America/Chicago
# log_with_timezone = ""
## Override default hostname, if empty use os.Hostname()
hostname = ""
## If set to true, do no set the "host" tag in the telegraf agent.
omit_hostname = false
[[outputs.influxdb_v2]]
## The URLs of the InfluxDB cluster nodes.
##
## Multiple URLs can be specified for a single cluster, only ONE of the
## urls will be written to each interval.
## ex: urls = ["https://us-west-2-1.aws.cloud2.influxdata.com"]
urls = ["http://localhost:8086"]
## Token for authentication.
token = "$INFLUX_TOKEN"
## Organization is the name of the organization you wish to write to; must exist.
organization = "proxmox"
## Destination bucket to write into.
bucket = "hddtemp"
## The value of this tag will be used to determine the bucket. If this
## tag is not set the 'bucket' option is used as the default.
# bucket_tag = ""
## If true, the bucket tag will not be added to the metric.
# exclude_bucket_tag = false
## Timeout for HTTP messages.
# timeout = "5s"
## Additional HTTP headers
# http_headers = {"X-Special-Header" = "Special-Value"}
## HTTP Proxy override, if unset values the standard proxy environment
## variables are consulted to determine which proxy, if any, should be used.
# http_proxy = "http://corporate.proxy:3128"
## HTTP User-Agent
# user_agent = "telegraf"
## Content-Encoding for write request body, can be set to "gzip" to
## compress body or "identity" to apply no encoding.
# content_encoding = "gzip"
## Enable or disable uint support for writing uints influxdb 2.0.
# influx_uint_support = false
## Optional TLS Config for use on HTTP connections.
# tls_ca = "/etc/telegraf/ca.pem"
# tls_cert = "/etc/telegraf/cert.pem"
# tls_key = "/etc/telegraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = false
# Monitor disks' temperatures using hddtemp
[[inputs.hddtemp]]
## By default, telegraf gathers temps data from all disks detected by the
## hddtemp.
##
## Only collect temps from the selected disks.
##
## A * as the device name will return the temperature values of all disks.
##
address = "localhost:7634"
#devices = ["sd*", "nvme*"]
interval = "5m"
flush_interval = "5m"
InfluxDB2/Grafana
数据展示代码
import "date"
import "strings"
import "interpolate"
from(bucket: "hddtemp")
|> range(
start: date.truncate(
t: v.timeRangeStart,
unit: if uint(v: v.windowPeriod) > 60000000000 then 1d else 1h, // 根据所选择的时间范围来确定 date.truncate 到什么程度
location: {zone: "Asia/Shanghai", offset: 0h}
),
stop: v.timeRangeStop
)
|> filter(fn: (r) => r["_measurement"] == "hddtemp")
|> filter(fn: (r) => r["_field"] == "temperature")
|> filter(fn: (r) => r["host"] == "telegraf")
|> filter(fn: (r) => r["_value"] > 0)
|> map(fn: (r) => ({r with device: strings.trimPrefix(v: r.device, prefix: "ata-")})) // 删除 device 的前缀
|> map(fn: (r) => ({r with device: strings.trimPrefix(v: r.device, prefix: "nvme-")})) // 删除 device 的前缀
|> map(fn: (r) => ({r with _value: float(v: r._value)})) // 后面的线性插值需要 _value 为 float 类型
|> keep(columns: ["_value", "_start", "_stop", "_time", "device"]) // 只保留用来作图的数据
|> interpolate.linear(every: v.windowPeriod) // 线性插值
|> aggregateWindow(
every:
if uint(v: v.windowPeriod) <= 60000000000 then //1天以内取1个像素所代表的时间范围的20倍为最小绘制单元
date.scale(d: v.windowPeriod, n: 20)
else if uint(v: v.windowPeriod) > 60000000000 and uint(v: v.windowPeriod) <= 120000000000 then //1-2天取40分钟为最小绘制单元
40m
else if uint(v: v.windowPeriod) > 120000000000 and uint(v: v.windowPeriod) <= 600000000000 then //2-7天取2小时为最小绘制单元
2h
else if uint(v: v.windowPeriod) > 600000000000 and uint(v: v.windowPeriod) <= 1800000000000 then //7-30天取6小时为最小绘制单元
6h
else if uint(v: v.windowPeriod) > 1800000000000 and uint(v: v.windowPeriod) <= 7200000000000 then //30-90天取12小时为最小绘制单元
12h
else //90天以上取1天为最小绘制单元
1d,
fn: mean,
timeSrc: "_start",
location: {zone: "Asia/Shanghai", offset: 0h}
)
注:nvme磁盘无法通过hddtemp
插件获取温度,需要自行通过脚本使用 influxdb2-client 自行写入。
网友评论