adding some base for monitoring, before adding prometheus and the rest
This commit is contained in:
commit
9487fb10aa
73
.gitignore
vendored
Normal file
73
.gitignore
vendored
Normal file
@ -0,0 +1,73 @@
|
||||
# swp vim edits
|
||||
*.swp
|
||||
*.swo
|
||||
.os_release.sh
|
||||
.configuration.cluster
|
||||
.configuration.installs
|
||||
.awscli-bundle/
|
||||
|
||||
# Python
|
||||
*.pyc
|
||||
**/.cache
|
||||
**/.coverage
|
||||
|
||||
# Eclipse related files
|
||||
.classpath
|
||||
.cproject
|
||||
.deps
|
||||
.project
|
||||
.settings
|
||||
.metadata/
|
||||
|
||||
# MAC DS files
|
||||
.DS_Store
|
||||
|
||||
# STS
|
||||
.springBeans
|
||||
# Maven and build related
|
||||
target/
|
||||
packages.mk
|
||||
|
||||
# IntelliJ IDEA related files
|
||||
*.iml
|
||||
*.iws
|
||||
*.ipr
|
||||
.idea/
|
||||
|
||||
.sonar-ide.properties
|
||||
|
||||
|
||||
*.orig
|
||||
*rebel*.xml
|
||||
.idea/
|
||||
|
||||
/INSTALL
|
||||
/Makefile
|
||||
/aclocal.m4
|
||||
/autom4te.cache/
|
||||
/build/aux/
|
||||
/config.log
|
||||
/config.status
|
||||
/configure
|
||||
|
||||
# sed related files
|
||||
**/*.bak_remove
|
||||
|
||||
# terraform state files
|
||||
**/.terraform
|
||||
**/*.tfstate
|
||||
**/*.tfstate.backup
|
||||
**/.terraform.tfstate.lock.info
|
||||
|
||||
# Folders and files
|
||||
certificates
|
||||
terraform
|
||||
configuration.cluster
|
||||
configuration.installs
|
||||
other
|
||||
ec2
|
||||
monitoring.backup
|
||||
monitoring/monitoring_*_dir
|
||||
**/*.bak_remove
|
||||
**/*.wrapped
|
||||
data
|
54
docker-compose.yml
Normal file
54
docker-compose.yml
Normal file
@ -0,0 +1,54 @@
|
||||
version: '3'
|
||||
|
||||
networks:
|
||||
public: {}
|
||||
|
||||
volumes:
|
||||
grafana_lib: {}
|
||||
grafana_ds:
|
||||
|
||||
services:
|
||||
influxdb:
|
||||
image: influxdb:alpine
|
||||
container_name: influxdb
|
||||
ports:
|
||||
- "8086:8086"
|
||||
networks:
|
||||
- public
|
||||
volumes:
|
||||
- ./data/influxdb:/var/lib/influxdb
|
||||
environment:
|
||||
INFLUXDB_REPORTING_DISABLED: "true"
|
||||
INFLUXDB_DB: telegraf
|
||||
INFLUXDB_USER: telegraf
|
||||
INFLUXDB_USER_PASSWORD: nimda
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:5.1.3
|
||||
container_name: grafana
|
||||
ports:
|
||||
- "3000:3000"
|
||||
networks:
|
||||
- public
|
||||
volumes:
|
||||
- grafana_lib:/var/lib/grafana
|
||||
- grafana_ds:/var/lib/grafana/ds:rw
|
||||
- ${PWD}/grafana/add_datasources.sh:/var/lib/grafana/ds/add_datasources.sh
|
||||
environment:
|
||||
GF_AUTH_ANONYMOUS_ENABLED: "true"
|
||||
GF_AUTH_ANONYMOUS_ORG_ROLE: "Admin"
|
||||
INFLUXDB_URI: "http://influxdb:8086"
|
||||
INFLUXDB_DB: telegraf
|
||||
INFLUXDB_USER: telegraf
|
||||
INFLUXDB_USER_PASSWORD: nimda
|
||||
command: ["bash", "/var/lib/grafana/ds/add_datasources.sh"]
|
||||
|
||||
telegraf:
|
||||
image: telegraf:latest
|
||||
container_name: telegraf
|
||||
network_mode: "host"
|
||||
volumes:
|
||||
- ./telegraf.conf:/etc/telegraf/telegraf.conf:ro
|
||||
environment:
|
||||
# real influx host
|
||||
INFLUXDB_URI: "http://localhost:8086"
|
97
grafana/add_dashboards.sh
Normal file
97
grafana/add_dashboards.sh
Normal file
@ -0,0 +1,97 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
NC='\033[0m'
|
||||
|
||||
RED='\033[00;31m'
|
||||
GREEN='\033[00;32m'
|
||||
YELLOW='\033[00;33m'
|
||||
BLUE='\033[00;34m'
|
||||
PURPLE='\033[00;35m'
|
||||
CYAN='\033[00;36m'
|
||||
LIGHTGRAY='\033[00;37m'
|
||||
MAGENTA='\033[00;35m'
|
||||
LRED='\033[01;31m'
|
||||
LGREEN='\033[01;32m'
|
||||
LYELLOW='\033[01;33m'
|
||||
LBLUE='\033[01;34m'
|
||||
LPURPLE='\033[01;35m'
|
||||
LCYAN='\033[01;36m'
|
||||
WHITE='\033[01;37m'
|
||||
|
||||
GRAFANA_URL=http://admin:admin@localhost:3000
|
||||
|
||||
|
||||
grafana_api() {
|
||||
local verb=$1
|
||||
local url=$2
|
||||
local params=$3
|
||||
local bodyfile=$4
|
||||
local response
|
||||
local cmd
|
||||
|
||||
cmd="curl -L -s --fail -H \"Accept: application/json\" -H \"Content-Type: application/json\" -X ${verb} -k ${GRAFANA_URL}${url}"
|
||||
[[ -n "${params}" ]] && cmd="${cmd} -d \"${params}\""
|
||||
[[ -n "${bodyfile}" ]] && cmd="${cmd} --data @${bodyfile}"
|
||||
echo -e "Running ${cmd}"
|
||||
eval ${cmd} || return 1
|
||||
return 0
|
||||
}
|
||||
|
||||
wait_for_api() {
|
||||
echo -e "${BLUE}Waiting for Grafana to be available...${NC}"
|
||||
while ! grafana_api GET /api/user/preferences
|
||||
do
|
||||
echo -e "${BLUE}Waiting still...${NC}"
|
||||
sleep 15
|
||||
done
|
||||
}
|
||||
|
||||
replace_datasource() {
|
||||
local dashboard_file=$1
|
||||
local datasource_name=$2
|
||||
cmd="sed -i.bak_remove \"s/\\\${DS_INFLUXDB}/${datasource_name}/g\" ${dashboard_file}"
|
||||
eval ${cmd} || return 1
|
||||
return 0
|
||||
}
|
||||
|
||||
install_dashboards() {
|
||||
local dashboard
|
||||
|
||||
for dashboard in dashboards/*.json
|
||||
|
||||
do
|
||||
if [[ $(grep "\"name\": \"DS_INFLUXDB\"," ${dashboard}) ]]; then
|
||||
echo -e "${PURPLE}Dashboard ${dashboard} seems to be for InfluxDB datasource${NC}"
|
||||
datasource_name="influxdb"
|
||||
fi
|
||||
if [[ $(grep "\"name\": \"DS_PROMETHEUS\"," ${dashboard}) ]]; then
|
||||
echo -e "${PURPLE}Dashboard ${dashboard} seems to be for Prometheus datasource${NC}"
|
||||
datasource_name="prometheus"
|
||||
fi
|
||||
if [[ -f "${dashboard}" ]]; then
|
||||
echo -e "${LCYAN}Installing dashboard ${dashboard}${NC}"
|
||||
replace_datasource ${dashboard} ${datasource_name}
|
||||
# backup will be created before wrapping dashboard ^
|
||||
#echo -e "{\"dashboard\": `cat $dashboard`}" > "${dashboard}.wrapped"
|
||||
cp ${dashboard} ${dashboard}.wrapped
|
||||
sed -i '1s/^/{"dashboard":\n/' ${dashboard}.wrapped
|
||||
echo "}" >> ${dashboard}.wrapped
|
||||
|
||||
if grafana_api POST /api/dashboards/db "" "${dashboard}.wrapped"; then
|
||||
echo -e "\n** ${GREEN}installed ok **${NC}"
|
||||
else
|
||||
echo -e "\n** ${RED}installation of: ${PURPLE}\"${dashboard}\"${RED} failed **${NC}"
|
||||
fi
|
||||
fi
|
||||
#rm ${dashboard}.wrapped
|
||||
done
|
||||
}
|
||||
|
||||
configure_grafana() {
|
||||
wait_for_api
|
||||
install_dashboards
|
||||
}
|
||||
|
||||
configure_grafana
|
||||
|
31
grafana/add_datasources.sh
Normal file
31
grafana/add_datasources.sh
Normal file
@ -0,0 +1,31 @@
|
||||
#!/bin/bash
|
||||
#set -e
|
||||
|
||||
# ADD INFLUXDB DATASOURCE
|
||||
curl -s -v -H "Content-Type: application/json" \
|
||||
-XPOST http://admin:admin@localhost:3000/api/datasources \
|
||||
-d @- <<EOF
|
||||
{
|
||||
"name": "influxdb",
|
||||
"type": "influxdb",
|
||||
"access": "proxy",
|
||||
"url": "http://influxdb:8086",
|
||||
"database": "telegraf",
|
||||
"user":"telegraf",
|
||||
"password":"nimda",
|
||||
"basicAuth":false
|
||||
}
|
||||
EOF
|
||||
|
||||
## ADD PROMETHEUS DATASOURCE
|
||||
curl -s -v -H "Content-Type: application/json" \
|
||||
-XPOST http://admin:admin@localhost:3000/api/datasources \
|
||||
-d @- <<EOF
|
||||
{
|
||||
"name": "prometheus",
|
||||
"type": "prometheus",
|
||||
"access": "proxy",
|
||||
"url": "http://prometheus:9090"
|
||||
}
|
||||
EOF
|
||||
|
2122
grafana/dashboards/prometheus-system.json
Normal file
2122
grafana/dashboards/prometheus-system.json
Normal file
File diff suppressed because it is too large
Load Diff
8131
grafana/dashboards/telegraf-dashboard.json
Normal file
8131
grafana/dashboards/telegraf-dashboard.json
Normal file
File diff suppressed because it is too large
Load Diff
148
telegraf.conf
Normal file
148
telegraf.conf
Normal file
@ -0,0 +1,148 @@
|
||||
[agent]
|
||||
## Default data collection interval for all inputs
|
||||
interval = "10s"
|
||||
## Rounds collection interval to 'interval'
|
||||
## ie, if interval="10s" then always collect on :00, :10, :20, etc.
|
||||
round_interval = true
|
||||
## Telegraf will send metrics to outputs in batches of at
|
||||
## most metric_batch_size metrics.
|
||||
metric_batch_size = 1000
|
||||
## For failed writes, telegraf will cache metric_buffer_limit metrics for each
|
||||
## output, and will flush this buffer on a successful write. Oldest metrics
|
||||
## are dropped first when this buffer fills.
|
||||
metric_buffer_limit = 10000
|
||||
## Collection jitter is used to jitter the collection by a random amount.
|
||||
## Each plugin will sleep for a random time within jitter before collecting.
|
||||
## This can be used to avoid many plugins querying things like sysfs at the
|
||||
## same time, which can have a measurable effect on the system.
|
||||
collection_jitter = "0s"
|
||||
## Default flushing interval for all outputs. You shouldn't set this below
|
||||
## interval. Maximum flush_interval will be flush_interval + flush_jitter
|
||||
flush_interval = "10s"
|
||||
## Jitter the flush interval by a random amount. This is primarily to avoid
|
||||
## large write spikes for users running a large number of telegraf instances.
|
||||
## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
|
||||
flush_jitter = "0s"
|
||||
## By default, precision will be set to the same timestamp order as the
|
||||
## collection interval, with the maximum being 1s.
|
||||
## Precision will NOT be used for service inputs, such as logparser and statsd.
|
||||
## Valid values are "ns", "us" (or "µs"), "ms", "s".
|
||||
precision = ""
|
||||
## Run telegraf in debug mode
|
||||
debug = false
|
||||
## Run telegraf in quiet mode
|
||||
quiet = false
|
||||
## Override default hostname, if empty use os.Hostname()
|
||||
hostname = ""
|
||||
## If set to true, do no set the "host" tag in the telegraf agent.
|
||||
omit_hostname = false
|
||||
|
||||
###############################################################################
|
||||
# OUTPUT PLUGINS #
|
||||
###############################################################################
|
||||
|
||||
# Configuration for influxdb server to send metrics to
|
||||
[[outputs.influxdb]]
|
||||
|
||||
## The full HTTP or UDP endpoint URL for your InfluxDB instance.
|
||||
## Multiple urls can be specified as part of the same cluster,
|
||||
## this means that only ONE of the urls will be written to each interval.
|
||||
# urls = ["udp://localhost:8089"] # UDP endpoint example
|
||||
|
||||
urls = ["$INFLUXDB_URI"] # required
|
||||
|
||||
|
||||
## The target database for metrics (telegraf will create it if not exists).
|
||||
database = "telegraf" # required
|
||||
## Retention policy to write to. Empty string writes to the default rp.
|
||||
retention_policy = ""
|
||||
## Write consistency (clusters only), can be: "any", "one", "quorum", "all"
|
||||
write_consistency = "any"
|
||||
## Write timeout (for the InfluxDB client), formatted as a string.
|
||||
## If not provided, will default to 5s. 0s means no timeout (not recommended).
|
||||
timeout = "5s"
|
||||
username = "telegraf"
|
||||
password = "nimda"
|
||||
## Set the user agent for HTTP POSTs (can be useful for log differentiation)
|
||||
user_agent = "telegraf-agent"
|
||||
## Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes)
|
||||
# udp_payload = 512
|
||||
|
||||
## Optional SSL Config
|
||||
# ssl_ca = "/etc/telegraf/ca.pem"
|
||||
# ssl_cert = "/etc/telegraf/cert.pem"
|
||||
# ssl_key = "/etc/telegraf/key.pem"
|
||||
## Use SSL but skip chain & host verification
|
||||
# insecure_skip_verify = false
|
||||
|
||||
|
||||
###############################################################################
|
||||
# INPUT PLUGINS #
|
||||
###############################################################################
|
||||
|
||||
# Read metrics about cpu usage
|
||||
[[inputs.cpu]]
|
||||
## Whether to report per-cpu stats or not
|
||||
percpu = true
|
||||
## Whether to report total system cpu stats or not
|
||||
totalcpu = true
|
||||
## Comment this line if you want the raw CPU time metrics
|
||||
fielddrop = ["time_*"]
|
||||
collect_cpu_time = true
|
||||
report_active = false
|
||||
|
||||
# Read metrics about disk usage by mount point
|
||||
[[inputs.disk]]
|
||||
#
|
||||
#
|
||||
## By default, telegraf gather stats for all mountpoints.
|
||||
## Setting mountpoints will restrict the stats to the specified mountpoints.
|
||||
# mount_points = ["/"]
|
||||
|
||||
## Ignore some mountpoints by filesystem type. For example (dev)tmpfs (usually
|
||||
## present on /run, /var/run, /dev/shm or /dev).
|
||||
ignore_fs = ["tmpfs", "devtmpfs"]
|
||||
|
||||
[[inputs.net]]
|
||||
# no configuration
|
||||
# Read metrics about memory usage
|
||||
[[inputs.mem]]
|
||||
# no configuration
|
||||
|
||||
# Read metrics about system load & uptime
|
||||
[[inputs.system]]
|
||||
# no configuration
|
||||
[[inputs.processes]]
|
||||
[[inputs.netstat]]
|
||||
[[inputs.kernel]]
|
||||
|
||||
# Read flattened metrics from one or more JSON HTTP endpoints
|
||||
#[[inputs.httpjson]]
|
||||
# a name for the service being polled
|
||||
#name_override = "monitoring-sebson"
|
||||
|
||||
# Your servers
|
||||
#servers = [
|
||||
# "http://localhost:4444/status"
|
||||
#]
|
||||
#response_timeout = "5s"
|
||||
## HTTP method to use: GET or POST (case-sensitive)
|
||||
#method = "GET"
|
||||
[[inputs.interrupts]]
|
||||
## To filter which IRQs to collect, make use of tagpass / tagdrop, i.e.
|
||||
# [inputs.interrupts.tagdrop]
|
||||
# irq = [ "NET_RX", "TASKLET" ]
|
||||
# Collects conntrack stats from the configured directories and files.
|
||||
[[inputs.conntrack]]
|
||||
## The following defaults would work with multiple versions of conntrack.
|
||||
## Note the nf_ and ip_ filename prefixes are mutually exclusive across
|
||||
## kernel versions, as are the directory locations.
|
||||
|
||||
## Superset of filenames to look for within the conntrack dirs.
|
||||
## Missing files will be ignored.
|
||||
files = ["ip_conntrack_count","ip_conntrack_max",
|
||||
"nf_conntrack_count","nf_conntrack_max"]
|
||||
|
||||
## Directories to search within for the conntrack files above.
|
||||
## Missing directrories will be ignored.
|
||||
dirs = ["/proc/sys/net/ipv4/netfilter","/proc/sys/net/netfilter"]
|
Loading…
Reference in New Issue
Block a user