adding some base for monitoring, before adding prometheus and the rest

This commit is contained in:
Sebastian Blasiak 2018-06-09 10:29:15 +02:00
commit 9487fb10aa
8 changed files with 10657 additions and 0 deletions

73
.gitignore vendored Normal file
View File

@ -0,0 +1,73 @@
# swp vim edits
*.swp
*.swo
.os_release.sh
.configuration.cluster
.configuration.installs
.awscli-bundle/
# Python
*.pyc
**/.cache
**/.coverage
# Eclipse related files
.classpath
.cproject
.deps
.project
.settings
.metadata/
# MAC DS files
.DS_Store
# STS
.springBeans
# Maven and build related
target/
packages.mk
# IntelliJ IDEA related files
*.iml
*.iws
*.ipr
.idea/
.sonar-ide.properties
*.orig
*rebel*.xml
.idea/
/INSTALL
/Makefile
/aclocal.m4
/autom4te.cache/
/build/aux/
/config.log
/config.status
/configure
# sed related files
**/*.bak_remove
# terraform state files
**/.terraform
**/*.tfstate
**/*.tfstate.backup
**/.terraform.tfstate.lock.info
# Folders and files
certificates
terraform
configuration.cluster
configuration.installs
other
ec2
monitoring.backup
monitoring/monitoring_*_dir
**/*.bak_remove
**/*.wrapped
data

1
README.md Normal file
View File

@ -0,0 +1 @@
# monitoring-grafana-influxdb-telegraf-prometheus

54
docker-compose.yml Normal file
View File

@ -0,0 +1,54 @@
version: '3'
networks:
public: {}
volumes:
grafana_lib: {}
grafana_ds:
services:
influxdb:
image: influxdb:alpine
container_name: influxdb
ports:
- "8086:8086"
networks:
- public
volumes:
- ./data/influxdb:/var/lib/influxdb
environment:
INFLUXDB_REPORTING_DISABLED: "true"
INFLUXDB_DB: telegraf
INFLUXDB_USER: telegraf
INFLUXDB_USER_PASSWORD: nimda
grafana:
image: grafana/grafana:5.1.3
container_name: grafana
ports:
- "3000:3000"
networks:
- public
volumes:
- grafana_lib:/var/lib/grafana
- grafana_ds:/var/lib/grafana/ds:rw
- ${PWD}/grafana/add_datasources.sh:/var/lib/grafana/ds/add_datasources.sh
environment:
GF_AUTH_ANONYMOUS_ENABLED: "true"
GF_AUTH_ANONYMOUS_ORG_ROLE: "Admin"
INFLUXDB_URI: "http://influxdb:8086"
INFLUXDB_DB: telegraf
INFLUXDB_USER: telegraf
INFLUXDB_USER_PASSWORD: nimda
command: ["bash", "/var/lib/grafana/ds/add_datasources.sh"]
telegraf:
image: telegraf:latest
container_name: telegraf
network_mode: "host"
volumes:
- ./telegraf.conf:/etc/telegraf/telegraf.conf:ro
environment:
# real influx host
INFLUXDB_URI: "http://localhost:8086"

97
grafana/add_dashboards.sh Normal file
View File

@ -0,0 +1,97 @@
#!/bin/bash
set -e
NC='\033[0m'
RED='\033[00;31m'
GREEN='\033[00;32m'
YELLOW='\033[00;33m'
BLUE='\033[00;34m'
PURPLE='\033[00;35m'
CYAN='\033[00;36m'
LIGHTGRAY='\033[00;37m'
MAGENTA='\033[00;35m'
LRED='\033[01;31m'
LGREEN='\033[01;32m'
LYELLOW='\033[01;33m'
LBLUE='\033[01;34m'
LPURPLE='\033[01;35m'
LCYAN='\033[01;36m'
WHITE='\033[01;37m'
GRAFANA_URL=http://admin:admin@localhost:3000
grafana_api() {
local verb=$1
local url=$2
local params=$3
local bodyfile=$4
local response
local cmd
cmd="curl -L -s --fail -H \"Accept: application/json\" -H \"Content-Type: application/json\" -X ${verb} -k ${GRAFANA_URL}${url}"
[[ -n "${params}" ]] && cmd="${cmd} -d \"${params}\""
[[ -n "${bodyfile}" ]] && cmd="${cmd} --data @${bodyfile}"
echo -e "Running ${cmd}"
eval ${cmd} || return 1
return 0
}
wait_for_api() {
echo -e "${BLUE}Waiting for Grafana to be available...${NC}"
while ! grafana_api GET /api/user/preferences
do
echo -e "${BLUE}Waiting still...${NC}"
sleep 15
done
}
replace_datasource() {
local dashboard_file=$1
local datasource_name=$2
cmd="sed -i.bak_remove \"s/\\\${DS_INFLUXDB}/${datasource_name}/g\" ${dashboard_file}"
eval ${cmd} || return 1
return 0
}
install_dashboards() {
local dashboard
for dashboard in dashboards/*.json
do
if [[ $(grep "\"name\": \"DS_INFLUXDB\"," ${dashboard}) ]]; then
echo -e "${PURPLE}Dashboard ${dashboard} seems to be for InfluxDB datasource${NC}"
datasource_name="influxdb"
fi
if [[ $(grep "\"name\": \"DS_PROMETHEUS\"," ${dashboard}) ]]; then
echo -e "${PURPLE}Dashboard ${dashboard} seems to be for Prometheus datasource${NC}"
datasource_name="prometheus"
fi
if [[ -f "${dashboard}" ]]; then
echo -e "${LCYAN}Installing dashboard ${dashboard}${NC}"
replace_datasource ${dashboard} ${datasource_name}
# backup will be created before wrapping dashboard ^
#echo -e "{\"dashboard\": `cat $dashboard`}" > "${dashboard}.wrapped"
cp ${dashboard} ${dashboard}.wrapped
sed -i '1s/^/{"dashboard":\n/' ${dashboard}.wrapped
echo "}" >> ${dashboard}.wrapped
if grafana_api POST /api/dashboards/db "" "${dashboard}.wrapped"; then
echo -e "\n** ${GREEN}installed ok **${NC}"
else
echo -e "\n** ${RED}installation of: ${PURPLE}\"${dashboard}\"${RED} failed **${NC}"
fi
fi
#rm ${dashboard}.wrapped
done
}
configure_grafana() {
wait_for_api
install_dashboards
}
configure_grafana

View File

@ -0,0 +1,31 @@
#!/bin/bash
#set -e
# ADD INFLUXDB DATASOURCE
curl -s -v -H "Content-Type: application/json" \
-XPOST http://admin:admin@localhost:3000/api/datasources \
-d @- <<EOF
{
"name": "influxdb",
"type": "influxdb",
"access": "proxy",
"url": "http://influxdb:8086",
"database": "telegraf",
"user":"telegraf",
"password":"nimda",
"basicAuth":false
}
EOF
## ADD PROMETHEUS DATASOURCE
curl -s -v -H "Content-Type: application/json" \
-XPOST http://admin:admin@localhost:3000/api/datasources \
-d @- <<EOF
{
"name": "prometheus",
"type": "prometheus",
"access": "proxy",
"url": "http://prometheus:9090"
}
EOF

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

148
telegraf.conf Normal file
View File

@ -0,0 +1,148 @@
[agent]
## Default data collection interval for all inputs
interval = "10s"
## Rounds collection interval to 'interval'
## ie, if interval="10s" then always collect on :00, :10, :20, etc.
round_interval = true
## Telegraf will send metrics to outputs in batches of at
## most metric_batch_size metrics.
metric_batch_size = 1000
## For failed writes, telegraf will cache metric_buffer_limit metrics for each
## output, and will flush this buffer on a successful write. Oldest metrics
## are dropped first when this buffer fills.
metric_buffer_limit = 10000
## Collection jitter is used to jitter the collection by a random amount.
## Each plugin will sleep for a random time within jitter before collecting.
## This can be used to avoid many plugins querying things like sysfs at the
## same time, which can have a measurable effect on the system.
collection_jitter = "0s"
## Default flushing interval for all outputs. You shouldn't set this below
## interval. Maximum flush_interval will be flush_interval + flush_jitter
flush_interval = "10s"
## Jitter the flush interval by a random amount. This is primarily to avoid
## large write spikes for users running a large number of telegraf instances.
## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
flush_jitter = "0s"
## By default, precision will be set to the same timestamp order as the
## collection interval, with the maximum being 1s.
## Precision will NOT be used for service inputs, such as logparser and statsd.
## Valid values are "ns", "us" (or "µs"), "ms", "s".
precision = ""
## Run telegraf in debug mode
debug = false
## Run telegraf in quiet mode
quiet = false
## Override default hostname, if empty use os.Hostname()
hostname = ""
## If set to true, do no set the "host" tag in the telegraf agent.
omit_hostname = false
###############################################################################
# OUTPUT PLUGINS #
###############################################################################
# Configuration for influxdb server to send metrics to
[[outputs.influxdb]]
## The full HTTP or UDP endpoint URL for your InfluxDB instance.
## Multiple urls can be specified as part of the same cluster,
## this means that only ONE of the urls will be written to each interval.
# urls = ["udp://localhost:8089"] # UDP endpoint example
urls = ["$INFLUXDB_URI"] # required
## The target database for metrics (telegraf will create it if not exists).
database = "telegraf" # required
## Retention policy to write to. Empty string writes to the default rp.
retention_policy = ""
## Write consistency (clusters only), can be: "any", "one", "quorum", "all"
write_consistency = "any"
## Write timeout (for the InfluxDB client), formatted as a string.
## If not provided, will default to 5s. 0s means no timeout (not recommended).
timeout = "5s"
username = "telegraf"
password = "nimda"
## Set the user agent for HTTP POSTs (can be useful for log differentiation)
user_agent = "telegraf-agent"
## Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes)
# udp_payload = 512
## Optional SSL Config
# ssl_ca = "/etc/telegraf/ca.pem"
# ssl_cert = "/etc/telegraf/cert.pem"
# ssl_key = "/etc/telegraf/key.pem"
## Use SSL but skip chain & host verification
# insecure_skip_verify = false
###############################################################################
# INPUT PLUGINS #
###############################################################################
# Read metrics about cpu usage
[[inputs.cpu]]
## Whether to report per-cpu stats or not
percpu = true
## Whether to report total system cpu stats or not
totalcpu = true
## Comment this line if you want the raw CPU time metrics
fielddrop = ["time_*"]
collect_cpu_time = true
report_active = false
# Read metrics about disk usage by mount point
[[inputs.disk]]
#
#
## By default, telegraf gather stats for all mountpoints.
## Setting mountpoints will restrict the stats to the specified mountpoints.
# mount_points = ["/"]
## Ignore some mountpoints by filesystem type. For example (dev)tmpfs (usually
## present on /run, /var/run, /dev/shm or /dev).
ignore_fs = ["tmpfs", "devtmpfs"]
[[inputs.net]]
# no configuration
# Read metrics about memory usage
[[inputs.mem]]
# no configuration
# Read metrics about system load & uptime
[[inputs.system]]
# no configuration
[[inputs.processes]]
[[inputs.netstat]]
[[inputs.kernel]]
# Read flattened metrics from one or more JSON HTTP endpoints
#[[inputs.httpjson]]
# a name for the service being polled
#name_override = "monitoring-sebson"
# Your servers
#servers = [
# "http://localhost:4444/status"
#]
#response_timeout = "5s"
## HTTP method to use: GET or POST (case-sensitive)
#method = "GET"
[[inputs.interrupts]]
## To filter which IRQs to collect, make use of tagpass / tagdrop, i.e.
# [inputs.interrupts.tagdrop]
# irq = [ "NET_RX", "TASKLET" ]
# Collects conntrack stats from the configured directories and files.
[[inputs.conntrack]]
## The following defaults would work with multiple versions of conntrack.
## Note the nf_ and ip_ filename prefixes are mutually exclusive across
## kernel versions, as are the directory locations.
## Superset of filenames to look for within the conntrack dirs.
## Missing files will be ignored.
files = ["ip_conntrack_count","ip_conntrack_max",
"nf_conntrack_count","nf_conntrack_max"]
## Directories to search within for the conntrack files above.
## Missing directrories will be ignored.
dirs = ["/proc/sys/net/ipv4/netfilter","/proc/sys/net/netfilter"]