Containerized Openstack Monitoring Solution
Change-Id: I66ea0711dd0319c1153a13b159dc5be6f7a7016c
This commit is contained in:
@@ -7,7 +7,8 @@ Methodologies
|
|||||||
=======================
|
=======================
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 2
|
:maxdepth: 4
|
||||||
|
|
||||||
tools
|
tools
|
||||||
hyper-scale
|
hyper-scale
|
||||||
|
monitoring/index
|
||||||
|
|||||||
15
doc/source/methodologies/monitoring/configs/ccp/ccp.yaml
Normal file
15
doc/source/methodologies/monitoring/configs/ccp/ccp.yaml
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
builder:
|
||||||
|
push: true
|
||||||
|
no_cache: false
|
||||||
|
registry:
|
||||||
|
address: "172.20.8.35:5000/env-1"
|
||||||
|
repositories:
|
||||||
|
skip_empty: True
|
||||||
|
kubernetes:
|
||||||
|
server: http://172.20.9.234:8080
|
||||||
|
---
|
||||||
|
!include
|
||||||
|
- versions.yaml
|
||||||
|
- topology.yaml
|
||||||
|
- configs.yaml
|
||||||
|
- repos.yaml
|
||||||
38
doc/source/methodologies/monitoring/configs/ccp/configs.yaml
Normal file
38
doc/source/methodologies/monitoring/configs/ccp/configs.yaml
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
configs:
|
||||||
|
private_interface: p1p1.602
|
||||||
|
public_interface: p1p1.602
|
||||||
|
ingress:
|
||||||
|
enabled: true
|
||||||
|
glance:
|
||||||
|
bootstrap:
|
||||||
|
enable: true
|
||||||
|
# nova:
|
||||||
|
# allocation_ratio:
|
||||||
|
# cpu: 16.0
|
||||||
|
neutron:
|
||||||
|
physnets:
|
||||||
|
- name: "physnet1"
|
||||||
|
bridge_name: "br-ex"
|
||||||
|
interface: "p1p1.649"
|
||||||
|
flat: true
|
||||||
|
vlan_range: false
|
||||||
|
bootstrap:
|
||||||
|
internal:
|
||||||
|
enable: true
|
||||||
|
external:
|
||||||
|
enable: true
|
||||||
|
net_name: ext-net
|
||||||
|
subnet_name: ext-subnet
|
||||||
|
physnet: physnet1
|
||||||
|
network: 10.144.0.0/12
|
||||||
|
gateway: 10.144.0.1
|
||||||
|
nameserver: 10.144.0.1
|
||||||
|
pool:
|
||||||
|
start: 10.144.1.0
|
||||||
|
end: 10.159.255.250
|
||||||
|
keystone:
|
||||||
|
debug: true
|
||||||
|
heat:
|
||||||
|
debug: true
|
||||||
|
memcached:
|
||||||
|
ram: 30720
|
||||||
78
doc/source/methodologies/monitoring/configs/ccp/deploy-ccp.sh
Executable file
78
doc/source/methodologies/monitoring/configs/ccp/deploy-ccp.sh
Executable file
@@ -0,0 +1,78 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -ex
|
||||||
|
if [ -z "$1" ]; then
|
||||||
|
echo "Please set number of env as argument"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
DEPLOY_TIMEOUT=1200
|
||||||
|
export SSH_USER="root"
|
||||||
|
export SSH_PASS="r00tme"
|
||||||
|
cd $(dirname $(realpath $0))
|
||||||
|
|
||||||
|
NODE1="172.20.8.6${1}"
|
||||||
|
|
||||||
|
SSH_OPTS="-q -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no"
|
||||||
|
SSH_CMD="sshpass -p ${SSH_PASS} ssh ${SSH_OPTS} ${SSH_USER}@${NODE1}"
|
||||||
|
SCP_CMD="sshpass -p ${SSH_PASS} scp ${SSH_OPTS}"
|
||||||
|
|
||||||
|
if [ ! -d ./env-${1} ]; then
|
||||||
|
echo "Yaml files for env-${1} is not found"
|
||||||
|
echo "Please, create and commit deployment/ccp/rackspace/env-${1}/configs with correct yaml files"
|
||||||
|
echo "Main file should be deployment/ccp/rackspace/env-${1}/configs/ccp.yaml"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
$SCP_CMD ./env-${1}/configs/ccp.yaml ${SSH_USER}@${NODE1}:/root/.ccp.yaml
|
||||||
|
for i in $(ls -1 ./env-${1}/configs/ | grep -v ccp.yaml ); do
|
||||||
|
$SCP_CMD ./env-${1}/configs/${i} ${SSH_USER}@${NODE1}:/root/
|
||||||
|
done
|
||||||
|
|
||||||
|
$SSH_CMD "rm -rf /root/fuel-ccp; cd /root; git clone https://git.openstack.org/openstack/fuel-ccp"
|
||||||
|
$SSH_CMD "apt-get -y install python-pip"
|
||||||
|
$SSH_CMD "/usr/bin/pip install --upgrade pip"
|
||||||
|
$SSH_CMD "/usr/bin/pip install /root/fuel-ccp/"
|
||||||
|
|
||||||
|
CCP_STATUS=$($SSH_CMD "/usr/local/bin/ccp status")
|
||||||
|
if [ -n "$CCP_STATUS" ]; then
|
||||||
|
echo "Active deployment was found"
|
||||||
|
echo "$CCP_STATUS"
|
||||||
|
echo "Please execute 'ccp cleanup' and 'rm -rf /var/lib/mysql/*' on the ${NODE1} manually"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
$SSH_CMD "echo '172.20.8.6${1} cloudformation.ccp.external console.ccp.external identity.ccp.external object-store.ccp.external compute.ccp.external orchestration.ccp.external network.ccp.external image.ccp.external volume.ccp.external horizon.ccp.external' >> /etc/hosts"
|
||||||
|
# $SSH_CMD kubectl delete configmaps traefik-conf -n kube-system
|
||||||
|
# $SSH_CMD kubectl delete service traefik -n kube-system
|
||||||
|
# $SSH_CMD kubectl delete secret traefik-cert -n kube-system
|
||||||
|
# $SSH_CMD kubectl delete deployment traefik -n kube-system
|
||||||
|
$SSH_CMD "/root/fuel-ccp/tools/ingress/deploy-ingress-controller.sh -i 172.20.8.6${1}" || echo "Already configured"
|
||||||
|
$SSH_CMD "echo 172.20.8.6${1} \$(ccp domains list -f value) >> /etc/hosts"
|
||||||
|
$SSH_CMD "openssl s_client -status -connect identity.ccp.external:8443 < /dev/null 2>&1 | awk 'BEGIN {pr=0;} /-----BEGIN CERTIFICATE-----/ {pr=1;} {if (pr) print;} /-----END CERTIFICATE-----/ {exit;}' >> /usr/local/lib/python2.7/dist-packages/requests/cacert.pem"
|
||||||
|
$SSH_CMD "openssl s_client -status -connect identity.ccp.external:8443 < /dev/null 2>&1 | awk 'BEGIN {pr=0;} /-----BEGIN CERTIFICATE-----/ {pr=1;} {if (pr) print;} /-----END CERTIFICATE-----/ {exit;}' > /usr/share/ca-certificates/ingress.crt"
|
||||||
|
$SSH_CMD "cp /usr/share/ca-certificates/ingress.crt /usr/local/share/ca-certificates/"
|
||||||
|
$SSH_CMD "update-ca-certificates"
|
||||||
|
if [ $($SSH_CMD "curl -s 'https://identity.ccp.external:8443/' > /dev/null; echo \$?") != 0 ]
|
||||||
|
then
|
||||||
|
echo "keystone is unreachable check https://identity.ccp.external:8443"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
#$SSH_CMD "/root/fuel-ccp/tools/registry/deploy-registry.sh" &&
|
||||||
|
$SSH_CMD "/usr/local/bin/ccp fetch"
|
||||||
|
$SSH_CMD "/usr/local/bin/ccp build"
|
||||||
|
$SSH_CMD "/usr/local/bin/ccp deploy"
|
||||||
|
|
||||||
|
DEPLOY_TIME=0
|
||||||
|
while [ "$($SSH_CMD '/usr/local/bin/ccp status -s -f value' 2>/dev/null)" != "ok" ]
|
||||||
|
do
|
||||||
|
sleep 5
|
||||||
|
DEPLOY_TIME=$((${DEPLOY_TIME} + 5))
|
||||||
|
if [ $DEPLOY_TIME -ge $DEPLOY_TIMEOUT ]; then
|
||||||
|
echo "Deployment timeout"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
$SSH_CMD "/usr/local/bin/ccp status"
|
||||||
@@ -0,0 +1,7 @@
|
|||||||
|
export OS_PROJECT_DOMAIN_NAME=default
|
||||||
|
export OS_USER_DOMAIN_NAME=default
|
||||||
|
export OS_PROJECT_NAME=admin
|
||||||
|
export OS_USERNAME=admin
|
||||||
|
export OS_PASSWORD=password
|
||||||
|
export OS_IDENTITY_API_VERSION=3
|
||||||
|
export OS_AUTH_URL=https://identity.ccp.external:8443/v3
|
||||||
44
doc/source/methodologies/monitoring/configs/ccp/repos.yaml
Normal file
44
doc/source/methodologies/monitoring/configs/ccp/repos.yaml
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
repositories:
|
||||||
|
repos:
|
||||||
|
- git_url: https://git.openstack.org/openstack/fuel-ccp-ceph
|
||||||
|
name: fuel-ccp-ceph
|
||||||
|
- git_url: https://git.openstack.org/openstack/fuel-ccp-cinder
|
||||||
|
name: fuel-ccp-cinder
|
||||||
|
- git_url: https://git.openstack.org/openstack/fuel-ccp-debian-base
|
||||||
|
name: fuel-ccp-debian-base
|
||||||
|
- git_url: https://git.openstack.org/openstack/fuel-ccp-entrypoint
|
||||||
|
name: fuel-ccp-entrypoint
|
||||||
|
- git_url: https://git.openstack.org/openstack/fuel-ccp-etcd
|
||||||
|
name: fuel-ccp-etcd
|
||||||
|
- git_url: https://git.openstack.org/openstack/fuel-ccp-glance
|
||||||
|
name: fuel-ccp-glance
|
||||||
|
- git_url: https://git.openstack.org/openstack/fuel-ccp-heat
|
||||||
|
name: fuel-ccp-heat
|
||||||
|
- git_url: https://git.openstack.org/openstack/fuel-ccp-horizon
|
||||||
|
name: fuel-ccp-horizon
|
||||||
|
# - git_url: https://git.openstack.org/openstack/fuel-ccp-ironic
|
||||||
|
# name: fuel-ccp-ironic
|
||||||
|
- git_url: https://git.openstack.org/openstack/fuel-ccp-keystone
|
||||||
|
name: fuel-ccp-keystone
|
||||||
|
# - git_url: https://git.openstack.org/openstack/fuel-ccp-mariadb
|
||||||
|
# name: fuel-ccp-mariadb
|
||||||
|
- git_url: https://git.openstack.org/openstack/fuel-ccp-galera
|
||||||
|
name: fuel-ccp-galera
|
||||||
|
- git_url: https://git.openstack.org/openstack/fuel-ccp-memcached
|
||||||
|
name: fuel-ccp-memcached
|
||||||
|
# - git_url: https://git.openstack.org/openstack/fuel-ccp-murano
|
||||||
|
# name: fuel-ccp-murano
|
||||||
|
- git_url: https://git.openstack.org/openstack/fuel-ccp-neutron
|
||||||
|
name: fuel-ccp-neutron
|
||||||
|
- git_url: https://git.openstack.org/openstack/fuel-ccp-nova
|
||||||
|
name: fuel-ccp-nova
|
||||||
|
- git_url: https://git.openstack.org/openstack/fuel-ccp-openstack-base
|
||||||
|
name: fuel-ccp-openstack-base
|
||||||
|
- git_url: https://git.openstack.org/openstack/fuel-ccp-rabbitmq
|
||||||
|
name: fuel-ccp-rabbitmq
|
||||||
|
# - git_url: https://git.openstack.org/openstack/fuel-ccp-sahara
|
||||||
|
# name: fuel-ccp-sahara
|
||||||
|
# - git_url: https://git.openstack.org/openstack/fuel-ccp-searchlight
|
||||||
|
# name: fuel-ccp-searchlight
|
||||||
|
# - git_url: https://git.openstack.org/openstack/fuel-ccp-stacklight
|
||||||
|
# name: fuel-ccp-stacklight
|
||||||
@@ -0,0 +1,77 @@
|
|||||||
|
nodes:
|
||||||
|
# node[1-3]: Kubernetes
|
||||||
|
node([4-6])$: # 4-6
|
||||||
|
roles:
|
||||||
|
- controller
|
||||||
|
- openvswitch
|
||||||
|
node[7-9]$: # 7-9
|
||||||
|
roles:
|
||||||
|
- rabbitmq
|
||||||
|
node10$: # 10
|
||||||
|
roles:
|
||||||
|
- galera
|
||||||
|
node11$: # 11
|
||||||
|
roles:
|
||||||
|
- heat
|
||||||
|
node(1[2-9])$: # 12-19
|
||||||
|
roles:
|
||||||
|
- compute
|
||||||
|
- openvswitch
|
||||||
|
node[2-9][0-9]$: # 20-99
|
||||||
|
roles:
|
||||||
|
- compute
|
||||||
|
- openvswitch
|
||||||
|
node(1[0-9][0-9])$: # 100-199
|
||||||
|
roles:
|
||||||
|
- compute
|
||||||
|
- openvswitch
|
||||||
|
node200$:
|
||||||
|
roles:
|
||||||
|
- backup
|
||||||
|
replicas:
|
||||||
|
glance-api: 1
|
||||||
|
glance-registry: 1
|
||||||
|
keystone: 3
|
||||||
|
nova-api: 3
|
||||||
|
nova-scheduler: 3
|
||||||
|
nova-conductor: 3
|
||||||
|
neutron-server: 3
|
||||||
|
neutron-metadata-agent: 3
|
||||||
|
horizon: 3
|
||||||
|
heat-api: 1
|
||||||
|
heat-api-cfn: 1
|
||||||
|
heat-engine: 1
|
||||||
|
roles:
|
||||||
|
galera:
|
||||||
|
- galera
|
||||||
|
rabbitmq:
|
||||||
|
- rabbitmq
|
||||||
|
controller:
|
||||||
|
- etcd
|
||||||
|
- glance-api
|
||||||
|
- glance-registry
|
||||||
|
- horizon
|
||||||
|
- keystone
|
||||||
|
- memcached
|
||||||
|
- neutron-dhcp-agent
|
||||||
|
- neutron-l3-agent
|
||||||
|
- neutron-metadata-agent
|
||||||
|
- neutron-server
|
||||||
|
- nova-api
|
||||||
|
- nova-conductor
|
||||||
|
- nova-consoleauth
|
||||||
|
- nova-novncproxy
|
||||||
|
- nova-scheduler
|
||||||
|
compute:
|
||||||
|
- nova-compute
|
||||||
|
- nova-libvirt
|
||||||
|
openvswitch:
|
||||||
|
- neutron-openvswitch-agent
|
||||||
|
- openvswitch-db
|
||||||
|
- openvswitch-vswitchd
|
||||||
|
backup:
|
||||||
|
- backup
|
||||||
|
heat:
|
||||||
|
- heat-api
|
||||||
|
- heat-api-cfn
|
||||||
|
- heat-engine
|
||||||
@@ -0,0 +1,71 @@
|
|||||||
|
images:
|
||||||
|
tag: newton
|
||||||
|
# image_specs:
|
||||||
|
# keystone:
|
||||||
|
# tag: newton
|
||||||
|
|
||||||
|
# horizon:
|
||||||
|
# tag: newton
|
||||||
|
|
||||||
|
# nova-upgrade:
|
||||||
|
# tag: newton
|
||||||
|
# nova-api:
|
||||||
|
# tag: newton
|
||||||
|
# nova-conductor:
|
||||||
|
# tag: newton
|
||||||
|
# nova-consoleauth:
|
||||||
|
# tag: newton
|
||||||
|
# nova-novncproxy:
|
||||||
|
# tag: newton
|
||||||
|
# nova-scheduler:
|
||||||
|
# tag: newton
|
||||||
|
# nova-compute:
|
||||||
|
# tag: newton
|
||||||
|
# nova-libvirt:
|
||||||
|
# tag: newton
|
||||||
|
|
||||||
|
# neutron-dhcp-agent:
|
||||||
|
# tag: newton
|
||||||
|
# neutron-l3-agent:
|
||||||
|
# tag: newton
|
||||||
|
# neutron-metadata-agent:
|
||||||
|
# tag: newton
|
||||||
|
# neutron-server:
|
||||||
|
# tag: newton
|
||||||
|
# neutron-openvswitch-agent:
|
||||||
|
# tag: newton
|
||||||
|
|
||||||
|
# glance-api:
|
||||||
|
# tag: newton
|
||||||
|
# glance-registry:
|
||||||
|
# tag: newton
|
||||||
|
# glance-upgrade:
|
||||||
|
# tag: newton
|
||||||
|
sources:
|
||||||
|
openstack/cinder:
|
||||||
|
git_ref: stable/newton
|
||||||
|
git_url: https://github.com/openstack/cinder.git
|
||||||
|
openstack/glance:
|
||||||
|
git_ref: stable/newton
|
||||||
|
git_url: https://github.com/openstack/glance.git
|
||||||
|
openstack/heat:
|
||||||
|
git_ref: stable/newton
|
||||||
|
git_url: https://github.com/openstack/heat.git
|
||||||
|
openstack/horizon:
|
||||||
|
git_ref: stable/newton
|
||||||
|
git_url: https://github.com/openstack/horizon.git
|
||||||
|
openstack/keystone:
|
||||||
|
git_ref: stable/newton
|
||||||
|
git_url: https://github.com/openstack/keystone.git
|
||||||
|
openstack/neutron:
|
||||||
|
git_ref: stable/newton
|
||||||
|
git_url: https://github.com/openstack/neutron.git
|
||||||
|
openstack/nova:
|
||||||
|
git_ref: stable/newton
|
||||||
|
git_url: https://github.com/openstack/nova.git
|
||||||
|
openstack/requirements:
|
||||||
|
git_ref: stable/newton
|
||||||
|
git_url: https://git.openstack.org/openstack/requirements.git
|
||||||
|
openstack/sahara-dashboard:
|
||||||
|
git_ref: stable/newton
|
||||||
|
git_url: https://git.openstack.org/openstack/sahara-dashboard.git
|
||||||
2086
doc/source/methodologies/monitoring/configs/dashboards/ETCD.json
Normal file
2086
doc/source/methodologies/monitoring/configs/dashboards/ETCD.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,103 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"_id": "Response-Time-Dashboard",
|
||||||
|
"_type": "dashboard",
|
||||||
|
"_source": {
|
||||||
|
"title": "Response Time Dashboard",
|
||||||
|
"hits": 0,
|
||||||
|
"description": "",
|
||||||
|
"panelsJSON": "[{\"id\":\"Env-1-Response-Time\",\"type\":\"visualization\",\"panelIndex\":1,\"size_x\":3,\"size_y\":2,\"col\":1,\"row\":1},{\"id\":\"Env-2-Response-Time\",\"type\":\"visualization\",\"panelIndex\":2,\"size_x\":3,\"size_y\":2,\"col\":4,\"row\":1},{\"id\":\"Env-3-Response-Time\",\"type\":\"visualization\",\"panelIndex\":3,\"size_x\":3,\"size_y\":2,\"col\":7,\"row\":1},{\"id\":\"Env-4-Response-Time\",\"type\":\"visualization\",\"panelIndex\":4,\"size_x\":3,\"size_y\":2,\"col\":1,\"row\":3},{\"id\":\"Env-5-Response-Time\",\"type\":\"visualization\",\"panelIndex\":5,\"size_x\":3,\"size_y\":2,\"col\":4,\"row\":3},{\"id\":\"Env-6-Response-Time\",\"type\":\"visualization\",\"panelIndex\":6,\"size_x\":3,\"size_y\":2,\"col\":7,\"row\":3}]",
|
||||||
|
"optionsJSON": "{\"darkTheme\":true}",
|
||||||
|
"uiStateJSON": "{}",
|
||||||
|
"version": 1,
|
||||||
|
"timeRestore": false,
|
||||||
|
"kibanaSavedObjectMeta": {
|
||||||
|
"searchSourceJSON": "{\"filter\":[{\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}}}]}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"_id": "Env-1-Response-Time",
|
||||||
|
"_type": "visualization",
|
||||||
|
"_source": {
|
||||||
|
"title": "Env-1 Response Time",
|
||||||
|
"visState": "{\"title\":\"New Visualization\",\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":true,\"showCircles\":true,\"smoothLines\":false,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"ResponseTime\",\"customLabel\":\"Avg Response Time ms\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"Timestamp\",\"interval\":\"auto\",\"customInterval\":\"2h\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{}}",
|
||||||
|
"uiStateJSON": "{}",
|
||||||
|
"description": "",
|
||||||
|
"version": 1,
|
||||||
|
"kibanaSavedObjectMeta": {
|
||||||
|
"searchSourceJSON": "{\"index\":\"env-*-heka*\",\"query\":{\"query_string\":{\"query\":\"Environment: \\\"env-1\\\"\",\"analyze_wildcard\":true}},\"filter\":[]}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"_id": "Env-4-Response-Time",
|
||||||
|
"_type": "visualization",
|
||||||
|
"_source": {
|
||||||
|
"title": "Env-4 Response Time",
|
||||||
|
"visState": "{\"title\":\"Env-3 Response Time\",\"type\":\"line\",\"params\":{\"addLegend\":true,\"addTimeMarker\":false,\"addTooltip\":true,\"defaultYExtents\":false,\"drawLinesBetweenPoints\":true,\"interpolate\":\"linear\",\"radiusRatio\":9,\"scale\":\"linear\",\"setYExtents\":false,\"shareYAxis\":true,\"showCircles\":true,\"smoothLines\":false,\"times\":[],\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"ResponseTime\",\"customLabel\":\"Avg Response Time ms\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"Timestamp\",\"interval\":\"auto\",\"customInterval\":\"2h\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{}}",
|
||||||
|
"uiStateJSON": "{}",
|
||||||
|
"description": "",
|
||||||
|
"version": 1,
|
||||||
|
"kibanaSavedObjectMeta": {
|
||||||
|
"searchSourceJSON": "{\"index\":\"env-*-heka*\",\"query\":{\"query_string\":{\"query\":\"Environment: \\\"env-4\\\"\",\"analyze_wildcard\":true}},\"filter\":[]}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"_id": "Env-5-Response-Time",
|
||||||
|
"_type": "visualization",
|
||||||
|
"_source": {
|
||||||
|
"title": "Env-5 Response Time",
|
||||||
|
"visState": "{\"title\":\"Env-4 Response Time\",\"type\":\"line\",\"params\":{\"addLegend\":true,\"addTimeMarker\":false,\"addTooltip\":true,\"defaultYExtents\":false,\"drawLinesBetweenPoints\":true,\"interpolate\":\"linear\",\"radiusRatio\":9,\"scale\":\"linear\",\"setYExtents\":false,\"shareYAxis\":true,\"showCircles\":true,\"smoothLines\":false,\"times\":[],\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"ResponseTime\",\"customLabel\":\"Avg Response Time ms\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"Timestamp\",\"interval\":\"auto\",\"customInterval\":\"2h\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{}}",
|
||||||
|
"uiStateJSON": "{}",
|
||||||
|
"description": "",
|
||||||
|
"version": 1,
|
||||||
|
"kibanaSavedObjectMeta": {
|
||||||
|
"searchSourceJSON": "{\"index\":\"env-*-heka*\",\"query\":{\"query_string\":{\"query\":\"Environment: \\\"env-5\\\"\",\"analyze_wildcard\":true}},\"filter\":[]}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"_id": "Env-6-Response-Time",
|
||||||
|
"_type": "visualization",
|
||||||
|
"_source": {
|
||||||
|
"title": "Env-6 Response Time",
|
||||||
|
"visState": "{\"title\":\"Env-5 Response Time\",\"type\":\"line\",\"params\":{\"addLegend\":true,\"addTimeMarker\":false,\"addTooltip\":true,\"defaultYExtents\":false,\"drawLinesBetweenPoints\":true,\"interpolate\":\"linear\",\"radiusRatio\":9,\"scale\":\"linear\",\"setYExtents\":false,\"shareYAxis\":true,\"showCircles\":true,\"smoothLines\":false,\"times\":[],\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"ResponseTime\",\"customLabel\":\"Avg Response Time ms\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"Timestamp\",\"interval\":\"auto\",\"customInterval\":\"2h\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{}}",
|
||||||
|
"uiStateJSON": "{}",
|
||||||
|
"description": "",
|
||||||
|
"version": 1,
|
||||||
|
"kibanaSavedObjectMeta": {
|
||||||
|
"searchSourceJSON": "{\"index\":\"env-*-heka*\",\"query\":{\"query_string\":{\"query\":\"Environment: \\\"env-6\\\"\",\"analyze_wildcard\":true}},\"filter\":[]}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"_id": "Env-3-Response-Time",
|
||||||
|
"_type": "visualization",
|
||||||
|
"_source": {
|
||||||
|
"title": "Env-3 Response Time",
|
||||||
|
"visState": "{\"aggs\":[{\"id\":\"1\",\"params\":{\"customLabel\":\"Avg Response Time ms\",\"field\":\"ResponseTime\"},\"schema\":\"metric\",\"type\":\"avg\"},{\"id\":\"2\",\"params\":{\"customInterval\":\"2h\",\"extended_bounds\":{},\"field\":\"Timestamp\",\"interval\":\"auto\",\"min_doc_count\":1},\"schema\":\"segment\",\"type\":\"date_histogram\"}],\"listeners\":{},\"params\":{\"addLegend\":true,\"addTimeMarker\":false,\"addTooltip\":true,\"defaultYExtents\":false,\"drawLinesBetweenPoints\":true,\"interpolate\":\"linear\",\"radiusRatio\":9,\"scale\":\"linear\",\"setYExtents\":false,\"shareYAxis\":true,\"showCircles\":true,\"smoothLines\":false,\"times\":[],\"yAxis\":{}},\"title\":\"Env-2 Response Time\",\"type\":\"line\"}",
|
||||||
|
"uiStateJSON": "{}",
|
||||||
|
"description": "",
|
||||||
|
"version": 1,
|
||||||
|
"kibanaSavedObjectMeta": {
|
||||||
|
"searchSourceJSON": "{\"index\":\"env-*-heka*\",\"query\":{\"query_string\":{\"query\":\"Environment: \\\"env-3\\\"\",\"analyze_wildcard\":true}},\"filter\":[]}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"_id": "Env-2-Response-Time",
|
||||||
|
"_type": "visualization",
|
||||||
|
"_source": {
|
||||||
|
"title": "Env-2 Response Time",
|
||||||
|
"visState": "{\"aggs\":[{\"id\":\"1\",\"params\":{\"customLabel\":\"Avg Response Time ms\",\"field\":\"ResponseTime\"},\"schema\":\"metric\",\"type\":\"avg\"},{\"id\":\"2\",\"params\":{\"customInterval\":\"2h\",\"extended_bounds\":{},\"field\":\"Timestamp\",\"interval\":\"auto\",\"min_doc_count\":1},\"schema\":\"segment\",\"type\":\"date_histogram\"}],\"listeners\":{},\"params\":{\"addLegend\":true,\"addTimeMarker\":false,\"addTooltip\":true,\"defaultYExtents\":false,\"drawLinesBetweenPoints\":true,\"interpolate\":\"linear\",\"radiusRatio\":9,\"scale\":\"linear\",\"setYExtents\":false,\"shareYAxis\":true,\"showCircles\":true,\"smoothLines\":false,\"times\":[],\"yAxis\":{}},\"title\":\"Env-1 Response Time\",\"type\":\"line\"}",
|
||||||
|
"uiStateJSON": "{}",
|
||||||
|
"description": "",
|
||||||
|
"version": 1,
|
||||||
|
"kibanaSavedObjectMeta": {
|
||||||
|
"searchSourceJSON": "{\"index\":\"env-*-heka*\",\"query\":{\"query_string\":{\"query\":\"Environment: \\\"env-2\\\"\",\"analyze_wildcard\":true}},\"filter\":[]}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,77 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
: ${DB_CONNECTION_STRING:?"You need to specify DB_CONNECTION_STRING parameter"}
|
||||||
|
: ${ENV_NAME:?"You need to specify ENV_NAME parameter"}
|
||||||
|
|
||||||
|
: ${MANAGEMENT_INTERFACE:="p1p1.602"}
|
||||||
|
: ${COBBLER_ADDRESS:="172.20.8.34"}
|
||||||
|
: ${CUSTOM_YAML}
|
||||||
|
: ${KARGO_REPO}
|
||||||
|
: ${KARGO_COMMIT}
|
||||||
|
: ${FUEL_CCP_COMMIT}
|
||||||
|
: ${ADMIN_USER}
|
||||||
|
: ${ADMIN_PASSWORD}
|
||||||
|
: ${ADMIN_NODE_CLEANUP}
|
||||||
|
DEPLOY_METHOD="kargo"
|
||||||
|
WORKSPACE="~/kargo_workspace_${ENV_NAME}"
|
||||||
|
SSH_OPTIONS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null"
|
||||||
|
|
||||||
|
get_env_nodes ()
|
||||||
|
{
|
||||||
|
ENV_NODES_NAMES=$(echo $(psql ${DB_CONNECTION_STRING} -c "select name from servers where environment_id in (select id from environments where name='${ENV_NAME}')" -P format=unaligned -t))
|
||||||
|
if [ -z "${ENV_NODES_NAMES}" ]
|
||||||
|
then
|
||||||
|
echo "No nodes in environment with name ${ENV_NAME}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
get_env_nodes_ips ()
|
||||||
|
{
|
||||||
|
ENV_NODES_IPS=$(echo $(ssh ${SSH_OPTIONS} root@${COBBLER_ADDRESS} bash -ex << EOF
|
||||||
|
for COBBLER_SYSTEM_NAME in ${ENV_NODES_NAMES}
|
||||||
|
do
|
||||||
|
NODE_IP=\$(cobbler system dumpvars --name=\${COBBLER_SYSTEM_NAME} | grep ^ip_address_${MANAGEMENT_INTERFACE} | awk '{print \$3}')
|
||||||
|
NODE_IPS+=\${NODE_IP}" "
|
||||||
|
done
|
||||||
|
echo \${NODE_IPS}
|
||||||
|
EOF
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
main ()
|
||||||
|
{
|
||||||
|
get_env_nodes
|
||||||
|
get_env_nodes_ips
|
||||||
|
export ADMIN_IP=$(echo ${ENV_NODES_IPS} | awk '{print $1}')
|
||||||
|
export SLAVE_IPS=$(echo ${ENV_NODES_IPS})
|
||||||
|
|
||||||
|
# for SLAVE_IP in ${SLAVE_IPS}
|
||||||
|
# do
|
||||||
|
# ssh ${SSH_OPTIONS} root@${SLAVE_IP} bash -ex << EOF
|
||||||
|
#echo "deb https://apt.dockerproject.org/repo ubuntu-\$(grep DISTRIB_CODENAME /etc/lsb-release | awk -F"=" '{print \$2}') main" >> /etc/apt/sources.list
|
||||||
|
#apt-get update && apt-get install -y --allow-unauthenticated -o Dpkg::Options::="--force-confdef" docker-engine
|
||||||
|
#EOF
|
||||||
|
# done
|
||||||
|
|
||||||
|
if [ -d "$WORKSPACE" ] ; then
|
||||||
|
rm -rf $WORKSPACE
|
||||||
|
fi
|
||||||
|
mkdir -p $WORKSPACE
|
||||||
|
cd $WORKSPACE
|
||||||
|
|
||||||
|
if [ -d './fuel-ccp-installer' ] ; then
|
||||||
|
rm -rf ./fuel-ccp-installer
|
||||||
|
fi
|
||||||
|
git clone https://review.openstack.org/openstack/fuel-ccp-installer
|
||||||
|
cd ./fuel-ccp-installer
|
||||||
|
|
||||||
|
if [ "$FUEL_CCP_COMMIT" ]; then
|
||||||
|
git fetch git://git.openstack.org/openstack/fuel-ccp-installer $FUEL_CCP_COMMIT && git checkout FETCH_HEAD
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Running on $NODE_NAME: $ENV_NAME"
|
||||||
|
|
||||||
|
bash -xe "./utils/jenkins/run_k8s_deploy_test.sh"
|
||||||
|
}
|
||||||
|
main
|
||||||
@@ -0,0 +1,46 @@
|
|||||||
|
---
|
||||||
|
- hosts: main-kuber
|
||||||
|
remote_user: root
|
||||||
|
tasks:
|
||||||
|
- name: Fetch heka package
|
||||||
|
get_url:
|
||||||
|
url: "{{ heka_package_url }}"
|
||||||
|
dest: /tmp/heka_amd64.deb
|
||||||
|
mode: 0664
|
||||||
|
force: yes
|
||||||
|
- name: Download heka package locally
|
||||||
|
fetch:
|
||||||
|
src: /tmp/heka_amd64.deb
|
||||||
|
dest: ./heka_amd64.deb
|
||||||
|
fail_on_missing: yes
|
||||||
|
flat: yes
|
||||||
|
|
||||||
|
- hosts: cluster-nodes
|
||||||
|
remote_user: root
|
||||||
|
tasks:
|
||||||
|
- name: Propagate heka package across cluster nodes
|
||||||
|
copy:
|
||||||
|
src: ./heka_amd64.deb
|
||||||
|
dest: /tmp/heka_amd64.deb
|
||||||
|
|
||||||
|
- hosts: all-cluster-nodes
|
||||||
|
remote_user: root
|
||||||
|
tasks:
|
||||||
|
- name: Install heka package
|
||||||
|
apt: deb=/tmp/heka_amd64.deb
|
||||||
|
- name: Adding heka user to docker group
|
||||||
|
user: name='heka' groups=docker append=yes
|
||||||
|
- name: Copy heka conf
|
||||||
|
template: src=heka/00-hekad.toml.j2 dest=/etc/heka/conf.d/00-hekad.toml
|
||||||
|
notify: restart heka
|
||||||
|
- name: Copy heka lua scripts
|
||||||
|
template: src=heka/kubeapi_to_int.lua.j2 dest=/usr/share/heka/lua_filters/kubeapi_to_int.lua
|
||||||
|
register: heka_lua
|
||||||
|
notify: restart heka
|
||||||
|
- name: ensure heka is running
|
||||||
|
systemd: state=started name=heka enabled=yes
|
||||||
|
|
||||||
|
handlers:
|
||||||
|
- name: restart heka
|
||||||
|
systemd: state=restarted name=heka
|
||||||
|
|
||||||
@@ -0,0 +1,71 @@
|
|||||||
|
#!/bin/bash -xe
|
||||||
|
|
||||||
|
HOSTNAME=`hostname`
|
||||||
|
ELASTICSEARCH_NODE=${ELASTICSEARCH_NODE:-172.20.9.3}
|
||||||
|
|
||||||
|
# install java
|
||||||
|
sudo add-apt-repository -y ppa:webupd8team/java
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get -y install oracle-java8-installer
|
||||||
|
|
||||||
|
# install elastic by adding extra repository
|
||||||
|
wget -qO - https://packages.elastic.co/GPG-KEY-elasticsearch | sudo apt-key add -
|
||||||
|
echo "deb http://packages.elastic.co/elasticsearch/2.x/debian stable main" | sudo tee -a /etc/apt/sources.list.d/elasticsearch-2.x.list
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get -y install elasticsearch
|
||||||
|
|
||||||
|
# edit configuration:
|
||||||
|
sed -i -E -e 's/^.*cluster.name: .*$/ cluster.name: elasticsearch_k8s/g' /etc/elasticsearch/elasticsearch.yml
|
||||||
|
sed -i -E -e "s/^.*node.name: .*$/ cluster.name: ${HOSTNAME}/g" /etc/elasticsearch/elasticsearch.yml
|
||||||
|
sed -i -E -e "s/^.*network.host: .*$/ network.host: ${ELASTICSEARCH_NODE}/g" /etc/elasticsearch/elasticsearch.yml
|
||||||
|
|
||||||
|
# increase memory limits:
|
||||||
|
sed -i -E -e "s/^.*ES_HEAP_SIZE=.*$/ES_HEAP_SIZE=10g/g" /etc/default/elasticsearch
|
||||||
|
|
||||||
|
# start service:
|
||||||
|
sudo systemctl restart elasticsearch
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl enable elasticsearch
|
||||||
|
|
||||||
|
# install kibana from extra repository:
|
||||||
|
echo "deb http://packages.elastic.co/kibana/4.5/debian stable main" | sudo tee -a /etc/apt/sources.list
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get -y install kibana
|
||||||
|
sed -i -E -e "s/^.*elasticsearch.url:.*$/ elasticsearch.url: \"http://${ELASTICSEARCH_NODE}:9200\"/g" /opt/kibana/config/kibana.yml
|
||||||
|
|
||||||
|
# enable kibana service:
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl enable kibana
|
||||||
|
sudo systemctl start kibana
|
||||||
|
|
||||||
|
# install nginx:
|
||||||
|
sudo apt-get -y install nginx
|
||||||
|
|
||||||
|
# set kibana admin:password (admin:admin)
|
||||||
|
echo "admin:`openssl passwd admin`" | sudo tee -a /etc/nginx/htpasswd.users
|
||||||
|
|
||||||
|
# prepare nginx config:
|
||||||
|
cat << EOF >> /etc/nginx/sites-available/default
|
||||||
|
server {
|
||||||
|
listen 80;
|
||||||
|
|
||||||
|
server_name ${HOSTNAME};
|
||||||
|
|
||||||
|
auth_basic "Restricted Access";
|
||||||
|
auth_basic_user_file /etc/nginx/htpasswd.users;
|
||||||
|
|
||||||
|
location / {
|
||||||
|
proxy_pass http://localhost:5601;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Upgrade \$http_upgrade;
|
||||||
|
proxy_set_header Connection 'upgrade';
|
||||||
|
proxy_set_header Host \$host;
|
||||||
|
proxy_cache_bypass \$http_upgrade;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# check and start nginx service:
|
||||||
|
sudo nginx -t
|
||||||
|
sudo systemctl restart nginx
|
||||||
|
|
||||||
@@ -0,0 +1,60 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
export ANSIBLE_HOST_KEY_CHECKING=False
|
||||||
|
export SSH_USER="root"
|
||||||
|
export SSH_PASS="r00tme"
|
||||||
|
cd $(dirname $(realpath $0))
|
||||||
|
|
||||||
|
ENV=${1}
|
||||||
|
if [ -z "${ENV}" ]; then
|
||||||
|
echo "Please provide env number $(basename $0) [1|2|3|4|5|6]"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
# elastic for k8s at rackspace as default
|
||||||
|
ELASTICSEARCH_NODE=${ELASTICSEARCH_NODE:-172.20.9.3}
|
||||||
|
# heka 0.10.0 as default
|
||||||
|
HEKA_PACKAGE_URL=${HEKA_PACKAGE_URL:-https://github.com/mozilla-services/heka/releases/download/v0.10.0/heka_0.10.0_amd64.deb}
|
||||||
|
KUBE_MAIN_NODE="172.20.8.6${ENV}"
|
||||||
|
SSH_OPTS="-q -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no"
|
||||||
|
|
||||||
|
echo "Get clusters nodes ..."
|
||||||
|
NODES_TMP=$(sshpass -p ${SSH_PASS} ssh ${SSH_OPTS} ${SSH_USER}@${KUBE_MAIN_NODE} 'kubectl get nodes -o jsonpath='"'"'{.items[*].status.addresses[?(@.type=="InternalIP")].address}'"'"'')
|
||||||
|
ALL_IP_ON_KUBER_NODE=$(sshpass -p ${SSH_PASS} ssh ${SSH_OPTS} ${SSH_USER}@${KUBE_MAIN_NODE} ip addr | grep 172.20 | awk '{print $2}' | awk -F'/' '{print $1}')
|
||||||
|
GREP_STRING_TMP=""
|
||||||
|
for i in $ALL_IP_ON_KUBER_NODE; do
|
||||||
|
GREP_STRING_TMP="${GREP_STRING_TMP}${i}|"
|
||||||
|
done
|
||||||
|
GREP_STRING=${GREP_STRING_TMP:0:-1}
|
||||||
|
SSH_AUTH="ansible_ssh_user=${SSH_USER} ansible_ssh_pass=${SSH_PASS}"
|
||||||
|
echo "[main-kuber]" > cluster-hosts
|
||||||
|
echo "${KUBE_MAIN_NODE} ${SSH_AUTH}" >> cluster-hosts
|
||||||
|
echo "[cluster-nodes]" >> cluster-hosts
|
||||||
|
set +e
|
||||||
|
# Remove IP of kuber node
|
||||||
|
for i in ${NODES_TMP} ; do
|
||||||
|
TMP_VAR=$(echo $i | grep -vE "(${GREP_STRING})")
|
||||||
|
NODES="${NODES} ${TMP_VAR}"
|
||||||
|
done
|
||||||
|
set -e
|
||||||
|
for i in ${NODES} ; do
|
||||||
|
if [ "$i" != "${KUBE_MAIN_NODE}" ]; then
|
||||||
|
echo "${i} ${SSH_AUTH}" >> cluster-hosts
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
echo "[all-cluster-nodes:children]" >> cluster-hosts
|
||||||
|
echo "main-kuber" >> cluster-hosts
|
||||||
|
echo "cluster-nodes" >> cluster-hosts
|
||||||
|
|
||||||
|
# Calculate parallel ansible execution
|
||||||
|
NODES_IPS=( $NODES )
|
||||||
|
if [[ "${#NODES_IPS[@]}" -lt 50 ]] && [[ "${#NODES_IPS[@]}" -gt 5 ]]; then
|
||||||
|
ANSIBLE_FORKS="${#NODES_IPS[@]}"
|
||||||
|
elif [[ "${#NODES_IPS[@]}" -ge 50 ]]; then
|
||||||
|
ANSIBLE_FORKS=50
|
||||||
|
else
|
||||||
|
ANSIBLE_FORKS=10
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Starting ansible ..."
|
||||||
|
ansible-playbook -v --ssh-extra-args "-o\ StrictHostKeyChecking=no" -f ${ANSIBLE_FORKS} -i ./cluster-hosts -e env_num=${ENV} -e elasticsearch_node="${ELASTICSEARCH_NODE}" -e heka_package_url=${HEKA_PACKAGE_URL} ./deploy-heka.yaml --diff
|
||||||
|
|
||||||
@@ -0,0 +1,69 @@
|
|||||||
|
# vim: set syntax=yaml
|
||||||
|
|
||||||
|
[hekad]
|
||||||
|
maxprocs = 2
|
||||||
|
|
||||||
|
[DockerLogInput]
|
||||||
|
endpoint = "unix:///var/run/docker.sock"
|
||||||
|
#decoder = "KubeAPI_decoder"
|
||||||
|
decoder = "MultiDecoder"
|
||||||
|
|
||||||
|
[MultiDecoder]
|
||||||
|
type = "MultiDecoder"
|
||||||
|
subs = ["KubeAPI_decoder", "EnvironmentScribbler"]
|
||||||
|
cascade_strategy = "all"
|
||||||
|
#log_sub_errors = true
|
||||||
|
{% raw %}
|
||||||
|
[KubeAPI_decoder]
|
||||||
|
type = "PayloadRegexDecoder"
|
||||||
|
match_regex = '\S+ \S+ .+ (?P<Code>\S+)\] (?P<Method>[A-Z]+) (?P<Url>\S+)\: \((?P<ResponseTime>\S+)ms\) (?P<StatusCode>\d+) \[\[(?P<Agent>.+)\] (?P<RemoteIP>\S+)\:(?P<RemotePort>\d+)\]'
|
||||||
|
[KubeAPI_decoder.message_fields]
|
||||||
|
Type = "KubeAPIlog"
|
||||||
|
Logger = "Docker"
|
||||||
|
Code = "%Code%"
|
||||||
|
Method = "%Method%"
|
||||||
|
Url|uri = "%Url%"
|
||||||
|
ResponseTime = "%ResponseTime%"
|
||||||
|
StatusCode = "%StatusCode%"
|
||||||
|
Agent = "%Agent%"
|
||||||
|
RemoteIP|ipv4 = "%RemoteIP%"
|
||||||
|
RemotePort = "%RemotePort%"
|
||||||
|
{% endraw %}
|
||||||
|
[EnvironmentScribbler]
|
||||||
|
type = "ScribbleDecoder"
|
||||||
|
[EnvironmentScribbler.message_fields]
|
||||||
|
Environment = "env-{{ env_num }}"
|
||||||
|
|
||||||
|
|
||||||
|
[KubeAPI_to_int]
|
||||||
|
type = "SandboxFilter"
|
||||||
|
filename = "lua_filters/kubeapi_to_int.lua"
|
||||||
|
message_matcher = "Type == 'KubeAPIlog'"
|
||||||
|
|
||||||
|
[ESJsonEncoder]
|
||||||
|
index = "env-{{ env_num }}-{{ '%{Type}-%{%Y.%m.%d}' }}"
|
||||||
|
#es_index_from_timestamp = true
|
||||||
|
type_name = "%{Type}"
|
||||||
|
|
||||||
|
[ElasticSearchOutput]
|
||||||
|
message_matcher = "Type == 'heka.sandbox.KubeAPIlog' || Type == 'DockerLog'"
|
||||||
|
server = "http://{{ elasticsearch_node }}:9200"
|
||||||
|
flush_interval = 5000
|
||||||
|
flush_count = 10
|
||||||
|
encoder = "ESJsonEncoder"
|
||||||
|
|
||||||
|
[PayloadEncoder]
|
||||||
|
append_newlines = false
|
||||||
|
#
|
||||||
|
[LogOutput]
|
||||||
|
<<<<<<< HEAD
|
||||||
|
#message_matcher = "Type == 'KubeAPIlog'"
|
||||||
|
message_matcher = "TRUE"
|
||||||
|
#encoder = "ESJsonEncoder"
|
||||||
|
encoder = "PayloadEncoder"
|
||||||
|
=======
|
||||||
|
message_matcher = "Type == 'heka.sandbox.KubeAPIlog' || Type == 'DockerLog'"
|
||||||
|
#message_matcher = "TRUE"
|
||||||
|
encoder = "ESJsonEncoder"
|
||||||
|
#encoder = "PayloadEncoder"
|
||||||
|
>>>>>>> b0caa3ceb82399dd16465645eebdebf90242662c
|
||||||
@@ -0,0 +1,30 @@
|
|||||||
|
{% raw %}
|
||||||
|
-- Invert Response time and some more fields to integer type
|
||||||
|
|
||||||
|
local fields = {["ResponseTime"] = 0, ["RemotePort"] = 0, ["StatusCode"] = 0}
|
||||||
|
local msg = {
|
||||||
|
Type = "KubeAPIlog",
|
||||||
|
Severity = 6,
|
||||||
|
Fields = fields
|
||||||
|
}
|
||||||
|
|
||||||
|
function process_message ()
|
||||||
|
fields["ResponseTime"] = tonumber(read_message("Fields[ResponseTime]"))
|
||||||
|
fields["RemotePort"] = tonumber(read_message("Fields[RemotePort]"))
|
||||||
|
fields["StatusCode"] = tonumber(read_message("Fields[StatusCode]"))
|
||||||
|
msg.Payload = read_message("Payload")
|
||||||
|
fields["Code"] = read_message("Fields[Code]")
|
||||||
|
fields["ContainerID"] = read_message("Fields[ContainerID]")
|
||||||
|
fields["ContainerName"] = read_message("Fields[ContainerName]")
|
||||||
|
fields["Environment"] = read_message("Fields[Environment]")
|
||||||
|
fields["Method"] = read_message("Fields[Method]")
|
||||||
|
fields["RemoteIP"] = read_message("Fields[RemoteIP]")
|
||||||
|
fields["Url"] = read_message("Fields[Url]")
|
||||||
|
local ok, msg = pcall(inject_message, msg)
|
||||||
|
if not ok then
|
||||||
|
inject_payload("txt", "error", msg)
|
||||||
|
end
|
||||||
|
return 0
|
||||||
|
end
|
||||||
|
|
||||||
|
{% endraw %}
|
||||||
BIN
doc/source/methodologies/monitoring/configs/node1.tar.gz
Normal file
BIN
doc/source/methodologies/monitoring/configs/node1.tar.gz
Normal file
Binary file not shown.
@@ -0,0 +1,124 @@
|
|||||||
|
---
|
||||||
|
- hosts: common
|
||||||
|
remote_user: root
|
||||||
|
tasks:
|
||||||
|
- name: Install common packages
|
||||||
|
apt: name={{ item }} state=installed
|
||||||
|
with_items:
|
||||||
|
- python-pip
|
||||||
|
tags: [ 'always' ]
|
||||||
|
- name: Install docker for Ubuntu 14.04
|
||||||
|
apt: name=docker.io state=installed
|
||||||
|
when: ansible_distribution == 'Ubuntu' and ansible_distribution_version == '14.04'
|
||||||
|
tags: [ 'always' ]
|
||||||
|
- name: Install docker for Ubuntu 16.01
|
||||||
|
apt: name=docker state=installed
|
||||||
|
when: ansible_distribution == 'Ubuntu' and ansible_distribution_version == '16.0.'
|
||||||
|
tags: [ 'always' ]
|
||||||
|
- name: Install python deps
|
||||||
|
pip: name={{ item }}
|
||||||
|
with_items:
|
||||||
|
- docker-py
|
||||||
|
- docker-compose
|
||||||
|
tags: [ 'always' ]
|
||||||
|
|
||||||
|
- hosts: grafana
|
||||||
|
remote_user: root
|
||||||
|
vars:
|
||||||
|
postgresql_root_user: root
|
||||||
|
postgresql_root_password: aijoom1Shiex
|
||||||
|
grafana_postgresql_user: grafana
|
||||||
|
grafana_postgresql_password: sHskdhos6se
|
||||||
|
grafana_postgresql_db: grafana
|
||||||
|
grafana_user: admin
|
||||||
|
grafana_password: admin
|
||||||
|
tasks:
|
||||||
|
- name: Install packages for grafana
|
||||||
|
apt: name={{ item }} state=installed
|
||||||
|
with_items:
|
||||||
|
- postgresql-client-9.3
|
||||||
|
- python-psycopg2
|
||||||
|
- name: Create postgres data dir
|
||||||
|
file: path=/var/lib/postgres/data/db state=directory
|
||||||
|
tags: [ 'grafana' ]
|
||||||
|
- name: Run postgres in docker
|
||||||
|
docker_container:
|
||||||
|
name: postgres
|
||||||
|
image: 'postgres:latest'
|
||||||
|
ports: 5432:5432
|
||||||
|
volumes: '/var/lib/postgres/data:/var/lib/postgres/data'
|
||||||
|
env:
|
||||||
|
POSTGRES_USER: "{{ postgresql_root_user }}"
|
||||||
|
POSTGRES_PASSWORD: "{{ postgresql_root_password }}"
|
||||||
|
PGDATA: /var/lib/postgres/data/db
|
||||||
|
tags: [ 'grafana' ]
|
||||||
|
- name: Create DB for grafana
|
||||||
|
postgresql_db:
|
||||||
|
name: "{{ grafana_postgresql_db }}"
|
||||||
|
login_user: "{{ postgresql_root_user }}"
|
||||||
|
login_password: "{{ postgresql_root_password }}"
|
||||||
|
login_host: localhost
|
||||||
|
encoding: 'UTF-8'
|
||||||
|
tags: [ 'grafana' ]
|
||||||
|
- name: Create user for grafana in postgres
|
||||||
|
postgresql_user:
|
||||||
|
name: "{{ grafana_postgresql_user }}"
|
||||||
|
login_user: "{{ postgresql_root_user }}"
|
||||||
|
login_password: "{{ postgresql_root_password }}"
|
||||||
|
login_host: localhost
|
||||||
|
password: "{{ grafana_postgresql_password }}"
|
||||||
|
db: grafana
|
||||||
|
priv: ALL
|
||||||
|
tags: [ 'grafana' ]
|
||||||
|
- name: Create data dir for Grafana
|
||||||
|
file: path=/var/lib/grafana state=directory
|
||||||
|
tags: [ 'grafana' ]
|
||||||
|
- name: Start Grafana container
|
||||||
|
docker_container:
|
||||||
|
name: grafana
|
||||||
|
image: 'grafana/grafana:4.0.1'
|
||||||
|
volumes: '/var/lib/grafana:/var/lib/grafana'
|
||||||
|
ports: 3000:3000
|
||||||
|
env:
|
||||||
|
GF_SECURITY_ADMIN_PASSWORD: "{{ grafana_user }}"
|
||||||
|
GF_SECURITY_ADMIN_USER: "{{ grafana_password }}"
|
||||||
|
GF_DATABASE_TYPE: postgres
|
||||||
|
GF_DATABASE_HOST: "{{ ansible_default_ipv4.address }}"
|
||||||
|
GF_DATABASE_NAME: "{{ grafana_postgresql_db }}"
|
||||||
|
GF_DATABASE_USER: "{{ grafana_postgresql_user }}"
|
||||||
|
GF_DATABASE_PASSWORD: "{{ grafana_postgresql_password }}"
|
||||||
|
GF_INSTALL_PLUGINS: grafana-piechart-panel
|
||||||
|
tags: [ 'grafana' ]
|
||||||
|
|
||||||
|
- hosts: prometheuses
|
||||||
|
remote_user: root
|
||||||
|
tasks:
|
||||||
|
- name: Data dir for prometheus
|
||||||
|
file: path=/var/lib/prometheus state=directory
|
||||||
|
tags: [ 'prometheus' ]
|
||||||
|
- include: docker_prometheus.yaml
|
||||||
|
|
||||||
|
- hosts: prometheus-kuber
|
||||||
|
remote_user: root
|
||||||
|
tasks:
|
||||||
|
- name: Copy prometheus config
|
||||||
|
template: src=prometheus/prometheus-kuber.yml.j2 dest=/var/lib/prometheus/prometheus.yml
|
||||||
|
register: prometheus_yml
|
||||||
|
tags: [ 'prometheus', 'prometheus-conf' ]
|
||||||
|
- include: docker_prometheus.yaml
|
||||||
|
- name: Send kill -1 to prometheus if prometheus.yml changed
|
||||||
|
command: pkill -1 prometheus
|
||||||
|
when: prometheus_yml.changed
|
||||||
|
tags: [ 'prometheus', 'prometheus-conf']
|
||||||
|
- hosts: prometheus-system
|
||||||
|
remote_user: root
|
||||||
|
tasks:
|
||||||
|
- name: Copy prometheus config
|
||||||
|
template: src=prometheus/prometheus-system.yml.j2 dest=/var/lib/prometheus/prometheus.yml
|
||||||
|
register: prometheus_yml
|
||||||
|
tags: [ 'prometheus', 'prometheus-conf' ]
|
||||||
|
- include: docker_prometheus.yaml
|
||||||
|
- name: Send kill -1 to prometheus if prometheus.yml changed
|
||||||
|
command: pkill -1 prometheus
|
||||||
|
when: prometheus_yml.changed
|
||||||
|
tags: [ 'prometheus', 'prometheus-conf']
|
||||||
@@ -0,0 +1,118 @@
|
|||||||
|
---
|
||||||
|
- hosts: all-cluster-nodes
|
||||||
|
remote_user: root
|
||||||
|
tasks:
|
||||||
|
- name: Create user telegraf
|
||||||
|
user: name=telegraf home=/opt/telegraf
|
||||||
|
- name: Create /opt/telegraf
|
||||||
|
file: path=/opt/telegraf state=directory owner=telegraf
|
||||||
|
- name: Create bin dir for telegraf
|
||||||
|
file: path=/opt/telegraf/bin state=directory owner=telegraf
|
||||||
|
- name: Create etc dir for telegraf
|
||||||
|
file: path=/opt/telegraf/etc state=directory owner=telegraf
|
||||||
|
- name: Copy telegraf to server
|
||||||
|
copy: src=../../telegraf/opt/bin/telegraf dest=/opt/telegraf/bin/telegraf mode=0755
|
||||||
|
register: telegraf_bin
|
||||||
|
- name: Copy telegraf.service
|
||||||
|
copy: src=telegraf/telegraf.service dest=/etc/systemd/system/telegraf.service
|
||||||
|
register: telegraf_service
|
||||||
|
- name: Start and enable telegraf
|
||||||
|
systemd: state=started enabled=yes daemon_reload=yes name=telegraf
|
||||||
|
- name: Delete allmetrics.tmp.lock
|
||||||
|
file: path=/opt/telegraf/bin/data/allmetrics.tmp.lock state=absent
|
||||||
|
when: telegraf_service.changed or telegraf_bin.changed
|
||||||
|
- name: Restart telegraf if telegraf binary has been changed
|
||||||
|
systemd: state=restarted name=telegraf
|
||||||
|
when: telegraf_bin.changed
|
||||||
|
- name: Install software
|
||||||
|
apt: name={{ item }} state=installed
|
||||||
|
with_items:
|
||||||
|
- sysstat
|
||||||
|
- numactl
|
||||||
|
- name: Copy system metric scripts
|
||||||
|
copy: src=../../telegraf/opt/system_stats/{{ item }} dest=/opt/telegraf/bin/{{ item }} mode=0755
|
||||||
|
with_items:
|
||||||
|
- entropy.sh
|
||||||
|
- iostat_per_device.sh
|
||||||
|
- memory_bandwidth.sh
|
||||||
|
- numa_stat_per_pid.sh
|
||||||
|
- per_process_cpu_usage.sh
|
||||||
|
- list_openstack_processes.sh
|
||||||
|
- network_tcp_queue.sh
|
||||||
|
- name: Copy pcm-memory-one-line.x
|
||||||
|
copy: src=../../telegraf/opt/system_stats/intel_pcm_mem/pcm-memory-one-line.x dest=/opt/telegraf/bin/pcm-memory-one-line.x mode=0755
|
||||||
|
- name: Add sysctl for pcm
|
||||||
|
sysctl: name=kernel.nmi_watchdog value=0 state=present reload=yes
|
||||||
|
- name: Load kernel module msr
|
||||||
|
modprobe: name=msr state=present
|
||||||
|
- name: Add module autoload
|
||||||
|
lineinfile: dest=/etc/modules line='msr'
|
||||||
|
- name: Add user telegraf to sudoers
|
||||||
|
lineinfile:
|
||||||
|
dest: /etc/sudoers
|
||||||
|
state: present
|
||||||
|
line: "telegraf ALL=(ALL) NOPASSWD: ALL"
|
||||||
|
|
||||||
|
|
||||||
|
- hosts: cluster-nodes
|
||||||
|
remote_user: root
|
||||||
|
tasks:
|
||||||
|
- name: Copy telegraf config
|
||||||
|
copy: src=./telegraf/telegraf-sys.conf dest=/opt/telegraf/etc/telegraf.conf
|
||||||
|
register: telegraf_conf
|
||||||
|
- name: Restart telegraf if config has been changed
|
||||||
|
systemd: state=restarted name=telegraf
|
||||||
|
when: telegraf_conf.changed
|
||||||
|
|
||||||
|
- hosts: main-kuber
|
||||||
|
remote_user: root
|
||||||
|
tasks:
|
||||||
|
- name: Copy openstack scripts
|
||||||
|
copy: src=../../telegraf/opt/osapi/{{ item }} dest=/opt/telegraf/bin/{{ item }} mode=0755
|
||||||
|
with_items:
|
||||||
|
- glog.sh
|
||||||
|
- osapitime.sh
|
||||||
|
- vmtime.sh
|
||||||
|
tags: [ 'openstack' ]
|
||||||
|
- name: Copy etcd scripts
|
||||||
|
copy: src=../../telegraf/opt/k8s_etcd/{{ item }} dest=/opt/telegraf/bin/{{ item }} mode=0755
|
||||||
|
with_items:
|
||||||
|
- etcd_get_metrics.sh
|
||||||
|
- k8s_get_metrics.sh
|
||||||
|
- name: Install software for scripts
|
||||||
|
apt: name={{ item }} state=installed
|
||||||
|
with_items:
|
||||||
|
- mysql-client
|
||||||
|
- bc
|
||||||
|
- jq
|
||||||
|
tags: [ 'openstack' ]
|
||||||
|
- name: Create dirs for scripts
|
||||||
|
file: path=/opt/telegraf/bin/{{ item }} state=directory owner=telegraf
|
||||||
|
with_items:
|
||||||
|
- log
|
||||||
|
- data
|
||||||
|
- name: Copy telegraf config
|
||||||
|
template: src=telegraf/telegraf-openstack.conf.j2 dest=/opt/telegraf/etc/telegraf.conf
|
||||||
|
register: telegraf_conf
|
||||||
|
tags: [ 'openstack' ]
|
||||||
|
- name: Delete allmetrics.tmp.lock
|
||||||
|
file: path=/opt/telegraf/bin/data/allmetrics.tmp.lock state=absent
|
||||||
|
when: telegraf_conf.changed
|
||||||
|
- name: Restart telegraf if config has been changed
|
||||||
|
systemd: state=restarted name=telegraf
|
||||||
|
when: telegraf_conf.changed
|
||||||
|
tags: [ 'openstack' ]
|
||||||
|
|
||||||
|
- hosts: all-cluster-nodes
|
||||||
|
remote_user: root
|
||||||
|
tasks:
|
||||||
|
- name: Reload telegraf is service file has been changed
|
||||||
|
systemd: daemon_reload=yes state=reloaded name=telegraf
|
||||||
|
when: telegraf_service.changed
|
||||||
|
|
||||||
|
- hosts: main
|
||||||
|
remote_user: root
|
||||||
|
tasks:
|
||||||
|
- name: update prometheus config
|
||||||
|
template: src=./prometheus/targets.yml.j2 dest=/var/lib/prometheus/targets-{{ cluster_tag }}.yml
|
||||||
|
tags: [ 'prometheus' ]
|
||||||
@@ -0,0 +1,46 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
CLUSTER=${1}
|
||||||
|
TMP_YAML=$(mktemp -u)
|
||||||
|
|
||||||
|
export ANSIBLE_HOST_KEY_CHECKING=False
|
||||||
|
export SSH_USER="root"
|
||||||
|
export SSH_PASS="r00tme"
|
||||||
|
cd $(dirname $(realpath $0))
|
||||||
|
|
||||||
|
ENV=${1}
|
||||||
|
if [ -z "${ENV}" ]; then
|
||||||
|
echo "Please provide env number $(basename $0) [1|2|3|4|5|6]"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
PROMETHEUS_HOST="172.20.9.115"
|
||||||
|
KUBE_MAIN_NODE="172.20.8.6${ENV}"
|
||||||
|
CLUSTER_TAG="env-${ENV}"
|
||||||
|
|
||||||
|
ETCD=""
|
||||||
|
|
||||||
|
SSH_OPTS="-o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no"
|
||||||
|
|
||||||
|
|
||||||
|
TARGETS=$(sshpass -p ${SSH_PASS} ssh ${SSH_OPTS} ${SSH_USER}@${KUBE_MAIN_NODE} curl -ks https://127.0.0.1:2379/v2/members | python -m json.tool | grep 2379)
|
||||||
|
|
||||||
|
if [ -z "$TARGETS" ]; then
|
||||||
|
echo "No etcd found"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
for i in ${TARGETS}; do
|
||||||
|
TEMP_TARGET=${i#\"https://}
|
||||||
|
ETCD="$ETCD ${TEMP_TARGET%\"}"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "- targets:" > ${TMP_YAML}
|
||||||
|
for i in ${ETCD}; do
|
||||||
|
echo " - $i" >> ${TMP_YAML}
|
||||||
|
done
|
||||||
|
echo " labels:" >> ${TMP_YAML}
|
||||||
|
echo " env: ${CLUSTER_TAG}" >> ${TMP_YAML}
|
||||||
|
|
||||||
|
echo "Targets file is ready"
|
||||||
|
cat ${TMP_YAML}
|
||||||
|
sshpass -p ${SSH_PASS} scp ${SSH_OPTS} ${TMP_YAML} root@${PROMETHEUS_HOST}:/var/lib/prometheus/etcd-env-${1}.yml
|
||||||
|
rm ${TMP_YAML}
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
ansible-playbook -i ./hosts ./deploy-graf-prom.yaml --tags "grafana"
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
ansible-playbook -i ./hosts ./deploy-graf-prom.yaml --tags "prometheus"
|
||||||
@@ -0,0 +1,65 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
export ANSIBLE_HOST_KEY_CHECKING=False
|
||||||
|
export SSH_USER="root"
|
||||||
|
export SSH_PASS="r00tme"
|
||||||
|
cd $(dirname $(realpath $0))
|
||||||
|
|
||||||
|
ENV=${1}
|
||||||
|
if [ -z "${ENV}" ]; then
|
||||||
|
echo "Please provide env number $(basename $0) [1|2|3|4|5|6]"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
PROMETHEUS_NODE="172.20.124.25"
|
||||||
|
KUBE_MAIN_NODE="172.20.8.6${ENV}"
|
||||||
|
CLUSTER_TAG="env-${ENV}"
|
||||||
|
|
||||||
|
# Secret option
|
||||||
|
ANSIBLE_TAG=$2
|
||||||
|
|
||||||
|
SSH_OPTS="-q -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no"
|
||||||
|
|
||||||
|
echo "Get clusters nodes"
|
||||||
|
|
||||||
|
NODES_TMP=$(sshpass -p ${SSH_PASS} ssh ${SSH_OPTS} ${SSH_USER}@${KUBE_MAIN_NODE} 'kubectl get nodes -o jsonpath='"'"'{.items[*].status.addresses[?(@.type=="InternalIP")].address}'"'"'')
|
||||||
|
ALL_IP_ON_KUBER_NODE=$(sshpass -p ${SSH_PASS} ssh ${SSH_OPTS} ${SSH_USER}@${KUBE_MAIN_NODE} ip addr | grep 172.20 | awk '{print $2}' | awk -F'/' '{print $1}')
|
||||||
|
GREP_STRING_TMP=""
|
||||||
|
for i in $ALL_IP_ON_KUBER_NODE; do
|
||||||
|
GREP_STRING_TMP="${GREP_STRING_TMP}${i}|"
|
||||||
|
done
|
||||||
|
GREP_STRING=${GREP_STRING_TMP:0:-1}
|
||||||
|
SSH_AUTH="ansible_ssh_user=${SSH_USER} ansible_ssh_pass=${SSH_PASS}"
|
||||||
|
echo "[main]" > cluster-hosts
|
||||||
|
echo "${PROMETHEUS_NODE} ${SSH_AUTH}" >> cluster-hosts
|
||||||
|
echo "[main-kuber]" >> cluster-hosts
|
||||||
|
echo "${KUBE_MAIN_NODE} ${SSH_AUTH}" >> cluster-hosts
|
||||||
|
echo "[cluster-nodes]" >> cluster-hosts
|
||||||
|
set +e
|
||||||
|
# Remove IP of kuber node
|
||||||
|
for i in ${NODES_TMP} ; do
|
||||||
|
TMP_VAR=$(echo $i | grep -vE "(${GREP_STRING})")
|
||||||
|
NODES="${NODES} ${TMP_VAR}"
|
||||||
|
done
|
||||||
|
set -e
|
||||||
|
for i in ${NODES} ; do
|
||||||
|
if [ "$i" != "${KUBE_MAIN_NODE}" ]; then
|
||||||
|
echo "${i} ${SSH_AUTH}" >> cluster-hosts
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
echo "[all-cluster-nodes:children]" >> cluster-hosts
|
||||||
|
echo "main-kuber" >> cluster-hosts
|
||||||
|
echo "cluster-nodes" >> cluster-hosts
|
||||||
|
LINES=$(wc -l cluster-hosts | awk '{print $1}')
|
||||||
|
NUM_NODES=$(($LINES - 7))
|
||||||
|
if [ ${NUM_NODES} -le 0 ]; then
|
||||||
|
echo "Something wrong, $NUM_NODES nodes found"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
echo "${NUM_NODES} nodes found"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "${ANSIBLE_TAG}" ]; then
|
||||||
|
ansible-playbook -f 40 -i ./cluster-hosts -e cluster_tag=${CLUSTER_TAG} ./deploy-telegraf.yaml
|
||||||
|
else
|
||||||
|
ansible-playbook -f 40 -i ./cluster-hosts -e cluster_tag=${CLUSTER_TAG} -t ${ANSIBLE_TAG} ./deploy-telegraf.yaml
|
||||||
|
fi
|
||||||
@@ -0,0 +1,10 @@
|
|||||||
|
---
|
||||||
|
- name: Deploy prometheus in docker
|
||||||
|
docker_container:
|
||||||
|
name: prometheus
|
||||||
|
image: 'prom/prometheus:v1.4.0'
|
||||||
|
ports: 9090:9090
|
||||||
|
state: started
|
||||||
|
volumes: ['/var/lib/prometheus:/prometheus']
|
||||||
|
command: '-config.file=/prometheus/prometheus.yml -storage.local.retention 168h0m0s -storage.local.max-chunks-to-persist 3024288 -storage.local.memory-chunks=50502740 -storage.local.num-fingerprint-mutexes=300960'
|
||||||
|
tags: [ 'prometheus' ]
|
||||||
@@ -0,0 +1,58 @@
|
|||||||
|
global:
|
||||||
|
scrape_interval: 15s # By default, scrape targets every 15 seconds.
|
||||||
|
evaluation_interval: 15s # By default, scrape targets every 15 seconds.
|
||||||
|
# Attach these labels to any time series or alerts when communicating with
|
||||||
|
# external systems (federation, remote storage, Alertmanager).
|
||||||
|
external_labels:
|
||||||
|
monitor: 'codelab-monitor'
|
||||||
|
|
||||||
|
rule_files:
|
||||||
|
# - "first.rules"
|
||||||
|
# - "second.rules"
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
|
||||||
|
- job_name: 'prometheus'
|
||||||
|
scrape_interval: 5s
|
||||||
|
scrape_timeout: 5s
|
||||||
|
# metrics_path defaults to '/metrics'
|
||||||
|
# scheme defaults to 'http'.
|
||||||
|
|
||||||
|
static_configs:
|
||||||
|
- targets: ['172.20.9.115:9090']
|
||||||
|
|
||||||
|
{% for env_num in range(1,7) %}
|
||||||
|
- job_name: 'k8-env-{{env_num}}'
|
||||||
|
scrape_interval: 30s
|
||||||
|
scrape_timeout: 30s
|
||||||
|
scheme: https
|
||||||
|
tls_config:
|
||||||
|
insecure_skip_verify: true
|
||||||
|
kubernetes_sd_configs:
|
||||||
|
- api_server: 'https://172.20.8.6{{env_num}}:443'
|
||||||
|
role: node
|
||||||
|
tls_config:
|
||||||
|
insecure_skip_verify: true
|
||||||
|
basic_auth:
|
||||||
|
username: kube
|
||||||
|
password: changeme
|
||||||
|
relabel_configs:
|
||||||
|
- action: labelmap
|
||||||
|
regex: __meta_kubernetes_node_label_(.+)
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: env
|
||||||
|
regex: .*
|
||||||
|
replacement: env-{{env_num}}
|
||||||
|
|
||||||
|
- job_name: 'etcd-env-{{env_num}}'
|
||||||
|
scrape_interval: 5s
|
||||||
|
scrape_timeout: 5s
|
||||||
|
scheme: https
|
||||||
|
tls_config:
|
||||||
|
insecure_skip_verify: true
|
||||||
|
file_sd_configs:
|
||||||
|
- files:
|
||||||
|
- etcd-env-{{env_num}}.yml
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,33 @@
|
|||||||
|
global:
|
||||||
|
scrape_interval: 15s # By default, scrape targets every 15 seconds.
|
||||||
|
evaluation_interval: 15s # By default, scrape targets every 15 seconds.
|
||||||
|
# Attach these labels to any time series or alerts when communicating with
|
||||||
|
# external systems (federation, remote storage, Alertmanager).
|
||||||
|
external_labels:
|
||||||
|
monitor: 'codelab-monitor'
|
||||||
|
|
||||||
|
rule_files:
|
||||||
|
# - "first.rules"
|
||||||
|
# - "second.rules"
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
|
||||||
|
- job_name: 'prometheus'
|
||||||
|
scrape_interval: 5s
|
||||||
|
scrape_timeout: 5s
|
||||||
|
# metrics_path defaults to '/metrics'
|
||||||
|
# scheme defaults to 'http'.
|
||||||
|
|
||||||
|
static_configs:
|
||||||
|
- targets: ['172.20.124.25:9090']
|
||||||
|
|
||||||
|
{% for env_num in range(1,7) %}
|
||||||
|
- job_name: 'telegraf-systems-env-{{env_num}}'
|
||||||
|
scrape_interval: 30s
|
||||||
|
scrape_timeout: 30s
|
||||||
|
file_sd_configs:
|
||||||
|
- files:
|
||||||
|
- targets-env-{{env_num}}.yml
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,6 @@
|
|||||||
|
- targets:
|
||||||
|
{% for host in groups['all-cluster-nodes']%}
|
||||||
|
- {{hostvars[host]['inventory_hostname']}}:9126
|
||||||
|
{% endfor %}
|
||||||
|
labels:
|
||||||
|
env: {{ cluster_tag }}
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
export LANG=C
|
||||||
|
set -o nounset # Treat unset variables as an error
|
||||||
|
echo "system entropy=$(cat /proc/sys/kernel/random/entropy_avail)"
|
||||||
|
|
||||||
@@ -0,0 +1,33 @@
|
|||||||
|
#!/bin/bash -e
|
||||||
|
|
||||||
|
ETCD=/usr/local/bin/etcdctl
|
||||||
|
|
||||||
|
type jq >/dev/null 2>&1 || ( echo "Jq is not installed" ; exit 1 )
|
||||||
|
type curl >/dev/null 2>&1 || ( echo "Curl is not installed" ; exit 1 )
|
||||||
|
|
||||||
|
# get etcd members credentials
|
||||||
|
MEMBERS="${ETCD} --endpoints https://127.0.0.1:2379 member list"
|
||||||
|
LEADER_ID=$(eval "$MEMBERS" | awk -F ':' '/isLeader=true/ {print $1}')
|
||||||
|
LEADER_ENDPOINT=$(eval "$MEMBERS" | awk '/isLeader=true/ {print $4}' | cut -d"=" -f2)
|
||||||
|
SLAVE_ID=$(eval "$MEMBERS" | grep 'isLeader=false' | head -n 1 | awk -F ":" '{print $1}')
|
||||||
|
SLAVE_ENDPOINT=$(eval "$MEMBERS" | grep 'isLeader=false' | head -n 1 | awk '{print $4}' | cut -d"=" -f2)
|
||||||
|
|
||||||
|
# member count:
|
||||||
|
metric_members_count=`curl -s -k https://172.20.9.15:2379/v2/members | jq -c '.members | length'`
|
||||||
|
metric_total_keys_count=`${ETCD} --endpoints https://127.0.0.1:2379 ls -r --sort | wc -l`
|
||||||
|
metric_total_size_dataset=`pidof etcd | xargs ps -o rss | awk '{rss=+$1} END {print rss}'`
|
||||||
|
metric_store_stats=`curl -s -k ${LEADER_ENDPOINT}/v2/stats/store| tr -d \"\{\} | sed -e 's/:/=/g'`
|
||||||
|
metric_latency_from_leader_avg=`curl -s -k ${LEADER_ENDPOINT}/v2/stats/leader | \
|
||||||
|
jq -c ".followers.\"${SLAVE_ID}\".latency.average"`
|
||||||
|
metric_leader_stats=`curl -s -k ${LEADER_ENDPOINT}/v2/stats/self | \
|
||||||
|
jq -c "{ sendBandwidthRate: .sendBandwidthRate, sendAppendRequestCnt: \
|
||||||
|
.sendAppendRequestCnt, sendPkgRate: .sendPkgRate }"| tr -d \"\{\} | sed -e 's/:/=/g'`
|
||||||
|
metric_slave_stats=`curl -s -k ${SLAVE_ENDPOINT}/v2/stats/self | \
|
||||||
|
jq -c "{ recvBandwidthRate: .recvBandwidthRate, recvAppendRequestCnt: \
|
||||||
|
.recvAppendRequestCnt, recvPkgRate: .recvPkgRate }"| tr -d \"\{\} | sed -e 's/:/=/g'`
|
||||||
|
cat << EOF
|
||||||
|
etcd_general_stats,group=etcd_cluster_metrics members_count=${metric_members_count},dataset_size=${metric_total_size_dataset},total_keys_count=${metric_total_keys_count}
|
||||||
|
etcd_leader_stats,group=etcd_cluster_metrics $metric_leader_stats
|
||||||
|
etcd_follower_stats,group=etcd_cluster_metrics ${metric_slave_stats},latency_from_leader_avg=${metric_latency_from_leader_avg}
|
||||||
|
etcd_store_stats,group=etcd_cluster_metrics $metric_store_stats
|
||||||
|
EOF
|
||||||
@@ -0,0 +1,105 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Logs extractor / parser
|
||||||
|
# checking that we are good
|
||||||
|
if [[ -z "${TMP_DIR}" || -z "${POD}" || -z "${CONTAINER}" || -z "${K8S_NS}" || -z "${OS_LOG_FIELDS}" || -z ${CONTID} ]]; then
|
||||||
|
echo "Required variables are not set, exiting!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
# Variables declaration
|
||||||
|
SSH_USER="${SSH_USER:-root}"
|
||||||
|
SSH_PASS="${SSH_PASS:-r00tme}"
|
||||||
|
LOG_ENTRIES_NUMBER=${LOG_ENTRIES_NUMBER:-1000}
|
||||||
|
LAST_TIME_STAMP_FILE="${TMP_DIR}/timestamp.tmp"
|
||||||
|
# get | set last timestamp for log entries
|
||||||
|
function last_ts_data()
|
||||||
|
{
|
||||||
|
local action
|
||||||
|
action=${1}
|
||||||
|
shift
|
||||||
|
if [ "${action}" == "get" ]; then
|
||||||
|
if [ -e ${LAST_TIME_STAMP_FILE} ]; then
|
||||||
|
cat ${LAST_TIME_STAMP_FILE}
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "$*" > ${LAST_TIME_STAMP_FILE}
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
function print_out()
|
||||||
|
{
|
||||||
|
if [ -z "${TMP_METRICS}" ];then
|
||||||
|
echo "$@"
|
||||||
|
else
|
||||||
|
echo "$@" >> ${TMP_METRICS}
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
function micro_to_seconds()
|
||||||
|
{
|
||||||
|
local input
|
||||||
|
local output
|
||||||
|
input="${1}"
|
||||||
|
output=$(echo "scale=4;${input}/1000000" | bc)
|
||||||
|
if echo ${output} | grep -q '^\..'; then
|
||||||
|
output="0${output}"
|
||||||
|
fi
|
||||||
|
echo "${output}"
|
||||||
|
}
|
||||||
|
# extract container logs from k8s
|
||||||
|
function get_logs()
|
||||||
|
{
|
||||||
|
local sdate
|
||||||
|
local stime
|
||||||
|
local scalltime
|
||||||
|
local lasttimestamp
|
||||||
|
local is_foundlast
|
||||||
|
local tmpdata
|
||||||
|
tmpdata="${TMP_DIR}/tmpdata.log"
|
||||||
|
if [ -e "${tmpdata}" ]; then rm -f ${tmpdata}; fi
|
||||||
|
if [ "${CONTAINER}" == "keystone" ];then
|
||||||
|
sshpass -p ${SSH_PASS} ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no ${SSH_USER}@${HOST} "tail -n${LOG_ENTRIES_NUMBER} /var/log/ccp/keystone/keystone-access.log | cut -d' ' -f${OS_LOG_FIELDS} | sed -e 's#\[##g' -e 's#\]##g'" 2>/dev/null > ${tmpdata}
|
||||||
|
else
|
||||||
|
sshpass -p ${SSH_PASS} ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no ${SSH_USER}@${HOST} "docker logs --tail ${LOG_ENTRIES_NUMBER} ${CONTID} 2>&1 | grep 'INFO' | grep 'GET /' | cut -d' ' -f${OS_LOG_FIELDS}" 2>/dev/null > ${tmpdata}
|
||||||
|
fi
|
||||||
|
is_foundlast=false
|
||||||
|
lasttimestamp=$(last_ts_data "get")
|
||||||
|
if [ -z "${lasttimestamp}" ]; then
|
||||||
|
while read log
|
||||||
|
do
|
||||||
|
sdate=$(echo ${log} | cut -d' ' -f1)
|
||||||
|
stime=$(echo ${log} | cut -d' ' -f2)
|
||||||
|
scalltime=$(echo ${log} | cut -d' ' -f3)
|
||||||
|
if [ "${CONTAINER}" == "keystone" ];then scalltime=$(micro_to_seconds ${scalltime});fi
|
||||||
|
if [ ! -z "${scalltime}" ]; then
|
||||||
|
print_out "os_api_response_time,container=${CONTAINER},pod=${POD},instance=${HOST},requestdate=${sdate},requesttime=${stime} processingtime=${scalltime}"
|
||||||
|
fi
|
||||||
|
done < <(cat ${tmpdata})
|
||||||
|
sdate=$(tail -n 1 ${tmpdata} | cut -d' ' -f1)
|
||||||
|
stime=$(tail -n 1 ${tmpdata} | cut -d' ' -f2)
|
||||||
|
last_ts_data "set" "${sdate}${stime}"
|
||||||
|
else
|
||||||
|
while read log
|
||||||
|
do
|
||||||
|
sdate=$(echo ${log} | cut -d' ' -f1)
|
||||||
|
stime=$(echo ${log} | cut -d' ' -f2)
|
||||||
|
scalltime=$(echo ${log} | cut -d' ' -f3)
|
||||||
|
if [ "${CONTAINER}" == "keystone" ];then scalltime=$(micro_to_seconds ${scalltime});fi
|
||||||
|
if [[ "${is_foundlast}" = "false" && "${lasttimestamp}" = "${sdate}${stime}" ]]; then
|
||||||
|
#echo "FOUND: ${sdate}${stime} ${scalltime}"
|
||||||
|
is_foundlast=true
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
if [ "${is_foundlast}" == "true" ]; then
|
||||||
|
if [ ! -z "${scalltime}" ]; then
|
||||||
|
print_out "os_api_response_time,container=${CONTAINER},pod=${POD},instance=${HOST},requestdate=${sdate},requesttime=${stime} processingtime=${scalltime}"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done < <(cat ${tmpdata})
|
||||||
|
if [ "${is_foundlast}" == "true" ]; then
|
||||||
|
sdate=$(tail -n 1 ${tmpdata} | cut -d' ' -f1)
|
||||||
|
stime=$(tail -n 1 ${tmpdata} | cut -d' ' -f2)
|
||||||
|
last_ts_data "set" "${sdate}${stime}"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
rm -f ${tmpdata}
|
||||||
|
}
|
||||||
|
# Main logic
|
||||||
|
get_logs
|
||||||
@@ -0,0 +1,6 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# output from iostat -Ndx is
|
||||||
|
# Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
|
||||||
|
export LANG=C
|
||||||
|
iostat -Ndx | tail -n +4 | head -n -1 | awk '{print "system_per_device_iostat,device="$1" read_merge="$2",write_merge="$3",await="$10",read_await="$11",write_await="$12",util="$14",average_queue="$9}'
|
||||||
|
|
||||||
@@ -0,0 +1,75 @@
|
|||||||
|
#!/bin/bash -e
|
||||||
|
|
||||||
|
|
||||||
|
K8S_MASTER=127.0.0.1
|
||||||
|
|
||||||
|
if [[ $1 ]] ; then
|
||||||
|
K8S_MASTER=$1
|
||||||
|
fi
|
||||||
|
|
||||||
|
type jq >/dev/null 2>&1 || ( echo "Jq is not installed" ; exit 1 )
|
||||||
|
type curl >/dev/null 2>&1 || ( echo "Curl is not installed" ; exit 1 )
|
||||||
|
|
||||||
|
curl_get() {
|
||||||
|
url="https://${K8S_MASTER}$@"
|
||||||
|
curl -k -s -u kube:changeme $url || ( echo "Curl failed at: $url" 1>&2; exit 1 )
|
||||||
|
}
|
||||||
|
# gathering frequent API calls output to separate file(in order to avoid long timeouts):
|
||||||
|
node_file=`mktemp /tmp/XXXXX`
|
||||||
|
pods_file=`mktemp /tmp/XXXXX`
|
||||||
|
endpoints_file=`mktemp /tmp/XXXXX`
|
||||||
|
curl_get "/api/v1/nodes" > $node_file
|
||||||
|
curl_get "/api/v1/pods" > $pods_file
|
||||||
|
curl_get "/api/v1/endpoints" > $endpoints_file
|
||||||
|
# metrics withdrawal:
|
||||||
|
number_of_namespaces_total=`curl_get "/api/v1/namespaces" | jq '[ .items[] .metadata.name ] | length'`
|
||||||
|
number_of_services_total=`curl_get "/api/v1/services" | jq -c '[ .items[] .metadata.name ] | length'`
|
||||||
|
number_of_nodes_total=`jq -c '[ .items[] .metadata.name ] | length' $node_file`
|
||||||
|
number_of_unsched=`jq -c '[ .items[] | select(.spec.unschedulable != null) .metadata.name ] | length' $node_file`
|
||||||
|
number_in_each_status=`jq -c '[ .items[] | .status.conditions[] | select(.type == "Ready") .status \
|
||||||
|
| gsub("(?<a>.+)"; "number_of_status_\(.a)" ) ] | group_by(.) | map({(.[0]): length}) | add ' $node_file \
|
||||||
|
| tr -d \"\{\} | sed -e 's/:/=/g'`
|
||||||
|
number_of_pods_total=`jq -c '[ .items[] .metadata.name ] | length' $pods_file`
|
||||||
|
number_of_pods_state_Pending=`jq -c '[ .items[] .status.phase | select(. == "Pending")] | length' $pods_file`
|
||||||
|
number_of_pods_state_Running=`jq -c '[ .items[] .status.phase | select(. == "Running")] | length' $pods_file`
|
||||||
|
number_of_pods_state_Succeeded=`jq -c '[ .items[] .status.phase | select(. == "Succeeded")] | length' $pods_file`
|
||||||
|
number_of_pods_state_Failed=`jq -c '[ .items[] .status.phase | select(. == "Failed")] | length' $pods_file`
|
||||||
|
number_of_pods_state_Unknown=`jq -c '[ .items[] .status.phase | select(. == "Unknown")] | length' $pods_file`
|
||||||
|
number_of_pods_per_node=`jq -c '[ .items[] | .spec.nodeName ] | group_by(.) | \
|
||||||
|
map("k8s_pods_per_node,group=k8s_cluster_metrics,pod_node=\(.[0]) value=\(length)")' $pods_file \
|
||||||
|
| sed -e 's/\["//g' -e 's/"\]//g' -e 's/","/\n/g'`
|
||||||
|
number_of_pods_per_ns=`jq -c '[ .items[] | .metadata.namespace ] | group_by(.) | \
|
||||||
|
map("k8s_pods_per_namespace,group=k8s_cluster_metrics,ns=\(.[0]) value=\(length)")' $pods_file \
|
||||||
|
| sed -e 's/\["//g' -e 's/"\]//g' -e 's/","/\n/g'`
|
||||||
|
number_of_endpoints_each_service=`jq -c '[ .items[] | { service: .metadata.name, endpoints: .subsets[] } | \
|
||||||
|
. as { service: $svc, endpoints: $endp } | $endp.addresses | length | . as $addr | $endp.ports | length | \
|
||||||
|
. as $prts | "k8s_services,group=k8s_cluster_metrics,service=\($svc) endpoints_number=\($addr * $prts)" ] ' $endpoints_file \
|
||||||
|
| sed -e 's/\["//g' -e 's/"\]//g' -e 's/","/\n/g'`
|
||||||
|
number_of_endpoints_total=`jq -c '[ .items[] | .subsets[] | { addrs: .addresses, ports: .ports } \
|
||||||
|
| map (length ) | .[0] * .[1] ] | add' $endpoints_file`
|
||||||
|
number_of_API_instances=`curl_get "/api/" | jq -c '.serverAddressByClientCIDRs | length'`
|
||||||
|
number_of_controllers=`curl_get "/api/v1/replicationcontrollers" | jq '.items | length'`
|
||||||
|
number_of_scheduler_instances=`curl_get /api/v1/namespaces/kube-system/pods?labelSelector='k8s-app=kube-scheduler' \
|
||||||
|
| jq -c '.items | length' `
|
||||||
|
cluster_resources_CPU=`jq -c '[ .items[] .status.capacity.cpu | tonumber ] | add' $node_file`
|
||||||
|
cluster_resources_RAM=`jq -c '[ .items[] .status.capacity.memory| gsub("[a-z]+$"; "" ; "i") | tonumber] | add' $node_file`
|
||||||
|
|
||||||
|
# output:
|
||||||
|
cat << EOF
|
||||||
|
k8s_nodes,group=k8s_cluster_metrics number_of_nodes_total=${number_of_nodes_total},number_of_unsched=${number_of_unsched}
|
||||||
|
k8s_nodes_states,group=k8s_cluster_metrics ${number_in_each_status}
|
||||||
|
k8s_namespaces,group=k8s_cluster_metrics number_of_namespaces_total=${number_of_namespaces_total}
|
||||||
|
k8s_pods,group=k8s_cluster_metrics number_of_pods_total=${number_of_pods_total}
|
||||||
|
k8s_pods_states,group=k8s_cluster_metrics number_of_pods_state_Pending=${number_of_pods_state_Pending},number_of_pods_state_Running=${number_of_pods_state_Running},number_of_pods_state_Succeeded=${number_of_pods_state_Succeeded},number_of_pods_state_Failed=${number_of_pods_state_Failed},number_of_pods_state_Unknown=${number_of_pods_state_Unknown}
|
||||||
|
${number_of_pods_per_node}
|
||||||
|
${number_of_pods_per_ns}
|
||||||
|
${number_of_endpoints_each_service}
|
||||||
|
k8s_services,group=k8s_cluster_metrics number_of_services_total=${number_of_services_total},number_of_endpoints_total=${number_of_endpoints_total}
|
||||||
|
k8s_number_of_API_instances,group=k8s_cluster_metrics value=${number_of_API_instances}
|
||||||
|
k8s_number_of_controllers,group=k8s_cluster_metrics value=${number_of_controllers}
|
||||||
|
k8s_number_of_scheduler_instances,group=k8s_cluster_metrics value=${number_of_scheduler_instances}
|
||||||
|
k8s_cluster_resources,group=k8s_cluster_metrics cpu_total=${cluster_resources_CPU},ram_total=${cluster_resources_RAM}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# cleanup
|
||||||
|
rm -f $node_file $pods_file $endpoints_file
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
export LANG=C
|
||||||
|
PS_ALL=$(ps --no-headers -A -o command | grep -vE '(sh|bash)')
|
||||||
|
M_NAME=system_openstack_list
|
||||||
|
|
||||||
|
MARIADB=$(echo "${PS_ALL}" | grep 'mariadb' | wc -l)
|
||||||
|
RABBITMQ=$(echo "${PS_ALL}" | grep 'rabbitmq' | wc -l)
|
||||||
|
KEYSTONE=$(echo "${PS_ALL}" | grep 'keystone' | wc -l)
|
||||||
|
GLANCE=$(echo "${PS_ALL}" | grep -E '(glance-api|glance-registry)' | wc -l)
|
||||||
|
CINDER=$(echo "${PS_ALL}" | grep 'cinder' | wc -l)
|
||||||
|
NOVA=$(echo "${PS_ALL}" | grep -E '(nova-api|nova-conductor|nova-consoleauth|nova-scheduler)' | wc -l)
|
||||||
|
NEUTRON=$(echo "${PS_ALL}" | grep -E '(neutron-server|neutron-metadata-agent|neutron-dhcp-agent|neutron-l3-agent|neutron-openvswitch-agent)' | wc -l)
|
||||||
|
OPENVSWITCH=$(echo "${PS_ALL}" | grep -E '(ovsdb-server|ovs-vswitchd|ovsdb-client)' | wc -l)
|
||||||
|
|
||||||
|
echo "${M_NAME} mariadb=${MARIADB},rabbitmq=${RABBITMQ},keystone=${KEYSTONE},glance=${GLANCE},cinder=${CINDER},nova=${NOVA},neutron=${NEUTRON},openvswitch=${OPENVSWITCH}"
|
||||||
@@ -0,0 +1,7 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Output in MB/s
|
||||||
|
# echo 0 > /proc/sys/kernel/nmi_watchdog
|
||||||
|
# modprobe msr
|
||||||
|
export LANG=C
|
||||||
|
MEM_BW=$(sudo /opt/telegraf/bin/pcm-memory-one-line.x /csv 1 2>/dev/null | tail -n 1 | awk '{print $28}')
|
||||||
|
echo "system_memory bandwidth=${MEM_BW}"
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
export LANG=C
|
||||||
|
IFS='
|
||||||
|
'
|
||||||
|
SUM_RESV_Q=0
|
||||||
|
SUM_SEND_Q=0
|
||||||
|
for i in $(netstat -4 -n); do
|
||||||
|
RESV_Q=$(echo $i | awk '{print $2}')
|
||||||
|
SEND_Q=$(echo $i | awk '{print $3}')
|
||||||
|
SUM_RESV_Q=$((${SUM_RESV_Q} + ${RESV_Q}))
|
||||||
|
SUM_SEND_Q=$((${SUM_SEND_Q} + ${SEND_Q}))
|
||||||
|
done
|
||||||
|
echo "system_tcp_queue sum_recv=${SUM_RESV_Q},sum_send=${SUM_SEND_Q}"
|
||||||
@@ -0,0 +1,22 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -o nounset # Treat unset variables as an error
|
||||||
|
#set -x
|
||||||
|
export LANG=C
|
||||||
|
if [ ! -d '/sys/devices/system/node' ]; then
|
||||||
|
# This host does not have NUMA
|
||||||
|
exit 44
|
||||||
|
fi
|
||||||
|
ALL_PROCESS="$(ps --no-headers -A -o pid,ucomm)"
|
||||||
|
for i in $(echo "${ALL_PROCESS}" | awk '{print $1}'); do
|
||||||
|
if [ -f "/proc/$i/numa_maps" ]; then
|
||||||
|
NUM_STAT=$(numastat -p $i)
|
||||||
|
PROC_NAME=$(echo "${ALL_PROCESS}" | grep -E "( $i |^$i )" | awk '{print $2}')
|
||||||
|
echo "${NUM_STAT}" | grep Huge | awk -v p=$i -v n=$PROC_NAME \
|
||||||
|
'{printf "system_numa_memory_per_pid,pid="p",name="n" memory_huge="$NF","}'
|
||||||
|
echo "${NUM_STAT}" | grep Heap | awk '{printf "memory_heap="$NF","}'
|
||||||
|
echo "${NUM_STAT}" | grep Stack | awk '{printf "memory_stack="$NF","}'
|
||||||
|
echo "${NUM_STAT}" | grep Private | awk '{print "memory_private="$NF}'
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,215 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Variables declaration
|
||||||
|
WORKDIR="$(cd "$(dirname ${0})" && pwd)"
|
||||||
|
OS_LOG_PARSER="${WORKDIR}/glog.sh"
|
||||||
|
TMPDATADIR="${WORKDIR}/data"
|
||||||
|
TMP_METRICS="${TMPDATADIR}/allmetrics.tmp"
|
||||||
|
MODE="${MODE:-bg}"
|
||||||
|
SCRIPT_LOG_DIR="${WORKDIR}/logs"
|
||||||
|
SCRIPT_LOG_FILE="${SCRIPT_LOG_DIR}/run_results_$(date +%Y-%m-%d).log"
|
||||||
|
SCRIPT_LOG_LVL=2
|
||||||
|
K8S_NS="${K8S_NS:-ccp}"
|
||||||
|
declare -a OSCONTROLLER=(
|
||||||
|
'cinder-api:1,2,21'
|
||||||
|
'glance-api:1,2,22'
|
||||||
|
'heat-api:1,2,22'
|
||||||
|
'neutron-metadata-agent:1,2,17'
|
||||||
|
'neutron-server:1,2,22'
|
||||||
|
'nova-api:1,2,21'
|
||||||
|
'keystone:4,5,11'
|
||||||
|
)
|
||||||
|
declare -a OSCOMPUTE=(
|
||||||
|
'nova-compute:'
|
||||||
|
)
|
||||||
|
# crete subfolder under working directory
|
||||||
|
function mk_dir()
|
||||||
|
{
|
||||||
|
local newdir="${TMPDATADIR}/${1}"
|
||||||
|
if [ ! -d "${newdir}" ]; then
|
||||||
|
mkdir -p ${newdir}
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
# log function
|
||||||
|
function log()
|
||||||
|
{
|
||||||
|
local input
|
||||||
|
local dtstamp
|
||||||
|
input="$*"
|
||||||
|
dtstamp="$(date +%Y-%m-%d_%H%M%S)"
|
||||||
|
if [ ! -d "${SCRIPT_LOG_DIR}" ]; then
|
||||||
|
mkdir -p "${SCRIPT_LOG_DIR}"
|
||||||
|
fi
|
||||||
|
case "${SCRIPT_LOG_LVL}" in
|
||||||
|
3)
|
||||||
|
if [ ! -z "${input}" ]; then
|
||||||
|
echo "${dtstamp}: ${input}" | tee -a "${SCRIPT_LOG_FILE}"
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
2)
|
||||||
|
if [ ! -z "${input}" ]; then
|
||||||
|
echo "${dtstamp}: ${input}" >> "${SCRIPT_LOG_FILE}"
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
1)
|
||||||
|
if [ ! -z "${input}" ]; then
|
||||||
|
echo "${dtstamp}: ${input}"
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
}
|
||||||
|
# get roles according to predefined in OSCONTROLLER & OSCOMPUTE
|
||||||
|
function get_role()
|
||||||
|
{
|
||||||
|
local role
|
||||||
|
local input
|
||||||
|
local arr_name
|
||||||
|
local arr_name_fields
|
||||||
|
role=${1}
|
||||||
|
shift
|
||||||
|
input=$*
|
||||||
|
case ${role} in
|
||||||
|
"controller")
|
||||||
|
for i in $(seq 0 $(( ${#OSCONTROLLER[@]} - 1)))
|
||||||
|
do
|
||||||
|
arr_name=$(echo ${OSCONTROLLER[${i}]} | cut -d":" -f1)
|
||||||
|
arr_name_fields=$(echo ${OSCONTROLLER[${i}]} | cut -d":" -f2)
|
||||||
|
if [[ "${arr_name}" == "${input}" ]]; then
|
||||||
|
echo "${arr_name_fields}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
;;
|
||||||
|
"compute")
|
||||||
|
for i in $(seq 0 $(( ${#OSCOMPUTE[@]} - 1)))
|
||||||
|
do
|
||||||
|
arr_name=$(echo ${OSCOMPUTE[${i}]} | cut -d":" -f1)
|
||||||
|
arr_name_fields=$(echo ${OSCOMPUTE[${i}]} | cut -d":" -f2)
|
||||||
|
if [ "${arr_name}" == "${input}" ]; then
|
||||||
|
echo "${arr_name_fields}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
# diff in seconds
|
||||||
|
function tdiff()
|
||||||
|
{
|
||||||
|
local now
|
||||||
|
local datetime
|
||||||
|
local result
|
||||||
|
datetime="$(date -d "${1}" +%s)"
|
||||||
|
now="$(date +%s)"
|
||||||
|
result=$(( ${now} - ${datetime} ))
|
||||||
|
echo ${result}
|
||||||
|
}
|
||||||
|
# lock file function
|
||||||
|
function glock()
|
||||||
|
{
|
||||||
|
local action
|
||||||
|
local lockfile
|
||||||
|
local accessdate
|
||||||
|
local old_in_sec=120
|
||||||
|
action="${1}"
|
||||||
|
# lockfile="${TMP_METRICS}.lock"
|
||||||
|
lockfile="${TMPDATADIR}/allmetrics.tmp.lock"
|
||||||
|
if [[ "${action}" == "lock" && ! -e "${lockfile}" ]]; then
|
||||||
|
touch "${lockfile}"
|
||||||
|
elif [[ "${action}" == "lock" && -e "${lockfile}" ]]; then
|
||||||
|
accessdate="$(stat ${lockfile} | grep Modify | cut -d' ' -f2,3)"
|
||||||
|
if [ "$(tdiff "${accessdate}")" -ge "${old_in_sec}" ]; then
|
||||||
|
rm "${lockfile}"
|
||||||
|
touch "${lockfile}"
|
||||||
|
else
|
||||||
|
log "Lock file ${lockfile} exists!"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
rm "${lockfile}"
|
||||||
|
fi
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
# wait for parcers launched in backgroud mode
|
||||||
|
function gatherchildren()
|
||||||
|
{
|
||||||
|
local childrencount
|
||||||
|
while true
|
||||||
|
do
|
||||||
|
childrencount=$(ps axf| grep ${OS_LOG_PARSER} | grep -v grep | wc -l)
|
||||||
|
if [ "${childrencount}" -eq 0 ]; then
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
log "Children running ${childrencount}."
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
}
|
||||||
|
# list of running contaners
|
||||||
|
function get_k8s_containers()
|
||||||
|
{
|
||||||
|
local cont_host
|
||||||
|
local cont_pod
|
||||||
|
local cont_name
|
||||||
|
local cont_id
|
||||||
|
local os_log_fields
|
||||||
|
local cont_tmp_dir
|
||||||
|
local _raw_data
|
||||||
|
glock "lock"
|
||||||
|
if [ "$?" -ne 0 ]; then exit 1;fi
|
||||||
|
#echo '[' > ${TMP_METRICS}
|
||||||
|
_raw_data="${TMPDATADIR}/._raw_data"
|
||||||
|
rm -rf ${_raw_data}
|
||||||
|
kubectl get pods -n "${K8S_NS}" -o 'go-template={{range .items}}{{if or (ne .status.phase "Succeeded") (eq .status.phase "Running")}}{{.spec.nodeName}},{{.metadata.name}},{{range .status.containerStatuses}}{{.name}},{{.containerID}}{{end}}{{"\n"}}{{end}}{{end}}' > ${_raw_data}
|
||||||
|
for data in $(cat ${_raw_data})
|
||||||
|
do
|
||||||
|
cont_host=$(echo ${data} | cut -d',' -f1)
|
||||||
|
cont_pod=$(echo ${data} | cut -d',' -f2)
|
||||||
|
cont_name=$(echo ${data} | cut -d',' -f3)
|
||||||
|
cont_id=$(echo ${data} | cut -d',' -f4 | sed 's|^docker://||')
|
||||||
|
cont_tmp_dir="${cont_host}_${cont_pod}_${cont_name}"
|
||||||
|
os_log_fields=$(get_role "controller" "${cont_name}")
|
||||||
|
if [ "$?" -eq 0 ]; then
|
||||||
|
mk_dir "${cont_tmp_dir}"
|
||||||
|
export K8S_NS=${K8S_NS}
|
||||||
|
export TMP_DIR=${TMPDATADIR}/${cont_tmp_dir}
|
||||||
|
# export TMP_METRICS=${TMP_METRICS}
|
||||||
|
export TMP_METRICS="${TMPDATADIR}/results/${cont_pod}.tmp"
|
||||||
|
export CONTID=${cont_id}
|
||||||
|
export CONTAINER=${cont_name}
|
||||||
|
export HOST=${cont_host}
|
||||||
|
export POD=${cont_pod}
|
||||||
|
export OS_LOG_FIELDS=${os_log_fields}
|
||||||
|
log "MODE=${MODE} CONTID=${cont_id} TMP_METRICS=${TMP_METRICS} ROLE=controller HOST=${cont_host} POD=${cont_pod} CONTAINER=${cont_name} OS_LOG_FIELDS=${os_log_fields} TMP_DIR=${TMPDATADIR}/${cont_tmp_dir} K8S_NS=${K8S_NS} ${OS_LOG_PARSER}"
|
||||||
|
if [[ "${MODE}" == "bg" ]]; then
|
||||||
|
log "${cont_pod} ${cont_name} ${cont_id}"
|
||||||
|
${OS_LOG_PARSER} &
|
||||||
|
else
|
||||||
|
${OS_LOG_PARSER}
|
||||||
|
fi
|
||||||
|
unset TMP_METRICS
|
||||||
|
unset CONTID
|
||||||
|
unset CONTAINER
|
||||||
|
unset POD
|
||||||
|
unset OS_LOG_FIELDS
|
||||||
|
unset HOST
|
||||||
|
fi
|
||||||
|
# os_log_fields=$(get_role "compute" "${cont_name}")
|
||||||
|
# if [ "$?" -eq 0 ]; then
|
||||||
|
# mk_dir "${cont_tmp_dir}"
|
||||||
|
# log "ROLE=compute HOST=${cont_host} POD=${cont_pod} CONTAINER=${cont_name} OS_LOG_FIELDS=${os_log_fields} TMP_DIR=${TMPDATADIR}/${cont_tmp_dir} K8S_NS=${K8S_NS} ${OS_LOG_PARSER}"
|
||||||
|
# fi
|
||||||
|
done
|
||||||
|
gatherchildren
|
||||||
|
if [ "$(ls ${TMPDATADIR}/results/ | wc -l)" -gt 0 ]; then
|
||||||
|
cat ${TMPDATADIR}/results/*.tmp
|
||||||
|
log "Resulting lines $(cat ${TMPDATADIR}/results/*.tmp | wc -l)"
|
||||||
|
rm -rf ${TMPDATADIR}/results/*
|
||||||
|
fi
|
||||||
|
glock "unlock"
|
||||||
|
}
|
||||||
|
# Main logic
|
||||||
|
mk_dir
|
||||||
|
mk_dir "results"
|
||||||
|
get_k8s_containers
|
||||||
@@ -0,0 +1,6 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
export LANG=C
|
||||||
|
for i in $(ps --no-headers -A -o pid); do
|
||||||
|
pidstat -p $i | tail -n 1 | grep -v PID | awk '{print "system_per_process_cpu_usage,process="$9" user="$4",system="$5}'
|
||||||
|
done
|
||||||
|
|
||||||
@@ -0,0 +1,12 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
WORKDIR="$(cd "$(dirname ${0})" && pwd)"
|
||||||
|
SCRIPT="${WORKDIR}/$(basename ${0})"
|
||||||
|
MYSQLUSER="nova"
|
||||||
|
MYSQPASSWD="password"
|
||||||
|
MYSQLHOST="mariadb.ccp"
|
||||||
|
avgdata=$(mysql -u${MYSQLUSER} -p${MYSQPASSWD} -h ${MYSQLHOST} -D nova --skip-column-names --batch -e "select diff from (select avg(unix_timestamp(launched_at) - unix_timestamp(created_at)) as diff from instances where vm_state != 'error' and launched_at >= subtime(now(),'30')) t1 where diff IS NOT NULL;" 2>/dev/null | sed 's/\t/,/g';)
|
||||||
|
if [ ! -z "${avgdata}" ]; then
|
||||||
|
echo "vm_spawn_avg_time timediffinsec=${avgdata}"
|
||||||
|
fi
|
||||||
|
|
||||||
@@ -0,0 +1,116 @@
|
|||||||
|
[global_tags]
|
||||||
|
metrics_source="system_openstack"
|
||||||
|
[agent]
|
||||||
|
interval = "10s"
|
||||||
|
round_interval = true
|
||||||
|
metric_batch_size = 1000
|
||||||
|
metric_buffer_limit = 10000
|
||||||
|
collection_jitter = "0s"
|
||||||
|
flush_interval = "15s"
|
||||||
|
flush_jitter = "5s"
|
||||||
|
precision = ""
|
||||||
|
debug = false
|
||||||
|
quiet = false
|
||||||
|
hostname = ""
|
||||||
|
omit_hostname = false
|
||||||
|
[[outputs.prometheus_client]]
|
||||||
|
listen = ":9126"
|
||||||
|
[[inputs.cpu]]
|
||||||
|
percpu = true
|
||||||
|
totalcpu = true
|
||||||
|
fielddrop = ["time_*"]
|
||||||
|
[[inputs.disk]]
|
||||||
|
ignore_fs = ["tmpfs", "devtmpfs"]
|
||||||
|
[[inputs.diskio]]
|
||||||
|
[[inputs.kernel]]
|
||||||
|
[[inputs.mem]]
|
||||||
|
[[inputs.processes]]
|
||||||
|
[[inputs.swap]]
|
||||||
|
[[inputs.system]]
|
||||||
|
[[inputs.kernel_vmstat]]
|
||||||
|
[[inputs.net]]
|
||||||
|
[[inputs.netstat]]
|
||||||
|
[[inputs.exec]]
|
||||||
|
interval = "15s"
|
||||||
|
commands = [
|
||||||
|
"/opt/telegraf/bin/vmtime.sh",
|
||||||
|
]
|
||||||
|
timeout = "30s"
|
||||||
|
data_format = "influx"
|
||||||
|
[[inputs.exec]]
|
||||||
|
interval = "30s"
|
||||||
|
commands = [
|
||||||
|
"/opt/telegraf/bin/osapitime.sh",
|
||||||
|
]
|
||||||
|
timeout = "60s"
|
||||||
|
data_format = "influx"
|
||||||
|
[[inputs.exec]]
|
||||||
|
interval = "15s"
|
||||||
|
commands = [
|
||||||
|
"/opt/telegraf/bin/etcd_get_metrics.sh"
|
||||||
|
]
|
||||||
|
timeout = "30s"
|
||||||
|
data_format = "influx"
|
||||||
|
[[inputs.exec]]
|
||||||
|
interval = "15s"
|
||||||
|
commands = [
|
||||||
|
"/opt/telegraf/bin/k8s_get_metrics.sh"
|
||||||
|
]
|
||||||
|
timeout = "30s"
|
||||||
|
data_format = "influx"
|
||||||
|
[[inputs.openstack]]
|
||||||
|
interval = '40s'
|
||||||
|
identity_endpoint = "http://keystone.ccp.svc.cluster.local:5000/v3"
|
||||||
|
domain = "default"
|
||||||
|
project = "admin"
|
||||||
|
username = "admin"
|
||||||
|
password = "password"
|
||||||
|
[[inputs.exec]]
|
||||||
|
interval = "15s"
|
||||||
|
commands = [
|
||||||
|
"/opt/telegraf/bin/iostat_per_device.sh"
|
||||||
|
]
|
||||||
|
timeout = "30s"
|
||||||
|
data_format = "influx"
|
||||||
|
[[inputs.exec]]
|
||||||
|
interval = "15s"
|
||||||
|
commands = [
|
||||||
|
"/opt/telegraf/bin/per_process_cpu_usage.sh"
|
||||||
|
]
|
||||||
|
timeout = "30s"
|
||||||
|
data_format = "influx"
|
||||||
|
[[inputs.exec]]
|
||||||
|
interval = "15s"
|
||||||
|
commands = [
|
||||||
|
"/opt/telegraf/bin/entropy.sh"
|
||||||
|
]
|
||||||
|
timeout = "30s"
|
||||||
|
data_format = "influx"
|
||||||
|
[[inputs.exec]]
|
||||||
|
interval = "60s"
|
||||||
|
commands = [
|
||||||
|
"/opt/telegraf/bin/numa_stat_per_pid.sh"
|
||||||
|
]
|
||||||
|
timeout = "60s"
|
||||||
|
data_format = "influx"
|
||||||
|
[[inputs.exec]]
|
||||||
|
interval = "15s"
|
||||||
|
commands = [
|
||||||
|
"/opt/telegraf/bin/memory_bandwidth.sh"
|
||||||
|
]
|
||||||
|
timeout = "30s"
|
||||||
|
data_format = "influx"
|
||||||
|
[[inputs.exec]]
|
||||||
|
interval = "15s"
|
||||||
|
commands = [
|
||||||
|
"/opt/telegraf/bin/list_openstack_processes.sh"
|
||||||
|
]
|
||||||
|
timeout = "30s"
|
||||||
|
data_format = "influx"
|
||||||
|
[[inputs.exec]]
|
||||||
|
interval = "15s"
|
||||||
|
commands = [
|
||||||
|
"/opt/telegraf/bin/network_tcp_queue.sh"
|
||||||
|
]
|
||||||
|
timeout = "30s"
|
||||||
|
data_format = "influx"
|
||||||
@@ -0,0 +1,81 @@
|
|||||||
|
[global_tags]
|
||||||
|
metrics_source="system"
|
||||||
|
[agent]
|
||||||
|
interval = "10s"
|
||||||
|
round_interval = true
|
||||||
|
metric_batch_size = 1000
|
||||||
|
metric_buffer_limit = 10000
|
||||||
|
collection_jitter = "0s"
|
||||||
|
flush_interval = "15s"
|
||||||
|
flush_jitter = "5s"
|
||||||
|
precision = ""
|
||||||
|
debug = false
|
||||||
|
quiet = false
|
||||||
|
hostname = ""
|
||||||
|
omit_hostname = false
|
||||||
|
[[outputs.prometheus_client]]
|
||||||
|
listen = ":9126"
|
||||||
|
[[inputs.cpu]]
|
||||||
|
percpu = true
|
||||||
|
totalcpu = true
|
||||||
|
fielddrop = ["time_*"]
|
||||||
|
[[inputs.disk]]
|
||||||
|
ignore_fs = ["tmpfs", "devtmpfs"]
|
||||||
|
[[inputs.diskio]]
|
||||||
|
[[inputs.kernel]]
|
||||||
|
[[inputs.mem]]
|
||||||
|
[[inputs.processes]]
|
||||||
|
[[inputs.swap]]
|
||||||
|
[[inputs.system]]
|
||||||
|
[[inputs.kernel_vmstat]]
|
||||||
|
[[inputs.net]]
|
||||||
|
[[inputs.netstat]]
|
||||||
|
[[inputs.exec]]
|
||||||
|
interval = "15s"
|
||||||
|
commands = [
|
||||||
|
"/opt/telegraf/bin/iostat_per_device.sh"
|
||||||
|
]
|
||||||
|
timeout = "30s"
|
||||||
|
data_format = "influx"
|
||||||
|
[[inputs.exec]]
|
||||||
|
interval = "15s"
|
||||||
|
commands = [
|
||||||
|
"/opt/telegraf/bin/per_process_cpu_usage.sh"
|
||||||
|
]
|
||||||
|
timeout = "30s"
|
||||||
|
data_format = "influx"
|
||||||
|
[[inputs.exec]]
|
||||||
|
interval = "15s"
|
||||||
|
commands = [
|
||||||
|
"/opt/telegraf/bin/entropy.sh"
|
||||||
|
]
|
||||||
|
timeout = "30s"
|
||||||
|
data_format = "influx"
|
||||||
|
[[inputs.exec]]
|
||||||
|
interval = "60s"
|
||||||
|
commands = [
|
||||||
|
"/opt/telegraf/bin/numa_stat_per_pid.sh"
|
||||||
|
]
|
||||||
|
timeout = "60s"
|
||||||
|
data_format = "influx"
|
||||||
|
[[inputs.exec]]
|
||||||
|
interval = "15s"
|
||||||
|
commands = [
|
||||||
|
"/opt/telegraf/bin/memory_bandwidth.sh"
|
||||||
|
]
|
||||||
|
timeout = "30s"
|
||||||
|
data_format = "influx"
|
||||||
|
[[inputs.exec]]
|
||||||
|
interval = "15s"
|
||||||
|
commands = [
|
||||||
|
"/opt/telegraf/bin/list_openstack_processes.sh"
|
||||||
|
]
|
||||||
|
timeout = "30s"
|
||||||
|
data_format = "influx"
|
||||||
|
[[inputs.exec]]
|
||||||
|
interval = "15s"
|
||||||
|
commands = [
|
||||||
|
"/opt/telegraf/bin/network_tcp_queue.sh"
|
||||||
|
]
|
||||||
|
timeout = "30s"
|
||||||
|
data_format = "influx"
|
||||||
948
doc/source/methodologies/monitoring/index.rst
Normal file
948
doc/source/methodologies/monitoring/index.rst
Normal file
@@ -0,0 +1,948 @@
|
|||||||
|
|
||||||
|
.. _Methodology_for_Containerized_Openstack_Monitoring:
|
||||||
|
|
||||||
|
**************************************************
|
||||||
|
Methodology for Containerized Openstack Monitoring
|
||||||
|
**************************************************
|
||||||
|
|
||||||
|
:Abstract:
|
||||||
|
|
||||||
|
This document describes one of the Containerized Openstack monitoring solutions
|
||||||
|
to provide scalable and comprehensive architecture and obtain all crucial performance
|
||||||
|
metrics on each structure layer.
|
||||||
|
|
||||||
|
|
||||||
|
Containerized Openstack Monitoring Architecture
|
||||||
|
===============================================
|
||||||
|
|
||||||
|
This part of documentation describes required performance metrics in each
|
||||||
|
distinguished Containerized Openstack layer.
|
||||||
|
|
||||||
|
Containerized Openstack comprises three layers where Monitoring System should
|
||||||
|
be able to query all necessary counters:
|
||||||
|
- OS layer
|
||||||
|
- Kubernetes layer
|
||||||
|
- Openstack layer
|
||||||
|
|
||||||
|
Monitoring instruments must be logically divided in two groups:
|
||||||
|
- Monitoring Server Side
|
||||||
|
- Node Client Side
|
||||||
|
|
||||||
|
Operation System Layer
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
We were using Ubuntu Xenial on top of bare-metal servers for both server and node side.
|
||||||
|
|
||||||
|
Baremetal hardware description
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
We deployed everything at 200 servers environment with following hardware characteristics:
|
||||||
|
|
||||||
|
.. table::
|
||||||
|
|
||||||
|
+-------+----------------+------------------------+
|
||||||
|
|server |vendor,model |HP,DL380 Gen9 |
|
||||||
|
+-------+----------------+------------------------+
|
||||||
|
|CPU |vendor,model |Intel,E5-2680 v3 |
|
||||||
|
| +----------------+------------------------+
|
||||||
|
| |processor_count |2 |
|
||||||
|
| +----------------+------------------------+
|
||||||
|
| |core_count |12 |
|
||||||
|
| +----------------+------------------------+
|
||||||
|
| |frequency_MHz |2500 |
|
||||||
|
+-------+----------------+------------------------+
|
||||||
|
|RAM |vendor,model |HP,752369-081 |
|
||||||
|
| +----------------+------------------------+
|
||||||
|
| |amount_MB |262144 |
|
||||||
|
+-------+----------------+------------------------+
|
||||||
|
|NETWORK|interface_name |p1p1 |
|
||||||
|
| +----------------+------------------------+
|
||||||
|
| |vendor,model |Intel,X710 Dual Port |
|
||||||
|
| +----------------+------------------------+
|
||||||
|
| |bandwidth |10G |
|
||||||
|
+-------+----------------+------------------------+
|
||||||
|
|STORAGE|dev_name |/dev/sda |
|
||||||
|
| +----------------+------------------------+
|
||||||
|
| |vendor,model | | raid10 - HP P840 |
|
||||||
|
| | | | 12 disks EH0600JEDHE |
|
||||||
|
| +----------------+------------------------+
|
||||||
|
| |SSD/HDD |HDD |
|
||||||
|
| +----------------+------------------------+
|
||||||
|
| |size | 3,6TB |
|
||||||
|
+-------+----------------+------------------------+
|
||||||
|
|
||||||
|
Operating system configuration
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Baremetal nodes were provisioned with Cobbler with our in-home preseed scripts.
|
||||||
|
OS versions we used:
|
||||||
|
|
||||||
|
.. table:: Versions Operating Systems
|
||||||
|
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|Software |Version |
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|Ubuntu |Ubuntu 16.04.1 LTS |
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|Kernel |4.4.0-47-generic |
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|
||||||
|
You can find /etc folder contents from the one of the typical system we were using:
|
||||||
|
|
||||||
|
:download:`etc_tarball <configs/node1.tar.gz>`
|
||||||
|
|
||||||
|
Required system metrics
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
At this layer we must get this list of processes:
|
||||||
|
|
||||||
|
.. table::
|
||||||
|
|
||||||
|
+------------------------+-----------------------------------------+
|
||||||
|
|List of processes |Mariadb |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Rabbitmq |
|
||||||
|
| |-----------------------------------------+
|
||||||
|
| |Keystone |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Glance |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Cinder |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Nova |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Neutron |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Openvswitch |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Kubernetes |
|
||||||
|
+------------------------+-----------------------------------------+
|
||||||
|
|
||||||
|
And following list of metrics:
|
||||||
|
|
||||||
|
.. table::
|
||||||
|
|
||||||
|
+------------------------+-----------------------------------------+
|
||||||
|
|Node load average |1min |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |5min |
|
||||||
|
| |-----------------------------------------+
|
||||||
|
| |15min |
|
||||||
|
+------------------------+-----------------------------------------+
|
||||||
|
|Global process stats |Running |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Stopped |
|
||||||
|
| |-----------------------------------------+
|
||||||
|
| |Waiting |
|
||||||
|
+------------------------+-----------------------------------------+
|
||||||
|
|Global CPU Usage | Steal |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| | Wait |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| | User |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| | System |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| | Interrupt |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| | Nice |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| | Idle |
|
||||||
|
+------------------------+-----------------------------------------+
|
||||||
|
|Per CPU Usage | User |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| | System |
|
||||||
|
+------------------------+-----------------------------------------+
|
||||||
|
|Global memory usage |bandwidth |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Cached |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Buffered |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Free |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Used |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Total |
|
||||||
|
+------------------------+-----------------------------------------+
|
||||||
|
|Numa monitoring |Numa_hit |
|
||||||
|
|For each node +-----------------------------------------+
|
||||||
|
| |Numa_miss |
|
||||||
|
| |-----------------------------------------+
|
||||||
|
| |Numa_foreign |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Local_node |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Other_node |
|
||||||
|
+------------------------+-----------------------------------------+
|
||||||
|
|Numa monitoring |Huge |
|
||||||
|
|For each pid +-----------------------------------------+
|
||||||
|
| |Heap |
|
||||||
|
| |-----------------------------------------+
|
||||||
|
| |Stack |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Private |
|
||||||
|
+------------------------+-----------------------------------------+
|
||||||
|
|Global IOSTAT \+ |Merge reads /s |
|
||||||
|
|Per device IOSTAT +-----------------------------------------+
|
||||||
|
| |Merge write /s |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |read/s |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |write/s |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Read transfer |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Write transfer |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Read latency |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Write latency |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Write transfer |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Queue size |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Await |
|
||||||
|
+------------------------+-----------------------------------------+
|
||||||
|
|Network per interface |Octets /s (in, out) |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Packet /s (in, out) |
|
||||||
|
| |-----------------------------------------+
|
||||||
|
| |Dropped /s |
|
||||||
|
+------------------------+-----------------------------------------+
|
||||||
|
|Other system metrics |Entropy |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |DF per device |
|
||||||
|
+------------------------+-----------------------------------------+
|
||||||
|
|
||||||
|
Kubernetes Layer
|
||||||
|
----------------
|
||||||
|
|
||||||
|
`Kargo`_ from `Fuel-CCP-installer`_ was our main tool to deploy K8S
|
||||||
|
on top of provisioned systems (monitored nodes).
|
||||||
|
|
||||||
|
Kargo sets up Kubernetes in the following way:
|
||||||
|
|
||||||
|
- masters: Calico, Kubernetes API services
|
||||||
|
- nodes: Calico, Kubernetes minion services
|
||||||
|
- etcd: etcd service
|
||||||
|
|
||||||
|
Kargo deployment parameters
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
You can find Kargo deployment script in `Kargo deployment script`_ section
|
||||||
|
|
||||||
|
.. code:: bash
|
||||||
|
|
||||||
|
docker_options: "--insecure-registry 172.20.8.35:5000 -D"
|
||||||
|
upstream_dns_servers: [172.20.8.34, 8.8.4.4]
|
||||||
|
nameservers: [172.20.8.34, 8.8.4.4]
|
||||||
|
kube_service_addresses: 10.224.0.0/12
|
||||||
|
kube_pods_subnet: 10.240.0.0/12
|
||||||
|
kube_network_node_prefix: 22
|
||||||
|
kube_apiserver_insecure_bind_address: "0.0.0.0"
|
||||||
|
dns_replicas: 3
|
||||||
|
dns_cpu_limit: "100m"
|
||||||
|
dns_memory_limit: "512Mi"
|
||||||
|
dns_cpu_requests: "70m"
|
||||||
|
dns_memory_requests: "70Mi"
|
||||||
|
deploy_netchecker: false
|
||||||
|
|
||||||
|
.. table::
|
||||||
|
|
||||||
|
+----------------------+-----------------------------------------+
|
||||||
|
|Software |Version |
|
||||||
|
+----------------------+-----------------------------------------+
|
||||||
|
|`Fuel-CCP-Installer`_ |6fd81252cb2d2c804f388337aa67d4403700f094 |
|
||||||
|
| | |
|
||||||
|
+----------------------+-----------------------------------------+
|
||||||
|
|`Kargo`_ |2c23027794d7851ee31363c5b6594180741ee923 |
|
||||||
|
+----------------------+-----------------------------------------+
|
||||||
|
|
||||||
|
Required K8S metrics
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Here we should get K8S health
|
||||||
|
metrics and ETCD performance metrics:
|
||||||
|
|
||||||
|
.. table::
|
||||||
|
|
||||||
|
+------------------------+-----------------------------------------+
|
||||||
|
|ETCD performance metrics|members count / states |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |numbers of keys in a cluster |
|
||||||
|
| |-----------------------------------------+
|
||||||
|
| |Size of data set |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Avg. latency from leader to followers |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Bandwidth rate, send/receive |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Create store success/fail |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Get success/fail |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Set success/fail |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Package rate, send/receive |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Expire count |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Update success/fail |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Compare-and-swap success/fail |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Watchers |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Delete success/fail |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Compare-and-delete success/fail |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Append req, send/ receive |
|
||||||
|
+------------------------+-----------------------------------------+
|
||||||
|
|K8S health metrics |Number of node in each state |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Total number of namespaces |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Total number of PODs per cluster,node,ns |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Total of number of services |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Endpoints in each service |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Number of API service instances |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Number of controller instances |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Number of scheduler instances |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Cluster resources, scheduler view |
|
||||||
|
+------------------------+-----------------------------------------+
|
||||||
|
|K8S API log analysis |Number of responses (per each HTTP code) |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Response Time |
|
||||||
|
+------------------------+-----------------------------------------+
|
||||||
|
|
||||||
|
For last two metrics we should utilize log collector to store and parse all
|
||||||
|
log records within K8S environments.
|
||||||
|
|
||||||
|
Openstack Layer
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
CCP stands for "Containerized Control Plane". CCP aims to build, run and manage
|
||||||
|
production-ready OpenStack containers on top of Kubernetes cluster.
|
||||||
|
|
||||||
|
.. table::
|
||||||
|
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|Software |Version |
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|`Fuel-CCP`_ |8570d0e0e512bd16f8449f0a10b1e3900fd09b2d |
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|
||||||
|
|
||||||
|
CCP configuration
|
||||||
|
^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
CCP was deployed on top of 200 nodes K8S cluster in the following configuration:
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
node[1-3]: Kubernetes
|
||||||
|
node([4-6])$: # 4-6
|
||||||
|
roles:
|
||||||
|
- controller
|
||||||
|
- openvswitch
|
||||||
|
node[7-9]$: # 7-9
|
||||||
|
roles:
|
||||||
|
- rabbitmq
|
||||||
|
node10$: # 10
|
||||||
|
roles:
|
||||||
|
- galera
|
||||||
|
node11$: # 11
|
||||||
|
roles:
|
||||||
|
- heat
|
||||||
|
node(1[2-9])$: # 12-19
|
||||||
|
roles:
|
||||||
|
- compute
|
||||||
|
- openvswitch
|
||||||
|
node[2-9][0-9]$: # 20-99
|
||||||
|
roles:
|
||||||
|
- compute
|
||||||
|
- openvswitch
|
||||||
|
node(1[0-9][0-9])$: # 100-199
|
||||||
|
roles:
|
||||||
|
- compute
|
||||||
|
- openvswitch
|
||||||
|
node200$:
|
||||||
|
roles:
|
||||||
|
- backup
|
||||||
|
|
||||||
|
|
||||||
|
CCP Openstack services list ( `versions.yaml`_ ):
|
||||||
|
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
openstack/cinder:
|
||||||
|
git_ref: stable/newton
|
||||||
|
git_url: https://github.com/openstack/cinder.git
|
||||||
|
openstack/glance:
|
||||||
|
git_ref: stable/newton
|
||||||
|
git_url: https://github.com/openstack/glance.git
|
||||||
|
openstack/heat:
|
||||||
|
git_ref: stable/newton
|
||||||
|
git_url: https://github.com/openstack/heat.git
|
||||||
|
openstack/horizon:
|
||||||
|
git_ref: stable/newton
|
||||||
|
git_url: https://github.com/openstack/horizon.git
|
||||||
|
openstack/keystone:
|
||||||
|
git_ref: stable/newton
|
||||||
|
git_url: https://github.com/openstack/keystone.git
|
||||||
|
openstack/neutron:
|
||||||
|
git_ref: stable/newton
|
||||||
|
git_url: https://github.com/openstack/neutron.git
|
||||||
|
openstack/nova:
|
||||||
|
git_ref: stable/newton
|
||||||
|
git_url: https://github.com/openstack/nova.git
|
||||||
|
openstack/requirements:
|
||||||
|
git_ref: stable/newton
|
||||||
|
git_url: https://git.openstack.org/openstack/requirements.git
|
||||||
|
openstack/sahara-dashboard:
|
||||||
|
git_ref: stable/newton
|
||||||
|
git_url: https://git.openstack.org/openstack/sahara-dashboard.git
|
||||||
|
|
||||||
|
|
||||||
|
`K8S Ingress Resources`_ rules were enabled during CCP deployment to expose Openstack services
|
||||||
|
endpoints to external routable network.
|
||||||
|
|
||||||
|
|
||||||
|
See CCP deployment script and configuration files in the
|
||||||
|
`CCP deployment and configuration files`_ section.
|
||||||
|
|
||||||
|
Required Openstack-related metrics
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
At this layer we should get openstack environment metrics,
|
||||||
|
API and resources utilization metrics.
|
||||||
|
|
||||||
|
.. table:: Versions of CCP-related software
|
||||||
|
|
||||||
|
+------------------------+-----------------------------------------+
|
||||||
|
|Openstack metrics |Total number of controller nodes |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Total number of services |
|
||||||
|
| |-----------------------------------------+
|
||||||
|
| |Total number of compute nodes |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Total number of nodes |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Total number of VMs |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Number of VMs per tenant, per node |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Resource utilization per project,service |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Total number of tenants |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |API request time |
|
||||||
|
| +-----------------------------------------+
|
||||||
|
| |Mean time to spawn VM |
|
||||||
|
+------------------------+-----------------------------------------+
|
||||||
|
|
||||||
|
Implementation
|
||||||
|
==============
|
||||||
|
|
||||||
|
This part of documentation describes Monitoring System implementation.
|
||||||
|
Here is software list that we chose to realize all required tasks:
|
||||||
|
|
||||||
|
.. table::
|
||||||
|
|
||||||
|
+-----------------------------------------+-----------------------------------------+
|
||||||
|
|Monitoring Node Server Side |Monitored Node Client Side |
|
||||||
|
+--------------------+--------------------+--------------------+--------------------+
|
||||||
|
|Metrics server |Log storage |Metrics agent |Log collector |
|
||||||
|
| | | | |
|
||||||
|
+--------------------+--------------------+--------------------+--------------------+
|
||||||
|
| `Prometheus`_ \+ | `ElasticSearch`_ |`Telegraf`_ | `Heka`_ |
|
||||||
|
| `Grafana`_ | \+ `Kibana`_ | | |
|
||||||
|
+--------------------+--------------------+--------------------+--------------------+
|
||||||
|
|
||||||
|
Server Side Software
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
Prometheus
|
||||||
|
^^^^^^^^^^
|
||||||
|
|
||||||
|
.. table::
|
||||||
|
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|Software |Version |
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|`Prometheus GitHub`_|7e369b9318a4d5d97a004586a99f10fa51a46b26 |
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|
||||||
|
Due to high load rate we faced an issue with Prometheus performance at metrics count up to 15 millions.
|
||||||
|
We split Prometheus setup in 2 standalone nodes. First node - to poll API metrics from K8S-related services
|
||||||
|
that natively available at `/metrics` uri and exposed by K8S API and ETCD API by default.
|
||||||
|
Second node - to store all other metrics that should be collected and calculated locally on environment
|
||||||
|
servers via Telegraf.
|
||||||
|
|
||||||
|
Prometheus nodes deployments scripts and configuration files could be found at `Prometheus deployment and configuration files`_ section
|
||||||
|
|
||||||
|
Grafana
|
||||||
|
^^^^^^^
|
||||||
|
|
||||||
|
.. table::
|
||||||
|
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|Software |Version |
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|`Grafana`_ |v4.0.1 |
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|
||||||
|
Grafana was used as a metrics visualizer with several dashboards for each metrics group.
|
||||||
|
Separate individual dashboards were built for each group of metrics:
|
||||||
|
|
||||||
|
- System nodes metrics
|
||||||
|
- Kubernetes metrics
|
||||||
|
- ETCD metrics
|
||||||
|
- Openstack metrics
|
||||||
|
|
||||||
|
You can find their setting at `Grafana dashboards configuration`_
|
||||||
|
|
||||||
|
Grafana server deployment script:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
#!/bin/bash
|
||||||
|
ansible-playbook -i ./hosts ./deploy-graf-prom.yaml --tags "grafana"
|
||||||
|
|
||||||
|
It uses the same yaml configuration file `deploy-graf-prom.yaml`_ from `Prometheus deployment and configuration files`_ section.
|
||||||
|
|
||||||
|
ElasticSearch
|
||||||
|
^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
.. table::
|
||||||
|
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|Software |Version |
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|`ElasticSearch`_ |2.4.2 |
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|
||||||
|
ElasticSearch is well-known proven log storage and we used it as a standalone
|
||||||
|
node for collecting Kubernetes API logs and all other logs from containers across environment.
|
||||||
|
For appropriate performance at 200 nodes lab we increased `ES_HEAP_SIZE` from default 1G to 10G
|
||||||
|
in /etc/default/elasticsearch configuration file.
|
||||||
|
|
||||||
|
Elastic search and Kibana dashboard were installed with
|
||||||
|
`deploy_elasticsearch_kibana.sh`_ deployment script.
|
||||||
|
|
||||||
|
Kibana
|
||||||
|
^^^^^^
|
||||||
|
|
||||||
|
.. table::
|
||||||
|
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|Software |Version |
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|`Kibana`_ |4.5.4 |
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|
||||||
|
We used Kibana as a main visualization tool for Elastic Search. We were able to create chart
|
||||||
|
graphs based on K8S API logs analysis. Kibana was installed on a single separate node
|
||||||
|
with a single dashboard representing K8S API Response time graph.
|
||||||
|
|
||||||
|
Dashboard settings:
|
||||||
|
|
||||||
|
:download:`Kibana_dashboard.json <configs/dashboards/Kibana_dashboard.json>`
|
||||||
|
|
||||||
|
Client side Software
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
Telegraf
|
||||||
|
^^^^^^^^
|
||||||
|
|
||||||
|
.. table::
|
||||||
|
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|Software |Version |
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|`Telegraf`_ |v1.0.0-beta2-235-gbc14ac5 |
|
||||||
|
| |git: openstack_stats |
|
||||||
|
| |bc14ac5b9475a59504b463ad8f82ed810feed3ec |
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|
||||||
|
Telegraf was chosen as client-side metrics agent. It provides multiple ways to poll and calculate from variety of
|
||||||
|
different sources. With regard to its plugin-driven nature, it takes data from different inputs and
|
||||||
|
exposes calculated metrics in Prometheus format. We used forked version of Telegraf with custom patches to
|
||||||
|
be able to utilize custom Openstack-input plugin:
|
||||||
|
|
||||||
|
- `GitHub Telegraf Fork`_
|
||||||
|
- `Go SDK for OpenStack`_
|
||||||
|
|
||||||
|
Following automation scripts and configuration files were used to start Telegraf agent
|
||||||
|
across environment nodes.
|
||||||
|
|
||||||
|
`Telegraf deployment and configuration files`_
|
||||||
|
|
||||||
|
Below you can see which plugins were used to obtain metrics.
|
||||||
|
|
||||||
|
Standart Plugins
|
||||||
|
""""""""""""""""
|
||||||
|
|
||||||
|
.. code:: bash
|
||||||
|
|
||||||
|
inputs.cpu CPU
|
||||||
|
inputs.disk
|
||||||
|
inputs.diskio
|
||||||
|
inputs.kernel
|
||||||
|
inputs.mem
|
||||||
|
inputs.processes
|
||||||
|
inputs.swap
|
||||||
|
inputs.system
|
||||||
|
inputs.kernel_vmstat
|
||||||
|
inputs.net
|
||||||
|
inputs.netstat
|
||||||
|
inputs.exec
|
||||||
|
|
||||||
|
Openstack input plugin
|
||||||
|
""""""""""""""""""""""
|
||||||
|
`inputs.openstack` custom plugin was used to gather the most of required Openstack-related metrics.
|
||||||
|
|
||||||
|
settings:
|
||||||
|
|
||||||
|
.. code:: bash
|
||||||
|
|
||||||
|
interval = '40s'
|
||||||
|
identity_endpoint = "http://keystone.ccp.svc.cluster.local:5000/v3"
|
||||||
|
domain = "default"
|
||||||
|
project = "admin"
|
||||||
|
username = "admin"
|
||||||
|
password = "password"
|
||||||
|
|
||||||
|
|
||||||
|
`System.exec` plugin
|
||||||
|
""""""""""""""""""""
|
||||||
|
`system.exec` plugin was used to trigger scripts to poll
|
||||||
|
and calculate all non-standard metrics.
|
||||||
|
|
||||||
|
common settings:
|
||||||
|
|
||||||
|
.. code:: bash
|
||||||
|
|
||||||
|
interval = "15s"
|
||||||
|
timeout = "30s"
|
||||||
|
data_format = "influx"
|
||||||
|
|
||||||
|
commands:
|
||||||
|
|
||||||
|
.. code:: bash
|
||||||
|
|
||||||
|
"/opt/telegraf/bin/list_openstack_processes.sh"
|
||||||
|
"/opt/telegraf/bin/per_process_cpu_usage.sh"
|
||||||
|
"/opt/telegraf/bin/numa_stat_per_pid.sh"
|
||||||
|
"/opt/telegraf/bin/iostat_per_device.sh"
|
||||||
|
"/opt/telegraf/bin/memory_bandwidth.sh"
|
||||||
|
"/opt/telegraf/bin/network_tcp_queue.sh"
|
||||||
|
"/opt/telegraf/bin/etcd_get_metrics.sh"
|
||||||
|
"/opt/telegraf/bin/k8s_get_metrics.sh"
|
||||||
|
"/opt/telegraf/bin/vmtime.sh"
|
||||||
|
"/opt/telegraf/bin/osapitime.sh"
|
||||||
|
|
||||||
|
You can see full Telegraf configuration file and its custom input scripts in the
|
||||||
|
section `Telegraf deployment and configuration files`_.
|
||||||
|
|
||||||
|
Heka
|
||||||
|
^^^^
|
||||||
|
|
||||||
|
.. table::
|
||||||
|
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|Software |Version |
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|`Heka`_ |0.10.0 |
|
||||||
|
+--------------------+-----------------------------------------+
|
||||||
|
|
||||||
|
We chose Heka as log collecting agent for its wide variety of inputs
|
||||||
|
(possibility to feed data from Docker socket), filters (custom shorthand SandBox filters in LUA language)
|
||||||
|
and possibility to encode data for ElasticSearch.
|
||||||
|
|
||||||
|
With Heka agent started across environment servers we were able to send containers' logs to ElasticSearch
|
||||||
|
server. With custom LUA filter we extracted K8S API data and convert it in appropriate format to
|
||||||
|
visualize API timing counters (Average Response Time).
|
||||||
|
|
||||||
|
Heka deployment scripts and configuration file with LUA custom filter are in
|
||||||
|
`Heka deployment and configuration`_ section.
|
||||||
|
|
||||||
|
Applications
|
||||||
|
============
|
||||||
|
|
||||||
|
Kargo deployment script
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
deploy_k8s_using_kargo.sh
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
.. literalinclude:: configs/deploy_k8s_using_kargo.sh
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
CCP deployment and configuration files
|
||||||
|
---------------------------------------
|
||||||
|
|
||||||
|
deploy-ccp.sh
|
||||||
|
^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
.. literalinclude:: configs/ccp/deploy-ccp.sh
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
ccp.yaml
|
||||||
|
^^^^^^^^
|
||||||
|
|
||||||
|
.. literalinclude:: configs/ccp/ccp.yaml
|
||||||
|
:language: yaml
|
||||||
|
|
||||||
|
configs.yaml
|
||||||
|
^^^^^^^^^^^^
|
||||||
|
|
||||||
|
.. literalinclude:: configs/ccp/configs.yaml
|
||||||
|
:language: yaml
|
||||||
|
|
||||||
|
topology.yaml
|
||||||
|
^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
.. literalinclude:: configs/ccp/topology.yaml
|
||||||
|
:language: yaml
|
||||||
|
|
||||||
|
repos.yaml
|
||||||
|
^^^^^^^^^^
|
||||||
|
|
||||||
|
.. literalinclude:: configs/ccp/repos.yaml
|
||||||
|
:language: yaml
|
||||||
|
|
||||||
|
versions.yaml
|
||||||
|
^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
.. literalinclude:: configs/ccp/versions.yaml
|
||||||
|
:language: yaml
|
||||||
|
|
||||||
|
Prometheus deployment and configuration files
|
||||||
|
---------------------------------------------
|
||||||
|
|
||||||
|
Deployment scripts
|
||||||
|
^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
deploy_prometheus.sh
|
||||||
|
""""""""""""""""""""
|
||||||
|
|
||||||
|
.. literalinclude:: configs/prometheus-grafana-telegraf/deploy_prometheus.sh
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
deploy-graf-prom.yaml
|
||||||
|
"""""""""""""""""""""
|
||||||
|
|
||||||
|
.. literalinclude:: configs/prometheus-grafana-telegraf/deploy-graf-prom.yaml
|
||||||
|
:language: yaml
|
||||||
|
|
||||||
|
docker_prometheus.yaml
|
||||||
|
""""""""""""""""""""""
|
||||||
|
|
||||||
|
.. literalinclude:: configs/prometheus-grafana-telegraf/docker_prometheus.yaml
|
||||||
|
:language: yaml
|
||||||
|
|
||||||
|
deploy_etcd_collect.sh
|
||||||
|
""""""""""""""""""""""
|
||||||
|
|
||||||
|
.. literalinclude:: configs/prometheus-grafana-telegraf/deploy_etcd_collect.sh
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
Configuration files
|
||||||
|
^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
prometheus-kuber.yml.j2
|
||||||
|
"""""""""""""""""""""""
|
||||||
|
|
||||||
|
.. literalinclude:: configs/prometheus-grafana-telegraf/prometheus/prometheus-kuber.yml.j2
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
prometheus-system.yml.j2
|
||||||
|
""""""""""""""""""""""""
|
||||||
|
|
||||||
|
.. literalinclude:: configs/prometheus-grafana-telegraf/prometheus/prometheus-system.yml.j2
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
targets.yml.j2
|
||||||
|
""""""""""""""
|
||||||
|
|
||||||
|
.. literalinclude:: configs/prometheus-grafana-telegraf/prometheus/targets.yml.j2
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
Grafana dashboards configuration
|
||||||
|
--------------------------------
|
||||||
|
|
||||||
|
:download:`Systems_nodes_statistics.json <configs/dashboards/Systems_nodes_statistics.json>`
|
||||||
|
|
||||||
|
:download:`Kubernetes_statistics.json <configs/dashboards/Kubernetes_statistics.json>`
|
||||||
|
|
||||||
|
:download:`ETCD.json <configs/dashboards/ETCD.json>`
|
||||||
|
|
||||||
|
:download:`OpenStack.json <configs/dashboards/OpenStack.json>`
|
||||||
|
|
||||||
|
ElasticSearch deployment script
|
||||||
|
-------------------------------
|
||||||
|
|
||||||
|
deploy_elasticsearch_kibana.sh
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
.. literalinclude:: configs/elasticsearch-heka/deploy_elasticsearch_kibana.sh
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
Telegraf deployment and configuration files
|
||||||
|
-------------------------------------------
|
||||||
|
|
||||||
|
deploy_telegraf.sh
|
||||||
|
^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
.. literalinclude:: configs/prometheus-grafana-telegraf/deploy_telegraf.sh
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
deploy-telegraf.yaml
|
||||||
|
^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
.. literalinclude:: configs/prometheus-grafana-telegraf/deploy-telegraf.yaml
|
||||||
|
:language: yaml
|
||||||
|
|
||||||
|
Telegraf system
|
||||||
|
^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
telegraf-sys.conf
|
||||||
|
"""""""""""""""""
|
||||||
|
|
||||||
|
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/telegraf-sys.conf
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
Telegraf openstack
|
||||||
|
^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
telegraf-openstack.conf.j2
|
||||||
|
""""""""""""""""""""""""""
|
||||||
|
|
||||||
|
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/telegraf-openstack.conf.j2
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
Telegraf inputs scripts
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
list_openstack_processes.sh
|
||||||
|
"""""""""""""""""""""""""""
|
||||||
|
|
||||||
|
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/scripts/list_openstack_processes.sh
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
per_process_cpu_usage.sh
|
||||||
|
""""""""""""""""""""""""
|
||||||
|
|
||||||
|
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/scripts/per_process_cpu_usage.sh
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
numa_stat_per_pid.sh
|
||||||
|
""""""""""""""""""""
|
||||||
|
|
||||||
|
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/scripts/numa_stat_per_pid.sh
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
iostat_per_device.sh
|
||||||
|
""""""""""""""""""""
|
||||||
|
|
||||||
|
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/scripts/iostat_per_device.sh
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
memory_bandwidth.sh
|
||||||
|
"""""""""""""""""""
|
||||||
|
|
||||||
|
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/scripts/memory_bandwidth.sh
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
network_tcp_queue.sh
|
||||||
|
""""""""""""""""""""
|
||||||
|
|
||||||
|
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/scripts/network_tcp_queue.sh
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
etcd_get_metrics.sh
|
||||||
|
"""""""""""""""""""
|
||||||
|
|
||||||
|
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/scripts/etcd_get_metrics.sh
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
k8s_get_metrics.sh
|
||||||
|
""""""""""""""""""
|
||||||
|
|
||||||
|
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/scripts/k8s_get_metrics.sh
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
vmtime.sh
|
||||||
|
"""""""""
|
||||||
|
|
||||||
|
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/scripts/vmtime.sh
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
osapitime.sh
|
||||||
|
""""""""""""
|
||||||
|
|
||||||
|
.. literalinclude:: configs/prometheus-grafana-telegraf/telegraf/scripts/osapitime.sh
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
Heka deployment and configuration
|
||||||
|
---------------------------------
|
||||||
|
|
||||||
|
Deployment
|
||||||
|
^^^^^^^^^^
|
||||||
|
|
||||||
|
deploy_heka.sh
|
||||||
|
""""""""""""""
|
||||||
|
|
||||||
|
.. literalinclude:: configs/elasticsearch-heka/deploy_heka.sh
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
deploy-heka.yaml
|
||||||
|
""""""""""""""""
|
||||||
|
|
||||||
|
.. literalinclude:: configs/elasticsearch-heka/deploy-heka.yaml
|
||||||
|
:language: yaml
|
||||||
|
|
||||||
|
Configuration
|
||||||
|
^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
00-hekad.toml.j2
|
||||||
|
""""""""""""""""
|
||||||
|
|
||||||
|
.. literalinclude:: configs/elasticsearch-heka/heka/00-hekad.toml.j2
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
kubeapi_to_int.lua.j2
|
||||||
|
"""""""""""""""""""""
|
||||||
|
|
||||||
|
.. literalinclude:: configs/elasticsearch-heka/heka/kubeapi_to_int.lua.j2
|
||||||
|
:language: bash
|
||||||
|
|
||||||
|
|
||||||
|
.. references:
|
||||||
|
|
||||||
|
.. _Fuel-CCP-Installer: https://github.com/openstack/fuel-ccp-installer
|
||||||
|
.. _Kargo: https://github.com/kubernetes-incubator/kargo.git
|
||||||
|
.. _Fuel-CCP: https://github.com/openstack/fuel-ccp
|
||||||
|
.. _Prometheus: https://prometheus.io/
|
||||||
|
.. _Prometheus GitHub: https://github.com/prometheus/prometheus
|
||||||
|
.. _Grafana: http://grafana.org/
|
||||||
|
.. _ElasticSearch: https://www.elastic.co/products/elasticsearch
|
||||||
|
.. _Kibana: https://www.elastic.co/products/kibana
|
||||||
|
.. _Telegraf: https://www.influxdata.com/time-series-platform/telegraf/
|
||||||
|
.. _GitHub Telegraf Fork: https://github.com/spjmurray/telegraf/tree/openstack_stats/plugins/inputs/openstack
|
||||||
|
.. _Go SDK for OpenStack: https://github.com/rackspace/gophercloud/
|
||||||
|
.. _Heka: https://hekad.readthedocs.io/en/v0.10.0/
|
||||||
|
.. _K8S Ingress Resources: http://kubernetes.io/docs/user-guide/ingress/
|
||||||
|
|
||||||
|
|
||||||
Reference in New Issue
Block a user