coolify/templates/compose/signoz.yaml
2026-01-11 17:51:10 +01:00

627 lines
26 KiB
YAML

# documentation: https://signoz.io/docs/introduction/
# slogan: An observability platform native to OpenTelemetry with logs, traces and metrics.
# tags: telemetry, server, applications, interface, logs, monitoring, traces, metrics
# logo: svgs/signoz.svg
# port: 8080
services:
init-clickhouse:
image: clickhouse/clickhouse-server:25.5.6-alpine
command:
- bash
- -c
- |
version="v0.0.1"
node_os=$$(uname -s | tr '[:upper:]' '[:lower:]')
node_arch=$$(uname -m | sed s/aarch64/arm64/ | sed s/x86_64/amd64/)
echo "Fetching histogram-binary for $${node_os}/$${node_arch}"
cd /tmp
wget -O histogram-quantile.tar.gz "https://github.com/SigNoz/signoz/releases/download/histogram-quantile%2F$${version}/histogram-quantile_$${node_os}_$${node_arch}.tar.gz"
tar -xvzf histogram-quantile.tar.gz
mkdir -p /var/lib/clickhouse/user_scripts/histogramQuantile
mv histogram-quantile /var/lib/clickhouse/user_scripts/histogramQuantile
restart: on-failure
exclude_from_hc: true
logging:
options:
max-size: 50m
max-file: "3"
zookeeper:
image: signoz/zookeeper:3.9.3
user: root
healthcheck:
test:
- CMD-SHELL
- curl -s -m 2 http://localhost:8080/commands/ruok | grep error | grep null
interval: 30s
timeout: 5s
retries: 3
logging:
options:
max-size: 50m
max-file: "3"
volumes:
- zookeeper:/bitnami/zookeeper
environment:
- ALLOW_ANONYMOUS_LOGIN=${ZOO_ALLOW_ANONYMOUS_LOGIN:-yes}
- ZOO_AUTOPURGE_INTERVAL=${ZOO_AUTOPURGE_INTERVAL:-1}
- ZOO_ENABLE_PROMETHEUS_METRICS=${ZOO_ENABLE_PROMETHEUS_METRICS:-yes}
- ZOO_PROMETHEUS_METRICS_PORT_NUMBER=${ZOO_PROMETHEUS_METRICS_PORT_NUMBER:-9141}
clickhouse:
# addding non LTS version due to this fix https://github.com/ClickHouse/ClickHouse/commit/32caf8716352f45c1b617274c7508c86b7d1afab
image: clickhouse/clickhouse-server:25.5.6-alpine
tty: true
depends_on:
init-clickhouse:
condition: service_completed_successfully
zookeeper:
condition: service_healthy
healthcheck:
test:
- CMD
- wget
- --spider
- -q
- 0.0.0.0:8123/ping
interval: 30s
timeout: 5s
retries: 3
ulimits:
nproc: 65535
nofile:
soft: 262144
hard: 262144
logging:
options:
max-size: 50m
max-file: "3"
environment:
- "CLICKHOUSE_SKIP_USER_SETUP=1"
volumes:
- type: volume
source: clickhouse
target: /var/lib/clickhouse/
- type: bind
source: ./clickhouse/custom-function.xml
target: /etc/clickhouse-server/custom-function.xml
content: |
<functions>
<function>
<type>executable</type>
<name>histogramQuantile</name>
<return_type>Float64</return_type>
<argument>
<type>Array(Float64)</type>
<name>buckets</name>
</argument>
<argument>
<type>Array(Float64)</type>
<name>counts</name>
</argument>
<argument>
<type>Float64</type>
<name>quantile</name>
</argument>
<format>CSV</format>
<command>./histogramQuantile</command>
</function>
</functions>
- type: bind
source: ./clickhouse/cluster.xml
target: /etc/clickhouse-server/config.d/cluster.xml
content: |
<?xml version="1.0"?>
<clickhouse>
<!-- ZooKeeper is used to store metadata about replicas, when using Replicated tables.
Optional. If you don't use replicated tables, you could omit that.
See https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/replication/
-->
<zookeeper>
<node index="1">
<host>zookeeper</host>
<port>2181</port>
</node>
</zookeeper>
<!-- Configuration of clusters that could be used in Distributed tables.
https://clickhouse.com/docs/en/operations/table_engines/distributed/
-->
<remote_servers>
<cluster>
<!-- Inter-server per-cluster secret for Distributed queries
default: no secret (no authentication will be performed)
If set, then Distributed queries will be validated on shards, so at least:
- such cluster should exist on the shard,
- such cluster should have the same secret.
And also (and which is more important), the initial_user will
be used as current user for the query.
Right now the protocol is pretty simple and it only takes into account:
- cluster name
- query
Also it will be nice if the following will be implemented:
- source hostname (see interserver_http_host), but then it will depends from DNS,
it can use IP address instead, but then the you need to get correct on the initiator node.
- target hostname / ip address (same notes as for source hostname)
- time-based security tokens
-->
<!-- <secret></secret> -->
<shard>
<!-- Optional. Whether to write data to just one of the replicas. Default: false (write data to all replicas). -->
<!-- <internal_replication>false</internal_replication> -->
<!-- Optional. Shard weight when writing data. Default: 1. -->
<!-- <weight>1</weight> -->
<replica>
<host>clickhouse</host>
<port>9000</port>
<!-- Optional. Priority of the replica for load_balancing. Default: 1 (less value has more priority). -->
<!-- <priority>1</priority> -->
</replica>
</shard>
<!-- <shard>
<replica>
<host>clickhouse-2</host>
<port>9000</port>
</replica>
</shard>
<shard>
<replica>
<host>clickhouse-3</host>
<port>9000</port>
</replica>
</shard> -->
</cluster>
</remote_servers>
</clickhouse>
- type: bind
source: ./clickhouse/users.xml
target: /etc/clickhouse-server/users.xml
content: |
<?xml version="1.0"?>
<clickhouse>
<!-- See also the files in users.d directory where the settings can be overridden. -->
<!-- Profiles of settings. -->
<profiles>
<!-- Default settings. -->
<default>
<!-- Maximum memory usage for processing single query, in bytes. -->
<max_memory_usage>10000000000</max_memory_usage>
<!-- How to choose between replicas during distributed query processing.
random - choose random replica from set of replicas with minimum number of errors
nearest_hostname - from set of replicas with minimum number of errors, choose replica
with minimum number of different symbols between replica's hostname and local hostname
(Hamming distance).
in_order - first live replica is chosen in specified order.
first_or_random - if first replica one has higher number of errors, pick a random one from replicas with minimum number of errors.
-->
<load_balancing>random</load_balancing>
</default>
<!-- Profile that allows only read queries. -->
<readonly>
<readonly>1</readonly>
</readonly>
</profiles>
<!-- Users and ACL. -->
<users>
<!-- If user name was not specified, 'default' user is used. -->
<default>
<!-- See also the files in users.d directory where the password can be overridden.
Password could be specified in plaintext or in SHA256 (in hex format).
If you want to specify password in plaintext (not recommended), place it in 'password' element.
Example: <password>qwerty</password>.
Password could be empty.
If you want to specify SHA256, place it in 'password_sha256_hex' element.
Example: <password_sha256_hex>65e84be33532fb784c48129675f9eff3a682b27168c0ea744b2cf58ee02337c5</password_sha256_hex>
Restrictions of SHA256: impossibility to connect to ClickHouse using MySQL JS client (as of July 2019).
If you want to specify double SHA1, place it in 'password_double_sha1_hex' element.
Example: <password_double_sha1_hex>e395796d6546b1b65db9d665cd43f0e858dd4303</password_double_sha1_hex>
If you want to specify a previously defined LDAP server (see 'ldap_servers' in the main config) for authentication,
place its name in 'server' element inside 'ldap' element.
Example: <ldap><server>my_ldap_server</server></ldap>
If you want to authenticate the user via Kerberos (assuming Kerberos is enabled, see 'kerberos' in the main config),
place 'kerberos' element instead of 'password' (and similar) elements.
The name part of the canonical principal name of the initiator must match the user name for authentication to succeed.
You can also place 'realm' element inside 'kerberos' element to further restrict authentication to only those requests
whose initiator's realm matches it.
Example: <kerberos />
Example: <kerberos><realm>EXAMPLE.COM</realm></kerberos>
How to generate decent password:
Execute: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha256sum | tr -d '-'
In first line will be password and in second - corresponding SHA256.
How to generate double SHA1:
Execute: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha1sum | tr -d '-' | xxd -r -p | sha1sum | tr -d '-'
In first line will be password and in second - corresponding double SHA1.
-->
<password></password>
<!-- List of networks with open access.
To open access from everywhere, specify:
<ip>::/0</ip>
To open access only from localhost, specify:
<ip>::1</ip>
<ip>127.0.0.1</ip>
Each element of list has one of the following forms:
<ip> IP-address or network mask. Examples: 213.180.204.3 or 10.0.0.1/8 or 10.0.0.1/255.255.255.0
2a02:6b8::3 or 2a02:6b8::3/64 or 2a02:6b8::3/ffff:ffff:ffff:ffff::.
<host> Hostname. Example: server01.clickhouse.com.
To check access, DNS query is performed, and all received addresses compared to peer address.
<host_regexp> Regular expression for host names. Example, ^server\d\d-\d\d-\d\.clickhouse\.com$
To check access, DNS PTR query is performed for peer address and then regexp is applied.
Then, for result of PTR query, another DNS query is performed and all received addresses compared to peer address.
Strongly recommended that regexp is ends with $
All results of DNS requests are cached till server restart.
-->
<networks>
<ip>::/0</ip>
</networks>
<!-- Settings profile for user. -->
<profile>default</profile>
<!-- Quota for user. -->
<quota>default</quota>
<!-- User can create other users and grant rights to them. -->
<!-- <access_management>1</access_management> -->
</default>
</users>
<!-- Quotas. -->
<quotas>
<!-- Name of quota. -->
<default>
<!-- Limits for time interval. You could specify many intervals with different limits. -->
<interval>
<!-- Length of interval. -->
<duration>3600</duration>
<!-- No limits. Just calculate resource usage for time interval. -->
<queries>0</queries>
<errors>0</errors>
<result_rows>0</result_rows>
<read_rows>0</read_rows>
<execution_time>0</execution_time>
</interval>
</default>
</quotas>
</clickhouse>
- type: bind
source: ./clickhouse/config.xml
target: /etc/clickhouse-server/config.xml
content: |
<?xml version="1.0"?>
<clickhouse>
<max_connections>4096</max_connections>
<keep_alive_timeout>3</keep_alive_timeout>
<max_concurrent_queries>100</max_concurrent_queries>
<mark_cache_size>5368709120</mark_cache_size>
<mmap_cache_size>1000</mmap_cache_size>
<compiled_expression_cache_size>134217728</compiled_expression_cache_size>
<compiled_expression_cache_elements_size>10000</compiled_expression_cache_elements_size>
<custom_settings_prefixes></custom_settings_prefixes>
<dictionaries_config>*_dictionary.xml</dictionaries_config>
<user_defined_executable_functions_config>*function.xml</user_defined_executable_functions_config>
<user_scripts_path>/var/lib/clickhouse/user_scripts/</user_scripts_path>
<http_port>8123</http_port>
<tcp_port>9000</tcp_port>
<mysql_port>9004</mysql_port>
<postgresql_port>9005</postgresql_port>
<interserver_http_port>9009</interserver_http_port>
<logger>
<level>information</level>
<formatting>
<type>json</type>
</formatting>
</logger>
<macros>
<shard>01</shard>
<replica>example01-01-1</replica>
</macros>
<prometheus>
<endpoint>/metrics</endpoint>
<port>9363</port>
<metrics>true</metrics>
<events>true</events>
<asynchronous_metrics>true</asynchronous_metrics>
<status_info>true</status_info>
</prometheus>
<opentelemetry_span_log>
<engine>engine MergeTree
partition by toYYYYMM(finish_date)
order by (finish_date, finish_time_us, trace_id)</engine>
</opentelemetry_span_log>
<query_masking_rules>
<rule>
<name>hide encrypt/decrypt arguments</name>
<regexp>((?:aes_)?(?:encrypt|decrypt)(?:_mysql)?)\s*\(\s*(?:'(?:\\'|.)+'|.*?)\s*\)</regexp>
<replace>\1(???)</replace>
</rule>
</query_masking_rules>
<send_crash_reports>
<enabled>false</enabled>
<anonymize>false</anonymize>
<endpoint>https://6f33034cfe684dd7a3ab9875e57b1c8d@o388870.ingest.sentry.io/5226277</endpoint>
</send_crash_reports>
<merge_tree_metadata_cache>
<lru_cache_size>268435456</lru_cache_size>
<continue_if_corrupted>true</continue_if_corrupted>
</merge_tree_metadata_cache>
<user_directories>
<users_xml>
<!-- Path to configuration file with predefined users. -->
<path>users.xml</path>
</users_xml>
<local_directory>
<!-- Path to folder where users created by SQL commands are stored. -->
<path>/var/lib/clickhouse/access/</path>
</local_directory>
</user_directories>
<default_profile>default</default_profile>
<distributed_ddl>
<!-- Path in ZooKeeper to queue with DDL queries -->
<path>/clickhouse/task_queue/ddl</path>
</distributed_ddl>
</clickhouse>
signoz:
image: signoz/signoz:v0.97.1
depends_on:
clickhouse:
condition: service_healthy
schema-migrator-sync:
condition: service_completed_successfully
logging:
options:
max-size: 50m
max-file: "3"
command:
- --config=/root/config/prometheus.yml
volumes:
- type: bind
source: ./prometheus.yml
target: /root/config/prometheus.yml
content: |
# my global config
global:
scrape_interval: 5s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files: []
# - "first_rules.yml"
# - "second_rules.yml"
# - 'alerts.yml'
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs: []
remote_read:
- url: tcp://clickhouse:9000/signoz_metrics
- type: volume
source: sqlite
target: /var/lib/signoz/
environment:
- SERVICE_URL_SIGNOZ_8080
- SIGNOZ_JWT_SECRET=${SERVICE_REALBASE64_JWTSECRET}
- SIGNOZ_TELEMETRYSTORE_CLICKHOUSE_DSN=tcp://clickhouse:9000
- SIGNOZ_SQLSTORE_SQLITE_PATH=/var/lib/signoz/signoz.db
- DASHBOARDS_PATH=/root/config/dashboards
- STORAGE=clickhouse
- GODEBUG=netdns=go
- DEPLOYMENT_TYPE=docker-standalone-amd
- SIGNOZ_STATSREPORTER_ENABLED=${SIGNOZ_STATSREPORTER_ENABLED:-true}
- SIGNOZ_EMAILING_ENABLED=${SIGNOZ_EMAILING_ENABLED:-false}
- SIGNOZ_EMAILING_SMTP_ADDRESS=${SIGNOZ_EMAILING_SMTP_ADDRESS}
- SIGNOZ_EMAILING_SMTP_FROM=${SIGNOZ_EMAILING_SMTP_FROM}
- SIGNOZ_EMAILING_SMTP_AUTH_USERNAME=${SIGNOZ_EMAILING_SMTP_AUTH_USERNAME}
- SIGNOZ_EMAILING_SMTP_AUTH_PASSWORD=${SIGNOZ_EMAILING_SMTP_AUTH_PASSWORD}
- SIGNOZ_ALERTMANAGER_PROVIDER=signoz
- SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__AUTH__PASSWORD=${SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__AUTH__PASSWORD}
- SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__AUTH__USERNAME=${SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__AUTH__USERNAME}
- SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__FROM=${SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__FROM}
- SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__SMARTHOST=${SIGNOZ_ALERTMANAGER_SIGNOZ_GLOBAL_SMTP__SMARTHOST}
- DOT_METRICS_ENABLED=true
healthcheck:
test:
- CMD
- wget
- --spider
- -q
- localhost:8080/api/v1/health
interval: 30s
timeout: 5s
retries: 3
otel-collector:
image: signoz/signoz-otel-collector:v0.129.7
depends_on:
clickhouse:
condition: service_healthy
schema-migrator-sync:
condition: service_completed_successfully
signoz:
condition: service_healthy
logging:
options:
max-size: 50m
max-file: "3"
command:
- --config=/etc/otel-collector-config.yaml
- --manager-config=/etc/manager-config.yaml
- --copy-path=/var/tmp/collector-config.yaml
- --feature-gates=-pkg.translator.prometheus.NormalizeName
volumes:
- type: bind
source: ./otel-collector-config.yaml
target: /etc/otel-collector-config.yaml
content: |
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
prometheus:
config:
global:
scrape_interval: 60s
scrape_configs:
- job_name: otel-collector
static_configs:
- targets:
- localhost:8888
labels:
job_name: otel-collector
processors:
batch:
send_batch_size: 10000
send_batch_max_size: 11000
timeout: 10s
resourcedetection:
# Using OTEL_RESOURCE_ATTRIBUTES envvar, env detector adds custom labels.
detectors: [env, system]
timeout: 2s
signozspanmetrics/delta:
metrics_exporter: signozclickhousemetrics
metrics_flush_interval: 60s
latency_histogram_buckets: [100us, 1ms, 2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1000ms, 1400ms, 2000ms, 5s, 10s, 20s, 40s, 60s ]
dimensions_cache_size: 100000
aggregation_temporality: AGGREGATION_TEMPORALITY_DELTA
enable_exp_histogram: true
dimensions:
- name: service.namespace
default: default
- name: deployment.environment
default: default
# This is added to ensure the uniqueness of the timeseries
# Otherwise, identical timeseries produced by multiple replicas of
# collectors result in incorrect APM metrics
- name: signoz.collector.id
- name: service.version
- name: browser.platform
- name: browser.mobile
- name: k8s.cluster.name
- name: k8s.node.name
- name: k8s.namespace.name
- name: host.name
- name: host.type
- name: container.name
extensions:
health_check:
endpoint: 0.0.0.0:13133
pprof:
endpoint: 0.0.0.0:1777
exporters:
clickhousetraces:
datasource: tcp://clickhouse:9000/signoz_traces
low_cardinal_exception_grouping: ${env:LOW_CARDINAL_EXCEPTION_GROUPING}
use_new_schema: true
signozclickhousemetrics:
dsn: tcp://clickhouse:9000/signoz_metrics
clickhouselogsexporter:
dsn: tcp://clickhouse:9000/signoz_logs
timeout: 10s
use_new_schema: true
service:
telemetry:
logs:
encoding: json
extensions:
- health_check
- pprof
pipelines:
traces:
receivers: [otlp]
processors: [signozspanmetrics/delta, batch]
exporters: [clickhousetraces]
metrics:
receivers: [otlp]
processors: [batch]
exporters: [signozclickhousemetrics]
metrics/prometheus:
receivers: [prometheus]
processors: [batch]
exporters: [signozclickhousemetrics]
logs:
receivers: [otlp]
processors: [batch]
exporters: [clickhouselogsexporter]
- type: bind
source: ./otel-collector-opamp-config.yaml
target: /etc/manager-config.yaml
content: |
server_endpoint: ws://signoz:4320/v1/opamp
environment:
- SERVICE_URL_OTELCOLLECTORHTTP_4318
- OTEL_RESOURCE_ATTRIBUTES=host.name=signoz-host,os.type=linux
- LOW_CARDINAL_EXCEPTION_GROUPING=false
healthcheck:
test: bash -c "exec 6<> /dev/tcp/localhost/13133"
interval: 30s
timeout: 5s
retries: 3
schema-migrator-sync:
image: signoz/signoz-schema-migrator:v0.129.7
command:
- sync
- --dsn=tcp://clickhouse:9000
- --up=
depends_on:
clickhouse:
condition: service_healthy
restart: on-failure
exclude_from_hc: true
logging:
options:
max-size: 50m
max-file: "3"
schema-migrator-async:
image: signoz/signoz-schema-migrator:v0.129.7
depends_on:
clickhouse:
condition: service_healthy
schema-migrator-sync:
condition: service_completed_successfully
restart: on-failure
exclude_from_hc: true
logging:
options:
max-size: 50m
max-file: "3"
command:
- async
- --dsn=tcp://clickhouse:9000
- --up=