Moved the Troubleshooting section

ydb-platform · Jan 29, 2025 · bad353c · bad353c
1 parent ff8f503
commit bad353c
Show file tree

Hide file tree

Showing 137 changed files with 232 additions and 219 deletions.
diff --git a/ydb/docs/en/core/dev/index.md b/ydb/docs/en/core/dev/index.md
@@ -27,6 +27,4 @@ Main resources:
   - [{#T}](../postgresql/intro.md)
   - [{#T}](../reference/kafka-api/index.md)
 
-- [{#T}](troubleshooting/index.md)
-
-If you're interested in developing {{ ydb-short-name }} core or satellite projects, refer to the [documentation for contributors](../contributor/index.md).
+If you're interested in developing {{ ydb-short-name }} core or satellite projects, refer to the [documentation for contributors](../contributor/index.md).
diff --git a/ydb/docs/en/core/dev/toc_p.yaml b/ydb/docs/en/core/dev/toc_p.yaml
@@ -18,11 +18,6 @@ items:
     path: primary-key/toc_p.yaml
 - name: Secondary indexes
   href: secondary-indexes.md
-- name: Troubleshooting
-  href: troubleshooting/index.md
-  include:
-    mode: link
-    path: troubleshooting/toc_p.yaml
 - name: Query plans optimization
   href: query-plans-optimization.md
 - name: Batch upload

diff --git a/ydb/docs/en/core/dev/troubleshooting/performance/hardware/cpu-bottleneck.md b/ydb/docs/en/core/dev/troubleshooting/performance/hardware/cpu-bottleneck.md
diff --git a/ydb/docs/en/core/dev/troubleshooting/performance/hardware/disk-space.md b/ydb/docs/en/core/dev/troubleshooting/performance/hardware/disk-space.md
diff --git a/ydb/docs/en/core/dev/troubleshooting/performance/schemas/splits-merges.md b/ydb/docs/en/core/dev/troubleshooting/performance/schemas/splits-merges.md
diff --git a/ydb/docs/en/core/toc_i.yaml b/ydb/docs/en/core/toc_i.yaml
@@ -37,6 +37,11 @@ items:
   include:
     mode: link
     path: recipes/toc_p.yaml
+- name: Troubleshooting
+  href: troubleshooting/index.md
+  include:
+    mode: link
+    path: troubleshooting/toc_p.yaml
 - name: Questions and answers
   href: faq/index.md
   include:

diff --git a/...docs/en/core/dev/troubleshooting/index.md → ydb/docs/en/core/troubleshooting/index.md b/...docs/en/core/dev/troubleshooting/index.md → ydb/docs/en/core/troubleshooting/index.md
diff --git a/...mance/hardware/_assets/cpu-batch-pool.png → ...mance/hardware/_assets/cpu-batch-pool.png b/...mance/hardware/_assets/cpu-batch-pool.png → ...mance/hardware/_assets/cpu-batch-pool.png
diff --git a/...formance/hardware/_assets/cpu-by-pool.png → ...formance/hardware/_assets/cpu-by-pool.png b/...formance/hardware/_assets/cpu-by-pool.png → ...formance/hardware/_assets/cpu-by-pool.png
diff --git a/...formance/hardware/_assets/cpu-ic-pool.png → ...formance/hardware/_assets/cpu-ic-pool.png b/...formance/hardware/_assets/cpu-ic-pool.png → ...formance/hardware/_assets/cpu-ic-pool.png
diff --git a/...formance/hardware/_assets/cpu-io-pool.png → ...formance/hardware/_assets/cpu-io-pool.png b/...formance/hardware/_assets/cpu-io-pool.png → ...formance/hardware/_assets/cpu-io-pool.png
diff --git a/...ware/_assets/cpu-read-only-tx-latency.png → ...ware/_assets/cpu-read-only-tx-latency.png b/...ware/_assets/cpu-read-only-tx-latency.png → ...ware/_assets/cpu-read-only-tx-latency.png
diff --git a/...ce/hardware/_assets/cpu-row-read-rows.png → ...ce/hardware/_assets/cpu-row-read-rows.png b/...ce/hardware/_assets/cpu-row-read-rows.png → ...ce/hardware/_assets/cpu-row-read-rows.png
diff --git a/...ance/hardware/_assets/cpu-system-pool.png → ...ance/hardware/_assets/cpu-system-pool.png b/...ance/hardware/_assets/cpu-system-pool.png → ...ance/hardware/_assets/cpu-system-pool.png
diff --git a/...rmance/hardware/_assets/cpu-user-pool.png → ...rmance/hardware/_assets/cpu-user-pool.png b/...rmance/hardware/_assets/cpu-user-pool.png → ...rmance/hardware/_assets/cpu-user-pool.png
diff --git a/...assets/disk-time-available--disk-cost.png → ...assets/disk-time-available--disk-cost.png b/...assets/disk-time-available--disk-cost.png → ...assets/disk-time-available--disk-cost.png
diff --git a/...e/_assets/embedded-ui-cpu-system-pool.png → ...e/_assets/embedded-ui-cpu-system-pool.png b/...e/_assets/embedded-ui-cpu-system-pool.png → ...e/_assets/embedded-ui-cpu-system-pool.png
diff --git a/...formance/hardware/_assets/microbursts.png → ...formance/hardware/_assets/microbursts.png b/...formance/hardware/_assets/microbursts.png → ...formance/hardware/_assets/microbursts.png
diff --git a/...ormance/hardware/_assets/request-size.png → ...ormance/hardware/_assets/request-size.png b/...ormance/hardware/_assets/request-size.png → ...ormance/hardware/_assets/request-size.png
diff --git a/...performance/hardware/_assets/requests.png → ...performance/hardware/_assets/requests.png b/...performance/hardware/_assets/requests.png → ...performance/hardware/_assets/requests.png
diff --git a/...rmance/hardware/_assets/response-size.png → ...rmance/hardware/_assets/response-size.png b/...rmance/hardware/_assets/response-size.png → ...rmance/hardware/_assets/response-size.png
diff --git a/...are/_assets/storage-groups-disk-space.png → ...are/_assets/storage-groups-disk-space.png b/...are/_assets/storage-groups-disk-space.png → ...are/_assets/storage-groups-disk-space.png
diff --git a/...ance/hardware/_includes/cpu-bottleneck.md → ...ance/hardware/_includes/cpu-bottleneck.md b/...ance/hardware/_includes/cpu-bottleneck.md → ...ance/hardware/_includes/cpu-bottleneck.md
@@ -1,6 +1,6 @@
-1. Use **Diagnostics** in the [Embedded UI](../../../../../reference/embedded-ui/index.md) to analyze CPU utilization in all pools:
+1. Use **Diagnostics** in the [Embedded UI](../../../../reference/embedded-ui/index.md) to analyze CPU utilization in all pools:
 
-    1. In the [Embedded UI](../../../../../reference/embedded-ui/index.md), go to the **Databases** tab and click on the database.
+    1. In the [Embedded UI](../../../../reference/embedded-ui/index.md), go to the **Databases** tab and click on the database.
 
     1. On the **Navigation** tab, ensure the required database is selected.
 
@@ -12,7 +12,7 @@
 
 1. Use Grafana charts to analyze CPU utilization in all pools:
 
-    1. Open the **[CPU](../../../../../reference/observability/metrics/grafana-dashboards.md#cpu)** dashboard in Grafana.
+    1. Open the **[CPU](../../../../reference/observability/metrics/grafana-dashboards.md#cpu)** dashboard in Grafana.
 
     1. See if the following charts show any spikes:
 

diff --git a/...rmance/hardware/_includes/io-bandwidth.md → ...rmance/hardware/_includes/io-bandwidth.md b/...rmance/hardware/_includes/io-bandwidth.md → ...rmance/hardware/_includes/io-bandwidth.md
@@ -1,4 +1,4 @@
-1. Open the **[Distributed Storage Overview](../../../../../reference/observability/metrics/grafana-dashboards.md)** dashboard in Grafana.
+1. Open the **[Distributed Storage Overview](../../../../reference/observability/metrics/grafana-dashboards.md)** dashboard in Grafana.
 
 1. On the **DiskTimeAvailable and total Cost relation** chart, see if the **Total Cost** spikes cross the **DiskTimeAvailable** level.
 

diff --git a/ydb/docs/en/core/troubleshooting/performance/hardware/cpu-bottleneck.md b/ydb/docs/en/core/troubleshooting/performance/hardware/cpu-bottleneck.md
@@ -0,0 +1,14 @@
+# CPU bottleneck
+
+High CPU usage can lead to slow query processing and increased response times. When CPU resources are constrained, the database may have difficulty handling complex queries or large transaction volumes.
+
+{{ ydb-short-name }} nodes primarily consume CPU resources for running [actors](../../../concepts/glossary.md#actor). On each node, actors are executed using multiple [actor system pools](../../../concepts/glossary.md#actor-system-pools). The resource consumption of each pool is measured separately which allows to identify what kind of activity changed its behavior.
+
+## Diagnostics
+
+<!-- The include is added to allow partial overrides in overlays  -->
+{% include notitle [#](_includes/cpu-bottleneck.md) %}
+
+## Recommendation
+
+Add additional [database nodes](../../../concepts/glossary.md#database-node) to the cluster or allocate more CPU cores to the existing nodes. If that's not possible, consider distributing CPU cores between pools differently.
diff --git a/ydb/docs/en/core/troubleshooting/performance/hardware/disk-space.md b/ydb/docs/en/core/troubleshooting/performance/hardware/disk-space.md
@@ -0,0 +1,29 @@
+# Disk space
+
+A lack of available disk space can prevent the database from storing new data, resulting in the database becoming read-only. This can also cause slowdowns as the system tries to reclaim disk space by compacting existing data more aggressively.
+
+## Diagnostics
+
+1. See if the **[DB overview > Storage](../../../reference/observability/metrics/grafana-dashboards.md#dboverview)** charts in Grafana show any spikes.
+
+1. In [Embedded UI](../../../reference/embedded-ui/index.md), on the **Storage** tab, analyze the list of available storage groups and nodes and their disk usage.
+
+    {% note tip %}
+
+    Use the **Out of Space** filter to list only the storage groups with full disks.
+
+    {% endnote %}
+
+    ![](_assets/storage-groups-disk-space.png)
+
+{% note info %}
+
+It is also recommended to use the [Healthcheck API](../../../reference/ydb-sdk/health-check-api.md) to get this information.
+
+{% endnote %}
+
+## Recommendations
+
+Add more [storage groups](../../../concepts/glossary.md#storage-group) to the database.
+
+If the cluster doesn't have spare storage groups, configure them first. Add additional [storage nodes](../../../concepts/glossary.md#storage-node), if necessary.
diff --git a/...rformance/hardware/insufficient-memory.md → ...rformance/hardware/insufficient-memory.md b/...rformance/hardware/insufficient-memory.md → ...rformance/hardware/insufficient-memory.md
@@ -18,7 +18,7 @@ Additionally, which components within the  {{ ydb-short-name }} process consume
 
 1. Determine whether any {{ ydb-short-name }} nodes recently restarted for unknown reasons. Exclude cases of {{ ydb-short-name }} version upgrades and other planned maintenance. This could reveal nodes terminated by OOM killer and restarted by `systemd`.
 
-    1. Open [Embedded UI](../../../../reference/embedded-ui/index.md).
+    1. Open [Embedded UI](../../../reference/embedded-ui/index.md).
 
     1. On the **Nodes** tab, look for nodes that have low uptime.
 
@@ -36,11 +36,11 @@ Additionally, which components within the  {{ ydb-short-name }} process consume
 
 1. Determine whether memory usage reached 100% of capacity.
 
-    1. Open the **[DB overview](../../../../reference/observability/metrics/grafana-dashboards.md#dboverview)** dashboard in Grafana.
+    1. Open the **[DB overview](../../../reference/observability/metrics/grafana-dashboards.md#dboverview)** dashboard in Grafana.
 
     1. Analyze the charts in the **Memory** section.
 
-1. Determine whether the user load on {{ ydb-short-name }} has increased. Analyze the following charts on the **[DB overview](../../../../reference/observability/metrics/grafana-dashboards.md#dboverview)** dashboard in Grafana:
+1. Determine whether the user load on {{ ydb-short-name }} has increased. Analyze the following charts on the **[DB overview](../../../reference/observability/metrics/grafana-dashboards.md#dboverview)** dashboard in Grafana:
 
     - **Requests** chart
     - **Request size** chart
@@ -54,4 +54,4 @@ Consider the following solutions for addressing insufficient memory:
 
 - If the load on {{ ydb-short-name }} has increased due to new usage patterns or increased query rate, try optimizing the application to reduce the load on {{ ydb-short-name }} or add more {{ ydb-short-name }} nodes.
 
-- If the load on {{ ydb-short-name }} has not changed but nodes are still restarting, consider adding more {{ ydb-short-name }} nodes or raising the hard memory limit for the nodes. For more information about memory management in {{ ydb-short-name }}, see [{#T}](../../../../reference/configuration/index.md#memory-controller).
+- If the load on {{ ydb-short-name }} has not changed but nodes are still restarting, consider adding more {{ ydb-short-name }} nodes or raising the hard memory limit for the nodes. For more information about memory management in {{ ydb-short-name }}, see [{#T}](../../../reference/configuration/index.md#memory-controller).
diff --git a/...ting/performance/hardware/io-bandwidth.md → ...ting/performance/hardware/io-bandwidth.md b/...ting/performance/hardware/io-bandwidth.md → ...ting/performance/hardware/io-bandwidth.md
@@ -9,7 +9,7 @@ A high rate of read and write operations can overwhelm the disk subsystem, leadi
 
 ## Recommendations
 
-Add more [storage groups](../../../../concepts/glossary.md#storage-group) to the database.
+Add more [storage groups](../../../concepts/glossary.md#storage-group) to the database.
 
 In cases of high microburst rates, balancing the load across storage groups might help.
 
diff --git a/...eshooting/performance/hardware/toc_p.yaml → ...eshooting/performance/hardware/toc_p.yaml b/...eshooting/performance/hardware/toc_p.yaml → ...eshooting/performance/hardware/toc_p.yaml
diff --git a/.../dev/troubleshooting/performance/index.md → ...core/troubleshooting/performance/index.md b/.../dev/troubleshooting/performance/index.md → ...core/troubleshooting/performance/index.md
@@ -6,15 +6,15 @@ Addressing database performance issues often requires a holistic approach, which
 
 Troubleshooting performance issues in {{ ydb-short-name }} involves the following tools:
 
-- [{{ ydb-short-name }} metrics](../../../reference/observability/metrics/index.md)
+- [{{ ydb-short-name }} metrics](../../reference/observability/metrics/index.md)
 
-    Diagnistic steps for most performance issues involve analyzing [Grafana dashboards](../../../reference/observability/metrics/grafana-dashboards.md) that use {{ ydb-short-name }} metrics collected by Prometheus. For information on installing Grafana and Prometheus, see [{#T}](../../../devops/manual/monitoring.md).
+    Diagnistic steps for most performance issues involve analyzing [Grafana dashboards](../../reference/observability/metrics/grafana-dashboards.md) that use {{ ydb-short-name }} metrics collected by Prometheus For information on installing Grafana and Prometheus, see [{#T}](../../devops/manual/monitoring.md).
 
-- [{{ ydb-short-name }} logs](../../../devops/manual/logging.md)
-- [Tracing](../../../reference/observability/tracing/setup.md)
-- [{{ ydb-short-name }} CLI](../../../reference/ydb-cli/index.md)
-- [Embedded UI](../../../reference/embedded-ui/index.md)
-- [Query plans](../../query-plans-optimization.md)
+- [{{ ydb-short-name }} logs](../../devops/manual/logging.md)
+- [Tracing](../../reference/observability/tracing/setup.md)
+- [{{ ydb-short-name }} CLI](../../reference/ydb-cli/index.md)
+- [Embedded UI](../../reference/embedded-ui/index.md)
+- [Query plans](../../dev/query-plans-optimization.md)
 - Third-party observability tools
 
 ## Classification of {{ ydb-short-name }} performance issues
@@ -33,15 +33,15 @@ Database performance issues can be classified into several categories based on t
 
 ### Insufficient resource issues
 
-These issues refer to situations when the workload demands more physical resources — such as CPU, memory, disk space, and network bandwidth — than allocated to a database. In some cases, suboptimal allocation of resources, for example misconfigured [control groups (cgroups)](https://en.wikipedia.org/wiki/Cgroups) or [actor system pools](../../../concepts/glossary.md#actor-system-pool), may also result in insufficient resources for {{ ydb-short-name }} and increase query latencies even though physical hardware resources are still available on the database server.
+These issues refer to situations when the workload demands more physical resources — such as CPU, memory, disk space, and network bandwidth — than allocated to a database. In some cases, suboptimal allocation of resources, for example misconfigured [control groups (cgroups)](https://en.wikipedia.org/wiki/Cgroups) or [actor system pools](../../concepts/glossary.md#actor-system-pool), may also result in insufficient resources for {{ ydb-short-name }} and increase query latencies even though physical hardware resources are still available on the database server.
 
 - **[CPU bottlenecks](hardware/cpu-bottleneck.md)**. High CPU usage can result in slow query processing and increased response times. When CPU resources are limited, the database may struggle to handle complex queries or large transaction loads.
 
 - **[Insufficient disk space](hardware/disk-space.md)**. A lack of available disk space can prevent the database from storing new data, resulting in the database becoming read-only. This might also cause slowdowns as the system tries to reclaim disk space by compacting existing data more aggressively.
 
 - **[Insufficient memory (RAM)](hardware/insufficient-memory.md)**. Queries require memory to temporarily store various intermediate data during execution. A lack of available memory can negatively impact database performance in multiple ways.
 
-- **[Insufficient disk I/O bandwidth](hardware/io-bandwidth.md)**. A high rate of read/write operations can overwhelm the disk subsystem, causing increased data access latencies. When the [distributed storage](../../../concepts/glossary.md#distributed-storage) cannot read or write data quickly enough, queries requiring disk access will take longer to execute.
+- **[Insufficient disk I/O bandwidth](hardware/io-bandwidth.md)**. A high rate of read/write operations can overwhelm the disk subsystem, causing increased data access latencies. When the [distributed storage](../../concepts/glossary.md#distributed-storage) cannot read or write data quickly enough, queries requiring disk access will take longer to execute.
 
 ### Operating system issues
 

diff --git a/.../infrastructure/_assets/cluster-nodes.png → .../infrastructure/_assets/cluster-nodes.png b/.../infrastructure/_assets/cluster-nodes.png → .../infrastructure/_assets/cluster-nodes.png
diff --git a/...structure/_assets/diagnostics-network.png → ...structure/_assets/diagnostics-network.png b/...structure/_assets/diagnostics-network.png → ...structure/_assets/diagnostics-network.png
diff --git a/...nce/infrastructure/_includes/dc-outage.md → ...nce/infrastructure/_includes/dc-outage.md b/...nce/infrastructure/_includes/dc-outage.md → ...nce/infrastructure/_includes/dc-outage.md
@@ -1,8 +1,8 @@
 To determine if one of the data centers of the {{ ydb-short-name }} cluster is not available, follow these steps:
 
-1. Open [Embedded UI](../../../../../reference/embedded-ui/index.md).
+1. Open [Embedded UI](../../../../reference/embedded-ui/index.md).
 
-1. On the **Nodes** tab, analyze the [health indicators](../../../../../reference/embedded-ui/ydb-monitoring.md#colored_indicator) in the **Host** and **DC** columns.
+1. On the **Nodes** tab, analyze the [health indicators](../../../../reference/embedded-ui/ydb-monitoring.md#colored_indicator) in the **Host** and **DC** columns.
 
     ![](../_assets/cluster-nodes.png)
 

diff --git a/...mance/infrastructure/_includes/network.md → ...mance/infrastructure/_includes/network.md b/...mance/infrastructure/_includes/network.md → ...mance/infrastructure/_includes/network.md
@@ -1,6 +1,6 @@
-To diagnose network issues, use the healthcheck in the [Embedded UI](../../../../../reference/embedded-ui/index.md):
+To diagnose network issues, use the healthcheck in the [Embedded UI](../../../../reference/embedded-ui/index.md):
 
-1. Open the [Embedded UI](../../../../../reference/embedded-ui/index.md):
+1. Open the [Embedded UI](../../../../reference/embedded-ui/index.md):
 
     1. Navigate to the **Databases** tab and click on the desired database.
 

diff --git a/...g/performance/infrastructure/dc-drills.md → ...g/performance/infrastructure/dc-drills.md b/...g/performance/infrastructure/dc-drills.md → ...g/performance/infrastructure/dc-drills.md
diff --git a/...g/performance/infrastructure/dc-outage.md → ...g/performance/infrastructure/dc-outage.md b/...g/performance/infrastructure/dc-outage.md → ...g/performance/infrastructure/dc-outage.md
diff --git a/...ng/performance/infrastructure/hardware.md → ...ng/performance/infrastructure/hardware.md b/...ng/performance/infrastructure/hardware.md → ...ng/performance/infrastructure/hardware.md
@@ -6,7 +6,7 @@ Malfunctioning storage drives and network cards, until replaced, significantly i
 
 Use the hardware monitoring tools that your operating system and data center provide to diagnose hardware issues.
 
-You can also use the **Healthcheck** in [Embedded UI](../../../../reference/embedded-ui/index.md) to diagnose some hardware issues:
+You can also use the **Healthcheck** in [Embedded UI](../../../reference/embedded-ui/index.md) to diagnose some hardware issues:
 
 - **Storage issues**
 

diff --git a/...ing/performance/infrastructure/network.md → ...ing/performance/infrastructure/network.md b/...ing/performance/infrastructure/network.md → ...ing/performance/infrastructure/network.md
diff --git a/...ing/performance/infrastructure/toc_p.yaml → ...ing/performance/infrastructure/toc_p.yaml b/...ing/performance/infrastructure/toc_p.yaml → ...ing/performance/infrastructure/toc_p.yaml
diff --git a/...rformance/queries/_assets/soft-errors.png → ...rformance/queries/_assets/soft-errors.png b/...rformance/queries/_assets/soft-errors.png → ...rformance/queries/_assets/soft-errors.png
diff --git a/...ssets/transactions-locks-invalidation.png → ...ssets/transactions-locks-invalidation.png b/...ssets/transactions-locks-invalidation.png → ...ssets/transactions-locks-invalidation.png