From 14fa6cbe704659b700486869764f409fe5b15da7 Mon Sep 17 00:00:00 2001 From: Dominic Tran Date: Mon, 21 Jul 2025 11:03:40 -0500 Subject: [PATCH 01/10] initial changes --- .../GlossaryTooltip/generate-tooltips.py | 90 +++++++++++++++++++ src/components/GlossaryTooltip/glossary.json | 9 ++ src/css/custom.scss | 47 ++++++++++ 3 files changed, 146 insertions(+) create mode 100644 src/components/GlossaryTooltip/generate-tooltips.py create mode 100644 src/components/GlossaryTooltip/glossary.json diff --git a/src/components/GlossaryTooltip/generate-tooltips.py b/src/components/GlossaryTooltip/generate-tooltips.py new file mode 100644 index 00000000000..f3de40242c6 --- /dev/null +++ b/src/components/GlossaryTooltip/generate-tooltips.py @@ -0,0 +1,90 @@ +import os +import re +import json + +GLOSSARY_JSON_PATH = "/home/dtran/clickhouse-docs/src/components/GlossaryTooltip/glossary.json" +DOCS_PATH = "/home/dtran/clickhouse-docs/docs" + +IGNORE_DIRS = { + "changelog", "changelogs", "i18n", "scripts", "static", "styles", + "contribute", "about-us", "_placeholders" +} + +GLOSSARY_IMPORT = "import GlossaryTooltip from '@site/src/components/GlossaryTooltip/GlossaryTooltip.jsx';" + +def load_glossary(path): + with open(path, 'r', encoding='utf-8') as f: + return json.load(f) + +def mask_ignores(text): + placeholders = {} + patterns = { + 'codeblocks': r'```[\s\S]*?```', + 'inline_code': r'`[^`\n]+`', + 'frontmatter': r'^---[\s\S]+?---', + 'imports': r'^import .*?;$', + 'headers': r'^(#+ .*)$', + 'html_blocks': r'<(div|details|summary)[\s\S]*?<\/\1>', + 'blockquotes': r'^\s*>.*$', + 'links': r'\[([^\]]+)\]\([^)]+\)', + 'images': r'!\[[^\]]*\]\([^)]+\)', + 'comments': r'', + } + + for name, pattern in patterns.items(): + regex = re.compile(pattern, re.MULTILINE) + matches = list(regex.finditer(text)) + for i, match in enumerate(matches): + key = f"__MASKED_{name.upper()}_{i}__" + placeholders[key] = match.group(0) + text = text.replace(match.group(0), key) + + return text, placeholders + +def unmask_ignores(text, placeholders): + for key, value in placeholders.items(): + text = text.replace(key, value) + return text + +def inject_tooltips(text, glossary): + def replacement(match): + word = match.group(0) + definition = glossary.get(word) + if definition: + return f'{word}' + return word + + pattern = r'\b(' + '|'.join(re.escape(k) for k in glossary.keys()) + r')\b' + return re.sub(pattern, replacement, text) + +def process_file(path, glossary): + with open(path, 'r', encoding='utf-8') as f: + content = f.read() + + masked_text, placeholders = mask_ignores(content) + replaced = inject_tooltips(masked_text, glossary) + final_text = unmask_ignores(replaced, placeholders) + + if ' {new_path}") + else: + print(f"– Skipped (no change): {path}") + +def process_directory(base_path, glossary): + for root, dirs, files in os.walk(base_path): + dirs[:] = [d for d in dirs if d not in IGNORE_DIRS] + for file in files: + if file.endswith(".md") and not file.startswith("_"): + path = os.path.join(root, file) + process_file(path, glossary) + +if __name__ == "__main__": + glossary = load_glossary(GLOSSARY_JSON_PATH) + process_directory(DOCS_PATH, glossary) diff --git a/src/components/GlossaryTooltip/glossary.json b/src/components/GlossaryTooltip/glossary.json new file mode 100644 index 00000000000..beec836aca8 --- /dev/null +++ b/src/components/GlossaryTooltip/glossary.json @@ -0,0 +1,9 @@ +{ + "Atomicity": "Atomicity ensures that a transaction (a series of database operations) is treated as a single, indivisible unit. This means that either all operations within the transaction occur, or none do. An example of an atomic transaction is transferring money from one bank account to another. If either step of the transfer fails, the transaction fails, and the money stays in the first account. Atomicity ensures no money is lost or created.", + "Cluster": "A collection of nodes (servers) that work together to store and process data.", + "CMEK": "Customer-managed encryption keys (CMEK) allow customers to use their key-management service (KMS) key to encrypt the ClickHouse disk data key and protect their data at rest.", + "Dictionary": "A dictionary is a mapping of key-value pairs that is useful for various types of reference lists. It is a powerful feature that allows for the efficient use of dictionaries in queries, which is often more efficient than using a `JOIN` with reference tables.", + "Parts": "A physical file on a disk that stores a portion of the table's data. This is different from a partition, which is a logical division of a table's data that is created using a partition key.", + "Replica": "A copy of the data stored in a ClickHouse database. You can have any number of replicas of the same data for redundancy and reliability. Replicas are used in conjunction with the ReplicatedMergeTree table engine, which enables ClickHouse to keep multiple copies of data in sync across different servers.", + "Shard": "A subset of data. ClickHouse always has at least one shard for your data. If you do not split the data across multiple servers, your data will be stored in one shard. Sharding data across multiple servers can be used to divide the load if you exceed the capacity of a single server." +} \ No newline at end of file diff --git a/src/css/custom.scss b/src/css/custom.scss index 21ef6708b10..20ae148e5cc 100644 --- a/src/css/custom.scss +++ b/src/css/custom.scss @@ -1423,3 +1423,50 @@ input::-ms-input-placeholder { /* Microsoft Edge */ .DocSearch-Cancel { color: var(--docsearch-text-color) !important; } + +.tooltip { + position: relative; + cursor: help; + border-bottom: 1px dotted #aaa; + outline: none; /* remove default outline, but keep focus styling if needed */ +} + +.tooltipText { + visibility: hidden; + width: max-content; + max-width: 280px; + background-color: #333; + color: #fff; + text-align: left; + padding: 8px 12px; + border-radius: 4px; + + position: absolute; + z-index: 100; + top: 125%; /* place tooltip below the term */ + left: 50%; + transform: translateX(-50%); + + opacity: 0; + transition: opacity 0.2s ease-in-out; + + white-space: pre-wrap; + box-shadow: 0 0 8px rgba(0,0,0,0.3); + pointer-events: none; /* tooltip itself should not capture pointer */ +} + +.tooltipText.visible { + visibility: visible; + opacity: 1; +} + +.tooltip-link { + text-decoration: underline dotted; + color: inherit; /* keep text color */ +} + +.tooltip-link:hover, +.tooltip-link:focus { + text-decoration: underline solid; + outline: none; +} \ No newline at end of file From defc5f8dd4eafdb962323b5c86b526e955aa752b Mon Sep 17 00:00:00 2001 From: Dominic Tran Date: Thu, 24 Jul 2025 13:17:34 -0500 Subject: [PATCH 02/10] Adding custom tooltips for concepts and getting started docs, including script for re-use. Starting with concepts and getting started to reduce scope. --- docs/concepts/glossary.md | 2 + docs/concepts/index.md | 2 +- ...-fast.md => why-clickhouse-is-so-fast.mdx} | 6 +- docs/faq/general/index.md | 2 +- docs/faq/general/olap.md | 2 +- docs/faq/use-cases/time-series.md | 2 +- docs/getting-started/quick-start/cloud.mdx | 4 +- docs/getting-started/quick-start/oss.mdx | 3 +- docs/intro.md | 2 +- scripts/inject-glossary-tooltips.py | 131 ++++++++++++++++++ .../GlossaryTooltip/GlossaryTooltip.tsx | 49 +++++++ src/theme/MDXComponents.js | 2 + 12 files changed, 195 insertions(+), 12 deletions(-) rename docs/concepts/{why-clickhouse-is-so-fast.md => why-clickhouse-is-so-fast.mdx} (96%) create mode 100644 scripts/inject-glossary-tooltips.py create mode 100644 src/components/GlossaryTooltip/GlossaryTooltip.tsx diff --git a/docs/concepts/glossary.md b/docs/concepts/glossary.md index 4b36ca9f57f..5f3352f488d 100644 --- a/docs/concepts/glossary.md +++ b/docs/concepts/glossary.md @@ -5,6 +5,8 @@ title: 'Glossary' slug: /concepts/glossary --- + + # Glossary ## Atomicity {#atomicity} diff --git a/docs/concepts/index.md b/docs/concepts/index.md index 10a67c51f50..481a1d11ee9 100644 --- a/docs/concepts/index.md +++ b/docs/concepts/index.md @@ -11,7 +11,7 @@ In this section of the docs we'll dive into the concepts around what makes Click | Page | Description | |------------------------------------------------------------------|---------------------------------------------------------------------------------------| -| [Why is ClickHouse so Fast?](./why-clickhouse-is-so-fast.md) | Learn what makes ClickHouse so fast. +| [Why is ClickHouse so Fast?](./why-clickhouse-is-so-fast.mdx) | Learn what makes ClickHouse so fast. | [What is OLAP?](./olap.md) | Learn what Online Analytical Processing is. | [Why is ClickHouse unique?](../about-us/distinctive-features.md) | Learn what makes ClickHouse unique. | [Glossary](./glossary.md) | This page contains a glossary of terms you'll commonly encounter throughout the docs. diff --git a/docs/concepts/why-clickhouse-is-so-fast.md b/docs/concepts/why-clickhouse-is-so-fast.mdx similarity index 96% rename from docs/concepts/why-clickhouse-is-so-fast.md rename to docs/concepts/why-clickhouse-is-so-fast.mdx index 5cc9f5fe512..34c933c546b 100644 --- a/docs/concepts/why-clickhouse-is-so-fast.md +++ b/docs/concepts/why-clickhouse-is-so-fast.mdx @@ -19,7 +19,7 @@ From an architectural perspective, databases consist (at least) of a storage lay -In ClickHouse, each table consists of multiple "table parts". A [part](/parts) is created whenever a user inserts data into the table (INSERT statement). A query is always executed against all table parts that exist at the time the query starts. +In ClickHouse, each table consists of multiple "table ". A [part](/parts) is created whenever a user inserts data into the table (INSERT statement). A query is always executed against all table parts that exist at the time the query starts. To avoid that too many parts accumulate, ClickHouse runs a [merge](/merges) operation in the background which continuously combines multiple smaller parts into a single bigger part. @@ -97,7 +97,7 @@ Finally, ClickHouse uses a vectorized query processing layer that parallelizes q Modern systems have dozens of CPU cores. To utilize all cores, ClickHouse unfolds the query plan into multiple lanes, typically one per core. Each lane processes a disjoint range of the table data. That way, the performance of the database scales "vertically" with the number of available cores. -If a single node becomes too small to hold the table data, further nodes can be added to form a cluster. Tables can be split ("sharded") and distributed across the nodes. ClickHouse will run queries on all nodes that store table data and thereby scale "horizontally" with the number of available nodes. +If a single node becomes too small to hold the table data, further nodes can be added to form a . Tables can be split ("sharded") and distributed across the nodes. ClickHouse will run queries on all nodes that store table data and thereby scale "horizontally" with the number of available nodes. 🤿 Deep dive into this in the [Query Processing Layer](/academic_overview#4-query-processing-layer) section of the web version of our VLDB 2024 paper. @@ -143,4 +143,4 @@ You can read a [PDF of the paper](https://www.vldb.org/pvldb/vol17/p3731-schulze Alexey Milovidov, our CTO and the creator of ClickHouse, presented the paper (slides [here](https://raw.githubusercontent.com/ClickHouse/clickhouse-presentations/master/2024-vldb/VLDB_2024_presentation.pdf)), followed by a Q&A (that quickly ran out of time!). You can catch the recorded presentation here: - + \ No newline at end of file diff --git a/docs/faq/general/index.md b/docs/faq/general/index.md index de96c9ba413..d7faa5313a6 100644 --- a/docs/faq/general/index.md +++ b/docs/faq/general/index.md @@ -10,7 +10,7 @@ description: 'Index page listing general questions about ClickHouse' # General questions about ClickHouse - [What is ClickHouse?](../../intro.md) -- [Why is ClickHouse so fast?](../../concepts/why-clickhouse-is-so-fast.md) +- [Why is ClickHouse so fast?](../../concepts/why-clickhouse-is-so-fast.mdx) - [Who is using ClickHouse?](../../faq/general/who-is-using-clickhouse.md) - [What does "ClickHouse" mean?](../../faq/general/dbms-naming.md) - [What does "Не тормозит" mean?](../../faq/general/ne-tormozit.md) diff --git a/docs/faq/general/olap.md b/docs/faq/general/olap.md index 1d3c9a99c13..f63850713b8 100644 --- a/docs/faq/general/olap.md +++ b/docs/faq/general/olap.md @@ -34,7 +34,7 @@ All database management systems could be classified into two groups: OLAP (Onlin In practice OLAP and OLTP are not categories, it's more like a spectrum. Most real systems usually focus on one of them but provide some solutions or workarounds if the opposite kind of workload is also desired. This situation often forces businesses to operate multiple storage systems integrated, which might be not so big deal but having more systems make it more expensive to maintain. So the trend of recent years is HTAP (**Hybrid Transactional/Analytical Processing**) when both kinds of the workload are handled equally well by a single database management system. -Even if a DBMS started as a pure OLAP or pure OLTP, they are forced to move towards that HTAP direction to keep up with their competition. And ClickHouse is no exception, initially, it has been designed as [fast-as-possible OLAP system](../../concepts/why-clickhouse-is-so-fast.md) and it still does not have full-fledged transaction support, but some features like consistent read/writes and mutations for updating/deleting data had to be added. +Even if a DBMS started as a pure OLAP or pure OLTP, they are forced to move towards that HTAP direction to keep up with their competition. And ClickHouse is no exception, initially, it has been designed as [fast-as-possible OLAP system](../../concepts/why-clickhouse-is-so-fast.mdx) and it still does not have full-fledged transaction support, but some features like consistent read/writes and mutations for updating/deleting data had to be added. The fundamental trade-off between OLAP and OLTP systems remains: diff --git a/docs/faq/use-cases/time-series.md b/docs/faq/use-cases/time-series.md index 1db9ac6cba3..dc62d67c50b 100644 --- a/docs/faq/use-cases/time-series.md +++ b/docs/faq/use-cases/time-series.md @@ -10,7 +10,7 @@ description: 'Page describing how to use ClickHouse as a time-series database' _Note: Please see the blog [Working with Time series data in ClickHouse](https://clickhouse.com/blog/working-with-time-series-data-and-functions-ClickHouse) for additional examples of using ClickHouse for time series analysis._ -ClickHouse is a generic data storage solution for [OLAP](../../faq/general/olap.md) workloads, while there are many specialized [time-series database management systems](https://clickhouse.com/engineering-resources/what-is-time-series-database). Nevertheless, ClickHouse's [focus on query execution speed](../../concepts/why-clickhouse-is-so-fast.md) allows it to outperform specialized systems in many cases. There are many independent benchmarks on this topic out there, so we're not going to conduct one here. Instead, let's focus on ClickHouse features that are important to use if that's your use case. +ClickHouse is a generic data storage solution for [OLAP](../../faq/general/olap.md) workloads, while there are many specialized [time-series database management systems](https://clickhouse.com/engineering-resources/what-is-time-series-database). Nevertheless, ClickHouse's [focus on query execution speed](../../concepts/why-clickhouse-is-so-fast.mdx) allows it to outperform specialized systems in many cases. There are many independent benchmarks on this topic out there, so we're not going to conduct one here. Instead, let's focus on ClickHouse features that are important to use if that's your use case. First of all, there are **[specialized codecs](../../sql-reference/statements/create/table.md#specialized-codecs)** which make typical time-series. Either common algorithms like `DoubleDelta` and `Gorilla` or specific to ClickHouse like `T64`. diff --git a/docs/getting-started/quick-start/cloud.mdx b/docs/getting-started/quick-start/cloud.mdx index 384be55f157..4852ea7cfff 100644 --- a/docs/getting-started/quick-start/cloud.mdx +++ b/docs/getting-started/quick-start/cloud.mdx @@ -60,7 +60,7 @@ Select your desired region for deploying the service, and give your new service New ClickHouse Service
-By default, the scale tier will create 3 replicas each with 4 VCPUs and 16 GiB RAM. [Vertical autoscaling](/manage/scaling#vertical-auto-scaling) will be enabled by default in the Scale tier. +By default, the scale tier will create 3 each with 4 VCPUs and 16 GiB RAM. [Vertical autoscaling](/manage/scaling#vertical-auto-scaling) will be enabled by default in the Scale tier. Users can customize the service resources if required, specifying a minimum and maximum size for replicas to scale between. When ready, select `Create service`. @@ -329,4 +329,4 @@ Suppose we have the following text in a CSV file named `data.csv`: - Check out our 25-minute video on [Getting Started with ClickHouse](https://clickhouse.com/company/events/getting-started-with-clickhouse/) - If your data is coming from an external source, view our [collection of integration guides](/integrations/index.mdx) for connecting to message queues, databases, pipelines and more - If you are using a UI/BI visualization tool, view the [user guides for connecting a UI to ClickHouse](/integrations/data-visualization) -- The user guide on [primary keys](/guides/best-practices/sparse-primary-indexes.md) is everything you need to know about primary keys and how to define them +- The user guide on [primary keys](/guides/best-practices/sparse-primary-indexes.md) is everything you need to know about primary keys and how to define them \ No newline at end of file diff --git a/docs/getting-started/quick-start/oss.mdx b/docs/getting-started/quick-start/oss.mdx index da543b7a668..b78d39ba107 100644 --- a/docs/getting-started/quick-start/oss.mdx +++ b/docs/getting-started/quick-start/oss.mdx @@ -107,7 +107,7 @@ PRIMARY KEY (user_id, timestamp) You can use the familiar `INSERT INTO TABLE` command with ClickHouse, but it is important to understand that each insert into a `MergeTree` table causes what we -call a **part** in ClickHouse to be created in storage. These parts later get +call a **part** in ClickHouse to be created in storage. These later get merged in the background by ClickHouse. In ClickHouse, we try to bulk insert lots of rows at a time @@ -373,4 +373,3 @@ technologies that integrate with ClickHouse. - The user guide on [primary keys](/guides/best-practices/sparse-primary-indexes.md) is everything you need to know about primary keys and how to define them. - diff --git a/docs/intro.md b/docs/intro.md index 0cc5962c58c..5133bfce44d 100644 --- a/docs/intro.md +++ b/docs/intro.md @@ -88,7 +88,7 @@ ClickHouse chooses the join algorithm adaptively, it starts with fast hash joins ## Superior query performance {#superior-query-performance} ClickHouse is well known for having extremely fast query performance. -To learn why ClickHouse is so fast, see the [Why is ClickHouse fast?](/concepts/why-clickhouse-is-so-fast.md) guide. +To learn why ClickHouse is so fast, see the [Why is ClickHouse fast?](/concepts/why-clickhouse-is-so-fast.mdx) guide. ' in content: + return content, False + + lines = content.splitlines() + inside_code_block = False + replaced_terms = set() + modified = False + output_lines = [] + + for line in lines: + stripped = line.strip() + + # Fence detection for code blocks + if stripped.startswith('```'): + inside_code_block = not inside_code_block + output_lines.append(line) + continue + + # Skip inside code or headings + if inside_code_block or stripped.startswith('#'): + output_lines.append(line) + continue + + new_line = replace_terms(line, replaced_terms) + if new_line != line: + modified = True + output_lines.append(new_line) + + return '\n'.join(output_lines), modified + +def rename_md_to_mdx(filepath): + if filepath.endswith('.md'): + new_path = filepath[:-3] + '.mdx' + os.rename(filepath, new_path) + print(f'Renamed: {filepath} → {new_path}') + return new_path + return filepath + +def walk_files(target_dir): + for root, _, files in os.walk(target_dir): + for filename in files: + if filename.endswith('.md') or filename.endswith('.mdx'): + yield os.path.join(root, filename) + +def print_diff(original, modified, path): + diff_lines = list(difflib.unified_diff( + original.splitlines(), + modified.splitlines(), + fromfile=path, + tofile=path, + lineterm='' + )) + if diff_lines: + print('\n'.join(diff_lines)) + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--dry-run', action='store_true', help='Show diffs without writing') + args = parser.parse_args() + + for filepath in walk_files(TARGET_DIR): + with open(filepath, 'r', encoding='utf-8') as f: + original = f.read() + + modified, changed = process_markdown(original) + + if changed: + if args.dry_run: + print(f'\n--- DRY RUN: {filepath} ---') + print_diff(original, modified, filepath) + else: + with open(filepath, 'w', encoding='utf-8') as f: + f.write(modified) + print(f'✅ Updated: {filepath}') + + # Rename to .mdx if needed + if filepath.endswith('.md'): + filepath = rename_md_to_mdx(filepath) + +if __name__ == '__main__': + main() diff --git a/src/components/GlossaryTooltip/GlossaryTooltip.tsx b/src/components/GlossaryTooltip/GlossaryTooltip.tsx new file mode 100644 index 00000000000..fc1317837d9 --- /dev/null +++ b/src/components/GlossaryTooltip/GlossaryTooltip.tsx @@ -0,0 +1,49 @@ +import React, { useState } from 'react'; +import glossary from './glossary.json'; +import Link from '@docusaurus/Link'; + +const GlossaryTooltip = ({ term, capitalize = false, plural = '' }) => { + const [visible, setVisible] = useState(false); + const definition = glossary[term]; + + if (!definition) { + console.warn(`Glossary term not found: ${term}`); + const displayFallback = capitalize + ? capitalizeWord(term) + plural + : term.toLowerCase() + plural; + return {displayFallback}; + } + + const displayTerm = capitalize ? capitalizeWord(term) : term.toLowerCase(); + const anchorId = term.toLowerCase().replace(/\s+/g, '-'); + const glossarySlug = `/concepts/glossary#${anchorId}`; + + return ( + setVisible(true)} + onMouseLeave={() => setVisible(false)} + onFocus={() => setVisible(true)} + onBlur={() => setVisible(false)} + > + setVisible(false)} + > + {displayTerm} + {plural} + + + {definition} + + + ); +}; + +function capitalizeWord(word) { + return word.charAt(0).toUpperCase() + word.slice(1); +} + +export default GlossaryTooltip; diff --git a/src/theme/MDXComponents.js b/src/theme/MDXComponents.js index d35d0e7bfc5..0e627a2d4a9 100644 --- a/src/theme/MDXComponents.js +++ b/src/theme/MDXComponents.js @@ -5,10 +5,12 @@ import MDXComponents from '@theme-original/MDXComponents'; // Import the custom Stepper component // Make sure the path matches your project structure import VStepper from '@site/src/components/Stepper/Stepper'; +import GlossaryTooltip from '../../src/components/GlossaryTooltip/GlossaryTooltip.tsx'; // Define the enhanced components const enhancedComponents = { ...MDXComponents, + GlossaryTooltip, ul: (props) =>
    , ol: (props) =>
      , li: (props) =>
    1. , From 4881f0a194baebbfeb51e555ab96d173b3fd2761 Mon Sep 17 00:00:00 2001 From: Dominic Tran Date: Mon, 28 Jul 2025 12:12:05 -0500 Subject: [PATCH 03/10] Glossary Transformer, finds words wrapped in ^^...^^ and transforms to a GlossaryToolTip at build time. --- docs/concepts/why-clickhouse-is-so-fast.mdx | 4 +- docs/getting-started/quick-start/cloud.mdx | 2 +- docs/getting-started/quick-start/oss.mdx | 4 +- docusaurus.config.en.js | 3 +- plugins/glossary-transformer.js | 140 ++++++++++++++++++ scripts/inject-glossary-tooltips.py | 131 ---------------- .../GlossaryTooltip/GlossaryTooltip.tsx | 20 ++- src/theme/MDXComponents.js | 2 +- 8 files changed, 165 insertions(+), 141 deletions(-) create mode 100644 plugins/glossary-transformer.js delete mode 100644 scripts/inject-glossary-tooltips.py diff --git a/docs/concepts/why-clickhouse-is-so-fast.mdx b/docs/concepts/why-clickhouse-is-so-fast.mdx index 34c933c546b..56e9199f8eb 100644 --- a/docs/concepts/why-clickhouse-is-so-fast.mdx +++ b/docs/concepts/why-clickhouse-is-so-fast.mdx @@ -19,7 +19,7 @@ From an architectural perspective, databases consist (at least) of a storage lay -In ClickHouse, each table consists of multiple "table ". A [part](/parts) is created whenever a user inserts data into the table (INSERT statement). A query is always executed against all table parts that exist at the time the query starts. +In ClickHouse, each table consists of multiple "table ^^parts^^". A [part](/parts) is created whenever a user inserts data into the table (INSERT statement). A query is always executed against all table parts that exist at the time the query starts. To avoid that too many parts accumulate, ClickHouse runs a [merge](/merges) operation in the background which continuously combines multiple smaller parts into a single bigger part. @@ -97,7 +97,7 @@ Finally, ClickHouse uses a vectorized query processing layer that parallelizes q Modern systems have dozens of CPU cores. To utilize all cores, ClickHouse unfolds the query plan into multiple lanes, typically one per core. Each lane processes a disjoint range of the table data. That way, the performance of the database scales "vertically" with the number of available cores. -If a single node becomes too small to hold the table data, further nodes can be added to form a . Tables can be split ("sharded") and distributed across the nodes. ClickHouse will run queries on all nodes that store table data and thereby scale "horizontally" with the number of available nodes. +If a single node becomes too small to hold the table data, further nodes can be added to form a ^^cluster^^. Tables can be split ("sharded") and distributed across the nodes. ClickHouse will run queries on all nodes that store table data and thereby scale "horizontally" with the number of available nodes. 🤿 Deep dive into this in the [Query Processing Layer](/academic_overview#4-query-processing-layer) section of the web version of our VLDB 2024 paper. diff --git a/docs/getting-started/quick-start/cloud.mdx b/docs/getting-started/quick-start/cloud.mdx index 4852ea7cfff..bd2efc0ff31 100644 --- a/docs/getting-started/quick-start/cloud.mdx +++ b/docs/getting-started/quick-start/cloud.mdx @@ -60,7 +60,7 @@ Select your desired region for deploying the service, and give your new service New ClickHouse Service
      -By default, the scale tier will create 3 each with 4 VCPUs and 16 GiB RAM. [Vertical autoscaling](/manage/scaling#vertical-auto-scaling) will be enabled by default in the Scale tier. +By default, the scale tier will create 3 ^^replica^^s each with 4 VCPUs and 16 GiB RAM. [Vertical autoscaling](/manage/scaling#vertical-auto-scaling) will be enabled by default in the Scale tier. Users can customize the service resources if required, specifying a minimum and maximum size for replicas to scale between. When ready, select `Create service`. diff --git a/docs/getting-started/quick-start/oss.mdx b/docs/getting-started/quick-start/oss.mdx index a5c323a303e..9c87651968e 100644 --- a/docs/getting-started/quick-start/oss.mdx +++ b/docs/getting-started/quick-start/oss.mdx @@ -1,7 +1,7 @@ --- slug: /getting-started/quick-start/oss sidebar_label: 'OSS' -sidebar_position: 1 +sidebar_position: 2 keywords: ['getting started', 'quick start', 'beginner-friendly'] title: 'ClickHouse OSS quick start' description: 'ClickHouse Quick Start guide' @@ -107,7 +107,7 @@ PRIMARY KEY (user_id, timestamp) You can use the familiar `INSERT INTO TABLE` command with ClickHouse, but it is important to understand that each insert into a `MergeTree` table causes what we -call a **part** in ClickHouse to be created in storage. These later get +call a **part** in ClickHouse to be created in storage. These ^^parts^^ later get merged in the background by ClickHouse. In ClickHouse, we try to bulk insert lots of rows at a time diff --git a/docusaurus.config.en.js b/docusaurus.config.en.js index 53361075e79..1860636683b 100644 --- a/docusaurus.config.en.js +++ b/docusaurus.config.en.js @@ -13,6 +13,7 @@ const frontmatterValidator = require('./plugins/frontmatter-validation/frontmatt import pluginLlmsTxt from './plugins/llms-txt-plugin.ts' import prismLight from "./src/utils/prismLight"; import prismDark from "./src/utils/prismDark"; +import glossaryTransformer from "./plugins/glossary-transformer.js"; // Helper function to skip over index.md files. function skipIndex(items) { @@ -151,7 +152,7 @@ const config = { showLastUpdateTime: false, sidebarCollapsed: true, routeBasePath: "/", - remarkPlugins: [math, remarkCustomBlocks], + remarkPlugins: [math, remarkCustomBlocks, glossaryTransformer], beforeDefaultRemarkPlugins: [fixLinks], rehypePlugins: [katex], }, diff --git a/plugins/glossary-transformer.js b/plugins/glossary-transformer.js new file mode 100644 index 00000000000..69ad9c84530 --- /dev/null +++ b/plugins/glossary-transformer.js @@ -0,0 +1,140 @@ +// plugins/glossary-transformer/index.js +const { visit } = require('unist-util-visit'); +const fs = require('fs'); +const path = require('path'); + +// Cache glossary terms globally +let cachedGlossary = null; +let glossaryModTime = null; + +function createGlossaryTransformer(options = {}) { + const config = { + caseSensitive: false, + validateTerms: true, + glossaryFile: path.resolve(__dirname, '../src/components/GlossaryTooltip/glossary.json'), + skipPatterns: [], + ...options + }; + + if (!Array.isArray(config.skipPatterns)) { + config.skipPatterns = []; + } + + function loadGlossary() { + if (!fs.existsSync(config.glossaryFile)) { + console.warn(`Glossary file not found: ${config.glossaryFile}`); + return new Map(); + } + + const stats = fs.statSync(config.glossaryFile); + if (cachedGlossary && glossaryModTime && stats.mtime <= glossaryModTime) { + return cachedGlossary; + } + + try { + const glossaryData = JSON.parse(fs.readFileSync(config.glossaryFile, 'utf8')); + const glossaryMap = new Map(); + + Object.entries(glossaryData).forEach(([term, definition]) => { + glossaryMap.set(term.toLowerCase(), { originalTerm: term, definition }); + }); + + cachedGlossary = glossaryMap; + glossaryModTime = stats.mtime; + console.log(`Loaded ${glossaryMap.size} glossary terms`); + + return glossaryMap; + } catch (error) { + console.error('Error loading glossary:', error.message); + return new Map(); + } + } + + function shouldProcess(filePath, fileContent) { + return filePath?.endsWith('.mdx') && + fileContent?.includes('^^') && + !config.skipPatterns.some(pattern => + pattern instanceof RegExp ? pattern.test(filePath) : filePath.includes(pattern) + ); + } + + return function transformer(tree, file) { + const filePath = file.path || file.history?.[0] || ''; + const fileContent = file.value || ''; + + if (!shouldProcess(filePath, fileContent)) { + return tree; + } + + const glossary = loadGlossary(); + if (glossary.size === 0) return tree; + + let transformCount = 0; + + visit(tree, 'text', (node, index, parent) => { + if (!node.value?.includes('^^') || !parent) return; + + const pattern = /\^\^([^^\n|]+?)(?:\|([^^\n]*?))?\^\^/g; + const newNodes = []; + let lastIndex = 0; + let match; + + while ((match = pattern.exec(node.value)) !== null) { + const [fullMatch, term, plural = ''] = match; + const cleanTerm = term.trim(); + const cleanPlural = plural.trim(); + + // Add text before match + if (match.index > lastIndex) { + newNodes.push({ + type: 'text', + value: node.value.slice(lastIndex, match.index) + }); + } + + // Get original term from glossary or use as-is + const glossaryEntry = glossary.get(cleanTerm.toLowerCase()); + const originalTerm = glossaryEntry?.originalTerm || cleanTerm; + + if (!glossaryEntry && config.validateTerms) { + console.warn(`Glossary term not found: ${cleanTerm}`); + } + + // Create MDX JSX element + newNodes.push({ + type: 'mdxJsxTextElement', + name: 'GlossaryTooltip', + attributes: [ + { type: 'mdxJsxAttribute', name: 'term', value: originalTerm }, + { type: 'mdxJsxAttribute', name: 'plural', value: cleanPlural } + ], + children: [] + }); + + transformCount++; + lastIndex = match.index + fullMatch.length; + } + + // Add remaining text + if (lastIndex < node.value.length) { + newNodes.push({ + type: 'text', + value: node.value.slice(lastIndex) + }); + } + + // Replace node if we made changes + if (newNodes.length > 0) { + parent.children.splice(index, 1, ...newNodes); + } + }); + + if (transformCount > 0) { + console.log(`Processed ${transformCount} glossary terms in: ${path.basename(filePath)}`); + } + + return tree; + }; +} + +module.exports = createGlossaryTransformer; \ No newline at end of file diff --git a/scripts/inject-glossary-tooltips.py b/scripts/inject-glossary-tooltips.py deleted file mode 100644 index 1856c2a233d..00000000000 --- a/scripts/inject-glossary-tooltips.py +++ /dev/null @@ -1,131 +0,0 @@ -import os -import re -import json -import argparse -import difflib -# This script injects glossary tooltips into Markdown files based on a glossary JSON file. -# Path to the glossary JSON file and target directory for Markdown files -# Adjust these paths as necessary -GLOSSARY_FILE = 'src/components/GlossaryTooltip/glossary.json' -TARGET_DIR = 'docs/getting-started/quick-start' - -with open(GLOSSARY_FILE, 'r', encoding='utf-8') as f: - glossary = json.load(f) - -terms = sorted(glossary.keys(), key=len, reverse=True) - -def build_term_regex(term): - escaped = re.escape(term) - return re.compile(rf'\b({escaped})(s|es)?\b', re.IGNORECASE) - -term_regexes = {term: build_term_regex(term) for term in terms} - -def capitalize_word(word): - return word[0].upper() + word[1:] if word else word - -def replace_terms(line, replaced_terms): - for term in terms: - if term in replaced_terms: - continue - - regex = term_regexes[term] - - def _replacer(match): - if term in replaced_terms: - return match.group(0) - - base, plural = match.group(1), match.group(2) or '' - capitalize = base[0].isupper() - capital_attr = ' capitalize' if capitalize else '' - plural_attr = f' plural="{plural}"' if plural else '' - replaced_terms.add(term) - return f'' - - line, count = regex.subn(_replacer, line, count=1) - if count > 0: - break # one term per line max - - return line - -def process_markdown(content): - if '' in content: - return content, False - - lines = content.splitlines() - inside_code_block = False - replaced_terms = set() - modified = False - output_lines = [] - - for line in lines: - stripped = line.strip() - - # Fence detection for code blocks - if stripped.startswith('```'): - inside_code_block = not inside_code_block - output_lines.append(line) - continue - - # Skip inside code or headings - if inside_code_block or stripped.startswith('#'): - output_lines.append(line) - continue - - new_line = replace_terms(line, replaced_terms) - if new_line != line: - modified = True - output_lines.append(new_line) - - return '\n'.join(output_lines), modified - -def rename_md_to_mdx(filepath): - if filepath.endswith('.md'): - new_path = filepath[:-3] + '.mdx' - os.rename(filepath, new_path) - print(f'Renamed: {filepath} → {new_path}') - return new_path - return filepath - -def walk_files(target_dir): - for root, _, files in os.walk(target_dir): - for filename in files: - if filename.endswith('.md') or filename.endswith('.mdx'): - yield os.path.join(root, filename) - -def print_diff(original, modified, path): - diff_lines = list(difflib.unified_diff( - original.splitlines(), - modified.splitlines(), - fromfile=path, - tofile=path, - lineterm='' - )) - if diff_lines: - print('\n'.join(diff_lines)) - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--dry-run', action='store_true', help='Show diffs without writing') - args = parser.parse_args() - - for filepath in walk_files(TARGET_DIR): - with open(filepath, 'r', encoding='utf-8') as f: - original = f.read() - - modified, changed = process_markdown(original) - - if changed: - if args.dry_run: - print(f'\n--- DRY RUN: {filepath} ---') - print_diff(original, modified, filepath) - else: - with open(filepath, 'w', encoding='utf-8') as f: - f.write(modified) - print(f'✅ Updated: {filepath}') - - # Rename to .mdx if needed - if filepath.endswith('.md'): - filepath = rename_md_to_mdx(filepath) - -if __name__ == '__main__': - main() diff --git a/src/components/GlossaryTooltip/GlossaryTooltip.tsx b/src/components/GlossaryTooltip/GlossaryTooltip.tsx index fc1317837d9..b7af611e9b3 100644 --- a/src/components/GlossaryTooltip/GlossaryTooltip.tsx +++ b/src/components/GlossaryTooltip/GlossaryTooltip.tsx @@ -4,7 +4,21 @@ import Link from '@docusaurus/Link'; const GlossaryTooltip = ({ term, capitalize = false, plural = '' }) => { const [visible, setVisible] = useState(false); - const definition = glossary[term]; + + // Case-insensitive lookup + let definition = glossary[term]; // Try exact match first + let matchedKey = term; + + if (!definition) { + // Try to find a case-insensitive match + const foundKey = Object.keys(glossary).find(key => + key.toLowerCase() === term.toLowerCase() + ); + if (foundKey) { + definition = glossary[foundKey]; + matchedKey = foundKey; + } + } if (!definition) { console.warn(`Glossary term not found: ${term}`); @@ -15,7 +29,7 @@ const GlossaryTooltip = ({ term, capitalize = false, plural = '' }) => { } const displayTerm = capitalize ? capitalizeWord(term) : term.toLowerCase(); - const anchorId = term.toLowerCase().replace(/\s+/g, '-'); + const anchorId = matchedKey.toLowerCase().replace(/\s+/g, '-'); const glossarySlug = `/concepts/glossary#${anchorId}`; return ( @@ -46,4 +60,4 @@ function capitalizeWord(word) { return word.charAt(0).toUpperCase() + word.slice(1); } -export default GlossaryTooltip; +export default GlossaryTooltip; \ No newline at end of file diff --git a/src/theme/MDXComponents.js b/src/theme/MDXComponents.js index 0e627a2d4a9..07b73d1081c 100644 --- a/src/theme/MDXComponents.js +++ b/src/theme/MDXComponents.js @@ -5,7 +5,7 @@ import MDXComponents from '@theme-original/MDXComponents'; // Import the custom Stepper component // Make sure the path matches your project structure import VStepper from '@site/src/components/Stepper/Stepper'; -import GlossaryTooltip from '../../src/components/GlossaryTooltip/GlossaryTooltip.tsx'; +import GlossaryTooltip from '@site/src/components/GlossaryTooltip/GlossaryTooltip'; // Define the enhanced components const enhancedComponents = { From 6a6d429c2e5a074838c8a808bbb853cc8d6f5692 Mon Sep 17 00:00:00 2001 From: Dominic Tran Date: Mon, 28 Jul 2025 14:36:57 -0500 Subject: [PATCH 04/10] glossary wrapper script with readme, plus files updated by the script --- docs/best-practices/partitioning_keys.mdx | 14 +- docs/concepts/why-clickhouse-is-so-fast.mdx | 10 +- docs/deployment-guides/parallel-replicas.mdx | 78 +++--- .../core-concepts/academic_overview.mdx | 46 ++-- scripts/wrap-glossary-terms/README.md | 129 ++++++++++ .../wrap-glossary-terms.py | 236 ++++++++++++++++++ 6 files changed, 439 insertions(+), 74 deletions(-) create mode 100644 scripts/wrap-glossary-terms/README.md create mode 100644 scripts/wrap-glossary-terms/wrap-glossary-terms.py diff --git a/docs/best-practices/partitioning_keys.mdx b/docs/best-practices/partitioning_keys.mdx index 262491ff0cf..0795053b16c 100644 --- a/docs/best-practices/partitioning_keys.mdx +++ b/docs/best-practices/partitioning_keys.mdx @@ -15,7 +15,7 @@ import merges_with_partitions from '@site/static/images/bestpractices/merges_wit Partitioning is primarily a data management technique and not a query optimization tool, and while it can improve performance in specific workloads, it should not be the first mechanism used to accelerate queries; the partitioning key must be chosen carefully, with a clear understanding of its implications, and only applied when it aligns with data life cycle needs or well-understood access patterns. ::: -In ClickHouse, partitioning organizes data into logical segments based on a specified key. This is defined using the `PARTITION BY` clause at table creation time and is commonly used to group rows by time intervals, categories, or other business-relevant dimensions. Each unique value of the partitioning expression forms its own physical partition on disk, and ClickHouse stores data in separate parts for each of these values. Partitioning improves data management, simplifies retention policies, and can help with certain query patterns. +In ClickHouse, partitioning organizes data into logical segments based on a specified key. This is defined using the `PARTITION BY` clause at table creation time and is commonly used to group rows by time intervals, categories, or other business-relevant dimensions. Each unique value of the partitioning expression forms its own physical partition on disk, and ClickHouse stores data in separate ^^parts^^ for each of these values. Partitioning improves data management, simplifies retention policies, and can help with certain query patterns. For example, consider the following UK price paid dataset table with a partitioning key of `toStartOfMonth(date)`. @@ -40,7 +40,7 @@ The ClickHouse server first splits the rows from the example insert with 4 rows For a more detailed explanation of partitioning, we recommend [this guide](/partitions). -With partitioning enabled, ClickHouse only [merges](/merges) data parts within, but not across partitions. We sketch that for our example table from above: +With partitioning enabled, ClickHouse only [merges](/merges) data ^^parts^^ within, but not across partitions. We sketch that for our example table from above: Partitions @@ -52,16 +52,16 @@ While partitioning can improve query performance for some workloads, it can also If the partitioning key is not in the primary key and you are filtering by it, users may see an improvement in query performance with partitioning. See [here](/partitions#query-optimization) for an example. -Conversely, if queries need to query across partitions performance may be negatively impacted due to a higher number of total parts. For this reason, users should understand their access patterns before considering partitioning a a query optimization technique. +Conversely, if queries need to query across partitions performance may be negatively impacted due to a higher number of total ^^parts^^. For this reason, users should understand their access patterns before considering partitioning a a query optimization technique. In summary, users should primarily think of partitioning as a data management technique. For an example of managing data, see ["Managing Data"](/observability/managing-data) from the observability use-case guide and ["What are table partitions used for?"](/partitions#data-management) from Core Concepts - Table partitions. ## Choose a low cardinality partitioning key {#choose-a-low-cardinality-partitioning-key} -Importantly, a higher number of parts will negatively affect query performance. ClickHouse will therefore respond to inserts with a [“too many parts”](/knowledgebase/exception-too-many-parts) error if the number of parts exceeds specified limits either in [total](/operations/settings/merge-tree-settings#max_parts_in_total) or [per partition](/operations/settings/merge-tree-settings#parts_to_throw_insert). +Importantly, a higher number of ^^parts^^ will negatively affect query performance. ClickHouse will therefore respond to inserts with a [“too many parts”](/knowledgebase/exception-too-many-parts) error if the number of ^^parts^^ exceeds specified limits either in [total](/operations/settings/merge-tree-settings#max_parts_in_total) or [per partition](/operations/settings/merge-tree-settings#parts_to_throw_insert). -Choosing the right **cardinality** for the partitioning key is critical. A high-cardinality partitioning key - where the number of distinct partition values is large - can lead to a proliferation of data parts. Since ClickHouse does not merge parts across partitions, too many partitions will result in too many unmerged parts, eventually triggering the “Too many parts” error. [Merges are essential](/merges) for reducing storage fragmentation and optimizing query speed, but with high-cardinality partitions, that merge potential is lost. +Choosing the right **cardinality** for the partitioning key is critical. A high-cardinality partitioning key - where the number of distinct partition values is large - can lead to a proliferation of data ^^parts^^. Since ClickHouse does not merge ^^parts^^ across partitions, too many partitions will result in too many unmerged ^^parts^^, eventually triggering the “Too many ^^parts^^” error. [Merges are essential](/merges) for reducing storage fragmentation and optimizing query speed, but with high-cardinality partitions, that merge potential is lost. -By contrast, a **low-cardinality partitioning key**—with fewer than 100 - 1,000 distinct values - is usually optimal. It enables efficient part merging, keeps metadata overhead low, and avoids excessive object creation in storage. In addition, ClickHouse automatically builds MinMax indexes on partition columns, which can significantly speed up queries that filter on those columns. For example, filtering by month when the table is partitioned by `toStartOfMonth(date)` allows the engine to skip irrelevant partitions and their parts entirely. +By contrast, a **low-cardinality partitioning key**—with fewer than 100 - 1,000 distinct values - is usually optimal. It enables efficient part merging, keeps metadata overhead low, and avoids excessive object creation in storage. In addition, ClickHouse automatically builds MinMax indexes on partition columns, which can significantly speed up queries that filter on those columns. For example, filtering by month when the table is partitioned by `toStartOfMonth(date)` allows the engine to skip irrelevant partitions and their ^^parts^^ entirely. -While partitioning can improve performance in some query patterns, it's primarily a data management feature. In many cases, querying across all partitions can be slower than using a non-partitioned table due to increased data fragmentation and more parts being scanned. Use partitioning judiciously, and always ensure that the chosen key is low-cardinality and aligns with your data life cycle policies (e.g., retention via TTL). If you're unsure whether partitioning is necessary, you may want to start without it and optimize later based on observed access patterns. +While partitioning can improve performance in some query patterns, it's primarily a data management feature. In many cases, querying across all partitions can be slower than using a non-partitioned table due to increased data fragmentation and more ^^parts^^ being scanned. Use partitioning judiciously, and always ensure that the chosen key is low-cardinality and aligns with your data life cycle policies (e.g., retention via TTL). If you're unsure whether partitioning is necessary, you may want to start without it and optimize later based on observed access patterns. diff --git a/docs/concepts/why-clickhouse-is-so-fast.mdx b/docs/concepts/why-clickhouse-is-so-fast.mdx index 56e9199f8eb..006e504920a 100644 --- a/docs/concepts/why-clickhouse-is-so-fast.mdx +++ b/docs/concepts/why-clickhouse-is-so-fast.mdx @@ -19,9 +19,9 @@ From an architectural perspective, databases consist (at least) of a storage lay -In ClickHouse, each table consists of multiple "table ^^parts^^". A [part](/parts) is created whenever a user inserts data into the table (INSERT statement). A query is always executed against all table parts that exist at the time the query starts. +In ClickHouse, each table consists of multiple "table ^^parts^^". A [part](/parts) is created whenever a user inserts data into the table (INSERT statement). A query is always executed against all table ^^parts^^ that exist at the time the query starts. -To avoid that too many parts accumulate, ClickHouse runs a [merge](/merges) operation in the background which continuously combines multiple smaller parts into a single bigger part. +To avoid that too many ^^parts^^ accumulate, ClickHouse runs a [merge](/merges) operation in the background which continuously combines multiple smaller ^^parts^^ into a single bigger part. This approach has several advantages: All data processing can be [offloaded to background part merges](/concepts/why-clickhouse-is-so-fast#storage-layer-merge-time-computation), keeping data writes lightweight and highly efficient. Individual inserts are "local" in the sense that they do not need to update global, i.e. per-table data structures. As a result, multiple simultaneous inserts need no mutual synchronization or synchronization with existing table data, and thus inserts can be performed almost at the speed of disk I/O. @@ -33,7 +33,7 @@ This approach has several advantages: All data processing can be [offloaded to b -Inserts are fully isolated from SELECT queries, and merging inserted data parts happens in the background without affecting concurrent queries. +Inserts are fully isolated from SELECT queries, and merging inserted data ^^parts^^ happens in the background without affecting concurrent queries. 🤿 Deep dive into this in the [Storage Layer](/docs/academic_overview#3-storage-layer) section of the web version of our VLDB 2024 paper. @@ -43,7 +43,7 @@ Inserts are fully isolated from SELECT queries, and merging inserted data parts Unlike other databases, ClickHouse keeps data writes lightweight and efficient by performing all additional data transformations during the [merge](/merges) background process. Examples of this include: -- **Replacing merges** which retain only the most recent version of a row in the input parts and discard all other row versions. Replacing merges can be thought of as a merge-time cleanup operation. +- **Replacing merges** which retain only the most recent version of a row in the input ^^parts^^ and discard all other row versions. Replacing merges can be thought of as a merge-time cleanup operation. - **Aggregating merges** which combine intermediate aggregation states in the input part to a new aggregation state. While this seems difficult to understand, it really actually only implements an incremental aggregation. @@ -53,7 +53,7 @@ The point of these transformations is to shift work (computation) from the time On the one hand, user queries may become significantly faster, sometimes by 1000x or more, if they can leverage "transformed" data, e.g. pre-aggregated data. -On the other hand, the majority of the runtime of merges is consumed by loading the input parts and saving the output part. The additional effort to transform the data during merge does usually not impact the runtime of merges too much. All of this magic is completely transparent and does not affect the result of queries (besides their performance). +On the other hand, the majority of the runtime of merges is consumed by loading the input ^^parts^^ and saving the output part. The additional effort to transform the data during merge does usually not impact the runtime of merges too much. All of this magic is completely transparent and does not affect the result of queries (besides their performance). 🤿 Deep dive into this in the [Merge-time Data Transformation](/docs/academic_overview#3-3-merge-time-data-transformation) section of the web version of our VLDB 2024 paper. diff --git a/docs/deployment-guides/parallel-replicas.mdx b/docs/deployment-guides/parallel-replicas.mdx index 4fff9cf815d..a6b3694956c 100644 --- a/docs/deployment-guides/parallel-replicas.mdx +++ b/docs/deployment-guides/parallel-replicas.mdx @@ -31,13 +31,13 @@ multiple replicas for its execution. ## Sharded architecture {#sharded-architecture} In a shared-nothing architecture, clusters are commonly split into -multiple shards, with each shard containing a subset of the overall data. A +multiple shards, with each ^^shard^^ containing a subset of the overall data. A distributed table sits on top of these shards, providing a unified view of the complete data. Reads can be sent to the local table. Query execution will occur only -on the specified shard, or it can be sent to the distributed table, and in that -case, each shard will execute the given queries. The server where the distributed +on the specified ^^shard^^, or it can be sent to the distributed table, and in that +case, each ^^shard^^ will execute the given queries. The server where the distributed table was queried will aggregate the data and respond to the client: sharded archtiecture @@ -51,12 +51,12 @@ The figure above visualizes what happens when a client queries a distributed tab by a load balancer). This node is now going to act as a coordinator.
    2. - The node will locate each shard that needs to execute the query + The node will locate each ^^shard^^ that needs to execute the query via the information specified by the distributed table, and the query is - sent to each shard. + sent to each ^^shard^^.
    3. - Each shard reads, filters, and aggregates the data locally and then + Each ^^shard^^ reads, filters, and aggregates the data locally and then sends back a mergeable state to the coordinator.
    4. @@ -66,7 +66,7 @@ The figure above visualizes what happens when a client queries a distributed tab
    When we add replicas into the mix, the process is fairly similar, with the only -difference being that only a single replica from each shard will execute the query. +difference being that only a single ^^replica^^ from each ^^shard^^ will execute the query. This means that more queries can then be processed in parallel. ## Non-sharded architecture {#non-sharded-architecture} @@ -81,14 +81,14 @@ The figure below shows the ClickHouse Cloud architecture: non sharded architecture This architecture allows us to be able to add and remove replicas nearly -instantaneously, ensuring a very high cluster scalability. The ClickHouse -Keeper cluster (shown right) ensures that we have a single source of truth +instantaneously, ensuring a very high ^^cluster^^ scalability. The ClickHouse +Keeper ^^cluster^^ (shown right) ensures that we have a single source of truth for the metadata. Replicas can fetch the metadata from the ClickHouse Keeper -cluster and all maintain the same data. The data themselves are stored in +^^cluster^^ and all maintain the same data. The data themselves are stored in object storage, and the SSD cache allows us to speed up queries. But how can we now distribute query execution across multiple servers? In a -sharded architecture, it was fairly obvious given that each shard could actually +sharded architecture, it was fairly obvious given that each ^^shard^^ could actually execute a query on a subset of the data. How does it work when there is no sharding? ## Introducing parallel replicas {#introducing-parallel-replicas} @@ -99,7 +99,7 @@ that creates the list of tasks that need to be executed, ensures they are all executed, aggregated and that the result is returned to the client. Like in most distributed systems, this will be the role of the node that receives the initial query. We also need to define the unit of work. In a sharded architecture, -the unit of work is the shard, a subset of the data. With parallel replicas we +the unit of work is the ^^shard^^, a subset of the data. With parallel replicas we will use a small portion of the table, called [granules](/docs/guides/best-practices/sparse-primary-indexes#data-is-organized-into-granules-for-parallel-data-processing), as the unit of work. @@ -115,7 +115,7 @@ With parallel replicas: balancer. This node becomes the coordinator for this query.
  • - The node analyzes the index of each part, and selects the right parts and + The node analyzes the index of each part, and selects the right ^^parts^^ and granules to process.
  • @@ -142,14 +142,14 @@ from working perfectly:
  • Replication in ClickHouse is asynchronous, some replicas might not - have the same parts at some point in time. + have the same ^^parts^^ at some point in time.
  • Tail latency between replicas needs to be handled somehow.
  • - The filesystem cache varies from replica to replica based on the - activity on each replica, meaning that a random task assignment might + The filesystem cache varies from ^^replica^^ to ^^replica^^ based on the + activity on each ^^replica^^, meaning that a random task assignment might lead to less optimal performance given the cache locality.
  • @@ -170,20 +170,20 @@ announcement. Let's visualize how it works using the figure below:
  • The coordinating node sends a request to get announcements from - all the replicas in the cluster. Replicas may have slightly different - views of the current set of parts for a table. As a result we need to + all the replicas in the ^^cluster^^. Replicas may have slightly different + views of the current set of ^^parts^^ for a table. As a result we need to collect this information to avoid incorrect scheduling decisions.
  • The coordinating node then uses the announcements to define a set of granules that can be assigned to the different replicas. Here for example, - we can see that no granules from part 3 have been assigned to replica 2 - because this replica did not provide this part in its announcement. - Also note that no tasks were assigned to replica 3 because the - replica did not provide an announcement. + we can see that no granules from part 3 have been assigned to ^^replica^^ 2 + because this ^^replica^^ did not provide this part in its announcement. + Also note that no tasks were assigned to ^^replica^^ 3 because the + ^^replica^^ did not provide an announcement.
  • - After each replica has processed the query on their subset of granules + After each ^^replica^^ has processed the query on their subset of granules and the mergeable state has been sent back to the coordinator, the coordinator merges the results and the response is sent to the client.
  • @@ -192,13 +192,13 @@ announcement. Let's visualize how it works using the figure below: ### Dynamic coordination {#dynamic-coordination} To address the issue of tail latency, we added dynamic coordination. This means -that all the granules are not sent to a replica in one request, but each replica +that all the granules are not sent to a ^^replica^^ in one request, but each ^^replica^^ will be able to request a new task (a set of granules to be processed) to the -coordinator. The coordinator will give the replica the set of granules based on +coordinator. The coordinator will give the ^^replica^^ the set of granules based on the announcement received. Let's assume that we are at the stage in the process where all replicas have sent -an announcement with all parts. +an announcement with all ^^parts^^. The figure below visualizes how dynamic coordination works: @@ -218,11 +218,11 @@ The figure below visualizes how dynamic coordination works:
    1. - Replica 1 and 2 are able to finish their task very quickly. They + ^^Replica^^ 1 and 2 are able to finish their task very quickly. They will request another task from the coordinator node.
    2. - The coordinator assigns new tasks to replica 1 and 2. + The coordinator assigns new tasks to ^^replica^^ 1 and 2.
    @@ -247,15 +247,15 @@ The figure below visualizes how dynamic coordination works: The last remaining potential issue is how we handle cache locality. If the query is executed multiple times, how can we ensure the same task gets routed to the -same replica? In the previous example, we had the following tasks assigned: +same ^^replica^^? In the previous example, we had the following tasks assigned: - - - + + + @@ -285,15 +285,15 @@ benefit from the cache, two things happen. A hash of the part + set of granules (a task) gets computed. A modulo of the number of replicas for the task assignment gets applied. -On paper this sounds good, but in reality, a sudden load on one replica, or a -network degradation, can introduce tail latency if the same replica is +On paper this sounds good, but in reality, a sudden load on one ^^replica^^, or a +network degradation, can introduce tail latency if the same ^^replica^^ is consistently used for executing certain tasks. If `max_parallel_replicas` is less than the number of replicas, then random replicas are picked for query execution. ### Task stealing {#task-stealing} -if some replica processes tasks slower than others, other replicas will try to -'steal' tasks that in principle belong to that replica by hash to reduce the +if some ^^replica^^ processes tasks slower than others, other replicas will try to +'steal' tasks that in principle belong to that ^^replica^^ by hash to reduce the tail latency. ### Limitations {#limitations} @@ -303,7 +303,7 @@ this section. :::note If you find an issue which is not one of the limitations given below, and -suspect parallel replica to be the cause, please open an issue on GitHub using +suspect parallel ^^replica^^ to be the cause, please open an issue on GitHub using the label `comp-parallel-replicas`. ::: @@ -477,7 +477,7 @@ WHERE type='type3' GROUP BY toYear(timestamp) LIMIT 10 ``` -Let's have a look at the query pipeline without parallel replica: +Let's have a look at the query pipeline without parallel ^^replica^^: ```sql title="EXPLAIN PIPELINE (without parallel replica)" EXPLAIN PIPELINE graph = 1, compact = 0 @@ -492,7 +492,7 @@ FORMAT TSV; -And now with parallel replica: +And now with parallel ^^replica^^: ```sql title="EXPLAIN PIPELINE (with parallel replica)" EXPLAIN PIPELINE graph = 1, compact = 0 diff --git a/docs/managing-data/core-concepts/academic_overview.mdx b/docs/managing-data/core-concepts/academic_overview.mdx index 7b44c728062..53f5e4a67d7 100644 --- a/docs/managing-data/core-concepts/academic_overview.mdx +++ b/docs/managing-data/core-concepts/academic_overview.mdx @@ -68,15 +68,15 @@ As shown by [Figure 2,](#page-2-0) the ClickHouse engine is split into three mai Query processing follows the traditional paradigm of parsing incoming queries, building and optimizing logical and physical query plans, and execution. ClickHouse uses a vectorized execution model similar to MonetDB/X100 [\[11\]](#page-12-0), in combination with opportunistic code compilation [\[53\]](#page-13-0). Queries can be written in a feature-rich SQL dialect, PRQL [\[76\]](#page-13-1), or Kusto's KQL [\[50\]](#page-13-2). -The storage layer consists of different table engines that encapsulate the format and location of table data. Table engines fall into three categories: The first category is the MergeTree* family of table engines which represent the primary persistence format in ClickHouse. Based on the idea of LSM trees [\[60\]](#page-13-3), tables are split into horizontal, sorted parts, which are continuously merged by a background process. Individual MergeTree* table engines differ in the way the merge combines the rows from its input parts. For example, rows can be aggregated or replaced, if outdated. +The storage layer consists of different table engines that encapsulate the format and location of table data. Table engines fall into three categories: The first category is the MergeTree* family of table engines which represent the primary persistence format in ClickHouse. Based on the idea of LSM trees [\[60\]](#page-13-3), tables are split into horizontal, sorted ^^parts^^, which are continuously merged by a background process. Individual MergeTree* table engines differ in the way the merge combines the rows from its input ^^parts^^. For example, rows can be aggregated or replaced, if outdated. The second category are special-purpose table engines, which are used to speed up or distribute query execution. This category includes in-memory key-value table engines called dictionaries. A [dictionary](https://clickhou.se/dictionaries) caches the result of a query periodically executed against an internal or external data source. This significantly reduces access latencies in scenarios, where a degree of data staleness can be tolerated. Other examples of special-purpose table engines include a pure in-memory engine used for temporary tables and the Distributed table engine for transparent data sharding (see below). The third category of table engines are virtual table engines for bidirectional data exchange with external systems such as relational databases (e.g. PostgreSQL, MySQL), publish/subscribe systems (e.g. Kafka, RabbitMQ [\[24\]](#page-12-1)), or key/value stores (e.g. Redis). Virtual engines can also interact with data lakes (e.g. Iceberg, DeltaLake, Hudi [\[36\]](#page-12-2)) or files in object storage (e.g. AWS S3, Google GCP). -ClickHouse supports sharding and replication of tables across multiple cluster nodes for scalability and availability. Sharding partitions a table into a set of table shards according to a sharding expression. The individual shards are mutually independent tables and typically located on different nodes. Clients can read and write shards directly, i.e. treat them as separate tables, or use the Distributed special table engine, which provides a global view of all table shards. The main purpose of sharding is to process data sets which exceed the capacity of individual nodes (typically, a few dozens terabytes of data). Another use of sharding is to distribute the read-write load for a table over multiple nodes, i.e., load balancing. Orthogonal to that, a shard can be replicated across multiple nodes for tolerance against node failures. To that end, each Merge-Tree* table engine has a corresponding ReplicatedMergeTree* engine which uses a multi-master coordination scheme based on Raft consensus [\[59\]](#page-13-4) (implemented by [Keeper](https://clickhou.se/keeper), a drop-in replacement for Apache Zookeeper written in C++) to guarantee that every shard has, at all times, a configurable number of replicas. Section [3.6](#page-5-0) discusses the replication mechanism in detail. As an example, [Figure 2](#page-2-0) shows a table with two shards, each replicated to two nodes. +ClickHouse supports sharding and replication of tables across multiple ^^cluster^^ nodes for scalability and availability. Sharding partitions a table into a set of table shards according to a sharding expression. The individual shards are mutually independent tables and typically located on different nodes. Clients can read and write shards directly, i.e. treat them as separate tables, or use the Distributed special table engine, which provides a global view of all table shards. The main purpose of sharding is to process data sets which exceed the capacity of individual nodes (typically, a few dozens terabytes of data). Another use of sharding is to distribute the read-write load for a table over multiple nodes, i.e., load balancing. Orthogonal to that, a ^^shard^^ can be replicated across multiple nodes for tolerance against node failures. To that end, each Merge-Tree* table engine has a corresponding ReplicatedMergeTree* engine which uses a multi-master coordination scheme based on Raft consensus [\[59\]](#page-13-4) (implemented by [Keeper](https://clickhou.se/keeper), a drop-in replacement for Apache Zookeeper written in C++) to guarantee that every ^^shard^^ has, at all times, a configurable number of replicas. Section [3.6](#page-5-0) discusses the replication mechanism in detail. As an example, [Figure 2](#page-2-0) shows a table with two shards, each replicated to two nodes. -Finally, the ClickHouse database engine can be operated in on-premise, cloud, standalone, or in-process modes. In the on-premise mode, users set up ClickHouse locally as a single server or multinode cluster with sharding and/or replication. Clients communicate with the database over the native, MySQL's, PostgreSQL's binary wire protocols, or an HTTP REST API. The cloud mode is represented by ClickHouse Cloud, a fully managed and autoscaling DBaaS offering. While this paper focuses on the on-premise mode, we plan to describe the architecture of ClickHouse Cloud in a follow-up publication. The [standalone mode](https://clickhou.se/local-fastest-tool) turns ClickHouse into a command line utility for analyzing and transforming files, making it a SQL-based alternative to Unix tools like cat and grep. While this requires no prior configuration, the standalone mode is restricted to a single server. Recently, an in-process mode called chDB [\[15\]](#page-12-3) has been developed for interactive data analysis use cases like Jupyter notebooks [\[37\]](#page-12-4) with Pandas dataframes [\[61\]](#page-13-5). Inspired by DuckDB [\[67\]](#page-13-6), [chDB](https://clickhou.se/chdb-rocket-engine) embeds ClickHouse as a high-performance OLAP engine into a host process. Compared to the other modes, this allows to pass source and result data between the database engine and the application efficiently without copying as they run in the same address space. +Finally, the ClickHouse database engine can be operated in on-premise, cloud, standalone, or in-process modes. In the on-premise mode, users set up ClickHouse locally as a single server or multinode ^^cluster^^ with sharding and/or replication. Clients communicate with the database over the native, MySQL's, PostgreSQL's binary wire protocols, or an HTTP REST API. The cloud mode is represented by ClickHouse Cloud, a fully managed and autoscaling DBaaS offering. While this paper focuses on the on-premise mode, we plan to describe the architecture of ClickHouse Cloud in a follow-up publication. The [standalone mode](https://clickhou.se/local-fastest-tool) turns ClickHouse into a command line utility for analyzing and transforming files, making it a SQL-based alternative to Unix tools like cat and grep. While this requires no prior configuration, the standalone mode is restricted to a single server. Recently, an in-process mode called chDB [\[15\]](#page-12-3) has been developed for interactive data analysis use cases like Jupyter notebooks [\[37\]](#page-12-4) with Pandas dataframes [\[61\]](#page-13-5). Inspired by DuckDB [\[67\]](#page-13-6), [chDB](https://clickhou.se/chdb-rocket-engine) embeds ClickHouse as a high-performance OLAP engine into a host process. Compared to the other modes, this allows to pass source and result data between the database engine and the application efficiently without copying as they run in the same address space. ## 3 STORAGE LAYER {#3-storage-layer} @@ -84,7 +84,7 @@ This section discusses MergeTree* table engines as ClickHouse's native storage f ### 3.1 On-Disk Format {#3-1-on-disk-format} -Each table in the MergeTree* table engine is organized as a collection of immutable table parts. A part is created whenever a set of rows is inserted into the table. Parts are self-contained in the sense that they include all metadata required to interpret their content without additional lookups to a central catalog. To keep the number of parts per table low, a background merge job periodically combines multiple smaller parts into a larger part until a configurable part size is reached (150 GB by default). Since parts are sorted by the table's primary key columns (see Section [3.2)](#page-3-0), efficient k-way merge sort [\[40\]](#page-12-5) is used for merging. The source parts are marked as inactive and eventually deleted as soon as their reference count drops to zero, i.e. no further queries read from them. +Each table in the MergeTree* table engine is organized as a collection of immutable table ^^parts^^. A part is created whenever a set of rows is inserted into the table. ^^Parts^^ are self-contained in the sense that they include all metadata required to interpret their content without additional lookups to a central catalog. To keep the number of ^^parts^^ per table low, a background merge job periodically combines multiple smaller ^^parts^^ into a larger part until a configurable part size is reached (150 GB by default). Since ^^parts^^ are sorted by the table's primary key columns (see Section [3.2)](#page-3-0), efficient k-way merge sort [\[40\]](#page-12-5) is used for merging. The source ^^parts^^ are marked as inactive and eventually deleted as soon as their reference count drops to zero, i.e. no further queries read from them. Rows can be inserted in two modes: In synchronous insert mode, each INSERT statement creates a new part and appends it to the table. To minimize the overhead of merges, database clients are encouraged to insert tuples in bulk, e.g. 20,000 rows at once. However, delays caused by client-side batching are often unacceptable if the data should be analyzed in real-time. For example, observability use cases frequently involve thousands of monitoring agents continuously sending small amounts of event and metrics data. Such scenarios can utilize the asynchronous insert mode, in which ClickHouse buffers rows from multiple incoming INSERTs into the same table and creates a new part only after the buffer size exceeds a configurable threshold or a timeout expires. @@ -92,13 +92,13 @@ Rows can be inserted in two modes: In synchronous insert mode, each INSERT state Figure 3: Inserts and merges for MergeTree*-engine tables. -[Figure 3](#page-2-1) illustrates four synchronous and two asynchronous inserts into a MergeTree*-engine table. Two merges reduced the number of active parts from initially fve to two. +[Figure 3](#page-2-1) illustrates four synchronous and two asynchronous inserts into a MergeTree*-engine table. Two merges reduced the number of active ^^parts^^ from initially fve to two. -Compared to LSM trees [\[58\]](#page-13-7) and their implementation in various databases [\[13,](#page-12-6) [26,](#page-12-7) [56\]](#page-13-8), ClickHouse treats all parts as equal instead of arranging them in a hierarchy. As a result, merges are no longer limited to parts in the same level. Since this also forgoes the implicit chronological ordering of parts, alternative mechanisms for updates and deletes not based on tombstones are required (see Section [3.4)](#page-4-0). ClickHouse writes inserts directly to disk while other LSM-treebased stores typically use write-ahead logging (see Section [3.7)](#page-5-1). +Compared to LSM trees [\[58\]](#page-13-7) and their implementation in various databases [\[13,](#page-12-6) [26,](#page-12-7) [56\]](#page-13-8), ClickHouse treats all ^^parts^^ as equal instead of arranging them in a hierarchy. As a result, merges are no longer limited to ^^parts^^ in the same level. Since this also forgoes the implicit chronological ordering of ^^parts^^, alternative mechanisms for updates and deletes not based on tombstones are required (see Section [3.4)](#page-4-0). ClickHouse writes inserts directly to disk while other LSM-treebased stores typically use write-ahead logging (see Section [3.7)](#page-5-1). A part corresponds to a directory on disk, containing one file for each column. As an optimization, the columns of a small part (smaller than 10 MB by default) are stored consecutively in a single file to increase the spatial locality for reads and writes. The rows of a part are further logically divided into groups of 8192 records, called granules. A granule represents the smallest indivisible data unit processed by the scan and index lookup operators in ClickHouse. Reads and writes of on-disk data are, however, not performed at the granule level but at the granularity of blocks, which combine multiple neighboring granules within a column. New blocks are formed based on a configurable byte size per block (by default 1 MB), i.e., the number of granules in a block is variable and depends on the column's data type and distribution. Blocks are furthermore compressed to reduce their size and I/O costs. By default, ClickHouse employs LZ4 [\[75\]](#page-13-9) as a general-purpose compression algorithm, but users can also specify specialized codecs like Gorilla [\[63\]](#page-13-10) or FPC [\[12\]](#page-12-8) for floating-point data. Compression algorithms can also be chained. For example, it is possible to first reduce logical redundancy in numeric values using delta coding [\[23\]](#page-12-9), then perform heavy-weight compression, and finally encrypt the data using an AES codec. Blocks are decompressed on-the-fy when they are loaded from disk into memory. To enable fast random access to individual granules despite compression, ClickHouse additionally stores for each column a mapping that associates every granule id with the offset of its containing compressed block in the column file and the offset of the granule in the uncompressed block. -Columns can further be dictionary-encoded [\[2,](#page-12-10) [77,](#page-13-11) [81\]](#page-13-12) or made nullable using two special wrapper data types: LowCardinality(T) replaces the original column values by integer ids and thus significantly reduces the storage overhead for data with few unique values. Nullable(T) adds an internal bitmap to column T, representing whether column values are NULL or not. +Columns can further be ^^dictionary^^-encoded [\[2,](#page-12-10) [77,](#page-13-11) [81\]](#page-13-12) or made nullable using two special wrapper data types: LowCardinality(T) replaces the original column values by integer ids and thus significantly reduces the storage overhead for data with few unique values. Nullable(T) adds an internal bitmap to column T, representing whether column values are NULL or not. Finally, tables can be range, hash, or round-robin partitioned using arbitrary partitioning expressions. To enable partition pruning, ClickHouse additionally stores the partitioning expression's minimum and maximum values for each partition. Users can optionally create more advanced column statistics (e.g., HyperLogLog [\[30\]](#page-12-11) or t-digest [\[28\]](#page-12-12) statistics) that also provide cardinality estimates. @@ -114,7 +114,7 @@ First, users can define a **primary key index** for a table. The primary key col Figure 4: Evaluating filters with a primary key index. -Second, users can create **table projections**, i.e., alternative versions of a table that contain the same rows sorted by a different primary key [\[71\]](#page-13-13). Projections allow to speed up queries that filter on columns different than the main table's primary key at the cost of an increased overhead for inserts, merges, and space consumption. By default, projections are populated lazily only from parts newly inserted into the main table but not from existing parts unless the user materializes the projection in full. The query optimizer chooses between reading from the main table or a projection based on estimated I/O costs. If no projection exists for a part, query execution falls back to the corresponding main table part. +Second, users can create **table projections**, i.e., alternative versions of a table that contain the same rows sorted by a different primary key [\[71\]](#page-13-13). Projections allow to speed up queries that filter on columns different than the main table's primary key at the cost of an increased overhead for inserts, merges, and space consumption. By default, projections are populated lazily only from ^^parts^^ newly inserted into the main table but not from existing ^^parts^^ unless the user materializes the projection in full. The query optimizer chooses between reading from the main table or a projection based on estimated I/O costs. If no projection exists for a part, query execution falls back to the corresponding main table part. Third, **skipping indices** provide a lightweight alternative to projections. The idea of skipping indices is to store small amounts of metadata at the level of multiple consecutive granules which allows to avoid scanning irrelevant rows. Skipping indices can be created for arbitrary index expressions and using a configurable granularity, i.e. number of granules in a skipping index block. Available skipping index types include: 1. Min-max indices [\[51\]](#page-13-14), storing the minimum and maximum values of the index expression for each index block. This index type works well for locally clustered data with small absolute ranges, e.g. loosely sorted data. 2. Set indices, storing a configurable number of unique index block values. These indexes are best used with data with a small local cardinality, i.e. "clumped together" values. 3. Bloom filter indices [\[9\]](#page-12-14) build for row, token, or n-gram values with a configurable false positive rate. These indices support text search [\[73\]](#page-13-15), but unlike min-max and set indices, they cannot be used for range or negative predicates. @@ -126,7 +126,7 @@ Business intelligence and observability use cases often need to handle data gene **Aggregating merges** collapse rows with equal primary key column values into an aggregated row. Non-primary key columns must be of a partial aggregation state that holds the summary values. Two partial aggregation states, e.g. a sum and a count for avg(), are combined into a new partial aggregation state. Aggregating merges are typically used in materialized views instead of normal tables. Materialized views are populated based on a transformation query against a source table. Unlike other databases, ClickHouse does not refresh materialized views periodically with the entire content of the source table. Materialized views are rather updated incrementally with the result of the transformation query when a new part is inserted into the source table. -[Figure 5](#page-4-1) shows a materialized view defined on a table with page impression statistics. For new parts inserted into the source table, the transformation query computes the maximum and average latencies, grouped by region, and inserts the result into a materialized view. Aggregation functions avg() and max() with extension -State return partial aggregation states instead of actual results. An aggregating merge defined for the materialized view continuously combines partial aggregation states in different parts. To obtain the final result, users consolidate the partial aggregation states in the materialized view using avg() and max()) with -Merge extension. +[Figure 5](#page-4-1) shows a materialized view defined on a table with page impression statistics. For new ^^parts^^ inserted into the source table, the transformation query computes the maximum and average latencies, grouped by region, and inserts the result into a materialized view. Aggregation functions avg() and max() with extension -State return partial aggregation states instead of actual results. An aggregating merge defined for the materialized view continuously combines partial aggregation states in different ^^parts^^. To obtain the final result, users consolidate the partial aggregation states in the materialized view using avg() and max()) with -Merge extension. @@ -134,7 +134,7 @@ Figure 5: Aggregating merges in materialized views. **TTL (time-to-live) merges** provide aging for historical data. Unlike deleting and aggregating merges, TTL merges process only one part at a time. TTL merges are defined in terms of rules with triggers and actions. A trigger is an expression computing a timestamp for every row, which is compared against the time at which the TTL merge runs. While this allows users to control actions at row granularity, we found it sufficient to check whether all rows satisfy a given condition and run the action on the entire part. Possible actions include 1. move the part to another volume (e.g. cheaper and slower storage), 2. re-compress the part (e.g. with a more heavy-weight codec), 3. delete the part, and 4. roll-up, i.e. aggregate the rows using a grouping key and aggregate functions. -As an example, consider the logging table definition in [Listing 1.](#page-4-2) ClickHouse will move parts with timestamp column values older than one week to slow but inexpensive S3 object storage. +As an example, consider the logging table definition in [Listing 1.](#page-4-2) ClickHouse will move ^^parts^^ with timestamp column values older than one week to slow but inexpensive S3 object storage. ``` 1 CREATE TABLE tab ( ts DateTime , msg String ) @@ -147,7 +147,7 @@ Listing 1: Move part to object storage after one week. The design of the MergeTree* table engines favors append-only workloads, yet some use cases require to modify existing data occasionally, e.g. for regulatory compliance. Two approaches for updating or deleting data exist, neither of which block parallel inserts. -**Mutations** rewrite all parts of a table in-place. To prevent a table (delete) or column (update) from doubling temporarily in size, this operation is non-atomic, i.e. parallel SELECT statements may read mutated and non-mutated parts. Mutations guarantee that the data is physically changed at the end of the operation. Delete mutations are still expensive as they rewrite all columns in all parts. +**Mutations** rewrite all ^^parts^^ of a table in-place. To prevent a table (delete) or column (update) from doubling temporarily in size, this operation is non-atomic, i.e. parallel SELECT statements may read mutated and non-mutated ^^parts^^. Mutations guarantee that the data is physically changed at the end of the operation. Delete mutations are still expensive as they rewrite all columns in all ^^parts^^. As an alternative, **lightweight deletes** only update an internal bitmap column, indicating if a row is deleted or not. ClickHouse amends SELECT queries with an additional filter on the bitmap column to exclude deleted rows from the result. Deleted rows are physically removed only by regular merges at an unspecified time in future. Depending on the column count, lightweight deletes can be much faster than mutations, at the cost of slower SELECTs. @@ -157,27 +157,27 @@ Update and delete operations performed on the same table are expected to be rare A problem that frequently occurs in practice is how clients should handle connection timeouts after sending data to the server for insertion into a table. In this situation, it is difficult for clients to distinguish between whether the data was successfully inserted or not. The problem is traditionally solved by re-sending the data from the client to the server and relying on primary key or unique constraints to reject duplicate inserts. Databases perform the required point lookups quickly using index structures based on binary trees [\[39,](#page-12-15) [68\]](#page-13-16), radix trees [\[45\]](#page-13-17), or hash tables [\[29\]](#page-12-16). Since these data structures index every tuple, their space and update overhead becomes prohibitive for large data sets and high ingest rates. -ClickHouse provides a more light-weight alternative based on the fact that each insert eventually creates a part. More specifically, the server maintains hashes of the N last inserted parts (e.g. N=100) and ignores re-inserts of parts with a known hash. Hashes for non-replicated and replicated tables are stored locally, respectively, in Keeper. As a result, inserts become idempotent, i.e. clients can simply re-send the same batch of rows after a timeout and assume that the server takes care of deduplication. For more control over the deduplication process, clients can optionally provide an insert token that acts as a part hash. While hash-based deduplication incurs an overhead associated with hashing the new rows, the cost of storing and comparing hashes is negligible. +ClickHouse provides a more light-weight alternative based on the fact that each insert eventually creates a part. More specifically, the server maintains hashes of the N last inserted ^^parts^^ (e.g. N=100) and ignores re-inserts of ^^parts^^ with a known hash. Hashes for non-replicated and replicated tables are stored locally, respectively, in Keeper. As a result, inserts become idempotent, i.e. clients can simply re-send the same batch of rows after a timeout and assume that the server takes care of deduplication. For more control over the deduplication process, clients can optionally provide an insert token that acts as a part hash. While hash-based deduplication incurs an overhead associated with hashing the new rows, the cost of storing and comparing hashes is negligible. ### 3.6 Data Replication {#3-6-data-replication} -Replication is a prerequisite for high availability (tolerance against node failures), but also used for load balancing and zero-downtime upgrades [\[14\]](#page-12-17). In ClickHouse, replication is based on the notion of table states which consist of a set of table parts (Section [3.1)](#page-2-2) and table metadata, such as column names and types. Nodes advance the state of a table using three operations: 1. Inserts add a new part to the state, 2. merges add a new part and delete existing parts to/from the state, 3. mutations and DDL statements add parts, and/or delete parts, and/or change table metadata, depending on the concrete operation. Operations are performed locally on a single node and recorded as a sequence of state transition in a global replication log. +Replication is a prerequisite for high availability (tolerance against node failures), but also used for load balancing and zero-downtime upgrades [\[14\]](#page-12-17). In ClickHouse, replication is based on the notion of table states which consist of a set of table ^^parts^^ (Section [3.1)](#page-2-2) and table metadata, such as column names and types. Nodes advance the state of a table using three operations: 1. Inserts add a new part to the state, 2. merges add a new part and delete existing ^^parts^^ to/from the state, 3. mutations and DDL statements add ^^parts^^, and/or delete ^^parts^^, and/or change table metadata, depending on the concrete operation. Operations are performed locally on a single node and recorded as a sequence of state transition in a global replication log. -The replication log is maintained by an ensemble of typically three ClickHouse Keeper processes which use the Raft consensus algorithm [\[59\]](#page-13-4) to provide a distributed and fault-tolerant coordination layer for a cluster of ClickHouse nodes. All cluster nodes initially point to the same position in the replication log. While the nodes execute local inserts, merges, mutations, and DDL statements, the replication log is replayed asynchronously on all other nodes. As a result, replicated tables are only eventually consistent, i.e. nodes can temporarily read old table states while converging towards the latest state. Most aforementioned operations can alternatively be executed synchronously until a quorum of nodes (e.g. a majority of nodes or all nodes) adopted the new state. +The replication log is maintained by an ensemble of typically three ClickHouse Keeper processes which use the Raft consensus algorithm [\[59\]](#page-13-4) to provide a distributed and fault-tolerant coordination layer for a ^^cluster^^ of ClickHouse nodes. All ^^cluster^^ nodes initially point to the same position in the replication log. While the nodes execute local inserts, merges, mutations, and DDL statements, the replication log is replayed asynchronously on all other nodes. As a result, replicated tables are only eventually consistent, i.e. nodes can temporarily read old table states while converging towards the latest state. Most aforementioned operations can alternatively be executed synchronously until a quorum of nodes (e.g. a majority of nodes or all nodes) adopted the new state. -As an example, [Figure 6](#page-5-3) shows an initially empty replicated table in a cluster of three ClickHouse nodes. Node 1 first receives two insert statements and records them ( 1 2 ) in the replication log stored in the Keeper ensemble. Next, Node 2 replays the first log entry by fetching it ( 3 ) and downloading the new part from Node 1 ( 4 ), whereas Node 3 replays both log entries ( 3 4 5 6 ). Finally, Node 3 merges both parts to a new part, deletes the input parts, and records a merge entry in the replication log ( 7 ). +As an example, [Figure 6](#page-5-3) shows an initially empty replicated table in a ^^cluster^^ of three ClickHouse nodes. Node 1 first receives two insert statements and records them ( 1 2 ) in the replication log stored in the Keeper ensemble. Next, Node 2 replays the first log entry by fetching it ( 3 ) and downloading the new part from Node 1 ( 4 ), whereas Node 3 replays both log entries ( 3 4 5 6 ). Finally, Node 3 merges both ^^parts^^ to a new part, deletes the input ^^parts^^, and records a merge entry in the replication log ( 7 ). -Figure 6: Replication in a cluster of three nodes. +Figure 6: Replication in a ^^cluster^^ of three nodes. -Three optimizations to speed up synchronization exist: First, new nodes added to the cluster do replay the replication log from scratch, instead they simply copy the state of the node which wrote the last replication log entry. Second, merges are replayed by repeating them locally or by fetching the result part from another node. The exact behavior is configurable and allows to balance CPU consumption and network I/O. For example, cross-data-center replication typically prefers local merges to minimize operating costs. Third, nodes replay mutually independent replication log entries in parallel. This includes, for example, fetches of new parts inserted consecutively into the same table, or operations on different tables. +Three optimizations to speed up synchronization exist: First, new nodes added to the ^^cluster^^ do replay the replication log from scratch, instead they simply copy the state of the node which wrote the last replication log entry. Second, merges are replayed by repeating them locally or by fetching the result part from another node. The exact behavior is configurable and allows to balance CPU consumption and network I/O. For example, cross-data-center replication typically prefers local merges to minimize operating costs. Third, nodes replay mutually independent replication log entries in parallel. This includes, for example, fetches of new ^^parts^^ inserted consecutively into the same table, or operations on different tables. ### 3.7 ACID Compliance {#3-7-acid-compliance} -To maximize the performance of concurrent read and write operations, ClickHouse avoids latching as much as possible. Queries are executed against a snapshot of all parts in all involved tables created at the beginning of the query. This ensures that new parts inserted by parallel INSERTs or merges (Section [3.1)](#page-2-2) do not participate in execution. To prevent parts from being modified or removed simultaneously (Section [3.4)](#page-4-0), the reference count of the processed parts is incremented for the duration of the query. Formally, this corresponds to snapshot isolation realized by an MVCC variant [\[6\]](#page-12-18) based on versioned parts. As a result, statements are generally not ACID-compliant except for the rare case that concurrent writes at the time the snapshot is taken each affect only a single part. +To maximize the performance of concurrent read and write operations, ClickHouse avoids latching as much as possible. Queries are executed against a snapshot of all ^^parts^^ in all involved tables created at the beginning of the query. This ensures that new ^^parts^^ inserted by parallel INSERTs or merges (Section [3.1)](#page-2-2) do not participate in execution. To prevent ^^parts^^ from being modified or removed simultaneously (Section [3.4)](#page-4-0), the reference count of the processed ^^parts^^ is incremented for the duration of the query. Formally, this corresponds to snapshot isolation realized by an MVCC variant [\[6\]](#page-12-18) based on versioned ^^parts^^. As a result, statements are generally not ACID-compliant except for the rare case that concurrent writes at the time the snapshot is taken each affect only a single part. -In practice, most of ClickHouse's write-heavy decision making use cases even tolerate a small risk of losing new data in case of a power outage. The database takes advantage of this by not forcing a commit (fsync) of newly inserted parts to disk by default, allowing the kernel to batch writes at the cost of forgoing atomicity. +In practice, most of ClickHouse's write-heavy decision making use cases even tolerate a small risk of losing new data in case of a power outage. The database takes advantage of this by not forcing a commit (fsync) of newly inserted ^^parts^^ to disk by default, allowing the kernel to batch writes at the cost of forgoing ^^atomicity^^. ## 4 QUERY PROCESSING LAYER {#4-query-processing-layer} @@ -187,7 +187,7 @@ Figure 7: Parallelization across SIMD units, cores and nodes. As illustrated by [Figure 7,](#page-6-1) ClickHouse parallelizes queries at the level of data elements, data chunks, and table shards. Multiple data elements can be processed within operators at once using SIMD instructions. On a single node, the query engine executes operators simultaneously in multiple threads. ClickHouse uses the same vectorization model as MonetDB/X100 [\[11\]](#page-12-0), i.e. operators produce, pass, and consume multiple rows (data chunks) instead of single rows to minimize the overhead of virtual function calls. If a source table is split into disjoint table shards, multiple nodes can scan the shards simultaneously. As a result, all hardware resources are fully utilized, and query processing can be scaled horizontally by adding nodes and vertically by adding cores. -The rest of this section first describes parallel processing at data element, data chunk, and shard granularity in more detail. We then present selected key optimizations to maximize query performance. Finally, we discuss how ClickHouse manages shared system resources in the presence of simultaneous queries. +The rest of this section first describes parallel processing at data element, data chunk, and ^^shard^^ granularity in more detail. We then present selected key optimizations to maximize query performance. Finally, we discuss how ClickHouse manages shared system resources in the presence of simultaneous queries. ### 4.1 SIMD Parallelization {#4-1-simd-parallelization} @@ -334,9 +334,9 @@ Figure 12: Hot runtimes (in seconds) for TPC-H queries. Analytical databases have been of great academic and commercial interest in recent decades [\[1\]](#page-12-35). Early systems like Sybase IQ [\[48\]](#page-13-28), Teradata [\[72\]](#page-13-29), Vertica [\[42\]](#page-12-36), and Greenplum [\[47\]](#page-13-30) were characterized by expensive batch ETL jobs and limited elasticity due to their on-premise nature. In the early 2010s, the advent of cloud-native data warehouses and database-as-a-service offerings (DBaaS) such as Snowfake [\[22\]](#page-12-37), BigQuery [\[49\]](#page-13-31), and Redshift [\[4\]](#page-12-38) dramatically reduced the cost and complexity of analytics for organizations, while benefiting from high availability and automatic resource scaling. More recently, analytical execution kernels (e.g. Photon [\[5\]](#page-12-39) and Velox [\[62\]](#page-13-32)) offer co-modified data processing for use in different analytical, streaming, and machine learning applications. -The most similar databases to ClickHouse, in terms of goals and design principles, are Druid [\[78\]](#page-13-33) and Pinot [\[34\]](#page-12-40). Both systems target real-time analytics with high data ingestion rates. Like ClickHouse, tables are split into horizontal parts called segments. While ClickHouse continuously merges smaller parts and optionally reduces data volumes using the techniques in Section [3.3,](#page-4-3) parts remain forever immutable in Druid and Pinot. Also, Druid and Pinot require specialized nodes to create, mutate, and search tables, whereas ClickHouse uses a monolithic binary for these tasks. +The most similar databases to ClickHouse, in terms of goals and design principles, are Druid [\[78\]](#page-13-33) and Pinot [\[34\]](#page-12-40). Both systems target real-time analytics with high data ingestion rates. Like ClickHouse, tables are split into horizontal ^^parts^^ called segments. While ClickHouse continuously merges smaller ^^parts^^ and optionally reduces data volumes using the techniques in Section [3.3,](#page-4-3) ^^parts^^ remain forever immutable in Druid and Pinot. Also, Druid and Pinot require specialized nodes to create, mutate, and search tables, whereas ClickHouse uses a monolithic binary for these tasks. -Snowfake [\[22\]](#page-12-37) is a popular proprietary cloud data warehouse based on a shared-disk architecture. Its approach of dividing tables into micro-partitions is similar to the concept of parts in ClickHouse. Snowfake uses hybrid PAX pages [\[3\]](#page-12-41) for persistence, whereas ClickHouse's storage format is strictly columnar. Snowfake also emphasizes local caching and data pruning using automatically created lightweight indexes [\[31,](#page-12-13) [51\]](#page-13-14) as a source for good performance. Similar to primary keys in ClickHouse, users may optionally create clustered indexes to co-locate data with the same values. +Snowfake [\[22\]](#page-12-37) is a popular proprietary cloud data warehouse based on a shared-disk architecture. Its approach of dividing tables into micro-partitions is similar to the concept of ^^parts^^ in ClickHouse. Snowfake uses hybrid PAX pages [\[3\]](#page-12-41) for persistence, whereas ClickHouse's storage format is strictly columnar. Snowfake also emphasizes local caching and data pruning using automatically created lightweight indexes [\[31,](#page-12-13) [51\]](#page-13-14) as a source for good performance. Similar to primary keys in ClickHouse, users may optionally create clustered indexes to co-locate data with the same values. Photon [\[5\]](#page-12-39) and Velox [\[62\]](#page-13-32) are query execution engines designed to be used as components in complex data management systems. Both systems are passed query plans as input, which are then executed on the local node over Parquet (Photon) or Arrow (Velox) files [\[46\]](#page-13-34). ClickHouse is able to consume and generate data in these generic formats but prefers its native file format for storage. While Velox and Photon do not optimize the query plan (Velox performs basic expression optimizations), they utilize runtime adaptivity techniques, such as dynamically switching compute kernels depending on the data characteristics. Similarly, plan operators in ClickHouse @@ -346,7 +346,7 @@ DuckDB [\[67\]](#page-13-6) is also meant to be embedded by a host process, but ## 8 CONCLUSION AND OUTLOOK {#8-conclusion-and-outlook} -We presented the architecture of ClickHouse, an open-source, highperformance OLAP database. With a write-optimized storage layer and a state-of-the-art vectorized query engine at its foundation, ClickHouse enables real-time analytics over petabyte-scale data sets with high ingestion rates. By merging and transforming data asynchronously in the background, ClickHouse efficiently decouples data maintenance and parallel inserts. Its storage layer enables aggressive data pruning using sparse primary indexes, skipping indexes, and projection tables. We described ClickHouse's implementation of updates and deletes, idempotent inserts, and data replication across nodes for high availability. The query processing layer optimizes queries using a wealth of techniques, and parallelizes execution across all server and cluster resources. Integration table engines and functions provide a convenient way to interact with other data management systems and data formats seamlessly. Through benchmarks, we demonstrate that ClickHouse is amongst the fastest analytical databases on the market, and we showed significant improvements in the performance of typical queries in real-world deployments of ClickHouse throughout the years. +We presented the architecture of ClickHouse, an open-source, highperformance OLAP database. With a write-optimized storage layer and a state-of-the-art vectorized query engine at its foundation, ClickHouse enables real-time analytics over petabyte-scale data sets with high ingestion rates. By merging and transforming data asynchronously in the background, ClickHouse efficiently decouples data maintenance and parallel inserts. Its storage layer enables aggressive data pruning using sparse primary indexes, skipping indexes, and projection tables. We described ClickHouse's implementation of updates and deletes, idempotent inserts, and data replication across nodes for high availability. The query processing layer optimizes queries using a wealth of techniques, and parallelizes execution across all server and ^^cluster^^ resources. Integration table engines and functions provide a convenient way to interact with other data management systems and data formats seamlessly. Through benchmarks, we demonstrate that ClickHouse is amongst the fastest analytical databases on the market, and we showed significant improvements in the performance of typical queries in real-world deployments of ClickHouse throughout the years. All features and enhancements planned for 2024 can be found on the public roadmap [\[18\]](#page-12-33). Planned improvements include support for user transactions, PromQL [\[69\]](#page-13-36) as an alternative query language, a new datatype for semi-structured data (e.g. JSON), better plan-level optimizations of joins, as well as an implementation of light-weight updates to complement light-weight deletes. diff --git a/scripts/wrap-glossary-terms/README.md b/scripts/wrap-glossary-terms/README.md new file mode 100644 index 00000000000..7ee78f0e750 --- /dev/null +++ b/scripts/wrap-glossary-terms/README.md @@ -0,0 +1,129 @@ +# Glossary Term Wrapper + +A Python script that automatically finds and wraps glossary terms in MDX files with the `^^term^^` syntax for use with a glossary tooltip component. + +## Usage + +### Basic Usage +```bash +python3 wrap_glossary_terms.py +``` + +### Options +```bash +python3 wrap_glossary_terms.py [OPTIONS] + +Options: + --docs-dir PATH Documentation directory (default: ./docs) + --glossary PATH Glossary JSON file (default: ./src/components/GlossaryTooltip/glossary.json) + --dry-run Show changes without writing files + --force Process files even if they already have glossary syntax +``` + +### Examples + +```bash +# Dry run to see what would change +python3 wrap_glossary_terms.py --dry-run + +# Process all files, even those with existing glossary terms +python3 wrap_glossary_terms.py --force + +# Use custom paths +python3 wrap_glossary_terms.py --docs-dir ./my-docs --glossary ./my-glossary.json + +# Force process with dry run +python3 wrap_glossary_terms.py --force --dry-run +``` + +## Glossary JSON Format + +The script expects a JSON file with term definitions: + +```json +{ + "Cluster": "A collection of nodes (servers) that work together to store and process data.", + "Replica": "A copy of the data stored in a ClickHouse database.", + "Shard": "A subset of data. ClickHouse always has at least one shard for your data." +} +``` + +## Protected Areas + +The script will **NOT** wrap glossary terms in these areas: + +- **Frontmatter** (`---...---`) +- **Code blocks** (````...````) +- **Inline code** (`` `...` ``) +- **Markdown links** (`[text](url)`) +- **Markdown tables** (`| column | column |`) +- **JSX components** (`...`) + +## File Filtering + +The script automatically skips files in these directories: +- `_snippets`, `snippets` +- `examples`, `example-data`, `sample-data` +- `changelog`, `changelogs`, `release-notes`, `releases` +- `_partials`, `partials`, `_includes`, `includes` + +## Output Examples + +``` +🚀 Starting Glossary Term Wrapper... + +📚 Loaded 7 glossary terms +📁 Found 22 MDX files, processing 15 files +⏭️ Skipped 7 files based on skip patterns + +✅ Modified managing-data/core-concepts/academic_overview.mdx (52 terms) +✅ Modified best-practices/partitioning_keys.mdx (11 terms) +➖ No changes needed for about-us/intro.mdx +⏭️ Skipped getting-started/quick-start/oss.mdx (already has glossary syntax) + +📊 Summary: + Files processed: 15 + Files modified: 4 + Files skipped: 3 + Terms wrapped: 111 +``` + +## How It Works + +1. **Loads glossary terms** from JSON file and sorts by length (longest first) +2. **Finds MDX files** recursively in the specified directory +3. **Filters out** files matching skip patterns +4. **For each file**: + - Identifies protected areas (code, links, JSX, etc.) + - Searches for glossary terms using word boundaries + - Wraps unprotected terms with `^^term^^` syntax + - Avoids double-wrapping existing terms + +## Integration with Docusaurus + +This script is designed to work with a Docusaurus site that has: +- A glossary tooltip component that processes `^^term^^` syntax +- MDX support enabled +- A remark plugin that transforms `^^term^^` to `` + +## Troubleshooting + +### "No changes needed" for files that should have terms +- Check if terms are in protected areas (code blocks, links, etc.) +- Verify the terms exist in your `glossary.json` file +- Use `--force` if the file already has some glossary syntax + +### Terms wrapped in wrong places +- The script protects common areas, but you may need to manually fix edge cases +- Consider updating the protected areas logic if needed + +### Script skips too many files +- Check the skip patterns in `should_skip_file()` function +- Adjust patterns if your documentation structure is different + +## Contributing + +To modify the script: +- **Add new protected areas**: Update `extract_protected_ranges()` function +- **Change skip patterns**: Modify `should_skip_file()` function +- **Adjust term matching**: Update the regex in `wrap_terms_in_content()` \ No newline at end of file diff --git a/scripts/wrap-glossary-terms/wrap-glossary-terms.py b/scripts/wrap-glossary-terms/wrap-glossary-terms.py new file mode 100644 index 00000000000..4dda1c8ab1e --- /dev/null +++ b/scripts/wrap-glossary-terms/wrap-glossary-terms.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 + +import json +import re +import os +import glob +import argparse + +def load_glossary(file_path): + """Load glossary terms from JSON file""" + with open(file_path, 'r') as f: + glossary = json.load(f) + + # Sort by length (longest first) to avoid partial matches + terms = sorted(glossary.keys(), key=len, reverse=True) + print(f"📚 Loaded {len(terms)} glossary terms") + return terms + +def should_skip_file(file_path): + """Check if file should be skipped based on path patterns""" + skip_patterns = [ + '_snippets', 'snippets', 'examples', 'example-data', 'sample-data', + 'changelog', 'changelogs', 'release-notes', 'releases', + '_partials', 'partials', '_includes', 'includes' + ] + + return any(pattern in file_path for pattern in skip_patterns) + +def extract_text_content(content, force_process=False): + """Extract text content, avoiding code blocks and JSX""" + + # Skip if already has glossary syntax (unless forced) + if not force_process and '^^' in content: + return None + + return content + +def extract_protected_ranges(content): + """Find all ranges that should be protected from glossary wrapping""" + protected_ranges = [] + + # Find frontmatter + frontmatter_match = re.match(r'^(---.*?---)', content, re.DOTALL) + if frontmatter_match: + protected_ranges.append((0, frontmatter_match.end())) + + # Find code blocks + for match in re.finditer(r'```[\s\S]*?```', content): + protected_ranges.append((match.start(), match.end())) + + # Find inline code + for match in re.finditer(r'`[^`]*`', content): + protected_ranges.append((match.start(), match.end())) + + # Find markdown links - protect the entire link [text](url) + for match in re.finditer(r'\[[^\]]*\]\([^)]*\)', content): + protected_ranges.append((match.start(), match.end())) + + # Find markdown tables - protect entire table blocks + # Tables start with | and have multiple lines with | + lines = content.split('\n') + in_table = False + table_start = 0 + + for i, line in enumerate(lines): + line_start = sum(len(lines[j]) + 1 for j in range(i)) # +1 for \n + + if '|' in line and line.strip(): + if not in_table: + in_table = True + table_start = line_start + else: + if in_table: + # End of table + table_end = line_start - 1 # Don't include the newline + protected_ranges.append((table_start, table_end)) + in_table = False + + # Handle table at end of file + if in_table: + protected_ranges.append((table_start, len(content))) + + # Find JSX component tags + for match in re.finditer(r'<[A-Z][^>]*/?>', content): + protected_ranges.append((match.start(), match.end())) + + # Sort and merge overlapping ranges + protected_ranges.sort() + merged = [] + for start, end in protected_ranges: + if merged and start <= merged[-1][1]: + merged[-1] = (merged[-1][0], max(merged[-1][1], end)) + else: + merged.append((start, end)) + + return merged + +def is_position_protected(pos, protected_ranges): + """Check if a position falls within any protected range""" + for start, end in protected_ranges: + if start <= pos < end: + return True + return False + +def wrap_terms_in_content(original_content, terms): + """Wrap glossary terms with ^^ syntax, avoiding protected areas""" + + # Find all protected ranges + protected_ranges = extract_protected_ranges(original_content) + + content = original_content + changes = 0 + + for term in terms: + # Use word boundaries for exact matches + pattern = r'\b' + re.escape(term) + r'\b' + matches = list(re.finditer(pattern, content, re.IGNORECASE)) + + if matches: + # Replace from end to start to maintain positions + for match in reversed(matches): + start, end = match.span() + + # Check if this match is in a protected area + if is_position_protected(start, protected_ranges): + continue + + matched_term = content[start:end] + + # Check if not already wrapped + before = content[:start] + if not (before.endswith('^^') or '^^' in before[-10:]): + content = content[:start] + f'^^{matched_term}^^' + content[end:] + changes += 1 + + # Update protected ranges since we modified the content + adjustment = len(f'^^{matched_term}^^') - len(matched_term) + protected_ranges = [(s + adjustment if s > start else s, + e + adjustment if e > start else e) + for s, e in protected_ranges] + + return content, changes + +def process_file(file_path, terms, dry_run=False, force_process=False): + """Process a single MDX file""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + original_content = f.read() + + # Check if should skip + text_content = extract_text_content(original_content, force_process) + if text_content is None: + return 'skipped', 0 + + # Wrap terms + new_content, changes = wrap_terms_in_content(original_content, terms) + + if changes > 0: + if not dry_run: + with open(file_path, 'w', encoding='utf-8') as f: + f.write(new_content) + return 'modified', changes + else: + return 'unchanged', 0 + + except Exception as e: + print(f"❌ Error processing {file_path}: {e}") + return 'error', 0 + +def main(): + parser = argparse.ArgumentParser(description='Wrap glossary terms in MDX files') + parser.add_argument('--docs-dir', default='./docs', help='Documentation directory') + parser.add_argument('--glossary', default='./src/components/GlossaryTooltip/glossary.json', help='Glossary JSON file') + parser.add_argument('--dry-run', action='store_true', help='Show changes without writing files') + parser.add_argument('--force', action='store_true', help='Process files even if they already have glossary syntax') + + args = parser.parse_args() + + print("🚀 Starting Glossary Term Wrapper...\n") + + # Load glossary + if not os.path.exists(args.glossary): + print(f"❌ Glossary file not found: {args.glossary}") + return + + terms = load_glossary(args.glossary) + + # Find MDX files + pattern = os.path.join(args.docs_dir, '**/*.mdx') + all_files = glob.glob(pattern, recursive=True) + + # Filter out skip patterns + files = [f for f in all_files if not should_skip_file(f)] + + print(f"📁 Found {len(all_files)} MDX files, processing {len(files)} files") + print(f"⏭️ Skipped {len(all_files) - len(files)} files based on skip patterns") + + if args.force: + print("💪 FORCE MODE - Processing files even with existing glossary syntax") + + if args.dry_run: + print("🔍 DRY RUN MODE - No files will be modified") + + print() + + # Process files + stats = {'modified': 0, 'unchanged': 0, 'skipped': 0, 'error': 0, 'terms_wrapped': 0} + + for file_path in files: + rel_path = os.path.relpath(file_path, args.docs_dir) + status, changes = process_file(file_path, terms, args.dry_run, args.force) + + if status == 'modified': + print(f"✅ Modified {rel_path} ({changes} terms)") + elif status == 'unchanged': + print(f"➖ No changes needed for {rel_path}") + elif status == 'skipped': + print(f"⏭️ Skipped {rel_path} (already has glossary syntax)") + + stats[status] += 1 + stats['terms_wrapped'] += changes + + # Print summary + print(f"\n📊 Summary:") + print(f" Files processed: {stats['modified'] + stats['unchanged']}") + print(f" Files modified: {stats['modified']}") + print(f" Files skipped: {stats['skipped']}") + print(f" Terms wrapped: {stats['terms_wrapped']}") + + if args.dry_run: + print("\n💡 Run without --dry-run to apply changes") + if not args.force and stats['skipped'] > 0: + print("💡 Use --force to process files with existing glossary syntax") + +if __name__ == '__main__': + main() \ No newline at end of file From 719db3c7d922fa54e31c6aa1386369defbf5122c Mon Sep 17 00:00:00 2001 From: Dominic Tran Date: Mon, 28 Jul 2025 15:16:29 -0500 Subject: [PATCH 05/10] Automatic checking of glossary terms on CI. Warning for now --- .github/workflows/check-build.yml | 13 +++- .../README.md | 0 scripts/glossary/extract-glossary-terms.py | 74 ++++++++++++++++++ .../wrap-glossary-terms.py | 78 ++++++++++++------- 4 files changed, 134 insertions(+), 31 deletions(-) rename scripts/{wrap-glossary-terms => glossary}/README.md (100%) create mode 100644 scripts/glossary/extract-glossary-terms.py rename scripts/{wrap-glossary-terms => glossary}/wrap-glossary-terms.py (74%) diff --git a/.github/workflows/check-build.yml b/.github/workflows/check-build.yml index 58e003c4a5a..51246ea4cd4 100644 --- a/.github/workflows/check-build.yml +++ b/.github/workflows/check-build.yml @@ -16,7 +16,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - check_type: [spellcheck, kbcheck, md-lint] + check_type: [spellcheck, kbcheck, md-lint, glossary-check] steps: # Add setup steps per check here - uses: actions/checkout@v4 @@ -24,7 +24,7 @@ jobs: if: matrix.check_type == 'spellcheck' run: sudo apt-get update && sudo apt-get install -y aspell aspell-en - name: Set up Python - if: matrix.check_type == 'kbcheck' + if: matrix.check_type == 'kbcheck' || matrix.check_type == 'glossary-check' run: | curl -Ls https://astral.sh/uv/install.sh | sh uv clean @@ -51,6 +51,12 @@ jobs: elif [[ "${{ matrix.check_type }}" == "md-lint" ]]; then yarn check-markdown exit_code=$? + elif [[ "${{ matrix.check_type }}" == "glossary-check" ]]; then + echo "Extracting glossary from markdown..." + python3 scripts/glossary/extract_glossary.py + echo "Checking glossary coverage..." + python3 scripts/glossary/wrap_glossary_terms.py --check || echo "::warning::Glossary check found unwrapped terms (non-blocking)" + exit_code=0 # Always succeed for glossary check fi if [[ $exit_code -ne 0 ]]; then @@ -74,5 +80,4 @@ jobs: if: needs.stylecheck.result != 'success' run: | echo "::error::One or more checks of the style check failed." - exit 1 - + exit 1 \ No newline at end of file diff --git a/scripts/wrap-glossary-terms/README.md b/scripts/glossary/README.md similarity index 100% rename from scripts/wrap-glossary-terms/README.md rename to scripts/glossary/README.md diff --git a/scripts/glossary/extract-glossary-terms.py b/scripts/glossary/extract-glossary-terms.py new file mode 100644 index 00000000000..ecdf57afaa9 --- /dev/null +++ b/scripts/glossary/extract-glossary-terms.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 + +import re +import json +import argparse + +def extract_glossary_terms(markdown_content): + """Extract terms and definitions from ClickHouse glossary markdown""" + + # Pattern to match: ## Term {#anchor} followed by definition paragraph(s) + pattern = r'^## ([^{]+?)\s*\{#[^}]+\}\s*\n\n(.*?)(?=\n## |\Z)' + + matches = re.findall(pattern, markdown_content, re.MULTILINE | re.DOTALL) + + glossary = {} + + for term, definition in matches: + # Clean up the term + term = term.strip() + + # Clean up the definition + definition = definition.strip() + + # Remove extra whitespace and normalize line breaks + definition = re.sub(r'\n+', ' ', definition) + definition = re.sub(r'\s+', ' ', definition) + + glossary[term] = definition + + return glossary + +def main(): + parser = argparse.ArgumentParser(description='Convert ClickHouse glossary.md to JSON') + parser.add_argument('--input', '-i', default='./docs/concepts/glossary.md', + help='Input markdown file') + parser.add_argument('--output', '-o', default='./src/components/GlossaryTooltip/glossary.json', + help='Output JSON file') + + args = parser.parse_args() + + # Read the markdown file + try: + with open(args.input, 'r', encoding='utf-8') as f: + content = f.read() + except FileNotFoundError: + print(f"❌ Input file not found: {args.input}") + return + except Exception as e: + print(f"❌ Error reading file: {e}") + return + + # Extract glossary terms + glossary = extract_glossary_terms(content) + + if not glossary: + print("❌ No glossary terms found") + return + + print(f"✅ Extracted {len(glossary)} terms:") + for term in sorted(glossary.keys()): + print(f" - {term}") + + # Write JSON file + try: + with open(args.output, 'w', encoding='utf-8') as f: + json.dump(glossary, f, indent=2, ensure_ascii=False) + + print(f"💾 Saved to: {args.output}") + + except Exception as e: + print(f"❌ Error writing JSON file: {e}") + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scripts/wrap-glossary-terms/wrap-glossary-terms.py b/scripts/glossary/wrap-glossary-terms.py similarity index 74% rename from scripts/wrap-glossary-terms/wrap-glossary-terms.py rename to scripts/glossary/wrap-glossary-terms.py index 4dda1c8ab1e..1370b11750f 100644 --- a/scripts/wrap-glossary-terms/wrap-glossary-terms.py +++ b/scripts/glossary/wrap-glossary-terms.py @@ -173,10 +173,12 @@ def main(): parser.add_argument('--glossary', default='./src/components/GlossaryTooltip/glossary.json', help='Glossary JSON file') parser.add_argument('--dry-run', action='store_true', help='Show changes without writing files') parser.add_argument('--force', action='store_true', help='Process files even if they already have glossary syntax') + parser.add_argument('--check', action='store_true', help='Check for unwrapped terms and show warnings (non-blocking)') args = parser.parse_args() - print("🚀 Starting Glossary Term Wrapper...\n") + if not args.check: + print("🚀 Starting Glossary Term Wrapper...\n") # Load glossary if not os.path.exists(args.glossary): @@ -192,45 +194,67 @@ def main(): # Filter out skip patterns files = [f for f in all_files if not should_skip_file(f)] - print(f"📁 Found {len(all_files)} MDX files, processing {len(files)} files") - print(f"⏭️ Skipped {len(all_files) - len(files)} files based on skip patterns") - - if args.force: - print("💪 FORCE MODE - Processing files even with existing glossary syntax") - - if args.dry_run: - print("🔍 DRY RUN MODE - No files will be modified") - - print() + if not args.check: + print(f"📁 Found {len(all_files)} MDX files, processing {len(files)} files") + print(f"⏭️ Skipped {len(all_files) - len(files)} files based on skip patterns") + + if args.force: + print("💪 FORCE MODE - Processing files even with existing glossary syntax") + + if args.dry_run: + print("🔍 DRY RUN MODE - No files will be modified") + + print() # Process files stats = {'modified': 0, 'unchanged': 0, 'skipped': 0, 'error': 0, 'terms_wrapped': 0} + file_details = [] # Track which files had terms for warning display for file_path in files: rel_path = os.path.relpath(file_path, args.docs_dir) - status, changes = process_file(file_path, terms, args.dry_run, args.force) + # For check mode, always use dry_run=True to avoid writing files + status, changes = process_file(file_path, terms, args.dry_run or args.check, args.force) - if status == 'modified': - print(f"✅ Modified {rel_path} ({changes} terms)") - elif status == 'unchanged': + if status == 'modified' and changes > 0: + file_details.append((rel_path, changes)) + if not args.check: + print(f"✅ Modified {rel_path} ({changes} terms)") + elif status == 'unchanged' and not args.check: print(f"➖ No changes needed for {rel_path}") - elif status == 'skipped': + elif status == 'skipped' and not args.check: print(f"⏭️ Skipped {rel_path} (already has glossary syntax)") stats[status] += 1 stats['terms_wrapped'] += changes - # Print summary - print(f"\n📊 Summary:") - print(f" Files processed: {stats['modified'] + stats['unchanged']}") - print(f" Files modified: {stats['modified']}") - print(f" Files skipped: {stats['skipped']}") - print(f" Terms wrapped: {stats['terms_wrapped']}") - - if args.dry_run: - print("\n💡 Run without --dry-run to apply changes") - if not args.force and stats['skipped'] > 0: - print("💡 Use --force to process files with existing glossary syntax") + # Show results + if args.check: + # Check mode: show warning if terms found + if stats['terms_wrapped'] > 0: + print(f"⚠️ GLOSSARY WARNING: Found {stats['terms_wrapped']} unwrapped glossary terms in {len(file_details)} files") + print("💡 Run 'python3 scripts/wrap_glossary_terms.py' to add glossary tooltips") + + # Show files with opportunities (limit to top 10 to avoid spam) + if file_details: + print(" Files with unwrapped terms:") + for rel_path, count in sorted(file_details, key=lambda x: x[1], reverse=True)[:10]: + print(f" - {rel_path} ({count} terms)") + if len(file_details) > 10: + print(f" ... and {len(file_details) - 10} more files") + else: + print("✅ All glossary terms are properly wrapped") + else: + # Normal mode: show detailed summary + print(f"\n📊 Summary:") + print(f" Files processed: {stats['modified'] + stats['unchanged']}") + print(f" Files modified: {stats['modified']}") + print(f" Files skipped: {stats['skipped']}") + print(f" Terms wrapped: {stats['terms_wrapped']}") + + if args.dry_run: + print("\n💡 Run without --dry-run to apply changes") + if not args.force and stats['skipped'] > 0: + print("💡 Use --force to process files with existing glossary syntax") if __name__ == '__main__': main() \ No newline at end of file From ddabedd07f59c534b8c745fbdbe4ba0d567d1c12 Mon Sep 17 00:00:00 2001 From: Dominic Tran Date: Mon, 28 Jul 2025 15:19:40 -0500 Subject: [PATCH 06/10] fixing ci --- .github/workflows/check-build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/check-build.yml b/.github/workflows/check-build.yml index 51246ea4cd4..98e90c83dc1 100644 --- a/.github/workflows/check-build.yml +++ b/.github/workflows/check-build.yml @@ -53,9 +53,9 @@ jobs: exit_code=$? elif [[ "${{ matrix.check_type }}" == "glossary-check" ]]; then echo "Extracting glossary from markdown..." - python3 scripts/glossary/extract_glossary.py + python3 scripts/glossary/extract-glossary.py echo "Checking glossary coverage..." - python3 scripts/glossary/wrap_glossary_terms.py --check || echo "::warning::Glossary check found unwrapped terms (non-blocking)" + python3 scripts/glossary/wrap-glossary-terms.py --check || echo "::warning::Glossary check found unwrapped terms (non-blocking)" exit_code=0 # Always succeed for glossary check fi From 0f4df42baa8fa51f1c86cec42b60942e38ded3fb Mon Sep 17 00:00:00 2001 From: Dominic Tran Date: Mon, 28 Jul 2025 15:21:04 -0500 Subject: [PATCH 07/10] fixing ci --- .github/workflows/check-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-build.yml b/.github/workflows/check-build.yml index 98e90c83dc1..8690a8fc4bb 100644 --- a/.github/workflows/check-build.yml +++ b/.github/workflows/check-build.yml @@ -53,7 +53,7 @@ jobs: exit_code=$? elif [[ "${{ matrix.check_type }}" == "glossary-check" ]]; then echo "Extracting glossary from markdown..." - python3 scripts/glossary/extract-glossary.py + python3 scripts/glossary/extract-glossary-terms.py echo "Checking glossary coverage..." python3 scripts/glossary/wrap-glossary-terms.py --check || echo "::warning::Glossary check found unwrapped terms (non-blocking)" exit_code=0 # Always succeed for glossary check From b49e0fa5668a7c467d5279a3307ab42867338381 Mon Sep 17 00:00:00 2001 From: Dominic Tran Date: Mon, 28 Jul 2025 15:29:05 -0500 Subject: [PATCH 08/10] fixing ci --- .github/workflows/check-build.yml | 48 ++++++++++++------------------- 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/.github/workflows/check-build.yml b/.github/workflows/check-build.yml index 8690a8fc4bb..b0a689c5aee 100644 --- a/.github/workflows/check-build.yml +++ b/.github/workflows/check-build.yml @@ -39,37 +39,25 @@ jobs: run: yarn add -D markdownlint-cli2 # Run the checks here - - name: Run checks - id: check_step - run: | - if [[ "${{ matrix.check_type }}" == "spellcheck" ]]; then - yarn check-spelling - exit_code=$? - elif [[ "${{ matrix.check_type }}" == "kbcheck" ]]; then - yarn check-kb - exit_code=$? - elif [[ "${{ matrix.check_type }}" == "md-lint" ]]; then - yarn check-markdown - exit_code=$? - elif [[ "${{ matrix.check_type }}" == "glossary-check" ]]; then - echo "Extracting glossary from markdown..." - python3 scripts/glossary/extract-glossary-terms.py - echo "Checking glossary coverage..." - python3 scripts/glossary/wrap-glossary-terms.py --check || echo "::warning::Glossary check found unwrapped terms (non-blocking)" - exit_code=0 # Always succeed for glossary check - fi - - if [[ $exit_code -ne 0 ]]; then - echo "::error::${{ matrix.check_type }} check failed. See logs for details." - exit 1 - fi + - name: Run spellcheck + if: matrix.check_type == 'spellcheck' + run: yarn check-spelling - - name: Set check status - if: steps.check_step.outcome != 'success' - uses: actions/github-script@v6 - with: - script: | - core.setFailed('${{ matrix.check_type }} check failed.'); + - name: Run KB check + if: matrix.check_type == 'kbcheck' + run: yarn check-kb + + - name: Run markdown lint + if: matrix.check_type == 'md-lint' + run: yarn check-markdown + + - name: Run glossary check + if: matrix.check_type == 'glossary-check' + run: | + echo "Extracting glossary from markdown..." + python3 scripts/glossary/extract-glossary-terms.py + echo "Checking glossary coverage..." + python3 scripts/glossary/wrap-glossary-terms.py --check || echo "::warning::Glossary check found unwrapped terms (non-blocking)" check_overall_status: needs: stylecheck From db83dd94e56811aec53b2a64e0abac2d973c8a30 Mon Sep 17 00:00:00 2001 From: Dominic Tran Date: Fri, 1 Aug 2025 22:31:06 -0500 Subject: [PATCH 09/10] Adding additional terms, applying tooltips to core concepts, converting to mdx --- docs/best-practices/partitioning_keys.mdx | 16 ++-- docs/concepts/glossary.md | 78 ++++++++++++++++++- docs/concepts/why-clickhouse-is-so-fast.mdx | 6 +- docs/deployment-guides/parallel-replicas.mdx | 10 +-- docs/getting-started/quick-start/cloud.mdx | 12 +-- .../core-concepts/academic_overview.mdx | 46 +++++------ docs/managing-data/core-concepts/index.md | 12 +-- .../core-concepts/{merges.md => merges.mdx} | 44 +++++------ .../{partitions.md => partitions.mdx} | 2 +- .../core-concepts/{parts.md => parts.mdx} | 28 +++---- ...primary-indexes.md => primary-indexes.mdx} | 22 +++--- .../core-concepts/{shards.md => shards.mdx} | 36 ++++----- ...lete_mutations.md => delete_mutations.mdx} | 4 +- docs/managing-data/deleting-data/index.md | 8 +- .../{overview.md => overview.mdx} | 8 +- .../{drop_partition.md => drop_partition.mdx} | 2 +- .../{overview.md => overview.mdx} | 14 ++-- ...date_mutations.md => update_mutations.mdx} | 4 +- plugins/glossary-transformer.js | 17 ++-- .../GlossaryTooltip/GlossaryTooltip.tsx | 36 +++------ src/components/GlossaryTooltip/glossary.json | 20 ++++- 21 files changed, 246 insertions(+), 179 deletions(-) rename docs/managing-data/core-concepts/{merges.md => merges.mdx} (65%) rename docs/managing-data/core-concepts/{partitions.md => partitions.mdx} (98%) rename docs/managing-data/core-concepts/{parts.md => parts.mdx} (55%) rename docs/managing-data/core-concepts/{primary-indexes.md => primary-indexes.mdx} (83%) rename docs/managing-data/core-concepts/{shards.md => shards.mdx} (54%) rename docs/managing-data/deleting-data/{delete_mutations.md => delete_mutations.mdx} (75%) rename docs/managing-data/deleting-data/{overview.md => overview.mdx} (77%) rename docs/managing-data/{drop_partition.md => drop_partition.mdx} (86%) rename docs/managing-data/updating-data/{overview.md => overview.mdx} (83%) rename docs/managing-data/updating-data/{update_mutations.md => update_mutations.mdx} (75%) diff --git a/docs/best-practices/partitioning_keys.mdx b/docs/best-practices/partitioning_keys.mdx index 0795053b16c..548fbe08f1f 100644 --- a/docs/best-practices/partitioning_keys.mdx +++ b/docs/best-practices/partitioning_keys.mdx @@ -12,12 +12,12 @@ import partitions from '@site/static/images/bestpractices/partitions.png'; import merges_with_partitions from '@site/static/images/bestpractices/merges_with_partitions.png'; :::note A data management technique -Partitioning is primarily a data management technique and not a query optimization tool, and while it can improve performance in specific workloads, it should not be the first mechanism used to accelerate queries; the partitioning key must be chosen carefully, with a clear understanding of its implications, and only applied when it aligns with data life cycle needs or well-understood access patterns. +Partitioning is primarily a data management technique and not a query optimization tool, and while it can improve performance in specific workloads, it should not be the first mechanism used to accelerate queries; the ^^partitioning key^^ must be chosen carefully, with a clear understanding of its implications, and only applied when it aligns with data life cycle needs or well-understood access patterns. ::: In ClickHouse, partitioning organizes data into logical segments based on a specified key. This is defined using the `PARTITION BY` clause at table creation time and is commonly used to group rows by time intervals, categories, or other business-relevant dimensions. Each unique value of the partitioning expression forms its own physical partition on disk, and ClickHouse stores data in separate ^^parts^^ for each of these values. Partitioning improves data management, simplifies retention policies, and can help with certain query patterns. -For example, consider the following UK price paid dataset table with a partitioning key of `toStartOfMonth(date)`. +For example, consider the following UK price paid dataset table with a ^^partitioning key^^ of `toStartOfMonth(date)`. ```sql CREATE TABLE uk.uk_price_paid_simple_partitioned @@ -46,22 +46,22 @@ With partitioning enabled, ClickHouse only [merges](/merges) data ^^parts^^ with ## Applications of partitioning {#applications-of-partitioning} -Partitioning is a powerful tool for managing large datasets in ClickHouse, especially in observability and analytics use cases. It enables efficient data life cycle operations by allowing entire partitions, often aligned with time or business logic, to be dropped, moved, or archived in a single metadata operation. This is significantly faster and less resource-intensive than row-level delete or copy operations. Partitioning also integrates cleanly with ClickHouse features like TTL and tiered storage, making it possible to implement retention policies or hot/cold storage strategies without custom orchestration. For example, recent data can be kept on fast SSD-backed storage, while older partitions are automatically moved to cheaper object storage. +Partitioning is a powerful tool for managing large datasets in ClickHouse, especially in observability and analytics use cases. It enables efficient data life cycle operations by allowing entire partitions, often aligned with time or business logic, to be dropped, moved, or archived in a single metadata operation. This is significantly faster and less resource-intensive than row-level delete or copy operations. Partitioning also integrates cleanly with ClickHouse features like ^^TTL^^ and tiered storage, making it possible to implement retention policies or hot/cold storage strategies without custom orchestration. For example, recent data can be kept on fast SSD-backed storage, while older partitions are automatically moved to cheaper object storage. While partitioning can improve query performance for some workloads, it can also negatively impact response time. -If the partitioning key is not in the primary key and you are filtering by it, users may see an improvement in query performance with partitioning. See [here](/partitions#query-optimization) for an example. +If the ^^partitioning key^^ is not in the ^^primary key^^ and you are filtering by it, users may see an improvement in query performance with partitioning. See [here](/partitions#query-optimization) for an example. Conversely, if queries need to query across partitions performance may be negatively impacted due to a higher number of total ^^parts^^. For this reason, users should understand their access patterns before considering partitioning a a query optimization technique. In summary, users should primarily think of partitioning as a data management technique. For an example of managing data, see ["Managing Data"](/observability/managing-data) from the observability use-case guide and ["What are table partitions used for?"](/partitions#data-management) from Core Concepts - Table partitions. -## Choose a low cardinality partitioning key {#choose-a-low-cardinality-partitioning-key} +## Choose a low cardinality ^^partitioning key^^ {#choose-a-low-cardinality-partitioning-key} Importantly, a higher number of ^^parts^^ will negatively affect query performance. ClickHouse will therefore respond to inserts with a [“too many parts”](/knowledgebase/exception-too-many-parts) error if the number of ^^parts^^ exceeds specified limits either in [total](/operations/settings/merge-tree-settings#max_parts_in_total) or [per partition](/operations/settings/merge-tree-settings#parts_to_throw_insert). -Choosing the right **cardinality** for the partitioning key is critical. A high-cardinality partitioning key - where the number of distinct partition values is large - can lead to a proliferation of data ^^parts^^. Since ClickHouse does not merge ^^parts^^ across partitions, too many partitions will result in too many unmerged ^^parts^^, eventually triggering the “Too many ^^parts^^” error. [Merges are essential](/merges) for reducing storage fragmentation and optimizing query speed, but with high-cardinality partitions, that merge potential is lost. +Choosing the right **cardinality** for the ^^partitioning key^^ is critical. A high-cardinality ^^partitioning key^^ - where the number of distinct partition values is large - can lead to a proliferation of data ^^parts^^. Since ClickHouse does not merge ^^parts^^ across partitions, too many partitions will result in too many unmerged ^^parts^^, eventually triggering the “Too many ^^parts^^” error. [Merges are essential](/merges) for reducing storage fragmentation and optimizing query speed, but with high-cardinality partitions, that merge potential is lost. -By contrast, a **low-cardinality partitioning key**—with fewer than 100 - 1,000 distinct values - is usually optimal. It enables efficient part merging, keeps metadata overhead low, and avoids excessive object creation in storage. In addition, ClickHouse automatically builds MinMax indexes on partition columns, which can significantly speed up queries that filter on those columns. For example, filtering by month when the table is partitioned by `toStartOfMonth(date)` allows the engine to skip irrelevant partitions and their ^^parts^^ entirely. +By contrast, a **low-cardinality ^^partitioning key^^**—with fewer than 100 - 1,000 distinct values - is usually optimal. It enables efficient part merging, keeps metadata overhead low, and avoids excessive object creation in storage. In addition, ClickHouse automatically builds MinMax indexes on partition columns, which can significantly speed up queries that filter on those columns. For example, filtering by month when the table is partitioned by `toStartOfMonth(date)` allows the engine to skip irrelevant partitions and their ^^parts^^ entirely. -While partitioning can improve performance in some query patterns, it's primarily a data management feature. In many cases, querying across all partitions can be slower than using a non-partitioned table due to increased data fragmentation and more ^^parts^^ being scanned. Use partitioning judiciously, and always ensure that the chosen key is low-cardinality and aligns with your data life cycle policies (e.g., retention via TTL). If you're unsure whether partitioning is necessary, you may want to start without it and optimize later based on observed access patterns. +While partitioning can improve performance in some query patterns, it's primarily a data management feature. In many cases, querying across all partitions can be slower than using a non-partitioned table due to increased data fragmentation and more ^^parts^^ being scanned. Use partitioning judiciously, and always ensure that the chosen key is low-cardinality and aligns with your data life cycle policies (e.g., retention via ^^TTL^^). If you're unsure whether partitioning is necessary, you may want to start without it and optimize later based on observed access patterns. diff --git a/docs/concepts/glossary.md b/docs/concepts/glossary.md index 5f3352f488d..d4ec252617b 100644 --- a/docs/concepts/glossary.md +++ b/docs/concepts/glossary.md @@ -11,7 +11,11 @@ slug: /concepts/glossary ## Atomicity {#atomicity} -Atomicity ensures that a transaction (a series of database operations) is treated as a single, indivisible unit. This means that either all operations within the transaction occur, or none do. An example of an atomic transaction is transferring money from one bank account to another. If either step of the transfer fails, the transaction fails, and the money stays in the first account. Atomicity ensures no money is lost or created. +Atomicity ensures that a transaction (a series of database operations) is treated as a single, indivisible unit. This means that either all operations within the transaction occur, or none do. An example of an atomic transaction is transferring money from one bank account to another. If either step of the transfer fails, the transaction fails, and the money stays in the first account. Atomicity ensures no money is lost or created. + +## Block {#block} + +A block is a logical unit for organizing data processing and storage. Each block contains columnar data which is processed together to enhance performance during query execution. By processing data in blocks, ClickHouse utilizes CPU cores efficiently by minimizing cache misses and facilitating vectorized execution. ClickHouse uses various compression algorithms, such as LZ4, ZSTD, and Delta, to compress data in blocks. ## Cluster {#cluster} @@ -19,20 +23,88 @@ A collection of nodes (servers) that work together to store and process data. ## CMEK {#cmek} -Customer-managed encryption keys (CMEK) allow customers to use their key-management service (KMS) key to encrypt the ClickHouse disk data key and protect their data at rest. +Customer-managed encryption keys (CMEK) allow customers to use their key-management service (KMS) key to encrypt the ClickHouse disk data key and protect their data at rest. ## Dictionary {#dictionary} A dictionary is a mapping of key-value pairs that is useful for various types of reference lists. It is a powerful feature that allows for the efficient use of dictionaries in queries, which is often more efficient than using a `JOIN` with reference tables. +## Distributed table {#distributed-table} + +A distributed table in ClickHouse is a special type of table that does not store data itself but provides a unified view for distributed query processing across multiple servers in a cluster. + +## Granule {#granule} + +A granule is a batch of rows in an uncompressed block. When reading data, ClickHouse accesses granules, but not individual rows, which enables faster data processing in analytical workloads. A granule contains 8192 rows by default. The primary index contains one entry per granule. + +## Incremental materialized view {#incremental-materialized-view} + +In ClickHouse is a type of materialized view that processes and aggregates data at insert time. When new data is inserted into the source table, the materialized view executes a predefined SQL aggregation query only on the newly inserted blocks and writes the aggregated results to a target table. + +## Lightweight update {#lightweight-update} + +A lightweight update in ClickHouse is an experimental feature that allows you to update rows in a table using standard SQL UPDATE syntax, but instead of rewriting entire columns or data parts (as with traditional mutations), it creates "patch parts" containing only the updated columns and rows. These updates are immediately visible in SELECT queries through patch application, but the physical data is only updated during subsequent merges. + +## Materialized view {#materialized-view} + +A materialized view in ClickHouse is a mechanism that automatically runs a query on data as it is inserted into a source table, storing the transformed or aggregated results in a separate target table for faster querying. + +## MergeTree {#mergetree} + +A MergeTree in ClickHouse is a table engine designed for high data ingest rates and large data volumes. It is the core storage engine in ClickHouse, providing features such as columnar storage, custom partitioning, sparse primary indexes, and support for background data merges. + +## Mutation {#mutation} + +A mutation in ClickHouse refers to an operation that modifies or deletes existing data in a table, typically using commands like ALTER TABLE ... UPDATE or ALTER TABLE ... DELETE. Mutations are implemented as asynchronous background processes that rewrite entire data parts affected by the change, rather than modifying rows in place. + +## On-the-fly mutation {#on-the-fly-mutation} + +On-the-fly mutations in ClickHouse are a mechanism that allows updates or deletes to be visible in subsequent SELECT queries immediately after the mutation is submitted, without waiting for the background mutation process to finish. + ## Parts {#parts} A physical file on a disk that stores a portion of the table's data. This is different from a partition, which is a logical division of a table's data that is created using a partition key. +## Partitioning key {#partitioning-key} + +A partitioning key in ClickHouse is a SQL expression defined in the PARTITION BY clause when creating a table. It determines how data is logically grouped into partitions on disk. Each unique value of the partitioning key forms its own physical partition, allowing for efficient data management operations such as dropping, moving, or archiving entire partitions. + +## Primary key {#primary-key} + +In ClickHouse, a primary key determines the order in which data is stored on disk and is used to build a sparse index that speeds up query filtering. Unlike traditional databases, the primary key in ClickHouse does not enforce uniqueness—multiple rows can have the same primary key value. + +## Projection {#projection} + +A projection in ClickHouse is a hidden, automatically maintained table that stores data in a different order or with precomputed aggregations to speed up queries, especially those filtering on columns not in the main primary key. + +## Refreshable materialized view {#refreshable-materialized-view} + +Refreshable materialized view is a type of materialized view that periodically re-executes its query over the full dataset and stores the result in a target table. Unlike incremental materialized views, refreshable materialized views are updated on a schedule and can support complex queries, including JOINs and UNIONs, without restrictions. + ## Replica {#replica} A copy of the data stored in a ClickHouse database. You can have any number of replicas of the same data for redundancy and reliability. Replicas are used in conjunction with the ReplicatedMergeTree table engine, which enables ClickHouse to keep multiple copies of data in sync across different servers. ## Shard {#shard} -A subset of data. ClickHouse always has at least one shard for your data. If you do not split the data across multiple servers, your data will be stored in one shard. Sharding data across multiple servers can be used to divide the load if you exceed the capacity of a single server. +A subset of data. ClickHouse always has at least one shard for your data. If you do not split the data across multiple servers, your data will be stored in one shard. Sharding data across multiple servers can be used to divide the load if you exceed the capacity of a single server. + +## Skipping index {#skipping-index} + +Skipping indices are used to store small amounts of metadata at the level of multiple consecutive granules which allows ClickHouse to avoid scanning irrelevant rows. Skipping indices provide a lightweight alternative to projections. + +## Sorting key {#sorting-key} + +In ClickHouse, a sorting key defines the physical order of rows on disk. If you do not specify a primary key, ClickHouse uses the sorting key as the primary key. If you specify both, the primary key must be a prefix of the sorting key. + +## Sparse index {#sparse-index} + +A type of indexing when the primary index contains one entry for a group of rows, rather than a single row. The entry that corresponds to a group of rows is referred to as a mark. With sparse indexes, ClickHouse first identifies groups of rows that potentially match the query and then processes them separately to find a match. Because of this, the primary index is small enough to be loaded into the memory. + +## Table engine {#table-engine} + +Table engines in ClickHouse determine how data is written, stored and accessed. MergeTree is the most common table engine, and allows quick insertion of large amounts of data which get processed in the background. + +## TTL {#ttl} + +Time To Live (TTL) is A ClickHouse feature that automatically moves, deletes, or rolls up columns or rows after a certain time period. This allows you to manage storage more efficiently because you can delete, move, or archive the data that you no longer need to access frequently. \ No newline at end of file diff --git a/docs/concepts/why-clickhouse-is-so-fast.mdx b/docs/concepts/why-clickhouse-is-so-fast.mdx index 006e504920a..26b71db8b62 100644 --- a/docs/concepts/why-clickhouse-is-so-fast.mdx +++ b/docs/concepts/why-clickhouse-is-so-fast.mdx @@ -47,7 +47,7 @@ Unlike other databases, ClickHouse keeps data writes lightweight and efficient b - **Aggregating merges** which combine intermediate aggregation states in the input part to a new aggregation state. While this seems difficult to understand, it really actually only implements an incremental aggregation. -- **TTL (time-to-live) merges** compress, move, or delete rows based on certain time-based rules. +- **^^TTL^^ (time-to-live) merges** compress, move, or delete rows based on certain time-based rules. The point of these transformations is to shift work (computation) from the time user queries run to merge time. This is important for two reasons: @@ -63,9 +63,9 @@ On the other hand, the majority of the runtime of merges is consumed by loading In practice, many queries are repetitive, i.e., run unchanged or only with slight modifications (e.g. different parameter values) in periodic intervals. Running the same or similar queries again and again allows adding indexes or re-organize the data in a way that frequent queries can access it faster. This approach is also known as "data pruning" and ClickHouse provides three techniques for that: -1. [Primary key indexes](/guides/best-practices/sparse-primary-indexes#clickhouse-index-design) which define the sort order of the table data. A well-chosen primary key allows to evaluate filters (like the WHERE clauses in the above query) using fast binary searches instead of full-column scans. In more technical terms, the runtime of scans becomes logarithmic instead of linear in the data size. +1. [Primary key indexes](/guides/best-practices/sparse-primary-indexes#clickhouse-index-design) which define the sort order of the table data. A well-chosen ^^primary key^^ allows to evaluate filters (like the WHERE clauses in the above query) using fast binary searches instead of full-column scans. In more technical terms, the runtime of scans becomes logarithmic instead of linear in the data size. -2. [Table projections](/sql-reference/statements/alter/projection) as alternative, internal versions of a table, storing the same data but sorted by a different primary key. Projections can be useful when there is more than one frequent filter condition. +2. [Table projections](/sql-reference/statements/alter/projection) as alternative, internal versions of a table, storing the same data but sorted by a different ^^primary key^^. Projections can be useful when there is more than one frequent filter condition. 3. [Skipping indexes](/optimize/skipping-indexes) that embed additional data statistics into columns, e.g. the minimum and maximum column value, the set of unique values, etc. Skipping indexes are orthogonal to primary keys and table projections, and depending on the data distribution in the column, they can greatly speed up the evaluation of filters. diff --git a/docs/deployment-guides/parallel-replicas.mdx b/docs/deployment-guides/parallel-replicas.mdx index a6b3694956c..7d7d33a436f 100644 --- a/docs/deployment-guides/parallel-replicas.mdx +++ b/docs/deployment-guides/parallel-replicas.mdx @@ -32,27 +32,27 @@ multiple replicas for its execution. In a shared-nothing architecture, clusters are commonly split into multiple shards, with each ^^shard^^ containing a subset of the overall data. A -distributed table sits on top of these shards, providing a unified view of the +^^distributed table^^ sits on top of these shards, providing a unified view of the complete data. Reads can be sent to the local table. Query execution will occur only -on the specified ^^shard^^, or it can be sent to the distributed table, and in that +on the specified ^^shard^^, or it can be sent to the ^^distributed table^^, and in that case, each ^^shard^^ will execute the given queries. The server where the distributed table was queried will aggregate the data and respond to the client: -The figure above visualizes what happens when a client queries a distributed table: +The figure above visualizes what happens when a client queries a ^^distributed table^^:
    1. - The select query is sent to a distributed table on a node arbitrarily + The select query is sent to a ^^distributed table^^ on a node arbitrarily (via a round-robin strategy or after being routed to a specific server by a load balancer). This node is now going to act as a coordinator.
    2. The node will locate each ^^shard^^ that needs to execute the query - via the information specified by the distributed table, and the query is + via the information specified by the ^^distributed table^^, and the query is sent to each ^^shard^^.
    3. diff --git a/docs/getting-started/quick-start/cloud.mdx b/docs/getting-started/quick-start/cloud.mdx index bd2efc0ff31..3a56046b7f7 100644 --- a/docs/getting-started/quick-start/cloud.mdx +++ b/docs/getting-started/quick-start/cloud.mdx @@ -168,15 +168,15 @@ of primary keys might seem unexpected!): - primary keys in ClickHouse are **_not unique_** for each row in a table -The primary key of a ClickHouse table determines how the data is sorted when written to disk. Every 8,192 rows or 10MB of -data (referred to as the **index granularity**) creates an entry in the primary key index file. This granularity concept -creates a **sparse index** that can easily fit in memory, and the granules represent a stripe of the smallest amount of +The ^^primary key^^ of a ClickHouse table determines how the data is sorted when written to disk. Every 8,192 rows or 10MB of +data (referred to as the **index granularity**) creates an entry in the ^^primary key^^ index file. This granularity concept +creates a **^^sparse index^^** that can easily fit in memory, and the granules represent a stripe of the smallest amount of column data that gets processed during `SELECT` queries. -The primary key can be defined using the `PRIMARY KEY` parameter. If you define a table without a `PRIMARY KEY` specified, -then the key becomes the tuple specified in the `ORDER BY` clause. If you specify both a `PRIMARY KEY` and an `ORDER BY`, the primary key must be a subset of the sort order. +The ^^primary key^^ can be defined using the `PRIMARY KEY` parameter. If you define a table without a `PRIMARY KEY` specified, +then the key becomes the tuple specified in the `ORDER BY` clause. If you specify both a `PRIMARY KEY` and an `ORDER BY`, the ^^primary key^^ must be a subset of the sort order. -The primary key is also the sorting key, which is a tuple of `(user_id, timestamp)`. Therefore, the data stored in each +The ^^primary key^^ is also the ^^sorting key^^, which is a tuple of `(user_id, timestamp)`. Therefore, the data stored in each column file will be sorted by `user_id`, then `timestamp`. For a deep dive into core ClickHouse concepts, see ["Core Concepts"](../../managing-data/core-concepts/index.md). diff --git a/docs/managing-data/core-concepts/academic_overview.mdx b/docs/managing-data/core-concepts/academic_overview.mdx index 53f5e4a67d7..e854ee7ba37 100644 --- a/docs/managing-data/core-concepts/academic_overview.mdx +++ b/docs/managing-data/core-concepts/academic_overview.mdx @@ -36,7 +36,7 @@ This is the web version of our [VLDB 2024 scientific paper](https://www.vldb.org Over the past several decades, the amount of data being stored and analyzed has increased exponentially. Businesses across industries and sectors have begun relying on this data to improve products, evaluate performance, and make business-critical decisions. However, as data volumes have increasingly become internetscale, businesses have needed to manage historical and new data in a cost-effective and scalable manner, while analyzing it using a high number of concurrent queries and an expectation of real-time latencies (e.g. less than one second, depending on the use case). -This paper presents an overview of ClickHouse, a popular opensource OLAP database designed for high-performance analytics over petabyte-scale data sets with high ingestion rates. Its storage layer combines a data format based on traditional log-structured merge (LSM) trees with novel techniques for continuous transformation (e.g. aggregation, archiving) of historical data in the background. Queries are written in a convenient SQL dialect and processed by a state-of-the-art vectorized query execution engine with optional code compilation. ClickHouse makes aggressive use of pruning techniques to avoid evaluating irrelevant data in queries. Other data management systems can be integrated at the table function, table engine, or database engine level. Real-world benchmarks demonstrate that ClickHouse is amongst the fastest analytical databases on the market. +This paper presents an overview of ClickHouse, a popular opensource OLAP database designed for high-performance analytics over petabyte-scale data sets with high ingestion rates. Its storage layer combines a data format based on traditional log-structured merge (LSM) trees with novel techniques for continuous transformation (e.g. aggregation, archiving) of historical data in the background. Queries are written in a convenient SQL dialect and processed by a state-of-the-art vectorized query execution engine with optional code compilation. ClickHouse makes aggressive use of pruning techniques to avoid evaluating irrelevant data in queries. Other data management systems can be integrated at the table function, ^^table engine^^, or database engine level. Real-world benchmarks demonstrate that ClickHouse is amongst the fastest analytical databases on the market. ## 1 INTRODUCTION {#1-introduction} @@ -68,35 +68,35 @@ As shown by [Figure 2,](#page-2-0) the ClickHouse engine is split into three mai Query processing follows the traditional paradigm of parsing incoming queries, building and optimizing logical and physical query plans, and execution. ClickHouse uses a vectorized execution model similar to MonetDB/X100 [\[11\]](#page-12-0), in combination with opportunistic code compilation [\[53\]](#page-13-0). Queries can be written in a feature-rich SQL dialect, PRQL [\[76\]](#page-13-1), or Kusto's KQL [\[50\]](#page-13-2). -The storage layer consists of different table engines that encapsulate the format and location of table data. Table engines fall into three categories: The first category is the MergeTree* family of table engines which represent the primary persistence format in ClickHouse. Based on the idea of LSM trees [\[60\]](#page-13-3), tables are split into horizontal, sorted ^^parts^^, which are continuously merged by a background process. Individual MergeTree* table engines differ in the way the merge combines the rows from its input ^^parts^^. For example, rows can be aggregated or replaced, if outdated. +The storage layer consists of different table engines that encapsulate the format and location of table data. Table engines fall into three categories: The first category is the ^^MergeTree^^* family of table engines which represent the primary persistence format in ClickHouse. Based on the idea of LSM trees [\[60\]](#page-13-3), tables are split into horizontal, sorted ^^parts^^, which are continuously merged by a background process. Individual ^^MergeTree^^* table engines differ in the way the merge combines the rows from its input ^^parts^^. For example, rows can be aggregated or replaced, if outdated. -The second category are special-purpose table engines, which are used to speed up or distribute query execution. This category includes in-memory key-value table engines called dictionaries. A [dictionary](https://clickhou.se/dictionaries) caches the result of a query periodically executed against an internal or external data source. This significantly reduces access latencies in scenarios, where a degree of data staleness can be tolerated. Other examples of special-purpose table engines include a pure in-memory engine used for temporary tables and the Distributed table engine for transparent data sharding (see below). +The second category are special-purpose table engines, which are used to speed up or distribute query execution. This category includes in-memory key-value table engines called dictionaries. A [dictionary](https://clickhou.se/dictionaries) caches the result of a query periodically executed against an internal or external data source. This significantly reduces access latencies in scenarios, where a degree of data staleness can be tolerated. Other examples of special-purpose table engines include a pure in-memory engine used for temporary tables and the ^^Distributed table^^ engine for transparent data sharding (see below). The third category of table engines are virtual table engines for bidirectional data exchange with external systems such as relational databases (e.g. PostgreSQL, MySQL), publish/subscribe systems (e.g. Kafka, RabbitMQ [\[24\]](#page-12-1)), or key/value stores (e.g. Redis). Virtual engines can also interact with data lakes (e.g. Iceberg, DeltaLake, Hudi [\[36\]](#page-12-2)) or files in object storage (e.g. AWS S3, Google GCP). -ClickHouse supports sharding and replication of tables across multiple ^^cluster^^ nodes for scalability and availability. Sharding partitions a table into a set of table shards according to a sharding expression. The individual shards are mutually independent tables and typically located on different nodes. Clients can read and write shards directly, i.e. treat them as separate tables, or use the Distributed special table engine, which provides a global view of all table shards. The main purpose of sharding is to process data sets which exceed the capacity of individual nodes (typically, a few dozens terabytes of data). Another use of sharding is to distribute the read-write load for a table over multiple nodes, i.e., load balancing. Orthogonal to that, a ^^shard^^ can be replicated across multiple nodes for tolerance against node failures. To that end, each Merge-Tree* table engine has a corresponding ReplicatedMergeTree* engine which uses a multi-master coordination scheme based on Raft consensus [\[59\]](#page-13-4) (implemented by [Keeper](https://clickhou.se/keeper), a drop-in replacement for Apache Zookeeper written in C++) to guarantee that every ^^shard^^ has, at all times, a configurable number of replicas. Section [3.6](#page-5-0) discusses the replication mechanism in detail. As an example, [Figure 2](#page-2-0) shows a table with two shards, each replicated to two nodes. +ClickHouse supports sharding and replication of tables across multiple ^^cluster^^ nodes for scalability and availability. Sharding partitions a table into a set of table shards according to a sharding expression. The individual shards are mutually independent tables and typically located on different nodes. Clients can read and write shards directly, i.e. treat them as separate tables, or use the Distributed special ^^table engine^^, which provides a global view of all table shards. The main purpose of sharding is to process data sets which exceed the capacity of individual nodes (typically, a few dozens terabytes of data). Another use of sharding is to distribute the read-write load for a table over multiple nodes, i.e., load balancing. Orthogonal to that, a ^^shard^^ can be replicated across multiple nodes for tolerance against node failures. To that end, each Merge-Tree* ^^table engine^^ has a corresponding ReplicatedMergeTree* engine which uses a multi-master coordination scheme based on Raft consensus [\[59\]](#page-13-4) (implemented by [Keeper](https://clickhou.se/keeper), a drop-in replacement for Apache Zookeeper written in C++) to guarantee that every ^^shard^^ has, at all times, a configurable number of replicas. Section [3.6](#page-5-0) discusses the replication mechanism in detail. As an example, [Figure 2](#page-2-0) shows a table with two shards, each replicated to two nodes. Finally, the ClickHouse database engine can be operated in on-premise, cloud, standalone, or in-process modes. In the on-premise mode, users set up ClickHouse locally as a single server or multinode ^^cluster^^ with sharding and/or replication. Clients communicate with the database over the native, MySQL's, PostgreSQL's binary wire protocols, or an HTTP REST API. The cloud mode is represented by ClickHouse Cloud, a fully managed and autoscaling DBaaS offering. While this paper focuses on the on-premise mode, we plan to describe the architecture of ClickHouse Cloud in a follow-up publication. The [standalone mode](https://clickhou.se/local-fastest-tool) turns ClickHouse into a command line utility for analyzing and transforming files, making it a SQL-based alternative to Unix tools like cat and grep. While this requires no prior configuration, the standalone mode is restricted to a single server. Recently, an in-process mode called chDB [\[15\]](#page-12-3) has been developed for interactive data analysis use cases like Jupyter notebooks [\[37\]](#page-12-4) with Pandas dataframes [\[61\]](#page-13-5). Inspired by DuckDB [\[67\]](#page-13-6), [chDB](https://clickhou.se/chdb-rocket-engine) embeds ClickHouse as a high-performance OLAP engine into a host process. Compared to the other modes, this allows to pass source and result data between the database engine and the application efficiently without copying as they run in the same address space. ## 3 STORAGE LAYER {#3-storage-layer} -This section discusses MergeTree* table engines as ClickHouse's native storage format. We describe their on-disk representation and discuss three data pruning techniques in ClickHouse. Afterwards, we present merge strategies which continuously transform data without impacting simultaneous inserts. Finally, we explain how updates and deletes are implemented, as well as data deduplication, data replication, and ACID compliance. +This section discusses ^^MergeTree^^* table engines as ClickHouse's native storage format. We describe their on-disk representation and discuss three data pruning techniques in ClickHouse. Afterwards, we present merge strategies which continuously transform data without impacting simultaneous inserts. Finally, we explain how updates and deletes are implemented, as well as data deduplication, data replication, and ACID compliance. ### 3.1 On-Disk Format {#3-1-on-disk-format} -Each table in the MergeTree* table engine is organized as a collection of immutable table ^^parts^^. A part is created whenever a set of rows is inserted into the table. ^^Parts^^ are self-contained in the sense that they include all metadata required to interpret their content without additional lookups to a central catalog. To keep the number of ^^parts^^ per table low, a background merge job periodically combines multiple smaller ^^parts^^ into a larger part until a configurable part size is reached (150 GB by default). Since ^^parts^^ are sorted by the table's primary key columns (see Section [3.2)](#page-3-0), efficient k-way merge sort [\[40\]](#page-12-5) is used for merging. The source ^^parts^^ are marked as inactive and eventually deleted as soon as their reference count drops to zero, i.e. no further queries read from them. +Each table in the ^^MergeTree^^* ^^table engine^^ is organized as a collection of immutable table ^^parts^^. A part is created whenever a set of rows is inserted into the table. ^^Parts^^ are self-contained in the sense that they include all metadata required to interpret their content without additional lookups to a central catalog. To keep the number of ^^parts^^ per table low, a background merge job periodically combines multiple smaller ^^parts^^ into a larger part until a configurable part size is reached (150 GB by default). Since ^^parts^^ are sorted by the table's ^^primary key^^ columns (see Section [3.2)](#page-3-0), efficient k-way merge sort [\[40\]](#page-12-5) is used for merging. The source ^^parts^^ are marked as inactive and eventually deleted as soon as their reference count drops to zero, i.e. no further queries read from them. Rows can be inserted in two modes: In synchronous insert mode, each INSERT statement creates a new part and appends it to the table. To minimize the overhead of merges, database clients are encouraged to insert tuples in bulk, e.g. 20,000 rows at once. However, delays caused by client-side batching are often unacceptable if the data should be analyzed in real-time. For example, observability use cases frequently involve thousands of monitoring agents continuously sending small amounts of event and metrics data. Such scenarios can utilize the asynchronous insert mode, in which ClickHouse buffers rows from multiple incoming INSERTs into the same table and creates a new part only after the buffer size exceeds a configurable threshold or a timeout expires. -Figure 3: Inserts and merges for MergeTree*-engine tables. +Figure 3: Inserts and merges for ^^MergeTree^^*-engine tables. -[Figure 3](#page-2-1) illustrates four synchronous and two asynchronous inserts into a MergeTree*-engine table. Two merges reduced the number of active ^^parts^^ from initially fve to two. +[Figure 3](#page-2-1) illustrates four synchronous and two asynchronous inserts into a ^^MergeTree^^*-engine table. Two merges reduced the number of active ^^parts^^ from initially fve to two. Compared to LSM trees [\[58\]](#page-13-7) and their implementation in various databases [\[13,](#page-12-6) [26,](#page-12-7) [56\]](#page-13-8), ClickHouse treats all ^^parts^^ as equal instead of arranging them in a hierarchy. As a result, merges are no longer limited to ^^parts^^ in the same level. Since this also forgoes the implicit chronological ordering of ^^parts^^, alternative mechanisms for updates and deletes not based on tombstones are required (see Section [3.4)](#page-4-0). ClickHouse writes inserts directly to disk while other LSM-treebased stores typically use write-ahead logging (see Section [3.7)](#page-5-1). -A part corresponds to a directory on disk, containing one file for each column. As an optimization, the columns of a small part (smaller than 10 MB by default) are stored consecutively in a single file to increase the spatial locality for reads and writes. The rows of a part are further logically divided into groups of 8192 records, called granules. A granule represents the smallest indivisible data unit processed by the scan and index lookup operators in ClickHouse. Reads and writes of on-disk data are, however, not performed at the granule level but at the granularity of blocks, which combine multiple neighboring granules within a column. New blocks are formed based on a configurable byte size per block (by default 1 MB), i.e., the number of granules in a block is variable and depends on the column's data type and distribution. Blocks are furthermore compressed to reduce their size and I/O costs. By default, ClickHouse employs LZ4 [\[75\]](#page-13-9) as a general-purpose compression algorithm, but users can also specify specialized codecs like Gorilla [\[63\]](#page-13-10) or FPC [\[12\]](#page-12-8) for floating-point data. Compression algorithms can also be chained. For example, it is possible to first reduce logical redundancy in numeric values using delta coding [\[23\]](#page-12-9), then perform heavy-weight compression, and finally encrypt the data using an AES codec. Blocks are decompressed on-the-fy when they are loaded from disk into memory. To enable fast random access to individual granules despite compression, ClickHouse additionally stores for each column a mapping that associates every granule id with the offset of its containing compressed block in the column file and the offset of the granule in the uncompressed block. +A part corresponds to a directory on disk, containing one file for each column. As an optimization, the columns of a small part (smaller than 10 MB by default) are stored consecutively in a single file to increase the spatial locality for reads and writes. The rows of a part are further logically divided into groups of 8192 records, called granules. A ^^granule^^ represents the smallest indivisible data unit processed by the scan and index lookup operators in ClickHouse. Reads and writes of on-disk data are, however, not performed at the ^^granule^^ level but at the granularity of blocks, which combine multiple neighboring granules within a column. New blocks are formed based on a configurable byte size per ^^block^^ (by default 1 MB), i.e., the number of granules in a ^^block^^ is variable and depends on the column's data type and distribution. Blocks are furthermore compressed to reduce their size and I/O costs. By default, ClickHouse employs LZ4 [\[75\]](#page-13-9) as a general-purpose compression algorithm, but users can also specify specialized codecs like Gorilla [\[63\]](#page-13-10) or FPC [\[12\]](#page-12-8) for floating-point data. Compression algorithms can also be chained. For example, it is possible to first reduce logical redundancy in numeric values using delta coding [\[23\]](#page-12-9), then perform heavy-weight compression, and finally encrypt the data using an AES codec. Blocks are decompressed on-the-fy when they are loaded from disk into memory. To enable fast random access to individual granules despite compression, ClickHouse additionally stores for each column a mapping that associates every ^^granule^^ id with the offset of its containing compressed ^^block^^ in the column file and the offset of the ^^granule^^ in the uncompressed ^^block^^. Columns can further be ^^dictionary^^-encoded [\[2,](#page-12-10) [77,](#page-13-11) [81\]](#page-13-12) or made nullable using two special wrapper data types: LowCardinality(T) replaces the original column values by integer ids and thus significantly reduces the storage overhead for data with few unique values. Nullable(T) adds an internal bitmap to column T, representing whether column values are NULL or not. @@ -106,33 +106,33 @@ Finally, tables can be range, hash, or round-robin partitioned using arbitrary p In most use cases, scanning petabytes of data just to answer a single query is too slow and expensive. ClickHouse supports three data pruning techniques that allow skipping the majority of rows during searches and therefore speed up queries significantly. -First, users can define a **primary key index** for a table. The primary key columns determine the sort order of the rows within each part, i.e. the index is locally clustered. ClickHouse additionally stores, for every part, a mapping from the primary key column values of each granule's first row to the granule's id, i.e. the index is sparse [\[31\]](#page-12-13). The resulting data structure is typically small enough to remain fully in-memory, e.g., only 1000 entries are required to index 8.1 million rows. The main purpose of a primary key is to evaluate equality and range predicates for frequently filtered columns using binary search instead of sequential scans (Section [4.4)](#page-7-0). The local sorting can furthermore be exploited for part merges and query optimization, e.g. sort-based aggregation or to remove sorting operators from the physical execution plan when the primary key columns form a prefix of the sorting columns. +First, users can define a **^^primary key^^ index** for a table. The ^^primary key^^ columns determine the sort order of the rows within each part, i.e. the index is locally clustered. ClickHouse additionally stores, for every part, a mapping from the ^^primary key^^ column values of each ^^granule^^'s first row to the ^^granule^^'s id, i.e. the index is sparse [\[31\]](#page-12-13). The resulting data structure is typically small enough to remain fully in-memory, e.g., only 1000 entries are required to index 8.1 million rows. The main purpose of a ^^primary key^^ is to evaluate equality and range predicates for frequently filtered columns using binary search instead of sequential scans (Section [4.4)](#page-7-0). The local sorting can furthermore be exploited for part merges and query optimization, e.g. sort-based aggregation or to remove sorting operators from the physical execution plan when the ^^primary key^^ columns form a prefix of the sorting columns. -[Figure 4](#page-3-1) shows a primary key index on column EventTime for a table with page impression statistics. Granules that match the range predicate in the query can be found by binary searching the primary key index instead of scanning EventTime sequentially. +[Figure 4](#page-3-1) shows a ^^primary key^^ index on column EventTime for a table with page impression statistics. Granules that match the range predicate in the query can be found by binary searching the ^^primary key^^ index instead of scanning EventTime sequentially. -Figure 4: Evaluating filters with a primary key index. +Figure 4: Evaluating filters with a ^^primary key^^ index. -Second, users can create **table projections**, i.e., alternative versions of a table that contain the same rows sorted by a different primary key [\[71\]](#page-13-13). Projections allow to speed up queries that filter on columns different than the main table's primary key at the cost of an increased overhead for inserts, merges, and space consumption. By default, projections are populated lazily only from ^^parts^^ newly inserted into the main table but not from existing ^^parts^^ unless the user materializes the projection in full. The query optimizer chooses between reading from the main table or a projection based on estimated I/O costs. If no projection exists for a part, query execution falls back to the corresponding main table part. +Second, users can create **table projections**, i.e., alternative versions of a table that contain the same rows sorted by a different ^^primary key^^ [\[71\]](#page-13-13). Projections allow to speed up queries that filter on columns different than the main table's ^^primary key^^ at the cost of an increased overhead for inserts, merges, and space consumption. By default, projections are populated lazily only from ^^parts^^ newly inserted into the main table but not from existing ^^parts^^ unless the user materializes the ^^projection^^ in full. The query optimizer chooses between reading from the main table or a ^^projection^^ based on estimated I/O costs. If no ^^projection^^ exists for a part, query execution falls back to the corresponding main table part. -Third, **skipping indices** provide a lightweight alternative to projections. The idea of skipping indices is to store small amounts of metadata at the level of multiple consecutive granules which allows to avoid scanning irrelevant rows. Skipping indices can be created for arbitrary index expressions and using a configurable granularity, i.e. number of granules in a skipping index block. Available skipping index types include: 1. Min-max indices [\[51\]](#page-13-14), storing the minimum and maximum values of the index expression for each index block. This index type works well for locally clustered data with small absolute ranges, e.g. loosely sorted data. 2. Set indices, storing a configurable number of unique index block values. These indexes are best used with data with a small local cardinality, i.e. "clumped together" values. 3. Bloom filter indices [\[9\]](#page-12-14) build for row, token, or n-gram values with a configurable false positive rate. These indices support text search [\[73\]](#page-13-15), but unlike min-max and set indices, they cannot be used for range or negative predicates. +Third, **skipping indices** provide a lightweight alternative to projections. The idea of skipping indices is to store small amounts of metadata at the level of multiple consecutive granules which allows to avoid scanning irrelevant rows. Skipping indices can be created for arbitrary index expressions and using a configurable granularity, i.e. number of granules in a ^^skipping index^^ block. Available ^^skipping index^^ types include: 1. Min-max indices [\[51\]](#page-13-14), storing the minimum and maximum values of the index expression for each index ^^block^^. This index type works well for locally clustered data with small absolute ranges, e.g. loosely sorted data. 2. Set indices, storing a configurable number of unique index ^^block^^ values. These indexes are best used with data with a small local cardinality, i.e. "clumped together" values. 3. Bloom filter indices [\[9\]](#page-12-14) build for row, token, or n-gram values with a configurable false positive rate. These indices support text search [\[73\]](#page-13-15), but unlike min-max and set indices, they cannot be used for range or negative predicates. ### 3.3 Merge-time Data Transformation {#3-3-merge-time-data-transformation} Business intelligence and observability use cases often need to handle data generated at constantly high rates or in bursts. Also, recently generated data is typically more relevant for meaningful real-time insights than historical data. Such use cases require databases to sustain high data ingestion rates while continuously reducing the volume of historical data through techniques like aggregation or data aging. ClickHouse allows a continuous incremental transformation of existing data using different merge strategies. Merge-time data transformation does not compromise the performance of INSERT statements, but it cannot guarantee that tables never contain unwanted (e.g. outdated or non-aggregated) values. If necessary, all merge-time transformations can be applied at query time by specifying the keyword FINAL in SELECT statements. -**Replacing merges** retain only the most recently inserted version of a tuple based on the creation timestamp of its containing part, older versions are deleted. Tuples are considered equivalent if they have the same primary key column values. For explicit control over which tuple is preserved, it is also possible to specify a special version column for comparison. Replacing merges are commonly used as a merge-time update mechanism (normally in use cases where updates are frequent), or as an alternative to insert-time data deduplication (Section [3.5)](#page-5-2). +**Replacing merges** retain only the most recently inserted version of a tuple based on the creation timestamp of its containing part, older versions are deleted. Tuples are considered equivalent if they have the same ^^primary key^^ column values. For explicit control over which tuple is preserved, it is also possible to specify a special version column for comparison. Replacing merges are commonly used as a merge-time update mechanism (normally in use cases where updates are frequent), or as an alternative to insert-time data deduplication (Section [3.5)](#page-5-2). -**Aggregating merges** collapse rows with equal primary key column values into an aggregated row. Non-primary key columns must be of a partial aggregation state that holds the summary values. Two partial aggregation states, e.g. a sum and a count for avg(), are combined into a new partial aggregation state. Aggregating merges are typically used in materialized views instead of normal tables. Materialized views are populated based on a transformation query against a source table. Unlike other databases, ClickHouse does not refresh materialized views periodically with the entire content of the source table. Materialized views are rather updated incrementally with the result of the transformation query when a new part is inserted into the source table. +**Aggregating merges** collapse rows with equal ^^primary key^^ column values into an aggregated row. Non-^^primary key^^ columns must be of a partial aggregation state that holds the summary values. Two partial aggregation states, e.g. a sum and a count for avg(), are combined into a new partial aggregation state. Aggregating merges are typically used in materialized views instead of normal tables. Materialized views are populated based on a transformation query against a source table. Unlike other databases, ClickHouse does not refresh materialized views periodically with the entire content of the source table. Materialized views are rather updated incrementally with the result of the transformation query when a new part is inserted into the source table. -[Figure 5](#page-4-1) shows a materialized view defined on a table with page impression statistics. For new ^^parts^^ inserted into the source table, the transformation query computes the maximum and average latencies, grouped by region, and inserts the result into a materialized view. Aggregation functions avg() and max() with extension -State return partial aggregation states instead of actual results. An aggregating merge defined for the materialized view continuously combines partial aggregation states in different ^^parts^^. To obtain the final result, users consolidate the partial aggregation states in the materialized view using avg() and max()) with -Merge extension. +[Figure 5](#page-4-1) shows a ^^materialized view^^ defined on a table with page impression statistics. For new ^^parts^^ inserted into the source table, the transformation query computes the maximum and average latencies, grouped by region, and inserts the result into a ^^materialized view^^. Aggregation functions avg() and max() with extension -State return partial aggregation states instead of actual results. An aggregating merge defined for the ^^materialized view^^ continuously combines partial aggregation states in different ^^parts^^. To obtain the final result, users consolidate the partial aggregation states in the ^^materialized view^^ using avg() and max()) with -Merge extension. Figure 5: Aggregating merges in materialized views. -**TTL (time-to-live) merges** provide aging for historical data. Unlike deleting and aggregating merges, TTL merges process only one part at a time. TTL merges are defined in terms of rules with triggers and actions. A trigger is an expression computing a timestamp for every row, which is compared against the time at which the TTL merge runs. While this allows users to control actions at row granularity, we found it sufficient to check whether all rows satisfy a given condition and run the action on the entire part. Possible actions include 1. move the part to another volume (e.g. cheaper and slower storage), 2. re-compress the part (e.g. with a more heavy-weight codec), 3. delete the part, and 4. roll-up, i.e. aggregate the rows using a grouping key and aggregate functions. +**^^TTL^^ (time-to-live) merges** provide aging for historical data. Unlike deleting and aggregating merges, ^^TTL^^ merges process only one part at a time. ^^TTL^^ merges are defined in terms of rules with triggers and actions. A trigger is an expression computing a timestamp for every row, which is compared against the time at which the ^^TTL^^ merge runs. While this allows users to control actions at row granularity, we found it sufficient to check whether all rows satisfy a given condition and run the action on the entire part. Possible actions include 1. move the part to another volume (e.g. cheaper and slower storage), 2. re-compress the part (e.g. with a more heavy-weight codec), 3. delete the part, and 4. roll-up, i.e. aggregate the rows using a grouping key and aggregate functions. As an example, consider the logging table definition in [Listing 1.](#page-4-2) ClickHouse will move ^^parts^^ with timestamp column values older than one week to slow but inexpensive S3 object storage. @@ -145,7 +145,7 @@ Listing 1: Move part to object storage after one week. ### 3.4 Updates and Deletes {#3-4-updates-and-deletes} -The design of the MergeTree* table engines favors append-only workloads, yet some use cases require to modify existing data occasionally, e.g. for regulatory compliance. Two approaches for updating or deleting data exist, neither of which block parallel inserts. +The design of the ^^MergeTree^^* table engines favors append-only workloads, yet some use cases require to modify existing data occasionally, e.g. for regulatory compliance. Two approaches for updating or deleting data exist, neither of which ^^block^^ parallel inserts. **Mutations** rewrite all ^^parts^^ of a table in-place. To prevent a table (delete) or column (update) from doubling temporarily in size, this operation is non-atomic, i.e. parallel SELECT statements may read mutated and non-mutated ^^parts^^. Mutations guarantee that the data is physically changed at the end of the operation. Delete mutations are still expensive as they rewrite all columns in all ^^parts^^. @@ -155,7 +155,7 @@ Update and delete operations performed on the same table are expected to be rare ### 3.5 Idempotent Inserts {#3-5-idempotent-inserts} -A problem that frequently occurs in practice is how clients should handle connection timeouts after sending data to the server for insertion into a table. In this situation, it is difficult for clients to distinguish between whether the data was successfully inserted or not. The problem is traditionally solved by re-sending the data from the client to the server and relying on primary key or unique constraints to reject duplicate inserts. Databases perform the required point lookups quickly using index structures based on binary trees [\[39,](#page-12-15) [68\]](#page-13-16), radix trees [\[45\]](#page-13-17), or hash tables [\[29\]](#page-12-16). Since these data structures index every tuple, their space and update overhead becomes prohibitive for large data sets and high ingest rates. +A problem that frequently occurs in practice is how clients should handle connection timeouts after sending data to the server for insertion into a table. In this situation, it is difficult for clients to distinguish between whether the data was successfully inserted or not. The problem is traditionally solved by re-sending the data from the client to the server and relying on ^^primary key^^ or unique constraints to reject duplicate inserts. Databases perform the required point lookups quickly using index structures based on binary trees [\[39,](#page-12-15) [68\]](#page-13-16), radix trees [\[45\]](#page-13-17), or hash tables [\[29\]](#page-12-16). Since these data structures index every tuple, their space and update overhead becomes prohibitive for large data sets and high ingest rates. ClickHouse provides a more light-weight alternative based on the fact that each insert eventually creates a part. More specifically, the server maintains hashes of the N last inserted ^^parts^^ (e.g. N=100) and ignores re-inserts of ^^parts^^ with a known hash. Hashes for non-replicated and replicated tables are stored locally, respectively, in Keeper. As a result, inserts become idempotent, i.e. clients can simply re-send the same batch of rows after a timeout and assume that the server takes care of deduplication. For more control over the deduplication process, clients can optionally provide an insert token that acts as a part hash. While hash-based deduplication incurs an overhead associated with hashing the new rows, the cost of storing and comparing hashes is negligible. @@ -221,11 +221,11 @@ Node 2 ... N in [Figure 8](#page-7-1) show plan fragments executed on other node This section presents selected key performance optimizations applied to different stages of query execution. -**Query optimization**. The first set of optimizations is applied on top of a semantic query representation obtained from the query's AST. Examples of such optimizations include constant folding (e.g. concat(lower('a'),upper('b')) becomes 'aB'), extracting scalars from certain aggregation functions (e.g. sum(a*2) becomes 2 * sum(a)), common subexpression elimination, and transforming disjunctions of equality filters to IN-lists (e.g. x=c OR x=d becomes x IN (c,d)). The optimized semantic query representation is subsequently transformed to a logical operator plan. Optimizations on top of the logical plan include filter pushdown, reordering function evaluation and sorting steps, depending on which one is estimated to be more expensive. Finally, the logical query plan is transformed into a physical operator plan. This transformation can exploit the particularities of the involved table engines. For example, in the case of a MergeTree*-table engine, if the ORDER BY columns form a prefix of the primary key, the data can be read in disk order, and sorting operators can be removed from the plan. Also, if the grouping columns in an aggregation form a prefix of the primary key, ClickHouse can use sort aggregation [\[33\]](#page-12-21), i.e. aggregate runs of the same value in the pre-sorted inputs directly. Compared to hash aggregation, sort aggregation is significantly less memory-intensive, and the aggregate value can be passed to the next operator immediately after a run has been processed. +**Query optimization**. The first set of optimizations is applied on top of a semantic query representation obtained from the query's AST. Examples of such optimizations include constant folding (e.g. concat(lower('a'),upper('b')) becomes 'aB'), extracting scalars from certain aggregation functions (e.g. sum(a*2) becomes 2 * sum(a)), common subexpression elimination, and transforming disjunctions of equality filters to IN-lists (e.g. x=c OR x=d becomes x IN (c,d)). The optimized semantic query representation is subsequently transformed to a logical operator plan. Optimizations on top of the logical plan include filter pushdown, reordering function evaluation and sorting steps, depending on which one is estimated to be more expensive. Finally, the logical query plan is transformed into a physical operator plan. This transformation can exploit the particularities of the involved table engines. For example, in the case of a ^^MergeTree^^*-^^table engine^^, if the ORDER BY columns form a prefix of the ^^primary key^^, the data can be read in disk order, and sorting operators can be removed from the plan. Also, if the grouping columns in an aggregation form a prefix of the ^^primary key^^, ClickHouse can use sort aggregation [\[33\]](#page-12-21), i.e. aggregate runs of the same value in the pre-sorted inputs directly. Compared to hash aggregation, sort aggregation is significantly less memory-intensive, and the aggregate value can be passed to the next operator immediately after a run has been processed. **Query compilation**. ClickHouse employs [query compilation based on LLVM](https://clickhou.se/jit) to dynamically fuse adjacent plan operators [\[38,](#page-12-22) [53\]](#page-13-0). For example, the expression a * b + c + 1 can be combined into a single operator instead of three operators. Besides expressions, ClickHouse also employs compilation to evaluate multiple aggregation functions at once (i.e. for GROUP BY) and for sorting with more than one sort key. Query compilation decreases the number of virtual calls, keeps data in registers or CPU caches, and helps the branch predictor as less code needs to execute. Additionally, runtime compilation enables a rich set of optimizations, such as logical optimizations and peephole optimizations implemented in compilers, and gives access to the fastest locally available CPU instructions. The compilation is initiated only when the same regular, aggregation, or sorting expression is executed by different queries more than a configurable number of times. Compiled query operators are cached and can be reused by future queries.[7] -**Primary key index evaluation**. ClickHouse evaluates WHERE conditions using the primary key index if a subset of filter clauses in the condition's conjunctive normal form constitutes a prefix of the primary key columns. The primary key index is analyzed left-to-right on lexicographically sorted ranges of key values. Filter clauses corresponding to a primary key column are evaluated using ternary logic - they are all true, all false, or mixed true/false for the values in the range. In the latter case, the range is split into sub-ranges which are analyzed recursively. Additional optimizations exist for functions in filter conditions. First, functions have traits describing their monotonicity, e.g, toDayOfMonth(date) is piecewise monotonic within a month. Monotonicity traits allow to infer if a function produces sorted results on sorted input key value ranges. Second, some functions can compute the preimage of a given function result. This is used to replace comparisons of constants with function calls on the key columns by comparing the key column value with the preimage. For example, toYear(k) = 2024 can be replaced by k >= 2024-01-01 && k < 2025-01-01. +**^^Primary key^^ index evaluation**. ClickHouse evaluates WHERE conditions using the ^^primary key^^ index if a subset of filter clauses in the condition's conjunctive normal form constitutes a prefix of the ^^primary key^^ columns. The ^^primary key^^ index is analyzed left-to-right on lexicographically sorted ranges of key values. Filter clauses corresponding to a ^^primary key^^ column are evaluated using ternary logic - they are all true, all false, or mixed true/false for the values in the range. In the latter case, the range is split into sub-ranges which are analyzed recursively. Additional optimizations exist for functions in filter conditions. First, functions have traits describing their monotonicity, e.g, toDayOfMonth(date) is piecewise monotonic within a month. Monotonicity traits allow to infer if a function produces sorted results on sorted input key value ranges. Second, some functions can compute the preimage of a given function result. This is used to replace comparisons of constants with function calls on the key columns by comparing the key column value with the preimage. For example, toYear(k) = 2024 can be replaced by k >= 2024-01-01 && k < 2025-01-01. **Data skipping**. ClickHouse tries to avoid data reads at query runtime using the data structures presented in Section [3.2.](#page-3-0) Additionally, filters on different columns are evaluated sequentially in order of descending estimated selectivity based on heuristics and (optional) column statistics. Only data chunks that contain at least one matching row are passed to the next predicate. This gradually decreases the amount of read data and the number of computations to be performed from predicate to predicate. The optimization is only applied when at least one highly selective predicate is present; otherwise, the latency of the query would deteriorate compared to an evaluation of all predicates in parallel. @@ -346,7 +346,7 @@ DuckDB [\[67\]](#page-13-6) is also meant to be embedded by a host process, but ## 8 CONCLUSION AND OUTLOOK {#8-conclusion-and-outlook} -We presented the architecture of ClickHouse, an open-source, highperformance OLAP database. With a write-optimized storage layer and a state-of-the-art vectorized query engine at its foundation, ClickHouse enables real-time analytics over petabyte-scale data sets with high ingestion rates. By merging and transforming data asynchronously in the background, ClickHouse efficiently decouples data maintenance and parallel inserts. Its storage layer enables aggressive data pruning using sparse primary indexes, skipping indexes, and projection tables. We described ClickHouse's implementation of updates and deletes, idempotent inserts, and data replication across nodes for high availability. The query processing layer optimizes queries using a wealth of techniques, and parallelizes execution across all server and ^^cluster^^ resources. Integration table engines and functions provide a convenient way to interact with other data management systems and data formats seamlessly. Through benchmarks, we demonstrate that ClickHouse is amongst the fastest analytical databases on the market, and we showed significant improvements in the performance of typical queries in real-world deployments of ClickHouse throughout the years. +We presented the architecture of ClickHouse, an open-source, highperformance OLAP database. With a write-optimized storage layer and a state-of-the-art vectorized query engine at its foundation, ClickHouse enables real-time analytics over petabyte-scale data sets with high ingestion rates. By merging and transforming data asynchronously in the background, ClickHouse efficiently decouples data maintenance and parallel inserts. Its storage layer enables aggressive data pruning using sparse primary indexes, skipping indexes, and ^^projection^^ tables. We described ClickHouse's implementation of updates and deletes, idempotent inserts, and data replication across nodes for high availability. The query processing layer optimizes queries using a wealth of techniques, and parallelizes execution across all server and ^^cluster^^ resources. Integration table engines and functions provide a convenient way to interact with other data management systems and data formats seamlessly. Through benchmarks, we demonstrate that ClickHouse is amongst the fastest analytical databases on the market, and we showed significant improvements in the performance of typical queries in real-world deployments of ClickHouse throughout the years. All features and enhancements planned for 2024 can be found on the public roadmap [\[18\]](#page-12-33). Planned improvements include support for user transactions, PromQL [\[69\]](#page-13-36) as an alternative query language, a new datatype for semi-structured data (e.g. JSON), better plan-level optimizations of joins, as well as an implementation of light-weight updates to complement light-weight deletes. diff --git a/docs/managing-data/core-concepts/index.md b/docs/managing-data/core-concepts/index.md index 326345cf1cb..4404a163dbe 100644 --- a/docs/managing-data/core-concepts/index.md +++ b/docs/managing-data/core-concepts/index.md @@ -10,9 +10,9 @@ you will learn some of the core concepts of how ClickHouse works. | Page | Description | |----------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| [Table parts](/parts) | Learn what table parts are in ClickHouse. | -| [Table partitions](/partitions) | Learn what table partitions are and what they are used for. | -| [Table part merges](/merges) | Learn what table part merges are and what they are used for. | -| [Table shards and replicas](/shards) | Learn what table shards and replicas are and what they are used for. | -| [Primary indexes](/primary-indexes) | Introduces ClickHouse's sparse primary index and how it helps efficiently skip unnecessary data during query execution. Explains how the index is built and used, with examples and tools for observing its effect. Links to a deep dive for advanced use cases and best practices. | -| [Architectural Overview](/academic_overview) | A concise academic overview of all components of the ClickHouse architecture, based on our VLDB 2024 scientific paper. | +| [Table parts](./parts.mdx) | Learn what table parts are in ClickHouse. | +| [Table partitions](./partitions.mdx) | Learn what table partitions are and what they are used for. | +| [Table part merges](./merges.mdx) | Learn what table part merges are and what they are used for. | +| [Table shards and replicas](./shards.mdx) | Learn what table shards and replicas are and what they are used for. | +| [Primary indexes](./primary-indexes.mdx) | Introduces ClickHouse's sparse primary index and how it helps efficiently skip unnecessary data during query execution. Explains how the index is built and used, with examples and tools for observing its effect. Links to a deep dive for advanced use cases and best practices. | +| [Architectural Overview](./academic_overview.mdx) | A concise academic overview of all components of the ClickHouse architecture, based on our VLDB 2024 scientific paper. | diff --git a/docs/managing-data/core-concepts/merges.md b/docs/managing-data/core-concepts/merges.mdx similarity index 65% rename from docs/managing-data/core-concepts/merges.md rename to docs/managing-data/core-concepts/merges.mdx index f1086bf6ff7..1aa05502085 100644 --- a/docs/managing-data/core-concepts/merges.md +++ b/docs/managing-data/core-concepts/merges.mdx @@ -27,7 +27,7 @@ ClickHouse [is fast](/concepts/why-clickhouse-is-so-fast) not just for queries b This makes data writes lightweight and [highly efficient](/concepts/why-clickhouse-is-so-fast#storage-layer-concurrent-inserts-are-isolated-from-each-other). -To control the number of parts per table and implement ② above, ClickHouse continuously merges ([per partition](/partitions#per-partition-merges)) smaller parts into larger ones in the background until they reach a compressed size of approximately [~150 GB](/operations/settings/merge-tree-settings#max_bytes_to_merge_at_max_space_in_pool). +To control the number of ^^parts^^ per table and implement ② above, ClickHouse continuously merges ([per partition](/partitions#per-partition-merges)) smaller ^^parts^^ into larger ones in the background until they reach a compressed size of approximately [~150 GB](/operations/settings/merge-tree-settings#max_bytes_to_merge_at_max_space_in_pool). The following diagram sketches this background merge process: @@ -35,11 +35,11 @@ The following diagram sketches this background merge process:
      -The `merge level` of a part is incremented by one with each additional merge. A level of `0` means the part is new and has not been merged yet. Parts that were merged into larger parts are marked as [inactive](/operations/system-tables/parts) and finally deleted after a [configurable](/operations/settings/merge-tree-settings#old_parts_lifetime) time (8 minutes by default). Over time, this creates a **tree** of merged parts. Hence the name [merge tree](/engines/table-engines/mergetree-family) table. +The `merge level` of a part is incremented by one with each additional merge. A level of `0` means the part is new and has not been merged yet. ^^Parts^^ that were merged into larger ^^parts^^ are marked as [inactive](/operations/system-tables/parts) and finally deleted after a [configurable](/operations/settings/merge-tree-settings#old_parts_lifetime) time (8 minutes by default). Over time, this creates a **tree** of merged ^^parts^^. Hence the name [merge tree](/engines/table-engines/mergetree-family) table. ## Monitoring merges {#monitoring-merges} -In the [what are table parts](/parts) example, we [showed](/parts#monitoring-table-parts) that ClickHouse tracks all table parts in the [parts](/operations/system-tables/parts) system table. We used the following query to retrieve the merge level and the number of stored rows per active part of the example table: +In the [what are table parts](/parts) example, we [showed](/parts#monitoring-table-parts) that ClickHouse tracks all table ^^parts^^ in the [parts](/operations/system-tables/parts) system table. We used the following query to retrieve the merge level and the number of stored rows per active part of the example table: ```sql SELECT name, @@ -50,7 +50,7 @@ WHERE (database = 'uk') AND (`table` = 'uk_price_paid_simple') AND active ORDER BY name ASC; ``` -The [previously documented](/parts#monitoring-table-parts) query result shows that the example table had four active parts, each created from a single merge of the initially inserted parts: +The [previously documented](/parts#monitoring-table-parts) query result shows that the example table had four active ^^parts^^, each created from a single merge of the initially inserted ^^parts^^: ```response ┌─name────────┬─level─┬────rows─┐ 1. │ all_0_5_1 │ 1 │ 6368414 │ @@ -60,7 +60,7 @@ The [previously documented](/parts#monitoring-table-parts) query result shows th └─────────────┴───────┴─────────┘ ``` -[Running](https://sql.clickhouse.com/?query=U0VMRUNUCiAgICBuYW1lLAogICAgbGV2ZWwsCiAgICByb3dzCkZST00gc3lzdGVtLnBhcnRzCldIRVJFIChkYXRhYmFzZSA9ICd1aycpIEFORCAoYHRhYmxlYCA9ICd1a19wcmljZV9wYWlkX3NpbXBsZScpIEFORCBhY3RpdmUKT1JERVIgQlkgbmFtZSBBU0M7&run_query=true&tab=results) the query now shows that the four parts have since merged into a single final part (as long as there are no further inserts into the table): +[Running](https://sql.clickhouse.com/?query=U0VMRUNUCiAgICBuYW1lLAogICAgbGV2ZWwsCiAgICByb3dzCkZST00gc3lzdGVtLnBhcnRzCldIRVJFIChkYXRhYmFzZSA9ICd1aycpIEFORCAoYHRhYmxlYCA9ICd1a19wcmljZV9wYWlkX3NpbXBsZScpIEFORCBhY3RpdmUKT1JERVIgQlkgbmFtZSBBU0M7&run_query=true&tab=results) the query now shows that the four ^^parts^^ have since merged into a single final part (as long as there are no further inserts into the table): ```response ┌─name───────┬─level─┬─────rows─┐ @@ -76,7 +76,7 @@ In ClickHouse 24.10, a new [merges dashboard](https://presentations.clickhouse.c The recorded dashboard above captures the entire process, from the initial data inserts to the final merge into a single part: -① Number of active parts. +① Number of active ^^parts^^. ② Part merges, visually represented with boxes (size reflects part size). @@ -92,9 +92,9 @@ A single ClickHouse server uses several background [merge threads](/operations/s Each merge thread executes a loop: -① Decide which parts to merge next, and load these parts into memory. +① Decide which ^^parts^^ to merge next, and load these ^^parts^^ into memory. -② Merge the parts in memory into a larger part. +② Merge the ^^parts^^ in memory into a larger part. ③ Write the merged part to disk. @@ -104,11 +104,11 @@ Note that increasing the number of CPU cores and the size of RAM allows to incre ## Memory optimized merges {#memory-optimized-merges} -ClickHouse does not necessarily load all parts to be merged into memory at once, as sketched in the [previous example](/merges#concurrent-merges). Based on several [factors](https://github.com/ClickHouse/ClickHouse/blob/bf37120c925ed846ae5cd72cd51e6340bebd2918/src/Storages/MergeTree/MergeTreeSettings.cpp#L210), and to reduce memory consumption (sacrificing merge speed), so-called [vertical merging](https://github.com/ClickHouse/ClickHouse/blob/bf37120c925ed846ae5cd72cd51e6340bebd2918/src/Storages/MergeTree/MergeTreeSettings.cpp#L209) loads and merges parts by chunks of blocks instead of in one go. +ClickHouse does not necessarily load all ^^parts^^ to be merged into memory at once, as sketched in the [previous example](/merges#concurrent-merges). Based on several [factors](https://github.com/ClickHouse/ClickHouse/blob/bf37120c925ed846ae5cd72cd51e6340bebd2918/src/Storages/MergeTree/MergeTreeSettings.cpp#L210), and to reduce memory consumption (sacrificing merge speed), so-called [vertical merging](https://github.com/ClickHouse/ClickHouse/blob/bf37120c925ed846ae5cd72cd51e6340bebd2918/src/Storages/MergeTree/MergeTreeSettings.cpp#L209) loads and merges ^^parts^^ by chunks of blocks instead of in one go. ## Merge mechanics {#merge-mechanics} -The diagram below illustrates how a single background [merge thread](/merges#concurrent-merges) in ClickHouse merges parts (by default, without [vertical merging](/merges#memory-optimized-merges)): +The diagram below illustrates how a single background [merge thread](/merges#concurrent-merges) in ClickHouse merges ^^parts^^ (by default, without [vertical merging](/merges#memory-optimized-merges)): @@ -116,7 +116,7 @@ The diagram below illustrates how a single background [merge thread](/merges#con The part merging is performed in several steps: -**① Decompression & Loading**: The [compressed binary column files](/parts#what-are-table-parts-in-clickhouse) from the parts to be merged are decompressed and loaded into memory. +**① Decompression & Loading**: The [compressed binary column files](/parts#what-are-table-parts-in-clickhouse) from the ^^parts^^ to be merged are decompressed and loaded into memory. **② Merging**: The data is merged into larger column files. @@ -128,19 +128,19 @@ Additional [metadata in data parts](/parts), such as secondary data skipping ind The mechanics of step ② depend on the specific [MergeTree engine](/engines/table-engines/mergetree-family) used, as different engines handle merging differently. For example, rows may be aggregated or replaced if outdated. As mentioned earlier, this approach **offloads all data processing to background merges**, enabling **super-fast inserts** by keeping write operations lightweight and efficient. -Next, we will briefly outline the merge mechanics of specific engines in the MergeTree family. +Next, we will briefly outline the merge mechanics of specific engines in the ^^MergeTree^^ family. ### Standard merges {#standard-merges} -The diagram below illustrates how parts in a standard [MergeTree](/engines/table-engines/mergetree-family/mergetree) table are merged: +The diagram below illustrates how ^^parts^^ in a standard [MergeTree](/engines/table-engines/mergetree-family/mergetree) table are merged:
      -The DDL statement in the diagram above creates a `MergeTree` table with a sorting key `(town, street)`, [meaning](/parts#what-are-table-parts-in-clickhouse) data on disk is sorted by these columns, and a sparse primary index is generated accordingly. +The DDL statement in the diagram above creates a `MergeTree` table with a ^^sorting key^^ `(town, street)`, [meaning](/parts#what-are-table-parts-in-clickhouse) data on disk is sorted by these columns, and a sparse primary index is generated accordingly. -The ① decompressed, pre-sorted table columns are ② merged while preserving the table's global sorting order defined by the table's sorting key, ③ a new sparse primary index is generated, and ④ the merged column files and index are compressed and stored as a new data part on disk. +The ① decompressed, pre-sorted table columns are ② merged while preserving the table's global sorting order defined by the table's ^^sorting key^^, ③ a new sparse primary index is generated, and ④ the merged column files and index are compressed and stored as a new data part on disk. ### Replacing merges {#replacing-merges} @@ -150,25 +150,25 @@ Part merges in a [ReplacingMergeTree](/engines/table-engines/mergetree-family/re
      -The DDL statement in the diagram above creates a `ReplacingMergeTree` table with a sorting key `(town, street, id)`, meaning data on disk is sorted by these columns, with a sparse primary index generated accordingly. +The DDL statement in the diagram above creates a `ReplacingMergeTree` table with a ^^sorting key^^ `(town, street, id)`, meaning data on disk is sorted by these columns, with a sparse primary index generated accordingly. The ② merging works similarly to a standard `MergeTree` table, combining decompressed, pre-sorted columns while preserving the global sorting order. -However, the `ReplacingMergeTree` removes duplicate rows with the same sorting key, keeping only the most recent row based on the creation timestamp of its containing part. +However, the `ReplacingMergeTree` removes duplicate rows with the same ^^sorting key^^, keeping only the most recent row based on the creation timestamp of its containing part.
      ### Summing merges {#summing-merges} -Numeric data is automatically summarized during merges of parts from a [SummingMergeTree](/engines/table-engines/mergetree-family/summingmergetree) table: +Numeric data is automatically summarized during merges of ^^parts^^ from a [SummingMergeTree](/engines/table-engines/mergetree-family/summingmergetree) table:
      -The DDL statement in the diagram above defines a `SummingMergeTree` table with `town` as the sorting key, meaning that data on disk is sorted by this column and a sparse primary index is created accordingly. +The DDL statement in the diagram above defines a `SummingMergeTree` table with `town` as the ^^sorting key^^, meaning that data on disk is sorted by this column and a sparse primary index is created accordingly. -In the ② merging step, ClickHouse replaces all rows with the same sorting key with a single row, summing the values of numeric columns. +In the ② merging step, ClickHouse replaces all rows with the same ^^sorting key^^ with a single row, summing the values of numeric columns. ### Aggregating merges {#aggregating-merges} @@ -178,6 +178,6 @@ The `SummingMergeTree` table example from above is a specialized variant of the
      -The DDL statement in the diagram above creates an `AggregatingMergeTree` table with `town` as the sorting key, ensuring data is ordered by this column on disk and a corresponding sparse primary index is generated. +The DDL statement in the diagram above creates an `AggregatingMergeTree` table with `town` as the ^^sorting key^^, ensuring data is ordered by this column on disk and a corresponding sparse primary index is generated. -During ② merging, ClickHouse replaces all rows with the same sorting key with a single row storing [partial aggregation states](https://clickhouse.com/blog/clickhouse_vs_elasticsearch_mechanics_of_count_aggregations#-multi-core-parallelization) (e.g. a `sum` and a `count` for `avg()`). These states ensure accurate results through incremental background merges. +During ② merging, ClickHouse replaces all rows with the same ^^sorting key^^ with a single row storing [partial aggregation states](https://clickhouse.com/blog/clickhouse_vs_elasticsearch_mechanics_of_count_aggregations#-multi-core-parallelization) (e.g. a `sum` and a `count` for `avg()`). These states ensure accurate results through incremental background merges. diff --git a/docs/managing-data/core-concepts/partitions.md b/docs/managing-data/core-concepts/partitions.mdx similarity index 98% rename from docs/managing-data/core-concepts/partitions.md rename to docs/managing-data/core-concepts/partitions.mdx index 65bfa17c3bf..bc831882f29 100644 --- a/docs/managing-data/core-concepts/partitions.md +++ b/docs/managing-data/core-concepts/partitions.mdx @@ -58,7 +58,7 @@ With partitioning enabled, ClickHouse only [merges](/merges) data parts within,
      -As sketched in the diagram above, parts belonging to different partitions are never merged. If a partition key with high cardinality is chosen, then parts spread across thousands of partitions will never be merge candidates - exceeding preconfigured limits and causing the dreaded `Too many parts` error. Addressing this problem is simple: choose a sensible partition key with [cardinality under 1000..10000](https://github.com/ClickHouse/ClickHouse/blob/ffc5b2c56160b53cf9e5b16cfb73ba1d956f7ce4/src/Storages/MergeTree/MergeTreeDataWriter.cpp#L121). +As sketched in the diagram above, parts belonging to different partitions are never merged. If a partition key with high cardinality is chosen, then parts spread across thousands of partitions will never be merge candidates - exceeding preconfigured limits and causing the dreaded `Too many ^^parts^^` error. Addressing this problem is simple: choose a sensible partition key with [cardinality under 1000..10000](https://github.com/ClickHouse/ClickHouse/blob/ffc5b2c56160b53cf9e5b16cfb73ba1d956f7ce4/src/Storages/MergeTree/MergeTreeDataWriter.cpp#L121). ## Monitoring partitions {#monitoring-partitions} diff --git a/docs/managing-data/core-concepts/parts.md b/docs/managing-data/core-concepts/parts.mdx similarity index 55% rename from docs/managing-data/core-concepts/parts.md rename to docs/managing-data/core-concepts/parts.mdx index 0c3c2f224e7..99a2791f26e 100644 --- a/docs/managing-data/core-concepts/parts.md +++ b/docs/managing-data/core-concepts/parts.mdx @@ -9,9 +9,9 @@ import merges from '@site/static/images/managing-data/core-concepts/merges.png'; import part from '@site/static/images/managing-data/core-concepts/part.png'; import Image from '@theme/IdealImage'; -## What are table parts in ClickHouse? {#what-are-table-parts-in-clickhouse} +## What are table ^^parts^^ in ClickHouse? {#what-are-table-parts-in-clickhouse} -
      +
      The data from each table in the ClickHouse [MergeTree engine family](/engines/table-engines/mergetree-family) is organized on disk as a collection of immutable `data parts`. @@ -33,13 +33,13 @@ You can [query this table](https://sql.clickhouse.com/?query=U0VMRUNUICogRlJPTSB A data part is created whenever a set of rows is inserted into the table. The following diagram sketches this: - + -
      +
      When a ClickHouse server processes the example insert with 4 rows (e.g., via an [INSERT INTO statement](/sql-reference/statements/insert-into)) sketched in the diagram above, it performs several steps: -① **Sorting**: The rows are sorted by the table's sorting key `(town, street)`, and a [sparse primary index](/guides/best-practices/sparse-primary-indexes) is generated for the sorted rows. +① **Sorting**: The rows are sorted by the table's ^^sorting key^^ `(town, street)`, and a [sparse primary index](/guides/best-practices/sparse-primary-indexes) is generated for the sorted rows. ② **Splitting**: The sorted data is split into columns. @@ -49,21 +49,21 @@ When a ClickHouse server processes the example insert with 4 rows (e.g., via an Depending on the table's specific engine, additional transformations [may](/operations/settings/settings) take place alongside sorting. -Data parts are self-contained, including all metadata needed to interpret their contents without requiring a central catalog. Beyond the sparse primary index, parts contain additional metadata, such as secondary [data skipping indexes](/optimize/skipping-indexes), [column statistics](https://clickhouse.com/blog/clickhouse-release-23-11#column-statistics-for-prewhere), checksums, min-max indexes (if [partitioning](/partitions) is used), and [more](https://github.com/ClickHouse/ClickHouse/blob/a065b11d591f22b5dd50cb6224fab2ca557b4989/src/Storages/MergeTree/MergeTreeData.h#L104). +Data ^^parts^^ are self-contained, including all metadata needed to interpret their contents without requiring a central catalog. Beyond the sparse primary index, ^^parts^^ contain additional metadata, such as secondary [data skipping indexes](/optimize/skipping-indexes), [column statistics](https://clickhouse.com/blog/clickhouse-release-23-11#column-statistics-for-prewhere), checksums, min-max indexes (if [partitioning](/partitions) is used), and [more](https://github.com/ClickHouse/ClickHouse/blob/a065b11d591f22b5dd50cb6224fab2ca557b4989/src/Storages/MergeTree/MergeTreeData.h#L104). ## Part merges {#part-merges} -To manage the number of parts per table, a [background merge](/merges) job periodically combines smaller parts into larger ones until they reach a [configurable](/operations/settings/merge-tree-settings#max_bytes_to_merge_at_max_space_in_pool) compressed size (typically ~150 GB). Merged parts are marked as inactive and deleted after a [configurable](/operations/settings/merge-tree-settings#old_parts_lifetime) time interval. Over time, this process creates a hierarchical structure of merged parts, which is why it's called a MergeTree table: +To manage the number of ^^parts^^ per table, a [background merge](/merges) job periodically combines smaller ^^parts^^ into larger ones until they reach a [configurable](/operations/settings/merge-tree-settings#max_bytes_to_merge_at_max_space_in_pool) compressed size (typically ~150 GB). Merged ^^parts^^ are marked as inactive and deleted after a [configurable](/operations/settings/merge-tree-settings#old_parts_lifetime) time interval. Over time, this process creates a hierarchical structure of merged ^^parts^^, which is why it's called a ^^MergeTree^^ table: - + -
      +
      -To minimize the number of initial parts and the overhead of merges, database clients are [encouraged](https://clickhouse.com/blog/asynchronous-data-inserts-in-clickhouse#data-needs-to-be-batched-for-optimal-performance) to either insert tuples in bulk, e.g. 20,000 rows at once, or to use the [asynchronous insert mode](https://clickhouse.com/blog/asynchronous-data-inserts-in-clickhouse), in which ClickHouse buffers rows from multiple incoming INSERTs into the same table and creates a new part only after the buffer size exceeds a configurable threshold, or a timeout expires. +To minimize the number of initial ^^parts^^ and the overhead of merges, database clients are [encouraged](https://clickhouse.com/blog/asynchronous-data-inserts-in-clickhouse#data-needs-to-be-batched-for-optimal-performance) to either insert tuples in bulk, e.g. 20,000 rows at once, or to use the [asynchronous insert mode](https://clickhouse.com/blog/asynchronous-data-inserts-in-clickhouse), in which ClickHouse buffers rows from multiple incoming INSERTs into the same table and creates a new part only after the buffer size exceeds a configurable threshold, or a timeout expires. -## Monitoring table parts {#monitoring-table-parts} +## Monitoring table ^^parts^^ {#monitoring-table-parts} -You can [query](https://sql.clickhouse.com/?query=U0VMRUNUIF9wYXJ0CkZST00gdWsudWtfcHJpY2VfcGFpZF9zaW1wbGUKR1JPVVAgQlkgX3BhcnQKT1JERVIgQlkgX3BhcnQgQVNDOw&run_query=true&tab=results) the list of all currently existing active parts of our example table by using the [virtual column](/engines/table-engines#table_engines-virtual_columns) `_part`: +You can [query](https://sql.clickhouse.com/?query=U0VMRUNUIF9wYXJ0CkZST00gdWsudWtfcHJpY2VfcGFpZF9zaW1wbGUKR1JPVVAgQlkgX3BhcnQKT1JERVIgQlkgX3BhcnQgQVNDOw&run_query=true&tab=results) the list of all currently existing active ^^parts^^ of our example table by using the [virtual column](/engines/table-engines#table_engines-virtual_columns) `_part`: ```sql SELECT _part @@ -80,7 +80,7 @@ ORDER BY _part ASC; ``` The query above retrieves the names of directories on disk, with each directory representing an active data part of the table. The components of these directory names have specific meanings, which are documented [here](https://github.com/ClickHouse/ClickHouse/blob/f90551824bb90ade2d8a1d8edd7b0a3c0a459617/src/Storages/MergeTree/MergeTreeData.h#L130) for those interested in exploring further. -Alternatively, ClickHouse tracks info for all parts of all tables in the [system.parts](/operations/system-tables/parts) system table, and the following query [returns](https://sql.clickhouse.com/?query=U0VMRUNUCiAgICBuYW1lLAogICAgbGV2ZWwsCiAgICByb3dzCkZST00gc3lzdGVtLnBhcnRzCldIRVJFIChkYXRhYmFzZSA9ICd1aycpIEFORCAoYHRhYmxlYCA9ICd1a19wcmljZV9wYWlkX3NpbXBsZScpIEFORCBhY3RpdmUKT1JERVIgQlkgbmFtZSBBU0M7&run_query=true&tab=results) for our example table above the list of all currently active parts, their merge level, and the number of rows stored in these parts: +Alternatively, ClickHouse tracks info for all ^^parts^^ of all tables in the [system.parts](/operations/system-tables/parts) system table, and the following query [returns](https://sql.clickhouse.com/?query=U0VMRUNUCiAgICBuYW1lLAogICAgbGV2ZWwsCiAgICByb3dzCkZST00gc3lzdGVtLnBhcnRzCldIRVJFIChkYXRhYmFzZSA9ICd1aycpIEFORCAoYHRhYmxlYCA9ICd1a19wcmljZV9wYWlkX3NpbXBsZScpIEFORCBhY3RpdmUKT1JERVIgQlkgbmFtZSBBU0M7&run_query=true&tab=results) for our example table above the list of all currently active ^^parts^^, their merge level, and the number of rows stored in these ^^parts^^: ```sql SELECT @@ -98,4 +98,4 @@ ORDER BY name ASC; 4. │ all_6_11_1 │ 1 │ 6459763 │ └─────────────┴───────┴─────────┘ ``` -The merge level is incremented by one with each additional merge on the part. A level of 0 indicates this is a new part that has not been merged yet. +The merge level is incremented by one with each additional merge on the part. A level of 0 indicates this is a new part that has not been merged yet. \ No newline at end of file diff --git a/docs/managing-data/core-concepts/primary-indexes.md b/docs/managing-data/core-concepts/primary-indexes.mdx similarity index 83% rename from docs/managing-data/core-concepts/primary-indexes.md rename to docs/managing-data/core-concepts/primary-indexes.mdx index 06eea10b266..5879f54cc3c 100644 --- a/docs/managing-data/core-concepts/primary-indexes.md +++ b/docs/managing-data/core-concepts/primary-indexes.mdx @@ -21,13 +21,13 @@ For advanced indexing strategies and deeper technical detail, see the [primary i
      -The sparse primary index in ClickHouse helps efficiently identify [granules](https://clickhouse.com/docs/guides/best-practices/sparse-primary-indexes#data-is-organized-into-granules-for-parallel-data-processing)—blocks of rows—that might contain data matching a query's condition on the table's primary key columns. In the next section, we explain how this index is constructed from the values in those columns. +The sparse primary index in ClickHouse helps efficiently identify [granules](https://clickhouse.com/docs/guides/best-practices/sparse-primary-indexes#data-is-organized-into-granules-for-parallel-data-processing)—blocks of rows—that might contain data matching a query's condition on the table's ^^primary key^^ columns. In the next section, we explain how this index is constructed from the values in those columns. ### Sparse primary index creation {#sparse-primary-index-creation} To illustrate how the sparse primary index is built, we use the [uk_price_paid_simple](https://clickhouse.com/docs/parts) table along with some animations. -As a [reminder](https://clickhouse.com/docs/parts), in our ① example table with the primary key (town, street), ② inserted data is ③ stored on disk, sorted by the primary key column values, and compressed, in separate files for each column: +As a [reminder](https://clickhouse.com/docs/parts), in our ① example table with the ^^primary key^^ (town, street), ② inserted data is ③ stored on disk, sorted by the ^^primary key^^ column values, and compressed, in separate files for each column: @@ -35,13 +35,13 @@ As a [reminder](https://clickhouse.com/docs/parts), in our ① example table wit For processing, each column's data is ④ logically divided into granules—each covering 8,192 rows—which are the smallest units ClickHouse's data processing mechanics work with. -This granule structure is also what makes the primary index **sparse**: instead of indexing every row, ClickHouse stores ⑤ the primary key values from just one row per granule—specifically, the first row. This results in one index entry per granule: +This ^^granule^^ structure is also what makes the primary index **sparse**: instead of indexing every row, ClickHouse stores ⑤ the ^^primary key^^ values from just one row per ^^granule^^—specifically, the first row. This results in one index entry per ^^granule^^:

      -Thanks to its sparseness, the primary index is small enough to fit entirely in memory, enabling fast filtering for queries with predicates on primary key columns. In the next section, we show how it helps accelerate such queries. +Thanks to its sparseness, the primary index is small enough to fit entirely in memory, enabling fast filtering for queries with predicates on ^^primary key^^ columns. In the next section, we show how it helps accelerate such queries. ### Primary index usage {#primary-index-usage} @@ -51,7 +51,7 @@ We sketch how the sparse primary index is used for query acceleration with anoth

      -① The example query includes a predicate on both primary key columns: `town = 'LONDON' AND street = 'OXFORD STREET'`. +① The example query includes a predicate on both ^^primary key^^ columns: `town = 'LONDON' AND street = 'OXFORD STREET'`. ② To accelerate the query, ClickHouse loads the table's primary index into memory. @@ -81,7 +81,7 @@ GROUP BY part_name; └───────────┴─────────┘ ``` -This query shows the first 10 entries from the primary index of one of the current data parts. Note that these parts are continuously [merged](/merges) in the background into larger parts: +This query shows the first 10 entries from the primary index of one of the current data ^^parts^^. Note that these ^^parts^^ are continuously [merged](/merges) in the background into larger ^^parts^^: ```sql SELECT @@ -109,7 +109,7 @@ LIMIT 10; └───────┴────────────────┴──────────────────┘ ``` -Lastly, we use the [EXPLAIN](/sql-reference/statements/explain) clause to see how the primary indexes of all data parts are used to skip granules that can't possibly contain rows matching the example query's predicates. These granules are excluded from loading and processing: +Lastly, we use the [EXPLAIN](/sql-reference/statements/explain) clause to see how the primary indexes of all data ^^parts^^ are used to skip granules that can't possibly contain rows matching the example query's predicates. These granules are excluded from loading and processing: ```sql EXPLAIN indexes = 1 SELECT @@ -138,7 +138,7 @@ WHERE └────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` -Note how row 13 of the EXPLAIN output above shows that only 3 out of 3,609 granules across all data parts were selected by the primary index analysis for processing. The remaining granules were skipped entirely. +Note how row 13 of the EXPLAIN output above shows that only 3 out of 3,609 granules across all data ^^parts^^ were selected by the primary index analysis for processing. The remaining granules were skipped entirely. We can also observe that most of the data was skipped by simply running the query: ```sql @@ -169,11 +169,11 @@ SELECT count() FROM uk.uk_price_paid_simple; ## Key takeaways {#key-takeaways} -* **Sparse primary indexes** help ClickHouse skip unnecessary data by identifying which granules might contain rows matching query conditions on primary key columns. +* **Sparse primary indexes** help ClickHouse skip unnecessary data by identifying which granules might contain rows matching query conditions on ^^primary key^^ columns. -* Each index stores only the primary key values from the **first row of every granule** (a granule has 8,192 rows by default), making it compact enough to fit in memory. +* Each index stores only the ^^primary key^^ values from the **first row of every ^^granule^^** (a ^^granule^^ has 8,192 rows by default), making it compact enough to fit in memory. -* **Each data part** in a MergeTree table has its **own primary index**, which is used independently during query execution. +* **Each data part** in a ^^MergeTree^^ table has its **own primary index**, which is used independently during query execution. * During queries, the index allows ClickHouse to **skip granules**, reducing I/O and memory usage while accelerating performance. diff --git a/docs/managing-data/core-concepts/shards.md b/docs/managing-data/core-concepts/shards.mdx similarity index 54% rename from docs/managing-data/core-concepts/shards.md rename to docs/managing-data/core-concepts/shards.mdx index 214febee990..a4ee754da7a 100644 --- a/docs/managing-data/core-concepts/shards.md +++ b/docs/managing-data/core-concepts/shards.mdx @@ -31,11 +31,11 @@ In such a case the data can be split over multiple ClickHouse servers in the for
      -Each shard holds a subset of the data and functions as a regular ClickHouse table that can be queried independently. However, queries will only process that subset, which may be a valid use case depending on data distribution. Typically, a [distributed table](/docs/engines/table-engines/special/distributed) (often per server) provides a unified view of the full dataset. It doesn't store data itself but forwards **SELECT** queries to all shards, assembles the results, and routes **INSERTS** to distribute data evenly. +Each ^^shard^^ holds a subset of the data and functions as a regular ClickHouse table that can be queried independently. However, queries will only process that subset, which may be a valid use case depending on data distribution. Typically, a [distributed table](/docs/engines/table-engines/special/distributed) (often per server) provides a unified view of the full dataset. It doesn't store data itself but forwards **SELECT** queries to all shards, assembles the results, and routes **INSERTS** to distribute data evenly. -## Distributed table creation {#distributed-table-creation} +## ^^Distributed table^^ creation {#distributed-table-creation} -To illustrate **SELECT** query forwarding and **INSERT** routing, we consider the [What are table parts](/parts) example table split across two shards on two ClickHouse servers. First, we show the DDL statement for creating a corresponding **Distributed table** for this setup: +To illustrate **SELECT** query forwarding and **INSERT** routing, we consider the [What are table parts](/parts) example table split across two shards on two ClickHouse servers. First, we show the DDL statement for creating a corresponding **^^Distributed table^^** for this setup: ```sql CREATE TABLE uk.uk_price_paid_simple_dist ON CLUSTER test_cluster @@ -50,59 +50,59 @@ ENGINE = Distributed('test_cluster', 'uk', 'uk_price_paid_simple', rand()) The `ON CLUSTER` clause makes the DDL statement a [distributed DDL statement](/docs/sql-reference/distributed-ddl), instructing ClickHouse to create the table on all servers listed in the `test_cluster` [cluster definition](/architecture/replication/#configure-clickhouse-servers). Distributed DDL requires an additional [Keeper](https://clickhouse.com/clickhouse/keeper) component in the [cluster architecture](/architecture/horizontal-scaling). -For the [distributed engine parameters](/docs/engines/table-engines/special/distributed#distributed-parameters), we specify the cluster name (`test_cluster`), the database name (`uk`) for the sharded target table, the sharded target table's name (`uk_price_paid_simple`), and the **sharding key** for INSERT routing. In this example, we use the [rand](/sql-reference/functions/random-functions#rand) function to randomly assign rows to shards. However, any expression—even complex ones—can be used as a sharding key, depending on the use case. The next section illustrates how INSERT routing works. +For the [distributed engine parameters](/docs/engines/table-engines/special/distributed#distributed-parameters), we specify the ^^cluster^^ name (`test_cluster`), the database name (`uk`) for the sharded target table, the sharded target table's name (`uk_price_paid_simple`), and the **sharding key** for INSERT routing. In this example, we use the [rand](/sql-reference/functions/random-functions#rand) function to randomly assign rows to shards. However, any expression—even complex ones—can be used as a sharding key, depending on the use case. The next section illustrates how INSERT routing works. ## INSERT routing {#insert-routing} -The diagram below illustrates how INSERTs into a distributed table are processed in ClickHouse: +The diagram below illustrates how INSERTs into a ^^distributed table^^ are processed in ClickHouse:
      -① An INSERT (with a single row) targeting the distributed table is sent to a ClickHouse server hosting the table, either directly or via a load balancer. +① An INSERT (with a single row) targeting the ^^distributed table^^ is sent to a ClickHouse server hosting the table, either directly or via a load balancer. -② For each row from the INSERT (just one in our example), ClickHouse evaluates the sharding key (here, rand()), takes the result modulo the number of shard servers, and uses that as the target server ID (IDs start from 0 and increment by 1). The row is then forwarded and ③ inserted into the corresponding server's table shard. +② For each row from the INSERT (just one in our example), ClickHouse evaluates the sharding key (here, rand()), takes the result modulo the number of ^^shard^^ servers, and uses that as the target server ID (IDs start from 0 and increment by 1). The row is then forwarded and ③ inserted into the corresponding server's table ^^shard^^. The next section explains how SELECT forwarding works. ## SELECT forwarding {#select-forwarding} -This diagram shows how SELECT queries are processed with a distributed table in ClickHouse: +This diagram shows how SELECT queries are processed with a ^^distributed table^^ in ClickHouse:
      -① A SELECT aggregation query targeting the distributed table is sent to corresponding ClickHouse server, either directly or via a load balancer. +① A SELECT aggregation query targeting the ^^distributed table^^ is sent to corresponding ClickHouse server, either directly or via a load balancer. -② The Distributed table forwards the query to all servers hosting shards of the target table, where each ClickHouse server computes its local aggregation result **in parallel**. +② The ^^Distributed table^^ forwards the query to all servers hosting shards of the target table, where each ClickHouse server computes its local aggregation result **in parallel**. -Then, the ClickHouse server hosting the initially targeted distributed table ③ collects all local results, ④ merges them into the final global result, and ⑤ returns it to the query sender. +Then, the ClickHouse server hosting the initially targeted ^^distributed table^^ ③ collects all local results, ④ merges them into the final global result, and ⑤ returns it to the query sender. ## What are table replicas in ClickHouse? {#what-are-table-replicas-in-clickhouse} -Replication in ClickHouse ensures **data integrity** and **failover** by maintaining **copies of shard data** across multiple servers. Since hardware failures are inevitable, replication prevents data loss by ensuring that each shard has multiple replicas. Writes can be directed to any replica, either directly or via a [distributed table](#distributed-table-creation), which selects a replica for the operation. Changes are automatically propagated to other replicas. In case of a failure or maintenance, data remains available on other replicas, and once a failed host recovers, it synchronizes automatically to stay up to date. +Replication in ClickHouse ensures **data integrity** and **failover** by maintaining **copies of ^^shard^^ data** across multiple servers. Since hardware failures are inevitable, replication prevents data loss by ensuring that each ^^shard^^ has multiple replicas. Writes can be directed to any ^^replica^^, either directly or via a [distributed table](#distributed-table-creation), which selects a ^^replica^^ for the operation. Changes are automatically propagated to other replicas. In case of a failure or maintenance, data remains available on other replicas, and once a failed host recovers, it synchronizes automatically to stay up to date. Note that replication requires a [Keeper](https://clickhouse.com/clickhouse/keeper) component in the [cluster architecture](/architecture/horizontal-scaling). -The following diagram illustrates a ClickHouse cluster with six servers, where the two table shards `Shard-1` and `Shard-2` introduced earlier each have three replicas. A query is sent to this cluster: +The following diagram illustrates a ClickHouse ^^cluster^^ with six servers, where the two table shards `Shard-1` and `Shard-2` introduced earlier each have three replicas. A query is sent to this ^^cluster^^:
      -Query processing works similarly to setups without replicas, with only a single replica from each shard executing the query. +Query processing works similarly to setups without replicas, with only a single ^^replica^^ from each ^^shard^^ executing the query. > Replicas not only ensure data integrity and failover but also improve query processing throughput by allowing multiple queries to run in parallel across different replicas. -① A query targeting the distributed table is sent to corresponding ClickHouse server, either directly or via a load balancer. +① A query targeting the ^^distributed table^^ is sent to corresponding ClickHouse server, either directly or via a load balancer. -② The Distributed table forwards the query to one replica from each shard, where each ClickHouse server hosting the selected replica computes its local query result in parallel. +② The ^^Distributed table^^ forwards the query to one ^^replica^^ from each ^^shard^^, where each ClickHouse server hosting the selected ^^replica^^ computes its local query result in parallel. -The rest works the [same](#select-forwarding) as in setups without replicas and is not shown in the diagram above. The ClickHouse server hosting the initially targeted distributed table collects all local results, merges them into the final global result, and returns it to the query sender. +The rest works the [same](#select-forwarding) as in setups without replicas and is not shown in the diagram above. The ClickHouse server hosting the initially targeted ^^distributed table^^ collects all local results, merges them into the final global result, and returns it to the query sender. -Note that ClickHouse allows configuring the query forwarding strategy for ②. By default—unlike in the diagram above—the distributed table [prefers](/docs/operations/settings/settings#prefer_localhost_replica) a local replica if available, but other load balancing [strategies](/docs/operations/settings/settings#load_balancing) can be used. +Note that ClickHouse allows configuring the query forwarding strategy for ②. By default—unlike in the diagram above—the ^^distributed table^^ [prefers](/docs/operations/settings/settings#prefer_localhost_replica) a local ^^replica^^ if available, but other load balancing [strategies](/docs/operations/settings/settings#load_balancing) can be used. ## Where to find more information {#where-to-find-more-information} diff --git a/docs/managing-data/deleting-data/delete_mutations.md b/docs/managing-data/deleting-data/delete_mutations.mdx similarity index 75% rename from docs/managing-data/deleting-data/delete_mutations.md rename to docs/managing-data/deleting-data/delete_mutations.mdx index 6e7e0f4eb0f..b9459e42b01 100644 --- a/docs/managing-data/deleting-data/delete_mutations.md +++ b/docs/managing-data/deleting-data/delete_mutations.mdx @@ -6,10 +6,10 @@ hide_title: false description: 'Page describing delete mutations - ALTER queries that manipulate table data through deletes' --- -Delete mutations refers to `ALTER` queries that manipulate table data through delete. Most notably they are queries like `ALTER TABLE DELETE`, etc. Performing such queries will produce new mutated versions of the data parts. This means that such statements would trigger a rewrite of whole data parts for all data that was inserted before the mutation, translating to a large amount of write requests. +Delete mutations refers to `ALTER` queries that manipulate table data through delete. Most notably they are queries like `ALTER TABLE DELETE`, etc. Performing such queries will produce new mutated versions of the data ^^parts^^. This means that such statements would trigger a rewrite of whole data ^^parts^^ for all data that was inserted before the ^^mutation^^, translating to a large amount of write requests. :::info -For deletes, you can avoid these large amounts of write requests by using specialised table engines like [ReplacingMergeTree](/guides/replacing-merge-tree) or [CollapsingMergeTree](/engines/table-engines/mergetree-family/collapsingmergetree) instead of the default MergeTree table engine. +For deletes, you can avoid these large amounts of write requests by using specialised table engines like [ReplacingMergeTree](/guides/replacing-merge-tree) or [CollapsingMergeTree](/engines/table-engines/mergetree-family/collapsingmergetree) instead of the default ^^MergeTree^^ ^^table engine^^. ::: import DeleteMutations from '@site/docs/sql-reference/statements/alter/delete.md'; diff --git a/docs/managing-data/deleting-data/index.md b/docs/managing-data/deleting-data/index.md index 3103a9cedbb..a02d3dae796 100644 --- a/docs/managing-data/deleting-data/index.md +++ b/docs/managing-data/deleting-data/index.md @@ -10,8 +10,8 @@ we will explore how to delete data in ClickHouse. | Page | Description | |-------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------| -| [Overview](/deletes/overview) | Provides an overview of the various ways to delete data in ClickHouse. | +| [Overview](./overview) | Provides an overview of the various ways to delete data in ClickHouse. | | [Lightweight deletes](/guides/developer/lightweight-delete) | Learn how to use the Lightweight Delete to delete data. | -| [Delete mutations](/managing-data/delete_mutations) | Learn about Delete Mutations. | -| [Truncate table](/managing-data/truncate) | Learn about how to use Truncate, which allows the data in a table or database to be removed, while preserving its existence. | -| [Drop partitions](/managing-data/drop_partition) | Learn about Dropping Partitions in ClickHouse. | +| [Delete mutations](/managing-data/delete_mutations) |Learn about Delete Mutations. | +| [Truncate table](../truncate) | Learn about how to use Truncate, which allows the data in a table or database to be removed, while preserving its existence. | +| [Drop partitions](../drop_partition) | Learn about Dropping Partitions in ClickHouse. | \ No newline at end of file diff --git a/docs/managing-data/deleting-data/overview.md b/docs/managing-data/deleting-data/overview.mdx similarity index 77% rename from docs/managing-data/deleting-data/overview.md rename to docs/managing-data/deleting-data/overview.mdx index ea6e1e09f61..98c05b9748a 100644 --- a/docs/managing-data/deleting-data/overview.md +++ b/docs/managing-data/deleting-data/overview.mdx @@ -18,7 +18,7 @@ Here is a summary of the different ways to delete data in ClickHouse: ## Lightweight deletes {#lightweight-deletes} -Lightweight deletes cause rows to be immediately marked as deleted such that they can be automatically filtered out of all subsequent `SELECT` queries. Subsequent removal of these deleted rows occurs during natural merge cycles and thus incurs less I/O. As a result, it is possible that for an unspecified period, data is not actually deleted from storage and is only marked as deleted. If you need to guarantee that data is deleted, consider the above mutation command. +Lightweight deletes cause rows to be immediately marked as deleted such that they can be automatically filtered out of all subsequent `SELECT` queries. Subsequent removal of these deleted rows occurs during natural merge cycles and thus incurs less I/O. As a result, it is possible that for an unspecified period, data is not actually deleted from storage and is only marked as deleted. If you need to guarantee that data is deleted, consider the above ^^mutation^^ command. ```sql -- delete all data from 2018 with a lightweight delete. Not recommended. @@ -27,7 +27,7 @@ DELETE FROM posts WHERE toYear(CreationDate) = 2018 Deleting large volumes of data with the lightweight `DELETE` statement can also negatively affect `SELECT` query performance. The command is also not compatible with tables with projections. -Note that a mutation is used in the operation to [mark the deleted rows](/sql-reference/statements/delete#how-lightweight-deletes-work-internally-in-clickhouse) (adding a `_row_exists` column), thus incurring some I/O. +Note that a ^^mutation^^ is used in the operation to [mark the deleted rows](/sql-reference/statements/delete#how-lightweight-deletes-work-internally-in-clickhouse) (adding a `_row_exists` column), thus incurring some I/O. In general, lightweight deletes should be preferred over mutations if the existence of the deleted data on disk can be tolerated (e.g. in non-compliance cases). This approach should still be avoided if all data needs to be deleted. @@ -42,7 +42,7 @@ Delete mutations can be issued through a `ALTER TABLE ... DELETE` command e.g. ALTER TABLE posts DELETE WHERE toYear(CreationDate) = 2018 ``` -These can be executed either synchronously (by default if non-replicated) or asynchronously (determined by the [mutations_sync](/operations/settings/settings#mutations_sync) setting). These are extremely IO-heavy, rewriting all the parts that match the `WHERE` expression. There is no atomicity to this process - parts are substituted for mutated parts as soon as they are ready, and a `SELECT` query that starts executing during a mutation will see data from parts that have already been mutated along with data from parts that have not been mutated yet. Users can track the state of the progress via the [systems.mutations](/operations/system-tables/mutations#monitoring-mutations) table. These are I/O intense operations and should be used sparingly as they can impact cluster `SELECT` performance. +These can be executed either synchronously (by default if non-replicated) or asynchronously (determined by the [mutations_sync](/operations/settings/settings#mutations_sync) setting). These are extremely IO-heavy, rewriting all the ^^parts^^ that match the `WHERE` expression. There is no ^^atomicity^^ to this process - ^^parts^^ are substituted for mutated ^^parts^^ as soon as they are ready, and a `SELECT` query that starts executing during a ^^mutation^^ will see data from ^^parts^^ that have already been mutated along with data from ^^parts^^ that have not been mutated yet. Users can track the state of the progress via the [systems.mutations](/operations/system-tables/mutations#monitoring-mutations) table. These are I/O intense operations and should be used sparingly as they can impact ^^cluster^^ `SELECT` performance. Read more about [delete mutations](/sql-reference/statements/alter/delete). @@ -58,7 +58,7 @@ Read more about [TRUNCATE TABLE](/sql-reference/statements/truncate). ## Drop partition {#drop-partition} -If you have specified a custom partitioning key for your data, partitions can be efficiently dropped. Avoid high cardinality partitioning. +If you have specified a custom ^^partitioning key^^ for your data, partitions can be efficiently dropped. Avoid high cardinality partitioning. ```sql ALTER TABLE posts (DROP PARTITION '2008') diff --git a/docs/managing-data/drop_partition.md b/docs/managing-data/drop_partition.mdx similarity index 86% rename from docs/managing-data/drop_partition.md rename to docs/managing-data/drop_partition.mdx index d3476ac4280..9cd976b3165 100644 --- a/docs/managing-data/drop_partition.md +++ b/docs/managing-data/drop_partition.mdx @@ -10,7 +10,7 @@ description: 'Page describing drop partitions' Partitioning is specified on a table when it is initially defined via the `PARTITION BY` clause. This clause can contain a SQL expression on any columns, the results of which will define which partition a row is sent to. -The data parts are logically associated with each partition on disk and can be queried in isolation. For the example below, we partition the `posts` table by year using the expression `toYear(CreationDate)`. As rows are inserted into ClickHouse, this expression will be evaluated against each row and routed to the resulting partition if it exists (if the row is the first for a year, the partition will be created). +The data ^^parts^^ are logically associated with each partition on disk and can be queried in isolation. For the example below, we partition the `posts` table by year using the expression `toYear(CreationDate)`. As rows are inserted into ClickHouse, this expression will be evaluated against each row and routed to the resulting partition if it exists (if the row is the first for a year, the partition will be created). ```sql CREATE TABLE posts diff --git a/docs/managing-data/updating-data/overview.md b/docs/managing-data/updating-data/overview.mdx similarity index 83% rename from docs/managing-data/updating-data/overview.md rename to docs/managing-data/updating-data/overview.mdx index a1a02783e1e..77231f2ee31 100644 --- a/docs/managing-data/updating-data/overview.md +++ b/docs/managing-data/updating-data/overview.mdx @@ -34,16 +34,16 @@ Update mutations can be issued through a `ALTER TABLE ... UPDATE` command e.g. ALTER TABLE posts_temp (UPDATE AnswerCount = AnswerCount + 1 WHERE AnswerCount = 0) ``` -These are extremely IO-heavy, rewriting all the parts that match the `WHERE` expression. There is no atomicity to this process - parts are substituted for mutated parts as soon as they are ready, and a `SELECT` query that starts executing during a mutation will see data from parts that have already been mutated along with data from parts that have not been mutated yet. Users can track the state of the progress via the [systems.mutations](/operations/system-tables/mutations) table. These are I/O intense operations and should be used sparingly as they can impact cluster `SELECT` performance. +These are extremely IO-heavy, rewriting all the ^^parts^^ that match the `WHERE` expression. There is no ^^atomicity^^ to this process - ^^parts^^ are substituted for mutated ^^parts^^ as soon as they are ready, and a `SELECT` query that starts executing during a ^^mutation^^ will see data from ^^parts^^ that have already been mutated along with data from ^^parts^^ that have not been mutated yet. Users can track the state of the progress via the [systems.mutations](/operations/system-tables/mutations) table. These are I/O intense operations and should be used sparingly as they can impact ^^cluster^^ `SELECT` performance. Read more about [update mutations](/sql-reference/statements/alter/update). ## Lightweight updates {#lightweight-updates} -Lightweight updates are a ClickHouse feature that updates rows using "patch parts" - special data parts containing only the updated columns and rows, rather than rewriting entire columns like traditional mutations. The Lightweight UPDATE +Lightweight updates are a ClickHouse feature that updates rows using "patch ^^parts^^" - special data ^^parts^^ containing only the updated columns and rows, rather than rewriting entire columns like traditional mutations. The ^^Lightweight UPDATE^^ Key characteristics: -- Uses the standard `UPDATE` syntax and creates patch parts immediately without waiting for merges +- Uses the standard `UPDATE` syntax and creates patch ^^parts^^ immediately without waiting for merges - Updated values are immediately visible in `SELECT` queries through patch application, but physically materialized only during subsequent merges - Designed for small updates (up to ~10% of table) with predictable latency - Adds overhead to `SELECT` queries that need to apply patches, but avoids rewriting entire columns @@ -52,7 +52,7 @@ For more details see ["The Lightweight UPDATE Statement"](/sql-reference/stateme ## On-the-fly Updates {#on-the-fly-updates} -On-the-fly updates provide a mechanism to update rows such that they are updated immediately, and subsequent `SELECT` queries will automatically return with the changed values (this incurs an overhead and will slow queries). This effectively addresses the atomicity limitation of normal mutations. We show an example below: +On-the-fly updates provide a mechanism to update rows such that they are updated immediately, and subsequent `SELECT` queries will automatically return with the changed values (this incurs an overhead and will slow queries). This effectively addresses the ^^atomicity^^ limitation of normal mutations. We show an example below: ```sql SET apply_mutations_on_fly = 1; @@ -83,19 +83,19 @@ WHERE Id = 404346 1 row in set. Elapsed: 0.149 sec. Processed 59.55 million rows, 259.91 MB (399.99 million rows/s., 1.75 GB/s.) ``` -Note that for on-the-fly updates, a mutation is still used to update the data; it is just not materialized immediately and applied during `SELECT` queries. It will still be applied in the background as an asynchronous process and incurs the same heavy overhead as a mutation and thus is an I/O intense operation that should be used sparingly. The expressions that can be used with this operation are also limited (see here for [details](/guides/developer/on-the-fly-mutations#support-for-subqueries-and-non-deterministic-functions)). +Note that for on-the-fly updates, a ^^mutation^^ is still used to update the data; it is just not materialized immediately and applied during `SELECT` queries. It will still be applied in the background as an asynchronous process and incurs the same heavy overhead as a ^^mutation^^ and thus is an I/O intense operation that should be used sparingly. The expressions that can be used with this operation are also limited (see here for [details](/guides/developer/on-the-fly-mutations#support-for-subqueries-and-non-deterministic-functions)). Read more about [on-the-fly updates](/guides/developer/on-the-fly-mutations). ## `CollapsingMergeTree` {#collapsing-merge-tree} Stemming from the idea that updates are expensive but inserts can be leveraged to perform updates, -the [`CollapsingMergeTree`](/engines/table-engines/mergetree-family/collapsingmergetree) table engine +the [`CollapsingMergeTree`](/engines/table-engines/mergetree-family/collapsingmergetree) ^^table engine^^ can be used together with a `sign` column as a way to tell ClickHouse to update a specific row by collapsing (deleting) a pair of rows with sign `1` and `-1`. If `-1` is inserted for the `sign` column, the whole row will be deleted. If `1` is inserted for the `sign` column, ClickHouse will keep the row. -Rows to update are identified based on the sorting key used in the `ORDER BY ()` statement when creating the table. +Rows to update are identified based on the ^^sorting key^^ used in the `ORDER BY ()` statement when creating the table. ```sql CREATE TABLE UAct diff --git a/docs/managing-data/updating-data/update_mutations.md b/docs/managing-data/updating-data/update_mutations.mdx similarity index 75% rename from docs/managing-data/updating-data/update_mutations.md rename to docs/managing-data/updating-data/update_mutations.mdx index 123505a9440..eab58eb865c 100644 --- a/docs/managing-data/updating-data/update_mutations.md +++ b/docs/managing-data/updating-data/update_mutations.mdx @@ -6,10 +6,10 @@ hide_title: false description: 'Page describing update mutations - ALTER queries that manipulate table data through updates' --- -Update mutations refers to `ALTER` queries that manipulate table data through updates. Most notably they are queries like `ALTER TABLE UPDATE`, etc. Performing such queries will produce new mutated versions of the data parts. This means that such statements would trigger a rewrite of whole data parts for all data that was inserted before the mutation, translating to a large amount of write requests. +Update mutations refers to `ALTER` queries that manipulate table data through updates. Most notably they are queries like `ALTER TABLE UPDATE`, etc. Performing such queries will produce new mutated versions of the data ^^parts^^. This means that such statements would trigger a rewrite of whole data ^^parts^^ for all data that was inserted before the ^^mutation^^, translating to a large amount of write requests. :::info -For updates, you can avoid these large amounts of write requests by using specialised table engines like [ReplacingMergeTree](/guides/replacing-merge-tree) or [CollapsingMergeTree](/engines/table-engines/mergetree-family/collapsingmergetree) instead of the default MergeTree table engine. +For updates, you can avoid these large amounts of write requests by using specialised table engines like [ReplacingMergeTree](/guides/replacing-merge-tree) or [CollapsingMergeTree](/engines/table-engines/mergetree-family/collapsingmergetree) instead of the default ^^MergeTree^^ ^^table engine^^. ::: import UpdateMutations from '@site/docs/sql-reference/statements/alter/update.md'; diff --git a/plugins/glossary-transformer.js b/plugins/glossary-transformer.js index 69ad9c84530..84ad8fe768e 100644 --- a/plugins/glossary-transformer.js +++ b/plugins/glossary-transformer.js @@ -3,7 +3,6 @@ const { visit } = require('unist-util-visit'); const fs = require('fs'); const path = require('path'); -// Cache glossary terms globally let cachedGlossary = null; let glossaryModTime = null; @@ -36,7 +35,7 @@ function createGlossaryTransformer(options = {}) { const glossaryMap = new Map(); Object.entries(glossaryData).forEach(([term, definition]) => { - glossaryMap.set(term.toLowerCase(), { originalTerm: term, definition }); + glossaryMap.set(term.toLowerCase(), definition); }); cachedGlossary = glossaryMap; @@ -84,7 +83,6 @@ function createGlossaryTransformer(options = {}) { const cleanTerm = term.trim(); const cleanPlural = plural.trim(); - // Add text before match if (match.index > lastIndex) { newNodes.push({ type: 'text', @@ -92,20 +90,17 @@ function createGlossaryTransformer(options = {}) { }); } - // Get original term from glossary or use as-is - const glossaryEntry = glossary.get(cleanTerm.toLowerCase()); - const originalTerm = glossaryEntry?.originalTerm || cleanTerm; + const definition = glossary.get(cleanTerm.toLowerCase()); - if (!glossaryEntry && config.validateTerms) { + if (!definition && config.validateTerms) { console.warn(`Glossary term not found: ${cleanTerm}`); } - // Create MDX JSX element newNodes.push({ type: 'mdxJsxTextElement', name: 'GlossaryTooltip', attributes: [ - { type: 'mdxJsxAttribute', name: 'term', value: originalTerm }, + { type: 'mdxJsxAttribute', name: 'term', value: cleanTerm }, { type: 'mdxJsxAttribute', name: 'plural', value: cleanPlural } ], children: [] @@ -114,8 +109,7 @@ function createGlossaryTransformer(options = {}) { transformCount++; lastIndex = match.index + fullMatch.length; } - - // Add remaining text + if (lastIndex < node.value.length) { newNodes.push({ type: 'text', @@ -123,7 +117,6 @@ function createGlossaryTransformer(options = {}) { }); } - // Replace node if we made changes if (newNodes.length > 0) { parent.children.splice(index, 1, ...newNodes); } diff --git a/src/components/GlossaryTooltip/GlossaryTooltip.tsx b/src/components/GlossaryTooltip/GlossaryTooltip.tsx index b7af611e9b3..d14dc66be1d 100644 --- a/src/components/GlossaryTooltip/GlossaryTooltip.tsx +++ b/src/components/GlossaryTooltip/GlossaryTooltip.tsx @@ -2,34 +2,22 @@ import React, { useState } from 'react'; import glossary from './glossary.json'; import Link from '@docusaurus/Link'; -const GlossaryTooltip = ({ term, capitalize = false, plural = '' }) => { +const GlossaryTooltip = ({ term, plural = '' }) => { const [visible, setVisible] = useState(false); - // Case-insensitive lookup - let definition = glossary[term]; // Try exact match first - let matchedKey = term; + // Always do case-insensitive lookup + const foundKey = Object.keys(glossary).find(key => + key.toLowerCase() === term.toLowerCase() + ); - if (!definition) { - // Try to find a case-insensitive match - const foundKey = Object.keys(glossary).find(key => - key.toLowerCase() === term.toLowerCase() - ); - if (foundKey) { - definition = glossary[foundKey]; - matchedKey = foundKey; - } - } - - if (!definition) { + if (!foundKey) { console.warn(`Glossary term not found: ${term}`); - const displayFallback = capitalize - ? capitalizeWord(term) + plural - : term.toLowerCase() + plural; - return {displayFallback}; + return {term}{plural}; } - const displayTerm = capitalize ? capitalizeWord(term) : term.toLowerCase(); - const anchorId = matchedKey.toLowerCase().replace(/\s+/g, '-'); + const definition = glossary[foundKey]; + const displayTerm = term; // Preserve original casing + const anchorId = foundKey.toLowerCase().replace(/\s+/g, '-'); const glossarySlug = `/concepts/glossary#${anchorId}`; return ( @@ -56,8 +44,4 @@ const GlossaryTooltip = ({ term, capitalize = false, plural = '' }) => { ); }; -function capitalizeWord(word) { - return word.charAt(0).toUpperCase() + word.slice(1); -} - export default GlossaryTooltip; \ No newline at end of file diff --git a/src/components/GlossaryTooltip/glossary.json b/src/components/GlossaryTooltip/glossary.json index beec836aca8..7818b8f01e2 100644 --- a/src/components/GlossaryTooltip/glossary.json +++ b/src/components/GlossaryTooltip/glossary.json @@ -1,9 +1,27 @@ { "Atomicity": "Atomicity ensures that a transaction (a series of database operations) is treated as a single, indivisible unit. This means that either all operations within the transaction occur, or none do. An example of an atomic transaction is transferring money from one bank account to another. If either step of the transfer fails, the transaction fails, and the money stays in the first account. Atomicity ensures no money is lost or created.", + "Block": "A block is a logical unit for organizing data processing and storage. Each block contains columnar data which is processed together to enhance performance during query execution. By processing data in blocks, ClickHouse utilizes CPU cores efficiently by minimizing cache misses and facilitating vectorized execution. ClickHouse uses various compression algorithms, such as LZ4, ZSTD, and Delta, to compress data in blocks.", "Cluster": "A collection of nodes (servers) that work together to store and process data.", "CMEK": "Customer-managed encryption keys (CMEK) allow customers to use their key-management service (KMS) key to encrypt the ClickHouse disk data key and protect their data at rest.", "Dictionary": "A dictionary is a mapping of key-value pairs that is useful for various types of reference lists. It is a powerful feature that allows for the efficient use of dictionaries in queries, which is often more efficient than using a `JOIN` with reference tables.", + "Distributed table": "A distributed table in ClickHouse is a special type of table that does not store data itself but provides a unified view for distributed query processing across multiple servers in a cluster.", + "Granule": "A granule is a batch of rows in an uncompressed block. When reading data, ClickHouse accesses granules, but not individual rows, which enables faster data processing in analytical workloads. A granule contains 8192 rows by default. The primary index contains one entry per granule.", + "Incremental materialized view": "In ClickHouse is a type of materialized view that processes and aggregates data at insert time. When new data is inserted into the source table, the materialized view executes a predefined SQL aggregation query only on the newly inserted blocks and writes the aggregated results to a target table.", + "Lightweight update": "A lightweight update in ClickHouse is an experimental feature that allows you to update rows in a table using standard SQL UPDATE syntax, but instead of rewriting entire columns or data parts (as with traditional mutations), it creates \"patch parts\" containing only the updated columns and rows. These updates are immediately visible in SELECT queries through patch application, but the physical data is only updated during subsequent merges.", + "Materialized view": "A materialized view in ClickHouse is a mechanism that automatically runs a query on data as it is inserted into a source table, storing the transformed or aggregated results in a separate target table for faster querying.", + "MergeTree": "A MergeTree in ClickHouse is a table engine designed for high data ingest rates and large data volumes. It is the core storage engine in ClickHouse, providing features such as columnar storage, custom partitioning, sparse primary indexes, and support for background data merges.", + "Mutation": "A mutation in ClickHouse refers to an operation that modifies or deletes existing data in a table, typically using commands like ALTER TABLE ... UPDATE or ALTER TABLE ... DELETE. Mutations are implemented as asynchronous background processes that rewrite entire data parts affected by the change, rather than modifying rows in place.", + "On-the-fly mutation": "On-the-fly mutations in ClickHouse are a mechanism that allows updates or deletes to be visible in subsequent SELECT queries immediately after the mutation is submitted, without waiting for the background mutation process to finish.", "Parts": "A physical file on a disk that stores a portion of the table's data. This is different from a partition, which is a logical division of a table's data that is created using a partition key.", + "Partitioning key": "A partitioning key in ClickHouse is a SQL expression defined in the PARTITION BY clause when creating a table. It determines how data is logically grouped into partitions on disk. Each unique value of the partitioning key forms its own physical partition, allowing for efficient data management operations such as dropping, moving, or archiving entire partitions.", + "Primary key": "In ClickHouse, a primary key determines the order in which data is stored on disk and is used to build a sparse index that speeds up query filtering. Unlike traditional databases, the primary key in ClickHouse does not enforce uniqueness—multiple rows can have the same primary key value.", + "Projection": "A projection in ClickHouse is a hidden, automatically maintained table that stores data in a different order or with precomputed aggregations to speed up queries, especially those filtering on columns not in the main primary key.", + "Refreshable materialized view": "Refreshable materialized view is a type of materialized view that periodically re-executes its query over the full dataset and stores the result in a target table. Unlike incremental materialized views, refreshable materialized views are updated on a schedule and can support complex queries, including JOINs and UNIONs, without restrictions.", "Replica": "A copy of the data stored in a ClickHouse database. You can have any number of replicas of the same data for redundancy and reliability. Replicas are used in conjunction with the ReplicatedMergeTree table engine, which enables ClickHouse to keep multiple copies of data in sync across different servers.", - "Shard": "A subset of data. ClickHouse always has at least one shard for your data. If you do not split the data across multiple servers, your data will be stored in one shard. Sharding data across multiple servers can be used to divide the load if you exceed the capacity of a single server." + "Shard": "A subset of data. ClickHouse always has at least one shard for your data. If you do not split the data across multiple servers, your data will be stored in one shard. Sharding data across multiple servers can be used to divide the load if you exceed the capacity of a single server.", + "Skipping index": "Skipping indices are used to store small amounts of metadata at the level of multiple consecutive granules which allows ClickHouse to avoid scanning irrelevant rows. Skipping indices provide a lightweight alternative to projections.", + "Sorting key": "In ClickHouse, a sorting key defines the physical order of rows on disk. If you do not specify a primary key, ClickHouse uses the sorting key as the primary key. If you specify both, the primary key must be a prefix of the sorting key.", + "Sparse index": "A type of indexing when the primary index contains one entry for a group of rows, rather than a single row. The entry that corresponds to a group of rows is referred to as a mark. With sparse indexes, ClickHouse first identifies groups of rows that potentially match the query and then processes them separately to find a match. Because of this, the primary index is small enough to be loaded into the memory.", + "Table engine": "Table engines in ClickHouse determine how data is written, stored and accessed. MergeTree is the most common table engine, and allows quick insertion of large amounts of data which get processed in the background.", + "TTL": "Time To Live (TTL) is A ClickHouse feature that automatically moves, deletes, or rolls up columns or rows after a certain time period. This allows you to manage storage more efficiently because you can delete, move, or archive the data that you no longer need to access frequently." } \ No newline at end of file From 8efa08de3d39c4359b525abb524d85fe8a34564e Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Sat, 2 Aug 2025 12:46:29 +0200 Subject: [PATCH 10/10] Update glossary.json --- src/components/GlossaryTooltip/glossary.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/components/GlossaryTooltip/glossary.json b/src/components/GlossaryTooltip/glossary.json index 7818b8f01e2..ca6643d965f 100644 --- a/src/components/GlossaryTooltip/glossary.json +++ b/src/components/GlossaryTooltip/glossary.json @@ -12,7 +12,7 @@ "MergeTree": "A MergeTree in ClickHouse is a table engine designed for high data ingest rates and large data volumes. It is the core storage engine in ClickHouse, providing features such as columnar storage, custom partitioning, sparse primary indexes, and support for background data merges.", "Mutation": "A mutation in ClickHouse refers to an operation that modifies or deletes existing data in a table, typically using commands like ALTER TABLE ... UPDATE or ALTER TABLE ... DELETE. Mutations are implemented as asynchronous background processes that rewrite entire data parts affected by the change, rather than modifying rows in place.", "On-the-fly mutation": "On-the-fly mutations in ClickHouse are a mechanism that allows updates or deletes to be visible in subsequent SELECT queries immediately after the mutation is submitted, without waiting for the background mutation process to finish.", - "Parts": "A physical file on a disk that stores a portion of the table's data. This is different from a partition, which is a logical division of a table's data that is created using a partition key.", + "Parts": "A physical file (or directory) on disk that stores a portion of the table's data. This is different from a partition, which is a logical division of a table's data that is created using a partition key.", "Partitioning key": "A partitioning key in ClickHouse is a SQL expression defined in the PARTITION BY clause when creating a table. It determines how data is logically grouped into partitions on disk. Each unique value of the partitioning key forms its own physical partition, allowing for efficient data management operations such as dropping, moving, or archiving entire partitions.", "Primary key": "In ClickHouse, a primary key determines the order in which data is stored on disk and is used to build a sparse index that speeds up query filtering. Unlike traditional databases, the primary key in ClickHouse does not enforce uniqueness—multiple rows can have the same primary key value.", "Projection": "A projection in ClickHouse is a hidden, automatically maintained table that stores data in a different order or with precomputed aggregations to speed up queries, especially those filtering on columns not in the main primary key.", @@ -24,4 +24,4 @@ "Sparse index": "A type of indexing when the primary index contains one entry for a group of rows, rather than a single row. The entry that corresponds to a group of rows is referred to as a mark. With sparse indexes, ClickHouse first identifies groups of rows that potentially match the query and then processes them separately to find a match. Because of this, the primary index is small enough to be loaded into the memory.", "Table engine": "Table engines in ClickHouse determine how data is written, stored and accessed. MergeTree is the most common table engine, and allows quick insertion of large amounts of data which get processed in the background.", "TTL": "Time To Live (TTL) is A ClickHouse feature that automatically moves, deletes, or rolls up columns or rows after a certain time period. This allows you to manage storage more efficiently because you can delete, move, or archive the data that you no longer need to access frequently." -} \ No newline at end of file +}
    Replica 1Replica 2Replica 3^^Replica^^ 1^^Replica^^ 2^^Replica^^ 3