From 22cbeae9cb2d19a0cfa0cd406bfed2c8f6ec1d36 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Thu, 28 Aug 2025 12:38:41 +0200 Subject: [PATCH] Upgrade Algolia to v4 and discover search regression --- scripts/search/README.md | 29 +++++++++---- scripts/search/index_pages.py | 12 +----- scripts/search/requirements.txt | 74 ++++++++++++++++----------------- scripts/search/results.csv | 2 +- 4 files changed, 62 insertions(+), 55 deletions(-) diff --git a/scripts/search/README.md b/scripts/search/README.md index aeb43601a6f..9f71ab83f3f 100644 --- a/scripts/search/README.md +++ b/scripts/search/README.md @@ -7,6 +7,7 @@ ```bash pip install -r requirements.txt ``` + ### Running ```bash @@ -33,7 +34,7 @@ options: []()## Search scripts -We use these to evaluate search performance. `results.csv` contains a list of authoriative search results for 200 terms. +We use these to evaluate search performance. `results.csv` contains a list of authoritative search results for 200 terms. We use this to compute an average nDCG. @@ -47,6 +48,19 @@ pip install -r requirements.txt ### Running +You need to comment out either Dev or Prod depending on what you want to test. +The API key is the public search key, don't worry. + +```python +# dev details +# ALGOLIA_APP_ID = "7AL1W7YVZK" +# ALGOLIA_API_KEY = "43bd50d4617a97c9b60042a2e8a348f9" + +# Prod details +ALGOLIA_APP_ID = "5H9UG7CX5W" +ALGOLIA_API_KEY = "4a7bf25cf3edbef29d78d5e1eecfdca5" +``` + ```bash python compute_ndcg.py -d ``` @@ -67,12 +81,13 @@ options: ### Results -| **Date** | **Average nDCG** | **Results** | **Changes** | -|------------|------------------|--------------------------------------------------------------------------------------------------------|--------------------------------------------------| -| 20/01/2024 | 0.4700 | [View Results](https://pastila.nl/?008231f5/bc107912f8a5074d70201e27b1a66c6c#cB/yJOsZPOWi9h8xAkuTUQ==) | Baseline | -| 21/01/2024 | 0.5021 | [View Results](https://pastila.nl/?00bb2c2f/936a9a3af62a9bdda186af5f37f55782#m7Hg0i9F1YCesMW6ot25yA==) | Index `_` character and move language to English | -| 24/01/2024 | 0.7072 | [View Results](https://pastila.nl/?065e3e67/e4ad889d0c166226118e6160b4ee53ff#x1NPd2R7hU90CZvvrE4nhg==) | Process markdown, and tune settings. | -| 24/01/2024 | 0.7412 | [View Results](https://pastila.nl/?0020013d/e69b33aaae82e49bc71c5ee2cea9ad46#pqq3VtRd4eP4JM5/izcBcA==) | Include manual promotions for ambigious terms. | +| **Date** | **Average nDCG** | **Results** | **Changes** | +|------------|------------------|---------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------| +| 20/01/2025 | 0.4700 | [View Results](https://pastila.nl/?008231f5/bc107912f8a5074d70201e27b1a66c6c#cB/yJOsZPOWi9h8xAkuTUQ==) | Baseline | +| 21/01/2025 | 0.5021 | [View Results](https://pastila.nl/?00bb2c2f/936a9a3af62a9bdda186af5f37f55782#m7Hg0i9F1YCesMW6ot25yA==) | Index `_` character and move language to English | +| 24/01/2025 | 0.7072 | [View Results](https://pastila.nl/?065e3e67/e4ad889d0c166226118e6160b4ee53ff#x1NPd2R7hU90CZvvrE4nhg==) | Process markdown, and tune settings. | +| 24/01/2025 | 0.7412 | [View Results](https://pastila.nl/?0020013d/e69b33aaae82e49bc71c5ee2cea9ad46#pqq3VtRd4eP4JM5/izcBcA==) | Include manual promotions for ambigious terms. | +| 28/08/2025 | 0.5729 | [View Results](https://pastila.nl/?00ab66a7/9eb511690e3b2f53ac7ae95e3f42113c#tK6gf8G9W7mbAQd3aD5f4Q==) | This was unfortunately not run or recorded for search improvements which were made recently | Note: exact scores may vary due to constant content changes. diff --git a/scripts/search/index_pages.py b/scripts/search/index_pages.py index e0d3b36a723..49dcf4dcea4 100644 --- a/scripts/search/index_pages.py +++ b/scripts/search/index_pages.py @@ -382,9 +382,7 @@ def process_markdown_directory(directory, base_directory, base_url): def send_to_algolia(client, index_name, records): """Send records to Algolia.""" if records: - client.batch(index_name=index_name, batch_write_params={ - "requests": [{"action": "addObject", "body": record} for record in records], - }) + client.save_objects(index_name, records) print(f"Successfully sent {len(records)} records to Algolia.") else: print("No records to send to Algolia.") @@ -449,13 +447,7 @@ def main(base_directory, algolia_app_id, algolia_api_key, algolia_index_name, print(f'total {'processed' if dry_run else 'indexed'} {t} records') if not dry_run: print('switching temporary index...', end='') - client.operation_index( - index_name=temp_index_name, - operation_index_params={ - "operation": "move", - "destination": algolia_index_name - }, - ) + client.operation_index(temp_index_name, {"operation": "move", "destination": algolia_index_name}) print('done') diff --git a/scripts/search/requirements.txt b/scripts/search/requirements.txt index 7c6c15c3284..f3c9d4e6f1a 100644 --- a/scripts/search/requirements.txt +++ b/scripts/search/requirements.txt @@ -1,37 +1,37 @@ -aiohappyeyeballs==2.4.4 -aiohttp==3.12.14 -aiosignal==1.3.2 -algoliasearch==4.12.0 -annotated-types==0.7.0 -async-timeout==5.0.1 -attrs==24.3.0 -certifi==2024.12.14 -charset-normalizer==3.4.1 -Deprecated==1.2.15 -frozenlist==1.5.0 -idna==3.10 -jaconv==0.4.0 -Markdown==3.7 -multidict==6.1.0 -networkx==3.4.2 -numpy==2.2.2 -propcache==0.2.1 -pydantic==2.10.5 -pydantic_core==2.27.2 -pykakasi==2.3.0 -python-dateutil==2.9.0.post0 -python-slugify==8.0.4 -PyYAML==6.0.2 -remember==0.1 -requests==2.32.4 -ruamel.yaml==0.18.10 -ruamel.yaml.clib==0.2.12 -scipy==1.15.1 -six==1.17.0 -slugger==0.2.2 -text-unidecode==1.3 -typing_extensions==4.12.2 -Unihandecode==0.81 -urllib3==2.5.0 -wrapt==1.17.2 -yarl==1.18.3 +aiohappyeyeballs +aiohttp +aiosignal +algoliasearch>=4.25.0 +annotated-types +async-timeout +attrs +certifi +charset-normalizer +Deprecated +frozenlist +idna +jaconv +Markdown +multidict +networkx +numpy +propcache +pydantic +pydantic_core +pykakasi +python-dateutil +python-slugify +PyYAML +remember +requests +ruamel.yaml +ruamel.yaml.clib +scipy +six +slugger +text-unidecode +typing_extensions +Unihandecode +urllib3 +wrapt +yarl diff --git a/scripts/search/results.csv b/scripts/search/results.csv index 305ee8db064..4ad85113eef 100644 --- a/scripts/search/results.csv +++ b/scripts/search/results.csv @@ -82,7 +82,7 @@ sum,https://clickhouse.com/docs/sql-reference/aggregate-functions/reference/sum, keeper,https://clickhouse.com/docs/guides/sre/keeper/clickhouse-keeper,https://clickhouse.com/docs/knowledgebase/why_recommend_clickhouse_keeper_over_zookeeper, type,https://clickhouse.com/docs/sql-reference/data-types,https://clickhouse.com/docs/sql-reference/functions/type-conversion-functions, nullable,https://clickhouse.com/docs/sql-reference/data-types/nullable,https://clickhouse.com/docs/cloud/bestpractices/avoid-nullable-columns,https://clickhouse.com/docs/sql-reference/functions/functions-for-nulls -projection,https://clickhouse.com/docs/sql-reference/statements/alter/projection,https://clickhouse.com/docs/engines/table-engines/mergetree-family/mergetree#projections,https://clickhouse.com/docs/knowledgebase/projection_example +projection,https://clickhouse.com/docs/data-modeling/projections,https://clickhouse.com/docs/sql-reference/statements/alter/projection,https://clickhouse.com/docs/engines/table-engines/mergetree-family/mergetree#projections jdbc,https://clickhouse.com/docs/interfaces/jdbc,https://clickhouse.com/docs/integrations/language-clients/java/jdbc,https://clickhouse.com/docs/engines/table-engines/integrations/jdbc ifnull,https://clickhouse.com/docs/sql-reference/functions/functions-for-nulls#ifnull,https://clickhouse.com/docs/sql-reference/functions/conditional-functions, any,https://clickhouse.com/docs/sql-reference/aggregate-functions/reference/any,https://clickhouse.com/docs/sql-reference/aggregate-functions/reference/first_value,