diff --git a/README.md b/README.md index 4fdb7583..f152a8de 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,10 @@ This repository demonstrates how to build a powerful semantic search engine using Couchbase as the backend database, combined with various AI-powered embedding and language model providers such as OpenAI, Azure OpenAI, Anthropic (Claude), Cohere, Hugging Face, Jina AI, Mistral AI, and Voyage AI. +Each example provides two distinct approaches: +- **FTS (Full Text Search)**: Uses Couchbase's vector search capabilities with pre-created search indices +- **GSI (Global Secondary Index)**: Leverages Couchbase's native SQL++ queries with vector similarity functions + Semantic search goes beyond simple keyword matching by understanding the context and meaning behind the words in a query, making it essential for applications that require intelligent information retrieval. ## Features @@ -27,10 +31,13 @@ Semantic search goes beyond simple keyword matching by understanding the context cd vector-search-cookbook ``` -### 2. Set up the Couchbase Vector Search Index: +### 2. Choose Your Approach: + +#### For FTS (Full Text Search) Examples: +Use the provided `{model}_index.json` index definition file in each model's `fts/` directory to create a new vector search index in your Couchbase cluster. -Use the provided `{model}_index.json` index definition file in each model's directory to create a new index in your Couchbase cluster. -The index supports separate properties for each embedding model. +#### For GSI (Global Secondary Index) Examples: +No additional setup required. GSI index will be created in each model's example. ### 3. Run the notebook file @@ -68,9 +75,9 @@ Each notebook implements a semantic search function that performs similarity sea The system implements caching functionality using `CouchbaseCache` to improve performance for repeated queries. -## Couchbase Vector Search Index +## Couchbase Vector Search Index (FTS Approach Only) -For more information on creating a vector search index, please follow the [instructions](https://docs.couchbase.com/cloud/vector-search/create-vector-search-index-ui.html). The following is an example for Azure OpenAI Model. +For FTS examples, you'll need to create a vector search index using the provided JSON configuration files. For more information on creating a vector search index, please follow the [instructions](https://docs.couchbase.com/cloud/vector-search/create-vector-search-index-ui.html). The following is an example for Azure OpenAI Model. ```json { @@ -146,12 +153,4 @@ For more information on creating a vector search index, please follow the [instr }, "sourceParams": {} } -``` - -## Contributing - -Contributions are welcome! Please feel free to submit a pull request or open an issue for any bugs or feature requests. - -## License - -This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. \ No newline at end of file +``` \ No newline at end of file diff --git a/awsbedrock/.env.sample b/awsbedrock/fts/.env.sample similarity index 100% rename from awsbedrock/.env.sample rename to awsbedrock/fts/.env.sample diff --git a/awsbedrock/RAG_with_Couchbase_and_Bedrock.ipynb b/awsbedrock/fts/RAG_with_Couchbase_and_Bedrock.ipynb similarity index 99% rename from awsbedrock/RAG_with_Couchbase_and_Bedrock.ipynb rename to awsbedrock/fts/RAG_with_Couchbase_and_Bedrock.ipynb index c861af62..3bde9949 100644 --- a/awsbedrock/RAG_with_Couchbase_and_Bedrock.ipynb +++ b/awsbedrock/fts/RAG_with_Couchbase_and_Bedrock.ipynb @@ -6,7 +6,7 @@ "source": [ "# Introduction\n", "\n", - "In this guide, we will walk you through building a powerful semantic search engine using Couchbase as the backend database and [Amazon Bedrock](https://aws.amazon.com/bedrock/) as both the embedding and language model provider. Semantic search goes beyond simple keyword matching by understanding the context and meaning behind the words in a query, making it an essential tool for applications that require intelligent information retrieval. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch." + "In this guide, we will walk you through building a powerful semantic search engine using Couchbase as the backend database and [Amazon Bedrock](https://aws.amazon.com/bedrock/) as both the embedding and language model provider. Semantic search goes beyond simple keyword matching by understanding the context and meaning behind the words in a query, making it an essential tool for applications that require intelligent information retrieval. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system using the FTS service from scratch. Alternatively if you want to perform semantic search using the GSI index, please take a look at [this.](https://developer.couchbase.com/tutorial-aws-bedrock-couchbase-rag-with-global-secondary-index/)" ] }, { diff --git a/awsbedrock/frontmatter.md b/awsbedrock/fts/frontmatter.md similarity index 75% rename from awsbedrock/frontmatter.md rename to awsbedrock/fts/frontmatter.md index 90b354d8..9ca0cef2 100644 --- a/awsbedrock/frontmatter.md +++ b/awsbedrock/fts/frontmatter.md @@ -1,10 +1,10 @@ --- # frontmatter -path: "/tutorial-aws-bedrock-couchbase-rag" -title: Retrieval-Augmented Generation (RAG) with Couchbase and Amazon Bedrock -short_title: RAG with Couchbase and Amazon Bedrock +path: "/tutorial-aws-bedrock-couchbase-rag-with-fts" +title: Retrieval-Augmented Generation (RAG) with Couchbase and Amazon Bedrock using FTS service +short_title: RAG with Couchbase and Amazon Bedrock using FTS service description: - - Learn how to build a semantic search engine using Couchbase and Amazon Bedrock. + - Learn how to build a semantic search engine using Couchbase and Amazon Bedrock using FTS service. - This tutorial demonstrates how to integrate Couchbase's vector search capabilities with Amazon Bedrock's Titan embeddings and Claude language model. - You'll understand how to perform Retrieval-Augmented Generation (RAG) using LangChain and Couchbase. content_type: tutorial diff --git a/awsbedrock/gsi/.env.sample b/awsbedrock/gsi/.env.sample new file mode 100644 index 00000000..496ee6fa --- /dev/null +++ b/awsbedrock/gsi/.env.sample @@ -0,0 +1,14 @@ +# AWS Credentials +AWS_ACCESS_KEY_ID= +AWS_SECRET_ACCESS_KEY= +AWS_REGION= + +# Couchbase Settings +CB_HOST= +CB_USERNAME= +CB_PASSWORD= +CB_BUCKET_NAME= + +SCOPE_NAME= +COLLECTION_NAME= +CACHE_COLLECTION= diff --git a/awsbedrock/gsi/RAG_with_Couchbase_and_Bedrock.ipynb b/awsbedrock/gsi/RAG_with_Couchbase_and_Bedrock.ipynb new file mode 100644 index 00000000..17769574 --- /dev/null +++ b/awsbedrock/gsi/RAG_with_Couchbase_and_Bedrock.ipynb @@ -0,0 +1,1241 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction\n", + "\n", + "In this guide, we will walk you through building a powerful semantic search engine using Couchbase as the backend database and [Amazon Bedrock](https://aws.amazon.com/bedrock/) as both the embedding and language model provider. Semantic search goes beyond simple keyword matching by understanding the context and meaning behind the words in a query, making it an essential tool for applications that require intelligent information retrieval. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system using GSI( Global Secondary Index) from scratch. Alternatively if you want to perform semantic search using the FTS index, please take a look at [this.](https://developer.couchbase.com/tutorial-aws-bedrock-couchbase-rag-with-fts/)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# How to run this tutorial\n", + "\n", + "This tutorial is available as a Jupyter Notebook (`.ipynb` file) that you can run interactively. You can access the original notebook [here](https://github.com/couchbase-examples/vector-search-cookbook/blob/main/awsbedrock/RAG_with_Couchbase_and_Bedrock.ipynb).\n", + "\n", + "You can either download the notebook file and run it on [Google Colab](https://colab.research.google.com/) or run it on your system by setting up the Python environment." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Before you start\n", + "\n", + "## Get Credentials for AWS Bedrock\n", + "* Please follow the [instructions](https://docs.aws.amazon.com/bedrock/latest/userguide/getting-started.html) to set up AWS Bedrock and generate credentials.\n", + "* Ensure you have the necessary IAM permissions to access Bedrock services.\n", + "\n", + "## Create and Deploy Your Free Tier Operational cluster on Capella\n", + "\n", + "To get started with Couchbase Capella, create an account and use it to deploy a forever free tier operational cluster. This account provides you with an environment where you can explore and learn about Capella with no time constraint.\n", + "\n", + "To know more, please follow the [instructions](https://docs.couchbase.com/cloud/get-started/create-account.html).\n", + "\n", + "Note: To run this this tutorial, you will need Capella with Couchbase Server version 8.0 or above as GSI search is supported only from version 8.0\n", + "\n", + "### Couchbase Capella Configuration\n", + "\n", + "When running Couchbase using [Capella](https://cloud.couchbase.com/sign-in), the following prerequisites need to be met.\n", + "\n", + "* Create the [database credentials](https://docs.couchbase.com/cloud/clusters/manage-database-users.html) to access the bucket (Read and Write) used in the application.\n", + "* [Allow access](https://docs.couchbase.com/cloud/clusters/allow-ip-address.html) to the Cluster from the IP on which the application is running." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setting the Stage: Installing Necessary Libraries\n", + "\n", + "To build our semantic search engine, we need a robust set of tools. The libraries we install handle everything from connecting to databases to performing complex machine learning tasks." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install --quiet datasets==3.5.0 langchain-couchbase==0.5.0rc1 langchain-aws boto3==1.37.35 python-dotenv==1.1.0\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Importing Necessary Libraries\n", + "\n", + "The script starts by importing a series of libraries required for various tasks, including handling JSON, logging, time tracking, Couchbase connections, embedding generation, and dataset loading." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "import json\n", + "import logging\n", + "import os\n", + "import time\n", + "from datetime import timedelta\n", + "\n", + "import boto3\n", + "from couchbase.auth import PasswordAuthenticator\n", + "from couchbase.cluster import Cluster\n", + "from couchbase.exceptions import (CouchbaseException,\n", + " InternalServerFailureException,\n", + " QueryIndexAlreadyExistsException,ServiceUnavailableException)\n", + "from couchbase.management.buckets import CreateBucketSettings\n", + "from couchbase.management.search import SearchIndex\n", + "from couchbase.options import ClusterOptions\n", + "from datasets import load_dataset\n", + "from dotenv import load_dotenv\n", + "from langchain_aws import BedrockEmbeddings, ChatBedrock\n", + "from langchain_core.globals import set_llm_cache\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_core.prompts.chat import ChatPromptTemplate\n", + "from langchain_core.runnables import RunnablePassthrough\n", + "from langchain_couchbase.cache import CouchbaseCache\n", + "from langchain_couchbase.vectorstores import CouchbaseQueryVectorStore\n", + "from langchain_couchbase.vectorstores import DistanceStrategy\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setup Logging\n", + "\n", + "Logging is configured to track the progress of the script and capture any errors or warnings." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Loading Sensitive Information\n", + "In this section, we prompt the user to input essential configuration settings needed. These settings include sensitive information like AWS credentials, database credentials, and specific configuration names. Instead of hardcoding these details into the script, we request the user to provide them at runtime, ensuring flexibility and security.\n", + "\n", + "The project includes an `.env.sample` file that lists all the environment variables. To get started:\n", + "\n", + "1. Create a `.env` file in the same directory as this notebook\n", + "2. Copy the contents from `.env.sample` to your `.env` file\n", + "3. Fill in the required credentials\n", + "\n", + "The script also validates that all required inputs are provided, raising an error if any crucial information is missing. This approach ensures that your integration is both secure and correctly configured without hardcoding sensitive information, enhancing the overall security and maintainability of your code." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Load environment variables from .env file if it exists\n", + "load_dotenv(override=True)\n", + "\n", + "# AWS Credentials\n", + "AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID') or input('Enter your AWS Access Key ID: ')\n", + "AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY') or getpass.getpass('Enter your AWS Secret Access Key: ')\n", + "AWS_REGION = os.getenv('AWS_REGION') or input('Enter your AWS region (default: us-east-1): ') or 'us-east-1'\n", + "\n", + "# Couchbase Settings\n", + "CB_HOST = os.getenv('CB_HOST') or input('Enter your Couchbase host (default: couchbase://localhost): ') or 'couchbase://localhost'\n", + "CB_USERNAME = os.getenv('CB_USERNAME') or input('Enter your Couchbase username (default: Administrator): ') or 'Administrator'\n", + "CB_PASSWORD = os.getenv('CB_PASSWORD') or getpass.getpass('Enter your Couchbase password (default: password): ') or 'password'\n", + "CB_BUCKET_NAME = os.getenv('CB_BUCKET_NAME') or input('Enter your Couchbase bucket name (default: query-vector-search-testing): ') or 'query-vector-search-testing'\n", + "SCOPE_NAME = os.getenv('SCOPE_NAME') or input('Enter your scope name (default: shared): ') or 'shared'\n", + "COLLECTION_NAME = os.getenv('COLLECTION_NAME') or input('Enter your collection name (default: bedrock): ') or 'bedrock'\n", + "CACHE_COLLECTION = os.getenv('CACHE_COLLECTION') or input('Enter your cache collection name (default: cache): ') or 'cache'\n", + "\n", + "# Check if required credentials are set\n", + "for cred_name, cred_value in {\n", + " 'AWS_ACCESS_KEY_ID': AWS_ACCESS_KEY_ID,\n", + " 'AWS_SECRET_ACCESS_KEY': AWS_SECRET_ACCESS_KEY, \n", + " 'CB_HOST': CB_HOST,\n", + " 'CB_USERNAME': CB_USERNAME,\n", + " 'CB_PASSWORD': CB_PASSWORD,\n", + " 'CB_BUCKET_NAME': CB_BUCKET_NAME\n", + "}.items():\n", + " if not cred_value:\n", + " raise ValueError(f\"{cred_name} is not set\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Connecting to the Couchbase Cluster\n", + "Connecting to a Couchbase cluster is the foundation of our project. Couchbase will serve as our primary data store, handling all the storage and retrieval operations required for our semantic search engine. By establishing this connection, we enable our application to interact with the database, allowing us to perform operations such as storing embeddings, querying data, and managing collections. This connection is the gateway through which all data will flow, so ensuring it's set up correctly is paramount.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-02 12:21:07,348 - INFO - Successfully connected to Couchbase\n" + ] + } + ], + "source": [ + "try:\n", + " auth = PasswordAuthenticator(CB_USERNAME, CB_PASSWORD)\n", + " options = ClusterOptions(auth)\n", + " cluster = Cluster(CB_HOST, options)\n", + " cluster.wait_until_ready(timedelta(seconds=5))\n", + " logging.info(\"Successfully connected to Couchbase\")\n", + "except Exception as e:\n", + " raise ConnectionError(f\"Failed to connect to Couchbase: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setting Up Collections in Couchbase\n", + "\n", + "The setup_collection() function handles creating and configuring the hierarchical data organization in Couchbase:\n", + "\n", + "1. Bucket Creation:\n", + " - Checks if specified bucket exists, creates it if not\n", + " - Sets bucket properties like RAM quota (1024MB) and replication (disabled)\n", + " - Note: You will not be able to create a bucket on Capella\n", + "\n", + "2. Scope Management: \n", + " - Verifies if requested scope exists within bucket\n", + " - Creates new scope if needed (unless it's the default \"_default\" scope)\n", + "\n", + "3. Collection Setup:\n", + " - Checks for collection existence within scope\n", + " - Creates collection if it doesn't exist\n", + " - Waits 2 seconds for collection to be ready\n", + "\n", + "Additional Tasks:\n", + "- Clears any existing documents for clean state\n", + "- Implements comprehensive error handling and logging\n", + "\n", + "The function is called twice to set up:\n", + "1. Main collection for vector embeddings\n", + "2. Cache collection for storing results\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-08-29 13:03:42,591 - INFO - Bucket 'query-vector-search-testing' does not exist. Creating it...\n", + "2025-08-29 13:03:44,657 - INFO - Bucket 'query-vector-search-testing' created successfully.\n", + "2025-08-29 13:03:44,663 - INFO - Scope 'shared' does not exist. Creating it...\n", + "2025-08-29 13:03:44,704 - INFO - Scope 'shared' created successfully.\n", + "2025-08-29 13:03:44,714 - INFO - Collection 'bedrock' does not exist. Creating it...\n", + "2025-08-29 13:03:44,770 - INFO - Collection 'bedrock' created successfully.\n", + "2025-08-29 13:03:46,953 - INFO - All documents cleared from the collection.\n", + "2025-08-29 13:03:46,954 - INFO - Bucket 'query-vector-search-testing' exists.\n", + "2025-08-29 13:03:46,969 - INFO - Collection 'cache' does not exist. Creating it...\n", + "2025-08-29 13:03:47,025 - INFO - Collection 'cache' created successfully.\n", + "2025-08-29 13:03:49,183 - INFO - All documents cleared from the collection.\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def setup_collection(cluster, bucket_name, scope_name, collection_name):\n", + " try:\n", + " # Check if bucket exists, create if it doesn't\n", + " try:\n", + " bucket = cluster.bucket(bucket_name)\n", + " logging.info(f\"Bucket '{bucket_name}' exists.\")\n", + " except Exception as e:\n", + " logging.info(f\"Bucket '{bucket_name}' does not exist. Creating it...\")\n", + " bucket_settings = CreateBucketSettings(\n", + " name=bucket_name,\n", + " bucket_type='couchbase',\n", + " ram_quota_mb=1024,\n", + " flush_enabled=True,\n", + " num_replicas=0\n", + " )\n", + " cluster.buckets().create_bucket(bucket_settings)\n", + " time.sleep(2) # Wait for bucket creation to complete and become available\n", + " bucket = cluster.bucket(bucket_name)\n", + " logging.info(f\"Bucket '{bucket_name}' created successfully.\")\n", + "\n", + " bucket_manager = bucket.collections()\n", + "\n", + " # Check if scope exists, create if it doesn't\n", + " scopes = bucket_manager.get_all_scopes()\n", + " scope_exists = any(scope.name == scope_name for scope in scopes)\n", + " \n", + " if not scope_exists and scope_name != \"_default\":\n", + " logging.info(f\"Scope '{scope_name}' does not exist. Creating it...\")\n", + " bucket_manager.create_scope(scope_name)\n", + " logging.info(f\"Scope '{scope_name}' created successfully.\")\n", + "\n", + " # Check if collection exists, create if it doesn't\n", + " collections = bucket_manager.get_all_scopes()\n", + " collection_exists = any(\n", + " scope.name == scope_name and collection_name in [col.name for col in scope.collections]\n", + " for scope in collections\n", + " )\n", + "\n", + " if not collection_exists:\n", + " logging.info(f\"Collection '{collection_name}' does not exist. Creating it...\")\n", + " bucket_manager.create_collection(scope_name, collection_name)\n", + " logging.info(f\"Collection '{collection_name}' created successfully.\")\n", + " else:\n", + " logging.info(f\"Collection '{collection_name}' already exists. Skipping creation.\")\n", + "\n", + " # Wait for collection to be ready\n", + " collection = bucket.scope(scope_name).collection(collection_name)\n", + " time.sleep(2) # Give the collection time to be ready for queries\n", + "\n", + " # Clear all documents in the collection\n", + " try:\n", + " query = f\"DELETE FROM `{bucket_name}`.`{scope_name}`.`{collection_name}`\"\n", + " cluster.query(query).execute()\n", + " logging.info(\"All documents cleared from the collection.\")\n", + " except Exception as e:\n", + " logging.warning(f\"Error while clearing documents: {str(e)}. The collection might be empty.\")\n", + "\n", + " return collection\n", + " except Exception as e:\n", + " raise RuntimeError(f\"Error setting up collection: {str(e)}\")\n", + " \n", + "setup_collection(cluster, CB_BUCKET_NAME, SCOPE_NAME, COLLECTION_NAME)\n", + "setup_collection(cluster, CB_BUCKET_NAME, SCOPE_NAME, CACHE_COLLECTION)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Creating Amazon Bedrock Client and Embeddings\n", + "\n", + "Embeddings are at the heart of semantic search. They are numerical representations of text that capture the semantic meaning of the words and phrases. We'll use Amazon Bedrock's Titan embedding model for embeddings.\n", + "\n", + "## Using Amazon Bedrock's Titan Model\n", + "\n", + "Language models are AI systems that are trained to understand and generate human language. We'll be using Amazon Bedrock's Titan model to process user queries and generate meaningful responses. The Titan model family includes both embedding models for converting text into vector representations and text generation models for producing human-like responses.\n", + "\n", + "Key features of Amazon Bedrock's Titan models:\n", + "- Titan Embeddings model for embedding vector generation\n", + "- Titan Text model for natural language understanding and generation\n", + "- Seamless integration with AWS infrastructure\n", + "- Enterprise-grade security and scalability" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-02 12:21:15,663 - INFO - Successfully created Bedrock embeddings client\n" + ] + } + ], + "source": [ + "try:\n", + " bedrock_client = boto3.client(\n", + " service_name='bedrock-runtime',\n", + " region_name=AWS_REGION,\n", + " aws_access_key_id=AWS_ACCESS_KEY_ID,\n", + " aws_secret_access_key=AWS_SECRET_ACCESS_KEY\n", + " )\n", + " \n", + " embeddings = BedrockEmbeddings(\n", + " client=bedrock_client,\n", + " model_id=\"amazon.titan-embed-text-v2:0\"\n", + " )\n", + " logging.info(\"Successfully created Bedrock embeddings client\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Error creating Bedrock embeddings client: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setting Up the Couchbase Query Vector Store\n", + "A vector store is where we'll keep our embeddings. The query vector store is specifically designed to handle embeddings and perform similarity searches. When a user inputs a query, GSI converts the query into an embedding and compares it against the embeddings stored in the vector store. This allows the engine to find documents that are semantically similar to the query, even if they don't contain the exact same words. By setting up the vector store in Couchbase, we create a powerful tool that enables us to understand and retrieve information based on the meaning and context of the query, rather than just the specific words used.\n", + "\n", + "The vector store requires a distance metric to determine how similarity between vectors is calculated. This is crucial for accurate semantic search results as different distance metrics can yield different similarity rankings. Some of the supported Distance strategies are dot, l2, euclidean, cosine, l2_squared, euclidean_squared. In our implementation we will use cosine which is particularly effective for text embeddings." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-02 12:22:15,979 - INFO - Successfully created vector store\n" + ] + } + ], + "source": [ + "try:\n", + " vector_store = CouchbaseQueryVectorStore(\n", + " cluster=cluster,\n", + " bucket_name=CB_BUCKET_NAME,\n", + " scope_name=SCOPE_NAME,\n", + " collection_name=COLLECTION_NAME,\n", + " embedding = embeddings,\n", + " distance_metric=DistanceStrategy.COSINE\n", + " )\n", + " logging.info(\"Successfully created vector store\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Failed to create vector store: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load the BBC News Dataset\n", + "To build a search engine, we need data to search through. We use the BBC News dataset from RealTimeData, which provides real-world news articles. This dataset contains news articles from BBC covering various topics and time periods. Loading the dataset is a crucial step because it provides the raw material that our search engine will work with. The quality and diversity of the news articles make it an excellent choice for testing and refining our search engine, ensuring it can handle real-world news content effectively.\n", + "\n", + "The BBC News dataset allows us to work with authentic news articles, enabling us to build and test a search engine that can effectively process and retrieve relevant news content. The dataset is loaded using the Hugging Face datasets library, specifically accessing the \"RealTimeData/bbc_news_alltime\" dataset with the \"2024-12\" version." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-02 12:21:31,880 - INFO - Successfully loaded the BBC News dataset with 2687 rows.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded the BBC News dataset with 2687 rows\n" + ] + } + ], + "source": [ + "try:\n", + " news_dataset = load_dataset(\n", + " \"RealTimeData/bbc_news_alltime\", \"2024-12\", split=\"train\"\n", + " )\n", + " print(f\"Loaded the BBC News dataset with {len(news_dataset)} rows\")\n", + " logging.info(f\"Successfully loaded the BBC News dataset with {len(news_dataset)} rows.\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Error loading the BBC News dataset: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleaning up the Data\n", + "We will use the content of the news articles for our RAG system.\n", + "\n", + "The dataset contains a few duplicate records. We are removing them to avoid duplicate results in the retrieval stage of our RAG system." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "We have 1749 unique articles in our database.\n" + ] + } + ], + "source": [ + "news_articles = news_dataset[\"content\"]\n", + "unique_articles = set()\n", + "for article in news_articles:\n", + " if article:\n", + " unique_articles.add(article)\n", + "unique_news_articles = list(unique_articles)\n", + "print(f\"We have {len(unique_news_articles)} unique articles in our database.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Saving Data to the Vector Store\n", + "To efficiently handle the large number of articles, we process them in batches of 50 articles at a time. This batch processing approach helps manage memory usage and provides better control over the ingestion process.\n", + "\n", + "We first filter out any articles that exceed 50,000 characters to avoid potential issues with token limits. Then, using the vector store's add_texts method, we add the filtered articles to our vector database. The batch_size parameter controls how many articles are processed in each iteration.\n", + "\n", + "This approach offers several benefits:\n", + "1. Memory Efficiency: Processing in smaller batches prevents memory overload\n", + "2. Error Handling: If an error occurs, only the current batch is affected\n", + "3. Progress Tracking: Easier to monitor and track the ingestion progress\n", + "4. Resource Management: Better control over CPU and network resource utilization\n", + "\n", + "We use a conservative batch size of 50 to ensure reliable operation.\n", + "The optimal batch size depends on many factors including:\n", + "- Document sizes being inserted\n", + "- Available system resources\n", + "- Network conditions\n", + "- Concurrent workload\n", + "\n", + "Consider measuring performance with your specific workload before adjusting.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-08-20 14:05:53,302 - INFO - Document ingestion completed successfully.\n" + ] + } + ], + "source": [ + "batch_size = 50\n", + "\n", + "# Automatic Batch Processing\n", + "articles = [article for article in unique_news_articles if article and len(article) <= 50000]\n", + "\n", + "try:\n", + " vector_store.add_texts(\n", + " texts=articles,\n", + " batch_size=batch_size\n", + " )\n", + " logging.info(\"Document ingestion completed successfully.\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Failed to save documents to vector store: {str(e)}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setting Up a Couchbase Cache\n", + "To further optimize our system, we set up a Couchbase-based cache. A cache is a temporary storage layer that holds data that is frequently accessed, speeding up operations by reducing the need to repeatedly retrieve the same information from the database. In our setup, the cache will help us accelerate repetitive tasks, such as looking up similar documents. By implementing a cache, we enhance the overall performance of our search engine, ensuring that it can handle high query volumes and deliver results quickly.\n", + "\n", + "Caching is particularly valuable in scenarios where users may submit similar queries multiple times or where certain pieces of information are frequently requested. By storing these in a cache, we can significantly reduce the time it takes to respond to these queries, improving the user experience.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-02 12:22:20,978 - INFO - Successfully created cache\n" + ] + } + ], + "source": [ + "try:\n", + " cache = CouchbaseCache(\n", + " cluster=cluster,\n", + " bucket_name=CB_BUCKET_NAME,\n", + " scope_name=SCOPE_NAME,\n", + " collection_name=CACHE_COLLECTION,\n", + " )\n", + " logging.info(\"Successfully created cache\")\n", + " set_llm_cache(cache)\n", + "except Exception as e:\n", + " raise ValueError(f\"Failed to create cache: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using Amazon Bedrock's Titan Text Express v1 Model\n", + "\n", + "Amazon Bedrock's Titan Text Express v1 is a state-of-the-art foundation model designed for fast and efficient text generation tasks. This model excels at:\n", + "\n", + "- Text generation and completion\n", + "- Question answering \n", + "- Summarization\n", + "- Content rewriting\n", + "- Analysis and extraction\n", + "\n", + "Key features of Titan Text Express v1:\n", + "\n", + "- Optimized for low-latency responses while maintaining high quality output\n", + "- Supports up to 8K tokens context window\n", + "- Built-in content filtering and safety controls\n", + "- Cost-effective compared to larger models\n", + "- Seamlessly integrates with AWS services\n", + "\n", + "The model uses a temperature parameter (0-1) to control randomness in responses:\n", + "- Lower values (e.g. 0) produce more focused, deterministic outputs\n", + "- Higher values introduce more creativity and variation\n", + "\n", + "We'll be using this model through Amazon Bedrock's API to process user queries and generate contextually relevant responses based on our vector database content." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-02 12:22:24,513 - INFO - Successfully created Bedrock LLM client\n" + ] + } + ], + "source": [ + "try:\n", + " llm = ChatBedrock(\n", + " client=bedrock_client,\n", + " model_id=\"amazon.titan-text-express-v1\",\n", + " model_kwargs={\"temperature\": 0}\n", + " )\n", + " logging.info(\"Successfully created Bedrock LLM client\")\n", + "except Exception as e:\n", + " logging.error(f\"Error creating Bedrock LLM client: {str(e)}. Please check your AWS credentials and Bedrock access.\")\n", + " raise" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Perform Semantic Search\n", + "Semantic search in Couchbase involves converting queries and documents into vector representations using an embeddings model. These vectors capture the semantic meaning of the text and are stored directly in Couchbase. When a query is made, Couchbase performs a similarity search by comparing the query vector against the stored document vectors. The similarity metric used for this comparison is configurable, allowing flexibility in how the relevance of documents is determined. Common metrics include cosine similarity, Euclidean distance, or dot product, but other metrics can be implemented based on specific use cases. Different embedding models like BERT, Word2Vec, or GloVe can also be used depending on the application's needs, with the vectors generated by these models stored and searched within Couchbase itself.\n", + "\n", + "In the provided code, the search process begins by recording the start time, followed by executing the `similarity_search_with_score` method of the `CouchbaseQueryVectorStore`. This method searches Couchbase for the most relevant documents based on the vector similarity to the query. The search results include the document content and the distance that reflects how closely each document aligns with the query in the defined semantic space. The time taken to perform this search is then calculated and logged, and the results are displayed, showing the most relevant documents along with their similarity scores. This approach leverages Couchbase as both a storage and retrieval engine for vector data, enabling efficient and scalable semantic searches. The integration of vector storage and search capabilities within Couchbase allows for sophisticated semantic search operations without relying on external services for vector storage or comparison." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-02 12:23:51,477 - INFO - Semantic search completed in 1.29 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Semantic Search Results (completed in 1.29 seconds):\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.3512, Text: Luke Littler has risen from 164th to fourth in the rankings in a year\n", + "\n", + "A tearful Luke Littler hit a tournament record 140.91 set average as he started his bid for the PDC World Championship title with a dramatic 3-1 win over Ryan Meikle. The 17-year-old made headlines around the world when he reached the tournament final in January, where he lost to Luke Humphries. Starting this campaign on Saturday, Littler was millimetres away from a nine-darter when he missed double 12 as he blew Meikle away in the fourth and final set of the second-round match. Littler was overcome with emotion at the end, cutting short his on-stage interview. \"It was probably the toughest game I've ever played. I had to fight until the end,\" he said later in a news conference. \"As soon as the question came on stage and then boom, the tears came. It was just a bit too much to speak on stage. \"It is the worst game I have played. I have never felt anything like that tonight.\" Admitting to nerves during the match, he told Sky Sports: \"Yes, probably the biggest time it's hit me. Coming into it I was fine, but as soon as [referee] George Noble said 'game on', I couldn't throw them.\" Littler started slowly against Meikle, who had two darts for the opening set, but he took the lead by twice hitting double 20. Meikle did not look overawed against his fellow Englishman and levelled, but Littler won the third set and exploded into life in the fourth. The tournament favourite hit four maximum 180s as he clinched three straight legs in 11, 10 and 11 darts for a record set average, and 100.85 overall. Meanwhile, two seeds crashed out on Saturday night – five-time world champion Raymond van Barneveld lost to Welshman Nick Kenny, while England's Ryan Joyce beat Danny Noppert. Australian Damon Heta was another to narrowly miss out on a nine-darter, just failing on double 12 when throwing for the match in a 3-1 win over Connor Scutt. Ninth seed Heta hit four 100-plus checkouts to come from a set down against Scutt in a match in which both men averaged more than 97.\n", + "\n", + "Littler was hugged by his parents after victory over Meikle\n", + "\n", + "Littler returned to Alexandra Palace to a boisterous reception from more than 3,000 spectators and delivered an astonishing display in the fourth set. He was on for a nine-darter after his opening two throws in both of the first two legs and completed the set in 32 darts - the minimum possible is 27. The teenager will next play after Christmas against European Championship winner Ritchie Edhouse, the 29th seed, or Ian White, and is seeded to meet Humphries in the semi-finals. Having entered last year's event ranked 164th, Littler is up to fourth in the world and will go to number two if he reaches the final again this time. He has won 10 titles in his debut professional year, including the Premier League and Grand Slam of Darts. After reaching the World Championship final as a debutant aged just 16, Littler's life has been transformed and interest in darts has rocketed. Google say he was the most searched-for athlete online in the UK during 2024. This Christmas, more than 100,000 children are expected to be opening Littler-branded magnetic dartboards as presents. His impact has helped double the number of junior academies and has prompted plans to expand the World Championship. Littler was named BBC Young Sports Personality of the Year on Tuesday and was runner-up to athlete Keely Hodgkinson for the main award.\n", + "\n", + "Nick Kenny will play world champion Luke Humphries in round three after Christmas\n", + "\n", + "Barneveld was shocked 3-1 by world number 76 Kenny, who was in tears after a famous victory. Kenny, 32, will face Humphries in round three after defeating the Dutchman, who won the BDO world title four times and the PDC crown in 2007. Van Barneveld, ranked 32nd, became the sixth seed to exit in the second round. His compatriot Noppert, the 13th seed, was stunned 3-1 by Joyce, who will face Ryan Searle or Matt Campbell next, with the winner of that tie potentially meeting Littler in the last 16. Elsewhere, 15th seed Chris Dobey booked his place in the third round with a 3-1 win over Alexander Merkx. Englishman Dobey concluded an afternoon session which started with a trio of 3-0 scorelines. Northern Ireland's Brendan Dolan beat Lok Yin Lee to set up a meeting with three-time champion Michael van Gerwen after Christmas. In the final two first-round matches of the 2025 competition, Wales' Rhys Griffin beat Karel Sedlacek of the Czech Republic before Asia number one Alexis Toylo cruised past Richard Veenstra.\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.4124, Text: The Littler effect - how darts hit the bullseye\n", + "\n", + "Teenager Luke Littler began his bid to win the 2025 PDC World Darts Championship with a second-round win against Ryan Meikle. Here we assess Littler's impact after a remarkable rise which saw him named BBC Young Sports Personality of the Year and runner-up in the main award to athlete Keely Hodgkinson.\n", + "\n", + "One year ago, he was barely a household name in his own home. Now he is a sporting phenomenon. After emerging from obscurity aged 16 to reach the World Championship final, the life of Luke Littler and the sport he loves has been transformed. Viewing figures, ticket sales and social media interest have rocketed. Darts has hit the bullseye. This Christmas more than 100,000 children are expected to be opening Littler-branded magnetic dartboards as presents. His impact has helped double the number of junior academies, prompted plans to expand the World Championship and generated interest in darts from Saudi Arabian backers.\n", + "\n", + "Just months after taking his GCSE exams and ranked 164th in the world, Littler beat former champions Raymond van Barneveld and Rob Cross en route to the PDC World Championship final in January, before his run ended with a 7-4 loss to Luke Humphries. With his nickname 'The Nuke' on his purple and yellow shirt and the Alexandra Palace crowd belting out his walk-on song, Pitbull's tune Greenlight, he became an instant hit. Electric on the stage, calm off it. The down-to-earth teenager celebrated with a kebab and computer games. \"We've been watching his progress since he was about seven. He was on our radar, but we never anticipated what would happen. The next thing we know 'Littlermania' is spreading everywhere,\" PDC president Barry Hearn told BBC Sport. A peak TV audience of 3.7 million people watched the final - easily Sky's biggest figure for a non-football sporting event. The teenager from Warrington in Cheshire was too young to legally drive or drink alcohol, but earned £200,000 for finishing second - part of £1m prize money in his first year as a professional - and an invitation to the elite Premier League competition. He turned 17 later in January but was he too young for the demanding event over 17 Thursday nights in 17 locations? He ended up winning the whole thing, and hit a nine-dart finish against Humphries in the final. From Bahrain to Wolverhampton, Littler claimed 10 titles in 2024 and is now eyeing the World Championship.\n", + "\n", + "As he progressed at the Ally Pally, the Manchester United fan was sent a good luck message by the club's former midfielder and ex-England captain David Beckham. In 12 months, Littler's Instagram followers have risen from 4,000 to 1.3m. Commercial backers include a clothing range, cereal firm and train company and he will appear in a reboot of the TV darts show Bullseye. Google say he was the most searched-for athlete online in the UK during 2024. On the back of his success, Littler darts, boards, cabinets, shirts are being snapped up in big numbers. \"This Christmas the junior magnetic dartboard is selling out, we're talking over 100,000. They're 20 quid and a great introduction for young children,\" said Garry Plummer, the boss of sponsors Target Darts, who first signed a deal with Littler's family when he was aged 12. \"All the toy shops want it, they all want him - 17, clean, doesn't drink, wonderful.\"\n", + "\n", + "Littler beat Luke Humphries to win the Premier League title in May\n", + "\n", + "The number of academies for children under the age of 16 has doubled in the last year, says Junior Darts Corporation chairman Steve Brown. There are 115 dedicated groups offering youngsters equipment, tournaments and a place to develop, with bases including Australia, Bulgaria, Greece, Norway, USA and Mongolia. \"We've seen so many inquiries from around the world, it's been such a boom. It took us 14 years to get 1,600 members and within 12 months we have over 3,000, and waiting lists,\" said Brown. \"When I played darts as a child, I was quite embarrassed to tell my friends what my hobby was. All these kids playing darts now are pretty popular at school. It's a bit rock 'n roll and recognised as a cool thing to do.\" Plans are being hatched to extend the World Championship by four days and increase the number of players from 96 to 128. That will boost the number of tickets available by 25,000 to 115,000 but Hearn reckons he could sell three times as many. He says Saudi Arabia wants to host a tournament, which is likely to happen if no-alcohol regulations are relaxed. \"They will change their rules in the next 12 months probably for certain areas having alcohol, and we'll take darts there and have a party in Saudi,\" he said. \"When I got involved in darts, the total prize money was something like £300,000 for the year. This year it will go to £20m. I expect in five years' time, we'll be playing for £40m.\"\n", + "\n", + "Former electrician Cross charged to the 2018 world title in his first full season, while Adrian Lewis and Michael van Gerwen were multiple victors in their 20s and 16-time champion Phil ‘The Power’ Taylor is widely considered the greatest of all time. Littler is currently fourth in the world rankings, although that is based on a two-year Order of Merit. There have been suggestions from others the spotlight on the teenager means world number one Humphries, 29, has been denied the coverage he deserves, but no darts player has made a mark at such a young age as Littler. \"Luke Humphries is another fabulous player who is going to be around for years. Sport is a very brutal world. It is about winning and claiming the high ground. There will be envy around,\" Hearn said. \"Luke Littler is the next Tiger Woods for darts so they better get used to it, and the only way to compete is to get better.\" World number 38 Martin Lukeman was awestruck as he described facing a peak Littler after being crushed 16-3 in the Grand Slam final, with the teenager winning 15 consecutive legs. \"I can't compete with that, it was like Godly. He was relentless, he is so good it's ridiculous,\" he said. Lukeman can still see the benefits he brings, adding: \"What he's done for the sport is brilliant. If it wasn't for him, our wages wouldn't be going up. There's more sponsors, more money coming in, all good.\" Hearn feels future competition may come from players even younger than Littler. \"I watched a 10-year-old a few months ago who averaged 104.89 and checked out a 4-3 win with a 136 finish. They smell the money, the fame and put the hard work in,\" he said. How much better Littler can get is guesswork, although Plummer believes he wants to reach new heights. \"He never says 'how good was I?' But I think he wants to break records and beat Phil Taylor's 16 World Championships and 16 World Matchplay titles,\" he said. \"He's young enough to do it.\" A version of this article was originally published on 29 November.\n", + "• None Know a lot about Littler? Take our quiz\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.4317, Text: Luke Littler is one of six contenders for the 2024 BBC Sports Personality of the Year award.\n", + "\n", + "Here BBC Sport takes a look at the darts player's year in five photos.\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.4817, Text: Littler is Young Sports Personality of the Year\n", + "\n", + "This video can not be played To play this video you need to enable JavaScript in your browser.\n", + "\n", + "Darts player Luke Littler has been named BBC Young Sports Personality of the Year 2024. The 17-year-old has enjoyed a breakthrough year after finishing runner-up at the 2024 PDC World Darts Championship in January. The Englishman, who has won 10 senior titles on the Professional Darts Corporation tour this year, is the first darts player to claim the award. \"It shows how well I have done this year, not only for myself, but I have changed the sport of darts,\" Littler told BBC One. \"I know the amount of academies that have been brought up in different locations, tickets selling out at Ally Pally in hours and the Premier League selling out - it just shows how much I have changed it.\"\n", + "\n", + "He was presented with the trophy by Harry Aikines-Aryeetey - a former sprinter who won the award in 2005 - and ex-rugby union player Jodie Ounsley, both of whom are stars of the BBC television show Gladiators. Skateboarder Sky Brown, 16, and Para-swimmer William Ellard, 18, were also shortlisted for the award. Littler became a household name at the start of 2024 by reaching the World Championship final aged just 16 years and 347 days. That achievement was just the start of a trophy-laden year, with Littler winning the Premier League Darts, Grand Slam and World Series of Darts Finals among his haul of titles. Littler has gone from 164th to fourth in the world rankings and earned more than £1m in prize money in 2024. The judging panel for Young Sports Personality of the Year included Paralympic gold medallist Sammi Kinghorn, Olympic silver medal-winning BMX freestyler Keiran Reilly, television presenter Qasa Alom and Radio 1 DJ Jeremiah Asiamah, as well as representatives from the Youth Sport Trust, Blue Peter and BBC Sport.\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.4823, Text: Wright is the 17th seed at the World Championship\n", + "\n", + "Two-time champion Peter Wright won his opening game at the PDC World Championship, while Ryan Meikle edged out Fallon Sherrock to set up a match against teenage prodigy Luke Littler. Scotland's Wright, the 2020 and 2022 winner, has been out of form this year, but overcame Wesley Plaisier 3-1 in the second round at Alexandra Palace in London. \"It was this crowd that got me through, they wanted me to win. I thank you all,\" said Wright. Meikle came from a set down to claim a 3-2 victory in his first-round match against Sherrock, who was the first woman to win matches at the tournament five years ago. The 28-year-old will now play on Saturday against Littler, who was named BBC Young Sports Personality of the Year and runner-up in the main award to athlete Keely Hodgkinson on Tuesday night. Littler, 17, will be competing on the Ally Pally stage for the first time since his rise to stardom when finishing runner-up in January's world final to Luke Humphries. Earlier on Tuesday, World Grand Prix champion Mike de Decker – the 24th seed - suffered a surprise defeat to Luke Woodhouse in the second round. He is the second seed to exit following 16th seed James Wade's defeat on Monday to Jermaine Wattimena, who meets Wright in round three. Kevin Doets recovered from a set down to win 3-1 against Noa-Lynn van Leuven, who was making history as the first transgender woman to compete in the tournament.\n", + "\n", + "Sherrock drew level at 2-2 but lost the final set to Meikle\n", + "\n", + "The 54-year-old Wright only averaged 89.63 to his opponent's 93.77, but did enough to progress. Sporting a purple mohawk and festive outfit, crowd favourite 'Snakebite' showed glimpses of his best to win the first set and survived eight set darts to go 2-0 ahead. He lost the next but Dutchman Plaisier missed two more set darts in the fourth and Wright seized his opportunity. \"Wesley had his chances but he missed them and I took them,\" he said. \"He's got his tour card and he's going to be a dangerous player next year for all the players playing against him.\" Sherrock, 30, fought back from 2-1 down to force a decider against her English compatriot Meikle. She then narrowly missed the bull to take out 170 in the fourth leg before left-hander Meikle held his nerve to hit double 18 for a 96 finish to seal a hard-fought success. \"I felt under pressure from the start and to come through feels unbelievable,\" said Meikle. \"It's an unbelievable prize to play Luke here on this stage. It's the biggest stage of them all. I'm so happy.\" World number 81 Jeffrey de Graaf, who was born in the Netherlands but now represents Sweden, looked in trouble against Rashad Sweeting before prevailing 3-1. Sweeting, who was making history as the first player from the Bahamas to compete in the tournament, took the first set, but De Graaf fought back to clinch a second-round meeting with two-time champion Gary Anderson Germany's Ricardo Pietreczko, ranked 34, beat China's Xiaochen Zong 3-1 and will face Gian van Veen next.\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.4823, Text: Wright is the 17th seed at the World Championship\n", + "\n", + "Two-time champion Peter Wright won his opening game at the PDC World Championship, while Ryan Meikle edged out Fallon Sherrock to set up a match against teenage prodigy Luke Littler. Scotland's Wright, the 2020 and 2022 winner, has been out of form this year, but overcame Wesley Plaisier 3-1 in the second round at Alexandra Palace in London. \"It was this crowd that got me through, they wanted me to win. I thank you all,\" said Wright. Meikle came from a set down to claim a 3-2 victory in his first-round match against Sherrock, who was the first woman to win matches at the tournament five years ago. The 28-year-old will now play on Saturday against Littler, who was named BBC Young Sports Personality of the Year and runner-up in the main award to athlete Keely Hodgkinson on Tuesday night. Littler, 17, will be competing on the Ally Pally stage for the first time since his rise to stardom when finishing runner-up in January's world final to Luke Humphries. Earlier on Tuesday, World Grand Prix champion Mike de Decker – the 24th seed - suffered a surprise defeat to Luke Woodhouse in the second round. He is the second seed to exit following 16th seed James Wade's defeat on Monday to Jermaine Wattimena, who meets Wright in round three. Kevin Doets recovered from a set down to win 3-1 against Noa-Lynn van Leuven, who was making history as the first transgender woman to compete in the tournament.\n", + "\n", + "Sherrock drew level at 2-2 but lost the final set to Meikle\n", + "\n", + "The 54-year-old Wright only averaged 89.63 to his opponent's 93.77, but did enough to progress. Sporting a purple mohawk and festive outfit, crowd favourite 'Snakebite' showed glimpses of his best to win the first set and survived eight set darts to go 2-0 ahead. He lost the next but Dutchman Plaisier missed two more set darts in the fourth and Wright seized his opportunity. \"Wesley had his chances but he missed them and I took them,\" he said. \"He's got his tour card and he's going to be a dangerous player next year for all the players playing against him.\" Sherrock, 30, fought back from 2-1 down to force a decider against her English compatriot Meikle. She then narrowly missed the bull to take out 170 in the fourth leg before left-hander Meikle held his nerve to hit double 18 for a 96 finish to seal a hard-fought success. \"I felt under pressure from the start and to come through feels unbelievable,\" said Meikle. \"It's an unbelievable prize to play Luke here on this stage. It's the biggest stage of them all. I'm so happy.\" World number 81 Jeffrey de Graaf, who was born in the Netherlands but now represents Sweden, looked in trouble against Rashad Sweeting before prevailing 3-1. Sweeting, who was making history as the first player from the Bahamas to compete in the tournament, took the first set, but De Graaf fought back to clinch a second-round meeting with two-time champion Gary Anderson Germany's Ricardo Pietreczko, ranked 34, beat China's Xiaochen Zong 3-1 and will face Gian van Veen next.\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.5302, Text: Luke Littler trends higher than PM on Google in 2024\n", + "\n", + "Luke Littler shot to fame when he became the youngest player to reach the World Darts Championship final in January\n", + "\n", + "Dart sensation Luke Littler has said he \"can't quite believe\" he has trended higher than the prime minister and the King in Google's most searched for lists in 2024. The 17-year-old star was an unknown when he came to prominence as the youngest player to reach the World Darts Championship final in January. He has subsequently risen to fourth in the world rankings and his fame has led him to lie behind only Catherine, Princess of Wales, and US president elect Donald Trump as Google's most searched for person in the UK in 2024. He has also taken the top slot as the most searched for athlete on the search engine, which he said was \"a proud moment\" in what had been \"an amazing year\".\n", + "\n", + "A peak TV audience of 3.7m watched the then-16-year-old's appearance in the final. He lost by seven sets to four to world number one Luke Humphries, but earned £200,000 as the runner-up. He beat Michael van Gerwen later in the same month to win the Bahrain Darts Masters and secure his first Professional Darts Corporation (PDC) senior title. The event also saw he become the youngest person to make a nine-dart finish on live television, which is considered one of the sport's highest achievements and sees a player score the required 501 in the lowest number of darts possible.\n", + "\n", + "Luke Littler said the award was a \"huge honour\"\n", + "\n", + "In May, Littler won the 2024 Premier League Darts, his first major PDC title, and in November, Littler won the Grand Slam of Darts for his first major ranking title. The corporation's statistics showed that after each win, there was increased interest in Littler online, with even his first round exit on his World Grand Prix debut in October appearing in searches.\n", + "\n", + "Littler, who plays under the nickname of The Nuke, said it had been \"an amazing year for me personally, and for the sport of darts as a whole\". \"To be recognised in two Year in Search lists is a huge honour,\" he said. \"I can't quite believe I'm trending higher than both the prime minister and the King in the 'People' category—and in a year of such great sporting achievements, it's a proud moment for me to be the top trending athlete in 2024.\"\n", + "\n", + "Google's most searched people in UK in 2024 Google's most searched for athletes in UK in 2024\n", + "\n", + "Google's Year in Search lists, external were also impacted by the announced return of rock superstars Oasis for a 2025 tour. The Mancunian legends topped the list of most searched for musicians, ahead of Sabrina Carpenter, One Direction, Dave Grohl and Raye, while \"how to get Oasis tickets\" was second only to \"how to vote in the UK\" in the list of searched questions. Matt Cooke from the Google News Initiative said 2024 had been \"a year of comebacks, curiosity, and community\". \"Whether it's fans reuniting for Oasis, young sports stars like Luke Littler making waves, or Brits voting in everything from elections to Eurovision, these searches show a nation full of passion and interest,\" he said. \"It's amazing to see what captivated the UK, and it's always a privilege to highlight these moments in our Year in Search.\"\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.6582, Text: Cross loses as record number of seeds out of Worlds\n", + "\n", + "Rob Cross has suffered three second-round exits in his eight World Championships\n", + "\n", + "Former champion Rob Cross became the latest high-profile casualty as a record-breaking 14th seed exited the PDC World Darts Championship in the second round. The number five seed was beaten 3-1 by close friend Scott Williams, who will face Germany's Ricardo Pietreczko in round three. Cross, who won the event on his debut in 2018, took the opening set but failed to reach anywhere near his best as he suffered his third second-round exit. He was joined by number six seed David Chisnall, who was beaten 3-2 in a sudden-death leg by Ricky Evans, who came into the tournament 46th in the PDC's Order of Merit. The 2021 semi-finalist won the opening set, but then found himself 2-1 down to an inspired Evans, who was cheered on relentlessly by the Alexandra Palace crowd. He forced the game into a deciding set and faced match dart but Evans missed bullseye by the width of the wire. Chisnall then missed his own match dart on double tops, before he made a miscalculation when attempting to checkout 139 at 5-4 down. No real harm was done with a sudden-death leg forced but he was unable to hold off Evans, who reaches the third round for the third time in the last five years. \"It's not even what it is, again I've played a world-class darts player. I've played quite well and won,\" Evans told Sky Sports. \"Look at this [the crowd], wow. I don't understand it, why are they cheering me on? \"I don't get this reception in my household. Thank you very much. You've made a very fat guy very happy.\" Evans will face unseeded Welshman Robert Owen when the third round starts after the three-day Christmas break.\n", + "\n", + "World youth champion Gian van Veen had become the 12th seed to be knocked out when he lost 3-1 to Pietreczko. The 28th seed lost the opening set, having missed nine darts at double, but levelled. However, the Dutchman was unable to match Pietreczko, who closed out a comfortable win with a checkout percentage of 55.6%. Pietreczko said: \"I am over the moon to win. It is very important for me to be in the third round after Christmas. I love the big stage.\" The 26th seed trailed 1-0 and 2-1, and both players went on to miss match darts, before Gurney won the final set 3-1 on legs.\n", + "\n", + "Jonny Clayton is into the third round of the PDC World Darts Championship for a sixth consecutive year\n", + "\n", + "In the afternoon session, Welsh number seven seed Jonny Clayton also needed sudden death to pull off a sensational final-set comeback against Mickey Mansell in. He was a leg away from defeat twice to his Northern Irish opponent, but came from behind to win the final set 6-5 in a sudden-death leg to win 3-2. Clayton, who will play Gurney in round three, lost the opening set of the match, but fought back to lead 2-1, before being pegged back again by 51-year-old Mansell, who then missed match darts on double tops in the deciding set. \"I was very emotional. I've got to be honest, that meant a lot,\" said Clayton, who is in the favourable half of the draw following shock second-round exits for former world champions Michael Smith and Gary Anderson. \"I had chances before and Mickey definitely had chances before. It wasn't great to play in, not the best - I wouldn't wish that on my worst enemy. \"There is a lot of weight off my shoulders after that. I know there is another gear or two in the bank, but I'll be honest that meant a lot to me, it is a tester and will try and make me believe again.\" Clayton was 2-0 down in the fifth set after consecutive 136 and 154 checkouts from Mansell, but won three legs on the trot in 15, 12 and 10 darts to wrestle a 3-2 lead. He missed three darts for the match, before his unseeded opponent held and broke Clayton's throw to lead 4-3. Mansell missed a match dart at double 20, before Clayton won on double five after two missed checkouts. Elsewhere, Northern Ireland's Josh Rock booked his place in the third round against England's Chris Dobey with a 3-0 win over Wales' Rhys Griffin. Martin Lukeman, runner-up to Luke Littler at the Grand Slam of Darts last month, is out after a 3-1 loss to number 21 seed Andrew Gilding. The final day before the Christmas break started with Poland's number 31 seed Krzysztof Ratajski recording a 3-1 win over Alexis Toylo of the Philippines.\n", + "\n", + "All times are GMT and subject to change. Two fourth-round matches will also be played\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.6872, Text: Michael van Gerwen has made just one major ranking event final in 2024\n", + "\n", + "Michael van Gerwen enjoyed a comfortable 3-0 victory over English debutant James Hurrell in his opening match of the PDC World Darts Championship. The three-time world champion has had a tough year by his standards, having fallen behind Luke Littler and Luke Humphries, so a relatively stress-free opening match at Alexandra Palace was just what was needed. Hurrell, 40, offered some resistance early on when taking the opening leg of the match, but he would win just two more as Van Gerwen proved far too strong. The third-seeded Dutchman averaged 94.85, took out two three-figure checkouts and hit 50% of his doubles - with six of his nine misses coming in one scrappy leg. Van Gerwen, 35, will now face either Brendan Dolan or Lok Yin Lee in the third round.\n", + "\n", + "\"I think I played OK,\" Van Gerwen told Sky Sports after his match. \"Of course, I was a bit nervous. Like everyone knows it's been a tough year for me. \"Overall, it was a good performance. I was confident. I won the game, that's the main thing.\" Also on Friday night, Germany's Florian Hempel showed why he loves playing on the Alexandra Palace stage with a thrilling 3-1 victory in a high-quality contest against Jeffrey de Zwaan. Both men hit seven 180s in a match played at a fast and furious pace, but 34-year-old Hempel's superior doubles gave him a fourth straight first-round victory in the competition. Hempel moves on to a tie with 26th seed Daryl Gurney but it was a damaging loss for De Zwaan, 28, who came through a late qualifier in November and needed a good run here to keep his PDC tour card for next season. Mickey Mansell earned a second-round date with world number seven Jonny Clayton after a scrappy 3-1 win over Japan's Tomoya Goto, while Dylan Slevin came through an all-Irish tie against William O'Connor to progress to a meeting with Dimitri van den Bergh.\n", + "\n", + "Stephen Bunting is in the third round of the PDC World Darts Championship for a third consecutive year\n", + "\n", + "In the afternoon session, Stephen Bunting came from behind to beat Kai Gotthardt 3-1 and book his place in the third round. Englishman Bunting, ranked eighth in the world, dropped the first set and almost went 2-0 down in the match before staging an impressive recovery. Tournament debutant Gotthardt missed three darts at double eight to win the second set, allowing Bunting to take out double 10 to level the match before powering away to victory by winning the third and fourth sets without losing a leg. Victory for \"The Bullet\" sets up a last 32 meeting with the winner of Dirk van Duijvenbode's meeting with Madars Razma after Christmas. Should Bunting progress further, he is seeded to face world number one and defending world champion Luke Humphries in the quarter-finals on New Year's Day. Elsewhere in Friday afternoon's session, the Dutch duo of Alexander Merkx and Wessel Nijman advanced to the second round with wins over Stephen Burton and Cameron Carolissen respectively. England's Ian White was handed a walkover victory against Sandro Eric Sosing of the Philippines. Sosing withdrew from the competition on medical grounds and was taken to hospital following chest pains.\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.7012, Text: Christian Kist was sealing his first televised nine-darter\n", + "\n", + "Christian Kist hit a nine-darter but lost his PDC World Championship first-round match to Madars Razma. The Dutchman became the first player to seal a perfect leg in the tournament since Michael Smith did so on the way to beating Michael van Gerwen in the 2023 final. Kist, the 2012 BDO world champion at Lakeside, collects £60,000 for the feat, with the same amount being awarded by sponsors to a charity and to one spectator inside Alexandra Palace in London. The 38-year-old's brilliant finish sealed the opening set, but his Latvian opponent bounced back to win 3-1. Darts is one of the few sports that can measure perfection; snooker has the 147 maximum break, golf has the hole-in-one, darts has the nine-dart finish. Kist scored two maximum 180s to leave a 141 checkout which he completed with a double 12, to the delight of more than 3,000 spectators. The English 12th seed, who has been troubled by wrist and back injuries, could next play Andrew Gilding in the third round - which begins on 27 December - should Gilding beat the winner of Martin Lukeman's match against qualifier Nitin Kumar. Aspinall faces a tough task to reach the last four again, with 2018 champion Rob Cross and 2024 runner-up Luke Littler both in his side of the draw.\n", + "\n", + "Kist - who was knocked out of last year's tournament by teenager Littler - will still earn a bigger cheque than he would have got for a routine run to the quarter-finals. His nine-darter was the 15th in the history of the championship and first since the greatest leg in darts history when Smith struck, moments after Van Gerwen just missed his attempt. Darts fan Kris, a railway worker from Sutton in south London, was the random spectator picked out to receive £60,000, with Prostate Cancer UK getting the same sum from tournament sponsors Paddy Power. \"I'm speechless to be honest. I didn't expect it to happen to me,\" Kris said. \"This was a birthday present so it makes it even better. My grandad got me tickets. It was just a normal day - I came here after work.\" Kist said: \"Hitting the double 12 felt amazing. It was a lovely moment for everyone and I hope Kris enjoys the money. Maybe I will go on vacation next month.\" Earlier, Jim Williams was favourite against Paolo Nebrida but lost 3-2 in an epic lasting more than an hour. The Filipino took a surprise 2-1 lead and Williams only went ahead for the first time in the opening leg of the deciding set. The Welshman looked on course for victory but missed five match darts. UK Open semi-finalist Ricky Evans set up a second-round match against Dave Chisnall, checking out on 109 to edge past Gordon Mathers 3-2.\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "query = \"What were Luke Littler's key achievements and records in his recent PDC World Championship match?\"\n", + "\n", + "try:\n", + " # Perform the semantic search\n", + " start_time = time.time()\n", + " search_results = vector_store.similarity_search_with_score(query, k=10)\n", + " search_elapsed_time = time.time() - start_time\n", + "\n", + " logging.info(f\"Semantic search completed in {search_elapsed_time:.2f} seconds\")\n", + "\n", + " # Display search results\n", + " print(f\"\\nSemantic Search Results (completed in {search_elapsed_time:.2f} seconds):\")\n", + " print(\"-\" * 80)\n", + "\n", + " for doc, score in search_results:\n", + " print(f\"Distance: {score:.4f}, Text: {doc.page_content}\")\n", + " print(\"-\" * 80)\n", + "\n", + "except CouchbaseException as e:\n", + " raise RuntimeError(f\"Error performing semantic search: {str(e)}\")\n", + "except Exception as e:\n", + " raise RuntimeError(f\"Unexpected error: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Optimizing Vector Search with Global Secondary Index (GSI)\n", + "\n", + "While the above semantic search using similarity_search_with_score works effectively, we can significantly improve query performance by leveraging Global Secondary Index (GSI) in Couchbase.\n", + "\n", + "Couchbase offers three types of vector indexes, but for GSI-based vector search we focus on two main types:\n", + "\n", + "Hyperscale Vector Indexes (BHIVE)\n", + "- Best for pure vector searches - content discovery, recommendations, semantic search\n", + "- High performance with low memory footprint - designed to scale to billions of vectors\n", + "- Optimized for concurrent operations - supports simultaneous searches and inserts\n", + "- Use when: You primarily perform vector-only queries without complex scalar filtering\n", + "- Ideal for: Large-scale semantic search, recommendation systems, content discovery\n", + "\n", + "Composite Vector Indexes \n", + "- Best for filtered vector searches - combines vector search with scalar value filtering\n", + "- Efficient pre-filtering - scalar attributes reduce the vector comparison scope\n", + "- Use when: Your queries combine vector similarity with scalar filters that eliminate large portions of data\n", + "- Ideal for: Compliance-based filtering, user-specific searches, time-bounded queries\n", + "\n", + "Choosing the Right Index Type\n", + "- Start with Hyperscale Vector Index for pure vector searches and large datasets\n", + "- Use Composite Vector Index when scalar filters significantly reduce your search space\n", + "- Consider your dataset size: Hyperscale scales to billions, Composite works well for tens of millions to billions\n", + "\n", + "For more details, see the [Couchbase Vector Index documentation](https://preview.docs-test.couchbase.com/docs-server-DOC-12565_vector_search_concepts/server/current/vector-index/use-vector-indexes.html).\n", + "\n", + "\n", + "## Understanding Index Configuration (Couchbase 8.0 Feature)\n", + "\n", + "The index_description parameter controls how Couchbase optimizes vector storage and search performance through centroids and quantization:\n", + "\n", + "Format: `'IVF[],{PQ|SQ}'`\n", + "\n", + "Centroids (IVF - Inverted File):\n", + "- Controls how the dataset is subdivided for faster searches\n", + "- More centroids = faster search, slower training \n", + "- Fewer centroids = slower search, faster training\n", + "- If omitted (like IVF,SQ8), Couchbase auto-selects based on dataset size\n", + "\n", + "Quantization Options:\n", + "- SQ (Scalar Quantization): SQ4, SQ6, SQ8 (4, 6, or 8 bits per dimension)\n", + "- PQ (Product Quantization): PQx (e.g., PQ32x8)\n", + "- Higher values = better accuracy, larger index size\n", + "\n", + "Common Examples:\n", + "- IVF,SQ8 - Auto centroids, 8-bit scalar quantization (good default)\n", + "- IVF1000,SQ6 - 1000 centroids, 6-bit scalar quantization \n", + "- IVF,PQ32x8 - Auto centroids, 32 subquantizers with 8 bits\n", + "\n", + "For detailed configuration options, see the [Quantization & Centroid Settings](https://preview.docs-test.couchbase.com/docs-server-DOC-12565_vector_search_concepts/server/current/vector-index/hyperscale-vector-index.html#algo_settings).\n", + "\n", + "In the code below, we demonstrate creating a BHIVE index. This method takes an index type (BHIVE or COMPOSITE) and description parameter for optimization settings. Alternatively, GSI indexes can be created manually from the Couchbase UI." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_couchbase.vectorstores import IndexType\n", + "vector_store.create_index(index_type=IndexType.BHIVE, index_name=\"bedrock_bhive_index\",index_description=\"IVF,SQ8\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The example below shows running the same similarity search, but now using the BHIVE GSI index we created above. You'll notice improved performance as the index efficiently retrieves data.\n", + "\n", + "**Important**: When using Composite indexes, scalar filters take precedence over vector similarity, which can improve performance for filtered searches but may miss some semantically relevant results that don't match the scalar criteria.\n", + "\n", + "Note: In GSI vector search, the distance represents the vector distance between the query and document embeddings. Lower distance indicate higher similarity, while higher distance indicate lower similarity." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-02 12:24:54,503 - INFO - Semantic search completed in 0.36 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Semantic Search Results (completed in 0.36 seconds):\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.3512, Text: Luke Littler has risen from 164th to fourth in the rankings in a year\n", + "\n", + "A tearful Luke Littler hit a tournament record 140.91 set average as he started his bid for the PDC World Championship title with a dramatic 3-1 win over Ryan Meikle. The 17-year-old made headlines around the world when he reached the tournament final in January, where he lost to Luke Humphries. Starting this campaign on Saturday, Littler was millimetres away from a nine-darter when he missed double 12 as he blew Meikle away in the fourth and final set of the second-round match. Littler was overcome with emotion at the end, cutting short his on-stage interview. \"It was probably the toughest game I've ever played. I had to fight until the end,\" he said later in a news conference. \"As soon as the question came on stage and then boom, the tears came. It was just a bit too much to speak on stage. \"It is the worst game I have played. I have never felt anything like that tonight.\" Admitting to nerves during the match, he told Sky Sports: \"Yes, probably the biggest time it's hit me. Coming into it I was fine, but as soon as [referee] George Noble said 'game on', I couldn't throw them.\" Littler started slowly against Meikle, who had two darts for the opening set, but he took the lead by twice hitting double 20. Meikle did not look overawed against his fellow Englishman and levelled, but Littler won the third set and exploded into life in the fourth. The tournament favourite hit four maximum 180s as he clinched three straight legs in 11, 10 and 11 darts for a record set average, and 100.85 overall. Meanwhile, two seeds crashed out on Saturday night – five-time world champion Raymond van Barneveld lost to Welshman Nick Kenny, while England's Ryan Joyce beat Danny Noppert. Australian Damon Heta was another to narrowly miss out on a nine-darter, just failing on double 12 when throwing for the match in a 3-1 win over Connor Scutt. Ninth seed Heta hit four 100-plus checkouts to come from a set down against Scutt in a match in which both men averaged more than 97.\n", + "\n", + "Littler was hugged by his parents after victory over Meikle\n", + "\n", + "Littler returned to Alexandra Palace to a boisterous reception from more than 3,000 spectators and delivered an astonishing display in the fourth set. He was on for a nine-darter after his opening two throws in both of the first two legs and completed the set in 32 darts - the minimum possible is 27. The teenager will next play after Christmas against European Championship winner Ritchie Edhouse, the 29th seed, or Ian White, and is seeded to meet Humphries in the semi-finals. Having entered last year's event ranked 164th, Littler is up to fourth in the world and will go to number two if he reaches the final again this time. He has won 10 titles in his debut professional year, including the Premier League and Grand Slam of Darts. After reaching the World Championship final as a debutant aged just 16, Littler's life has been transformed and interest in darts has rocketed. Google say he was the most searched-for athlete online in the UK during 2024. This Christmas, more than 100,000 children are expected to be opening Littler-branded magnetic dartboards as presents. His impact has helped double the number of junior academies and has prompted plans to expand the World Championship. Littler was named BBC Young Sports Personality of the Year on Tuesday and was runner-up to athlete Keely Hodgkinson for the main award.\n", + "\n", + "Nick Kenny will play world champion Luke Humphries in round three after Christmas\n", + "\n", + "Barneveld was shocked 3-1 by world number 76 Kenny, who was in tears after a famous victory. Kenny, 32, will face Humphries in round three after defeating the Dutchman, who won the BDO world title four times and the PDC crown in 2007. Van Barneveld, ranked 32nd, became the sixth seed to exit in the second round. His compatriot Noppert, the 13th seed, was stunned 3-1 by Joyce, who will face Ryan Searle or Matt Campbell next, with the winner of that tie potentially meeting Littler in the last 16. Elsewhere, 15th seed Chris Dobey booked his place in the third round with a 3-1 win over Alexander Merkx. Englishman Dobey concluded an afternoon session which started with a trio of 3-0 scorelines. Northern Ireland's Brendan Dolan beat Lok Yin Lee to set up a meeting with three-time champion Michael van Gerwen after Christmas. In the final two first-round matches of the 2025 competition, Wales' Rhys Griffin beat Karel Sedlacek of the Czech Republic before Asia number one Alexis Toylo cruised past Richard Veenstra.\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.4124, Text: The Littler effect - how darts hit the bullseye\n", + "\n", + "Teenager Luke Littler began his bid to win the 2025 PDC World Darts Championship with a second-round win against Ryan Meikle. Here we assess Littler's impact after a remarkable rise which saw him named BBC Young Sports Personality of the Year and runner-up in the main award to athlete Keely Hodgkinson.\n", + "\n", + "One year ago, he was barely a household name in his own home. Now he is a sporting phenomenon. After emerging from obscurity aged 16 to reach the World Championship final, the life of Luke Littler and the sport he loves has been transformed. Viewing figures, ticket sales and social media interest have rocketed. Darts has hit the bullseye. This Christmas more than 100,000 children are expected to be opening Littler-branded magnetic dartboards as presents. His impact has helped double the number of junior academies, prompted plans to expand the World Championship and generated interest in darts from Saudi Arabian backers.\n", + "\n", + "Just months after taking his GCSE exams and ranked 164th in the world, Littler beat former champions Raymond van Barneveld and Rob Cross en route to the PDC World Championship final in January, before his run ended with a 7-4 loss to Luke Humphries. With his nickname 'The Nuke' on his purple and yellow shirt and the Alexandra Palace crowd belting out his walk-on song, Pitbull's tune Greenlight, he became an instant hit. Electric on the stage, calm off it. The down-to-earth teenager celebrated with a kebab and computer games. \"We've been watching his progress since he was about seven. He was on our radar, but we never anticipated what would happen. The next thing we know 'Littlermania' is spreading everywhere,\" PDC president Barry Hearn told BBC Sport. A peak TV audience of 3.7 million people watched the final - easily Sky's biggest figure for a non-football sporting event. The teenager from Warrington in Cheshire was too young to legally drive or drink alcohol, but earned £200,000 for finishing second - part of £1m prize money in his first year as a professional - and an invitation to the elite Premier League competition. He turned 17 later in January but was he too young for the demanding event over 17 Thursday nights in 17 locations? He ended up winning the whole thing, and hit a nine-dart finish against Humphries in the final. From Bahrain to Wolverhampton, Littler claimed 10 titles in 2024 and is now eyeing the World Championship.\n", + "\n", + "As he progressed at the Ally Pally, the Manchester United fan was sent a good luck message by the club's former midfielder and ex-England captain David Beckham. In 12 months, Littler's Instagram followers have risen from 4,000 to 1.3m. Commercial backers include a clothing range, cereal firm and train company and he will appear in a reboot of the TV darts show Bullseye. Google say he was the most searched-for athlete online in the UK during 2024. On the back of his success, Littler darts, boards, cabinets, shirts are being snapped up in big numbers. \"This Christmas the junior magnetic dartboard is selling out, we're talking over 100,000. They're 20 quid and a great introduction for young children,\" said Garry Plummer, the boss of sponsors Target Darts, who first signed a deal with Littler's family when he was aged 12. \"All the toy shops want it, they all want him - 17, clean, doesn't drink, wonderful.\"\n", + "\n", + "Littler beat Luke Humphries to win the Premier League title in May\n", + "\n", + "The number of academies for children under the age of 16 has doubled in the last year, says Junior Darts Corporation chairman Steve Brown. There are 115 dedicated groups offering youngsters equipment, tournaments and a place to develop, with bases including Australia, Bulgaria, Greece, Norway, USA and Mongolia. \"We've seen so many inquiries from around the world, it's been such a boom. It took us 14 years to get 1,600 members and within 12 months we have over 3,000, and waiting lists,\" said Brown. \"When I played darts as a child, I was quite embarrassed to tell my friends what my hobby was. All these kids playing darts now are pretty popular at school. It's a bit rock 'n roll and recognised as a cool thing to do.\" Plans are being hatched to extend the World Championship by four days and increase the number of players from 96 to 128. That will boost the number of tickets available by 25,000 to 115,000 but Hearn reckons he could sell three times as many. He says Saudi Arabia wants to host a tournament, which is likely to happen if no-alcohol regulations are relaxed. \"They will change their rules in the next 12 months probably for certain areas having alcohol, and we'll take darts there and have a party in Saudi,\" he said. \"When I got involved in darts, the total prize money was something like £300,000 for the year. This year it will go to £20m. I expect in five years' time, we'll be playing for £40m.\"\n", + "\n", + "Former electrician Cross charged to the 2018 world title in his first full season, while Adrian Lewis and Michael van Gerwen were multiple victors in their 20s and 16-time champion Phil ‘The Power’ Taylor is widely considered the greatest of all time. Littler is currently fourth in the world rankings, although that is based on a two-year Order of Merit. There have been suggestions from others the spotlight on the teenager means world number one Humphries, 29, has been denied the coverage he deserves, but no darts player has made a mark at such a young age as Littler. \"Luke Humphries is another fabulous player who is going to be around for years. Sport is a very brutal world. It is about winning and claiming the high ground. There will be envy around,\" Hearn said. \"Luke Littler is the next Tiger Woods for darts so they better get used to it, and the only way to compete is to get better.\" World number 38 Martin Lukeman was awestruck as he described facing a peak Littler after being crushed 16-3 in the Grand Slam final, with the teenager winning 15 consecutive legs. \"I can't compete with that, it was like Godly. He was relentless, he is so good it's ridiculous,\" he said. Lukeman can still see the benefits he brings, adding: \"What he's done for the sport is brilliant. If it wasn't for him, our wages wouldn't be going up. There's more sponsors, more money coming in, all good.\" Hearn feels future competition may come from players even younger than Littler. \"I watched a 10-year-old a few months ago who averaged 104.89 and checked out a 4-3 win with a 136 finish. They smell the money, the fame and put the hard work in,\" he said. How much better Littler can get is guesswork, although Plummer believes he wants to reach new heights. \"He never says 'how good was I?' But I think he wants to break records and beat Phil Taylor's 16 World Championships and 16 World Matchplay titles,\" he said. \"He's young enough to do it.\" A version of this article was originally published on 29 November.\n", + "• None Know a lot about Littler? Take our quiz\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.4317, Text: Luke Littler is one of six contenders for the 2024 BBC Sports Personality of the Year award.\n", + "\n", + "Here BBC Sport takes a look at the darts player's year in five photos.\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.4817, Text: Littler is Young Sports Personality of the Year\n", + "\n", + "This video can not be played To play this video you need to enable JavaScript in your browser.\n", + "\n", + "Darts player Luke Littler has been named BBC Young Sports Personality of the Year 2024. The 17-year-old has enjoyed a breakthrough year after finishing runner-up at the 2024 PDC World Darts Championship in January. The Englishman, who has won 10 senior titles on the Professional Darts Corporation tour this year, is the first darts player to claim the award. \"It shows how well I have done this year, not only for myself, but I have changed the sport of darts,\" Littler told BBC One. \"I know the amount of academies that have been brought up in different locations, tickets selling out at Ally Pally in hours and the Premier League selling out - it just shows how much I have changed it.\"\n", + "\n", + "He was presented with the trophy by Harry Aikines-Aryeetey - a former sprinter who won the award in 2005 - and ex-rugby union player Jodie Ounsley, both of whom are stars of the BBC television show Gladiators. Skateboarder Sky Brown, 16, and Para-swimmer William Ellard, 18, were also shortlisted for the award. Littler became a household name at the start of 2024 by reaching the World Championship final aged just 16 years and 347 days. That achievement was just the start of a trophy-laden year, with Littler winning the Premier League Darts, Grand Slam and World Series of Darts Finals among his haul of titles. Littler has gone from 164th to fourth in the world rankings and earned more than £1m in prize money in 2024. The judging panel for Young Sports Personality of the Year included Paralympic gold medallist Sammi Kinghorn, Olympic silver medal-winning BMX freestyler Keiran Reilly, television presenter Qasa Alom and Radio 1 DJ Jeremiah Asiamah, as well as representatives from the Youth Sport Trust, Blue Peter and BBC Sport.\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.4823, Text: Wright is the 17th seed at the World Championship\n", + "\n", + "Two-time champion Peter Wright won his opening game at the PDC World Championship, while Ryan Meikle edged out Fallon Sherrock to set up a match against teenage prodigy Luke Littler. Scotland's Wright, the 2020 and 2022 winner, has been out of form this year, but overcame Wesley Plaisier 3-1 in the second round at Alexandra Palace in London. \"It was this crowd that got me through, they wanted me to win. I thank you all,\" said Wright. Meikle came from a set down to claim a 3-2 victory in his first-round match against Sherrock, who was the first woman to win matches at the tournament five years ago. The 28-year-old will now play on Saturday against Littler, who was named BBC Young Sports Personality of the Year and runner-up in the main award to athlete Keely Hodgkinson on Tuesday night. Littler, 17, will be competing on the Ally Pally stage for the first time since his rise to stardom when finishing runner-up in January's world final to Luke Humphries. Earlier on Tuesday, World Grand Prix champion Mike de Decker – the 24th seed - suffered a surprise defeat to Luke Woodhouse in the second round. He is the second seed to exit following 16th seed James Wade's defeat on Monday to Jermaine Wattimena, who meets Wright in round three. Kevin Doets recovered from a set down to win 3-1 against Noa-Lynn van Leuven, who was making history as the first transgender woman to compete in the tournament.\n", + "\n", + "Sherrock drew level at 2-2 but lost the final set to Meikle\n", + "\n", + "The 54-year-old Wright only averaged 89.63 to his opponent's 93.77, but did enough to progress. Sporting a purple mohawk and festive outfit, crowd favourite 'Snakebite' showed glimpses of his best to win the first set and survived eight set darts to go 2-0 ahead. He lost the next but Dutchman Plaisier missed two more set darts in the fourth and Wright seized his opportunity. \"Wesley had his chances but he missed them and I took them,\" he said. \"He's got his tour card and he's going to be a dangerous player next year for all the players playing against him.\" Sherrock, 30, fought back from 2-1 down to force a decider against her English compatriot Meikle. She then narrowly missed the bull to take out 170 in the fourth leg before left-hander Meikle held his nerve to hit double 18 for a 96 finish to seal a hard-fought success. \"I felt under pressure from the start and to come through feels unbelievable,\" said Meikle. \"It's an unbelievable prize to play Luke here on this stage. It's the biggest stage of them all. I'm so happy.\" World number 81 Jeffrey de Graaf, who was born in the Netherlands but now represents Sweden, looked in trouble against Rashad Sweeting before prevailing 3-1. Sweeting, who was making history as the first player from the Bahamas to compete in the tournament, took the first set, but De Graaf fought back to clinch a second-round meeting with two-time champion Gary Anderson Germany's Ricardo Pietreczko, ranked 34, beat China's Xiaochen Zong 3-1 and will face Gian van Veen next.\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.4823, Text: Wright is the 17th seed at the World Championship\n", + "\n", + "Two-time champion Peter Wright won his opening game at the PDC World Championship, while Ryan Meikle edged out Fallon Sherrock to set up a match against teenage prodigy Luke Littler. Scotland's Wright, the 2020 and 2022 winner, has been out of form this year, but overcame Wesley Plaisier 3-1 in the second round at Alexandra Palace in London. \"It was this crowd that got me through, they wanted me to win. I thank you all,\" said Wright. Meikle came from a set down to claim a 3-2 victory in his first-round match against Sherrock, who was the first woman to win matches at the tournament five years ago. The 28-year-old will now play on Saturday against Littler, who was named BBC Young Sports Personality of the Year and runner-up in the main award to athlete Keely Hodgkinson on Tuesday night. Littler, 17, will be competing on the Ally Pally stage for the first time since his rise to stardom when finishing runner-up in January's world final to Luke Humphries. Earlier on Tuesday, World Grand Prix champion Mike de Decker – the 24th seed - suffered a surprise defeat to Luke Woodhouse in the second round. He is the second seed to exit following 16th seed James Wade's defeat on Monday to Jermaine Wattimena, who meets Wright in round three. Kevin Doets recovered from a set down to win 3-1 against Noa-Lynn van Leuven, who was making history as the first transgender woman to compete in the tournament.\n", + "\n", + "Sherrock drew level at 2-2 but lost the final set to Meikle\n", + "\n", + "The 54-year-old Wright only averaged 89.63 to his opponent's 93.77, but did enough to progress. Sporting a purple mohawk and festive outfit, crowd favourite 'Snakebite' showed glimpses of his best to win the first set and survived eight set darts to go 2-0 ahead. He lost the next but Dutchman Plaisier missed two more set darts in the fourth and Wright seized his opportunity. \"Wesley had his chances but he missed them and I took them,\" he said. \"He's got his tour card and he's going to be a dangerous player next year for all the players playing against him.\" Sherrock, 30, fought back from 2-1 down to force a decider against her English compatriot Meikle. She then narrowly missed the bull to take out 170 in the fourth leg before left-hander Meikle held his nerve to hit double 18 for a 96 finish to seal a hard-fought success. \"I felt under pressure from the start and to come through feels unbelievable,\" said Meikle. \"It's an unbelievable prize to play Luke here on this stage. It's the biggest stage of them all. I'm so happy.\" World number 81 Jeffrey de Graaf, who was born in the Netherlands but now represents Sweden, looked in trouble against Rashad Sweeting before prevailing 3-1. Sweeting, who was making history as the first player from the Bahamas to compete in the tournament, took the first set, but De Graaf fought back to clinch a second-round meeting with two-time champion Gary Anderson Germany's Ricardo Pietreczko, ranked 34, beat China's Xiaochen Zong 3-1 and will face Gian van Veen next.\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.5302, Text: Luke Littler trends higher than PM on Google in 2024\n", + "\n", + "Luke Littler shot to fame when he became the youngest player to reach the World Darts Championship final in January\n", + "\n", + "Dart sensation Luke Littler has said he \"can't quite believe\" he has trended higher than the prime minister and the King in Google's most searched for lists in 2024. The 17-year-old star was an unknown when he came to prominence as the youngest player to reach the World Darts Championship final in January. He has subsequently risen to fourth in the world rankings and his fame has led him to lie behind only Catherine, Princess of Wales, and US president elect Donald Trump as Google's most searched for person in the UK in 2024. He has also taken the top slot as the most searched for athlete on the search engine, which he said was \"a proud moment\" in what had been \"an amazing year\".\n", + "\n", + "A peak TV audience of 3.7m watched the then-16-year-old's appearance in the final. He lost by seven sets to four to world number one Luke Humphries, but earned £200,000 as the runner-up. He beat Michael van Gerwen later in the same month to win the Bahrain Darts Masters and secure his first Professional Darts Corporation (PDC) senior title. The event also saw he become the youngest person to make a nine-dart finish on live television, which is considered one of the sport's highest achievements and sees a player score the required 501 in the lowest number of darts possible.\n", + "\n", + "Luke Littler said the award was a \"huge honour\"\n", + "\n", + "In May, Littler won the 2024 Premier League Darts, his first major PDC title, and in November, Littler won the Grand Slam of Darts for his first major ranking title. The corporation's statistics showed that after each win, there was increased interest in Littler online, with even his first round exit on his World Grand Prix debut in October appearing in searches.\n", + "\n", + "Littler, who plays under the nickname of The Nuke, said it had been \"an amazing year for me personally, and for the sport of darts as a whole\". \"To be recognised in two Year in Search lists is a huge honour,\" he said. \"I can't quite believe I'm trending higher than both the prime minister and the King in the 'People' category—and in a year of such great sporting achievements, it's a proud moment for me to be the top trending athlete in 2024.\"\n", + "\n", + "Google's most searched people in UK in 2024 Google's most searched for athletes in UK in 2024\n", + "\n", + "Google's Year in Search lists, external were also impacted by the announced return of rock superstars Oasis for a 2025 tour. The Mancunian legends topped the list of most searched for musicians, ahead of Sabrina Carpenter, One Direction, Dave Grohl and Raye, while \"how to get Oasis tickets\" was second only to \"how to vote in the UK\" in the list of searched questions. Matt Cooke from the Google News Initiative said 2024 had been \"a year of comebacks, curiosity, and community\". \"Whether it's fans reuniting for Oasis, young sports stars like Luke Littler making waves, or Brits voting in everything from elections to Eurovision, these searches show a nation full of passion and interest,\" he said. \"It's amazing to see what captivated the UK, and it's always a privilege to highlight these moments in our Year in Search.\"\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.6582, Text: Cross loses as record number of seeds out of Worlds\n", + "\n", + "Rob Cross has suffered three second-round exits in his eight World Championships\n", + "\n", + "Former champion Rob Cross became the latest high-profile casualty as a record-breaking 14th seed exited the PDC World Darts Championship in the second round. The number five seed was beaten 3-1 by close friend Scott Williams, who will face Germany's Ricardo Pietreczko in round three. Cross, who won the event on his debut in 2018, took the opening set but failed to reach anywhere near his best as he suffered his third second-round exit. He was joined by number six seed David Chisnall, who was beaten 3-2 in a sudden-death leg by Ricky Evans, who came into the tournament 46th in the PDC's Order of Merit. The 2021 semi-finalist won the opening set, but then found himself 2-1 down to an inspired Evans, who was cheered on relentlessly by the Alexandra Palace crowd. He forced the game into a deciding set and faced match dart but Evans missed bullseye by the width of the wire. Chisnall then missed his own match dart on double tops, before he made a miscalculation when attempting to checkout 139 at 5-4 down. No real harm was done with a sudden-death leg forced but he was unable to hold off Evans, who reaches the third round for the third time in the last five years. \"It's not even what it is, again I've played a world-class darts player. I've played quite well and won,\" Evans told Sky Sports. \"Look at this [the crowd], wow. I don't understand it, why are they cheering me on? \"I don't get this reception in my household. Thank you very much. You've made a very fat guy very happy.\" Evans will face unseeded Welshman Robert Owen when the third round starts after the three-day Christmas break.\n", + "\n", + "World youth champion Gian van Veen had become the 12th seed to be knocked out when he lost 3-1 to Pietreczko. The 28th seed lost the opening set, having missed nine darts at double, but levelled. However, the Dutchman was unable to match Pietreczko, who closed out a comfortable win with a checkout percentage of 55.6%. Pietreczko said: \"I am over the moon to win. It is very important for me to be in the third round after Christmas. I love the big stage.\" The 26th seed trailed 1-0 and 2-1, and both players went on to miss match darts, before Gurney won the final set 3-1 on legs.\n", + "\n", + "Jonny Clayton is into the third round of the PDC World Darts Championship for a sixth consecutive year\n", + "\n", + "In the afternoon session, Welsh number seven seed Jonny Clayton also needed sudden death to pull off a sensational final-set comeback against Mickey Mansell in. He was a leg away from defeat twice to his Northern Irish opponent, but came from behind to win the final set 6-5 in a sudden-death leg to win 3-2. Clayton, who will play Gurney in round three, lost the opening set of the match, but fought back to lead 2-1, before being pegged back again by 51-year-old Mansell, who then missed match darts on double tops in the deciding set. \"I was very emotional. I've got to be honest, that meant a lot,\" said Clayton, who is in the favourable half of the draw following shock second-round exits for former world champions Michael Smith and Gary Anderson. \"I had chances before and Mickey definitely had chances before. It wasn't great to play in, not the best - I wouldn't wish that on my worst enemy. \"There is a lot of weight off my shoulders after that. I know there is another gear or two in the bank, but I'll be honest that meant a lot to me, it is a tester and will try and make me believe again.\" Clayton was 2-0 down in the fifth set after consecutive 136 and 154 checkouts from Mansell, but won three legs on the trot in 15, 12 and 10 darts to wrestle a 3-2 lead. He missed three darts for the match, before his unseeded opponent held and broke Clayton's throw to lead 4-3. Mansell missed a match dart at double 20, before Clayton won on double five after two missed checkouts. Elsewhere, Northern Ireland's Josh Rock booked his place in the third round against England's Chris Dobey with a 3-0 win over Wales' Rhys Griffin. Martin Lukeman, runner-up to Luke Littler at the Grand Slam of Darts last month, is out after a 3-1 loss to number 21 seed Andrew Gilding. The final day before the Christmas break started with Poland's number 31 seed Krzysztof Ratajski recording a 3-1 win over Alexis Toylo of the Philippines.\n", + "\n", + "All times are GMT and subject to change. Two fourth-round matches will also be played\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.6872, Text: Michael van Gerwen has made just one major ranking event final in 2024\n", + "\n", + "Michael van Gerwen enjoyed a comfortable 3-0 victory over English debutant James Hurrell in his opening match of the PDC World Darts Championship. The three-time world champion has had a tough year by his standards, having fallen behind Luke Littler and Luke Humphries, so a relatively stress-free opening match at Alexandra Palace was just what was needed. Hurrell, 40, offered some resistance early on when taking the opening leg of the match, but he would win just two more as Van Gerwen proved far too strong. The third-seeded Dutchman averaged 94.85, took out two three-figure checkouts and hit 50% of his doubles - with six of his nine misses coming in one scrappy leg. Van Gerwen, 35, will now face either Brendan Dolan or Lok Yin Lee in the third round.\n", + "\n", + "\"I think I played OK,\" Van Gerwen told Sky Sports after his match. \"Of course, I was a bit nervous. Like everyone knows it's been a tough year for me. \"Overall, it was a good performance. I was confident. I won the game, that's the main thing.\" Also on Friday night, Germany's Florian Hempel showed why he loves playing on the Alexandra Palace stage with a thrilling 3-1 victory in a high-quality contest against Jeffrey de Zwaan. Both men hit seven 180s in a match played at a fast and furious pace, but 34-year-old Hempel's superior doubles gave him a fourth straight first-round victory in the competition. Hempel moves on to a tie with 26th seed Daryl Gurney but it was a damaging loss for De Zwaan, 28, who came through a late qualifier in November and needed a good run here to keep his PDC tour card for next season. Mickey Mansell earned a second-round date with world number seven Jonny Clayton after a scrappy 3-1 win over Japan's Tomoya Goto, while Dylan Slevin came through an all-Irish tie against William O'Connor to progress to a meeting with Dimitri van den Bergh.\n", + "\n", + "Stephen Bunting is in the third round of the PDC World Darts Championship for a third consecutive year\n", + "\n", + "In the afternoon session, Stephen Bunting came from behind to beat Kai Gotthardt 3-1 and book his place in the third round. Englishman Bunting, ranked eighth in the world, dropped the first set and almost went 2-0 down in the match before staging an impressive recovery. Tournament debutant Gotthardt missed three darts at double eight to win the second set, allowing Bunting to take out double 10 to level the match before powering away to victory by winning the third and fourth sets without losing a leg. Victory for \"The Bullet\" sets up a last 32 meeting with the winner of Dirk van Duijvenbode's meeting with Madars Razma after Christmas. Should Bunting progress further, he is seeded to face world number one and defending world champion Luke Humphries in the quarter-finals on New Year's Day. Elsewhere in Friday afternoon's session, the Dutch duo of Alexander Merkx and Wessel Nijman advanced to the second round with wins over Stephen Burton and Cameron Carolissen respectively. England's Ian White was handed a walkover victory against Sandro Eric Sosing of the Philippines. Sosing withdrew from the competition on medical grounds and was taken to hospital following chest pains.\n", + "--------------------------------------------------------------------------------\n", + "Distance: 0.7012, Text: Christian Kist was sealing his first televised nine-darter\n", + "\n", + "Christian Kist hit a nine-darter but lost his PDC World Championship first-round match to Madars Razma. The Dutchman became the first player to seal a perfect leg in the tournament since Michael Smith did so on the way to beating Michael van Gerwen in the 2023 final. Kist, the 2012 BDO world champion at Lakeside, collects £60,000 for the feat, with the same amount being awarded by sponsors to a charity and to one spectator inside Alexandra Palace in London. The 38-year-old's brilliant finish sealed the opening set, but his Latvian opponent bounced back to win 3-1. Darts is one of the few sports that can measure perfection; snooker has the 147 maximum break, golf has the hole-in-one, darts has the nine-dart finish. Kist scored two maximum 180s to leave a 141 checkout which he completed with a double 12, to the delight of more than 3,000 spectators. The English 12th seed, who has been troubled by wrist and back injuries, could next play Andrew Gilding in the third round - which begins on 27 December - should Gilding beat the winner of Martin Lukeman's match against qualifier Nitin Kumar. Aspinall faces a tough task to reach the last four again, with 2018 champion Rob Cross and 2024 runner-up Luke Littler both in his side of the draw.\n", + "\n", + "Kist - who was knocked out of last year's tournament by teenager Littler - will still earn a bigger cheque than he would have got for a routine run to the quarter-finals. His nine-darter was the 15th in the history of the championship and first since the greatest leg in darts history when Smith struck, moments after Van Gerwen just missed his attempt. Darts fan Kris, a railway worker from Sutton in south London, was the random spectator picked out to receive £60,000, with Prostate Cancer UK getting the same sum from tournament sponsors Paddy Power. \"I'm speechless to be honest. I didn't expect it to happen to me,\" Kris said. \"This was a birthday present so it makes it even better. My grandad got me tickets. It was just a normal day - I came here after work.\" Kist said: \"Hitting the double 12 felt amazing. It was a lovely moment for everyone and I hope Kris enjoys the money. Maybe I will go on vacation next month.\" Earlier, Jim Williams was favourite against Paolo Nebrida but lost 3-2 in an epic lasting more than an hour. The Filipino took a surprise 2-1 lead and Williams only went ahead for the first time in the opening leg of the deciding set. The Welshman looked on course for victory but missed five match darts. UK Open semi-finalist Ricky Evans set up a second-round match against Dave Chisnall, checking out on 109 to edge past Gordon Mathers 3-2.\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "\n", + "query = \"What were Luke Littler's key achievements and records in his recent PDC World Championship match?\"\n", + "\n", + "try:\n", + " # Perform the semantic search\n", + " start_time = time.time()\n", + " search_results = vector_store.similarity_search_with_score(query, k=10)\n", + " search_elapsed_time = time.time() - start_time\n", + "\n", + " logging.info(f\"Semantic search completed in {search_elapsed_time:.2f} seconds\")\n", + "\n", + " # Display search results\n", + " print(f\"\\nSemantic Search Results (completed in {search_elapsed_time:.2f} seconds):\")\n", + " print(\"-\" * 80)\n", + "\n", + " for doc, score in search_results:\n", + " print(f\"Distance: {score:.4f}, Text: {doc.page_content}\")\n", + " print(\"-\" * 80)\n", + "\n", + "except CouchbaseException as e:\n", + " raise RuntimeError(f\"Error performing semantic search: {str(e)}\")\n", + "except Exception as e:\n", + " raise RuntimeError(f\"Unexpected error: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: To create a COMPOSITE index, the below code can be used.\n", + "Choose based on your specific use case and query patterns. For this tutorial's news search scenario, either index type would work, but BHIVE might be more efficient for pure semantic search across news articles." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_couchbase.vectorstores import IndexType\n", + "vector_store.create_index(index_type=IndexType.COMPOSITE, index_name=\"bedrock_composite_index\", index_description=\"IVF,SQ8\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Retrieval-Augmented Generation (RAG) with Couchbase and LangChain\n", + "Couchbase and LangChain can be seamlessly integrated to create RAG (Retrieval-Augmented Generation) chains, enhancing the process of generating contextually relevant responses. In this setup, Couchbase serves as the vector store, where embeddings of documents are stored. When a query is made, LangChain retrieves the most relevant documents from Couchbase by comparing the query’s embedding with the stored document embeddings. These documents, which provide contextual information, are then passed to a generative language model within LangChain.\n", + "\n", + "The language model, equipped with the context from the retrieved documents, generates a response that is both informed and contextually accurate. This integration allows the RAG chain to leverage Couchbase’s efficient storage and retrieval capabilities, while LangChain handles the generation of responses based on the context provided by the retrieved documents. Together, they create a powerful system that can deliver highly relevant and accurate answers by combining the strengths of both retrieval and generation." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-02 12:25:08,521 - INFO - Successfully created RAG chain\n" + ] + } + ], + "source": [ + "# Create RAG prompt template\n", + "rag_prompt = ChatPromptTemplate.from_messages([\n", + " (\"system\", \"You are a helpful assistant that answers questions based on the provided context.\"),\n", + " (\"human\", \"Context: {context}\\n\\nQuestion: {question}\")\n", + "])\n", + "\n", + "# Create RAG chain\n", + "rag_chain = (\n", + " {\"context\": vector_store.as_retriever(), \"question\": RunnablePassthrough()}\n", + " | rag_prompt\n", + " | llm\n", + " | StrOutputParser()\n", + ")\n", + "logging.info(\"Successfully created RAG chain\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RAG Response: \n", + "Luke Littler hit a tournament record 140.91 set average as he started his bid for the PDC World Championship title with a dramatic 3-1 win over Ryan Meikle. The 17-year-old made headlines around the world when he reached the tournament final in January, where he lost to Luke Humphries. Starting this campaign on Saturday, Littler was millimetres away from a nine-darter when he missed double 12 as he blew Meikle away in the fourth and final set of the second-round match. Littler was overcome with emotion at the end\n", + "RAG response generated in 0.41 seconds\n" + ] + } + ], + "source": [ + "start_time = time.time()\n", + "# Turn off excessive Logging \n", + "logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s', force=True)\n", + "\n", + "try:\n", + " rag_response = rag_chain.invoke(query)\n", + " rag_elapsed_time = time.time() - start_time\n", + " print(f\"RAG Response: {rag_response}\")\n", + " print(f\"RAG response generated in {rag_elapsed_time:.2f} seconds\")\n", + "except InternalServerFailureException as e:\n", + " if \"query request rejected\" in str(e):\n", + " print(\"Error: Search request was rejected due to rate limiting. Please try again later.\")\n", + " else:\n", + " print(f\"Internal server error occurred: {str(e)}\")\n", + "except Exception as e:\n", + " print(f\"Unexpected error occurred: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using Couchbase as a caching mechanism\n", + "Couchbase can be effectively used as a caching mechanism for RAG (Retrieval-Augmented Generation) responses by storing and retrieving precomputed results for specific queries. This approach enhances the system's efficiency and speed, particularly when dealing with repeated or similar queries. When a query is first processed, the RAG chain retrieves relevant documents, generates a response using the language model, and then stores this response in Couchbase, with the query serving as the key.\n", + "\n", + "For subsequent requests with the same query, the system checks Couchbase first. If a cached response is found, it is retrieved directly from Couchbase, bypassing the need to re-run the entire RAG process. This significantly reduces response time because the computationally expensive steps of document retrieval and response generation are skipped. Couchbase's role in this setup is to provide a fast and scalable storage solution for caching these responses, ensuring that frequently asked queries can be answered more quickly and efficiently.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Query 1: What happened in the match between Fullham and Liverpool?\n", + "Response: The match between Fullham and Liverpool ended in a 2-2 draw.\n", + "Time taken: 2.30 seconds\n", + "\n", + "Query 2: What were Luke Littler's key achievements and records in his recent PDC World Championship match?\n", + "Response: \n", + "Luke Littler hit a tournament record 140.91 set average as he started his bid for the PDC World Championship title with a dramatic 3-1 win over Ryan Meikle. The 17-year-old made headlines around the world when he reached the tournament final in January, where he lost to Luke Humphries. Starting this campaign on Saturday, Littler was millimetres away from a nine-darter when he missed double 12 as he blew Meikle away in the fourth and final set of the second-round match. Littler was overcome with emotion at the end\n", + "Time taken: 0.40 seconds\n", + "\n", + "Query 3: What happened in the match between Fullham and Liverpool?\n", + "Response: The match between Fullham and Liverpool ended in a 2-2 draw.\n", + "Time taken: 0.36 seconds\n" + ] + } + ], + "source": [ + "try:\n", + " queries = [\n", + " \"What happened in the match between Fullham and Liverpool?\",\n", + " \"What were Luke Littler's key achievements and records in his recent PDC World Championship match?\",\n", + " \"What happened in the match between Fullham and Liverpool?\", # Repeated query\n", + " ]\n", + "\n", + " for i, query in enumerate(queries, 1):\n", + " print(f\"\\nQuery {i}: {query}\")\n", + " start_time = time.time()\n", + "\n", + " response = rag_chain.invoke(query)\n", + " elapsed_time = time.time() - start_time\n", + " print(f\"Response: {response}\")\n", + " print(f\"Time taken: {elapsed_time:.2f} seconds\")\n", + "\n", + "except InternalServerFailureException as e:\n", + " if \"query request rejected\" in str(e):\n", + " print(\"Error: Search request was rejected due to rate limiting. Please try again later.\")\n", + " else:\n", + " print(f\"Internal server error occurred: {str(e)}\")\n", + "except Exception as e:\n", + " print(f\"Unexpected error occurred: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "By following these steps, you'll have a fully functional semantic search engine that leverages the strengths of Couchbase and AWS Bedrock. This guide is designed not just to show you how to build the system, but also to explain why each step is necessary, giving you a deeper understanding of the principles behind semantic search and how it improves querying data more efficiently using GSI which can significantly improve your RAG performance. Whether you're a newcomer to software development or an experienced developer looking to expand your skills, this guide will provide you with the knowledge and tools you need to create a powerful, AI-driven search engine." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (jupyter_env)", + "language": "python", + "name": "jupyter_env" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/awsbedrock/gsi/frontmatter.md b/awsbedrock/gsi/frontmatter.md new file mode 100644 index 00000000..f3eaa925 --- /dev/null +++ b/awsbedrock/gsi/frontmatter.md @@ -0,0 +1,21 @@ +--- +# frontmatter +path: "/tutorial-aws-bedrock-couchbase-rag-with-global-secondary-index" +title: Retrieval-Augmented Generation (RAG) with Couchbase and Amazon Bedrock using GSI index +short_title: RAG with Couchbase and Amazon Bedrock using GSI index +description: + - Learn how to build a semantic search engine using Couchbase and Amazon Bedrock using GSI. + - This tutorial demonstrates how to integrate Couchbase's vector search capabilities with Amazon Bedrock's Titan embeddings and Claude language model. + - You'll understand how to perform Retrieval-Augmented Generation (RAG) using LangChain and Couchbase. +content_type: tutorial +filter: sdk +technology: + - vector search +tags: + - Artificial Intelligence + - LangChain + - Amazon Bedrock +sdk_language: + - python +length: 60 Mins +--- diff --git a/claudeai/.env.sample b/claudeai/fts/.env.sample similarity index 100% rename from claudeai/.env.sample rename to claudeai/fts/.env.sample diff --git a/claudeai/RAG_with_Couchbase_and_Claude(by_Anthropic).ipynb b/claudeai/fts/RAG_with_Couchbase_and_Claude(by_Anthropic).ipynb similarity index 99% rename from claudeai/RAG_with_Couchbase_and_Claude(by_Anthropic).ipynb rename to claudeai/fts/RAG_with_Couchbase_and_Claude(by_Anthropic).ipynb index ce1e0d79..02732b95 100644 --- a/claudeai/RAG_with_Couchbase_and_Claude(by_Anthropic).ipynb +++ b/claudeai/fts/RAG_with_Couchbase_and_Claude(by_Anthropic).ipynb @@ -7,7 +7,7 @@ }, "source": [ "# Introduction\n", - "In this guide, we will walk you through building a powerful semantic search engine using Couchbase as the backend database, [OpenAI](https://openai.com/) as the AI-powered embedding and [Anthropic](https://claude.ai/) as the language model provider. Semantic search goes beyond simple keyword matching by understanding the context and meaning behind the words in a query, making it an essential tool for applications that require intelligent information retrieval. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system from scratch." + "In this guide, we will walk you through building a powerful semantic search engine using Couchbase as the backend database, [OpenAI](https://openai.com/) as the AI-powered embedding and [Anthropic](https://claude.ai/) as the language model provider. Semantic search goes beyond simple keyword matching by understanding the context and meaning behind the words in a query, making it an essential tool for applications that require intelligent information retrieval. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system using the FTS service from scratch. Alternatively if you want to perform semantic search using the GSI index, please take a look at [this.](https://developer.couchbase.com/tutorial-openai-claude-couchbase-rag-with-global-secondary-index/)" ] }, { @@ -675,9 +675,8 @@ "\n", "This approach offers several benefits:\n", "1. Memory Efficiency: Processing in smaller batches prevents memory overload\n", - "2. Error Handling: If an error occurs, only the current batch is affected\n", - "3. Progress Tracking: Easier to monitor and track the ingestion progress\n", - "4. Resource Management: Better control over CPU and network resource utilization\n", + "2. Progress Tracking: Easier to monitor and track the ingestion progress\n", + "3. Resource Management: Better control over CPU and network resource utilization\n", "\n", "We use a conservative batch size of 100 to ensure reliable operation.\n", "The optimal batch size depends on many factors including:\n", @@ -769,8 +768,8 @@ "id": "uehAx36o9Rlm" }, "source": [ - "# Using the Claude 3.7 Sonnet Language Model (LLM)\n", - "Language models are AI systems that are trained to understand and generate human language. We'll be using the `Claude 3.7 Sonnet` language model to process user queries and generate meaningful responses. This model is a key component of our semantic search engine, allowing it to go beyond simple keyword matching and truly understand the intent behind a query. By creating this language model, we equip our search engine with the ability to interpret complex queries, understand the nuances of language, and provide more accurate and contextually relevant responses.\n", + "# Using the Claude 4 Sonnet Language Model (LLM)\n", + "Language models are AI systems that are trained to understand and generate human language. We'll be using the `Claude 4 Sonnet` language model to process user queries and generate meaningful responses. This model is a key component of our semantic search engine, allowing it to go beyond simple keyword matching and truly understand the intent behind a query. By creating this language model, we equip our search engine with the ability to interpret complex queries, understand the nuances of language, and provide more accurate and contextually relevant responses.\n", "\n", "The language model's ability to understand context and generate coherent responses is what makes our search engine truly intelligent. It can not only find the right information but also present it in a way that is useful and understandable to the user.\n", "\n" @@ -778,7 +777,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -797,7 +796,7 @@ ], "source": [ "try:\n", - " llm = ChatAnthropic(temperature=0.1, anthropic_api_key=ANTHROPIC_API_KEY, model_name='claude-3-7-sonnet-20250219') \n", + " llm = ChatAnthropic(temperature=0.1, anthropic_api_key=ANTHROPIC_API_KEY, model_name='claude-sonnet-4-20250514') \n", " logging.info(\"Successfully created ChatAnthropic\")\n", "except Exception as e:\n", " logging.error(f\"Error creating ChatAnthropic: {str(e)}. Please check your API key and network connection.\")\n", diff --git a/claudeai/claude_index.json b/claudeai/fts/claude_index.json similarity index 100% rename from claudeai/claude_index.json rename to claudeai/fts/claude_index.json diff --git a/claudeai/frontmatter.md b/claudeai/fts/frontmatter.md similarity index 72% rename from claudeai/frontmatter.md rename to claudeai/fts/frontmatter.md index 8b96a208..4df5ac11 100644 --- a/claudeai/frontmatter.md +++ b/claudeai/fts/frontmatter.md @@ -1,10 +1,10 @@ --- # frontmatter -path: "/tutorial-openai-claude-couchbase-rag" -title: Retrieval-Augmented Generation (RAG) with Couchbase, OpenAI, and Claude -short_title: RAG with Couchbase, OpenAI, and Claude +path: "/tutorial-openai-claude-couchbase-rag-with-fts" +title: Retrieval-Augmented Generation (RAG) with Couchbase, OpenAI, and Claude using FTS service +short_title: RAG with Couchbase, OpenAI, and Claude using FTS service description: - - Learn how to build a semantic search engine using Couchbase, OpenAI embeddings, and Anthropic's Claude. + - Learn how to build a semantic search engine using Couchbase, OpenAI embeddings, and Anthropic's Claude using FTS service. - This tutorial demonstrates how to integrate Couchbase's vector search capabilities with OpenAI embeddings and use Claude as the language model. - You'll understand how to perform Retrieval-Augmented Generation (RAG) using LangChain and Couchbase. content_type: tutorial diff --git a/claudeai/gsi/.env.sample b/claudeai/gsi/.env.sample new file mode 100644 index 00000000..a8f2bbb8 --- /dev/null +++ b/claudeai/gsi/.env.sample @@ -0,0 +1,11 @@ +ANTHROPIC_API_KEY = "" +OPENAI_API_KEY = "" + +CB_HOST="" +CB_USERNAME="" +CB_PASSWORD="" +CB_BUCKET_NAME="" + +SCOPE_NAME="" +COLLECTION_NAME="" +CACHE_COLLECTION="" \ No newline at end of file diff --git a/claudeai/gsi/RAG_with_Couchbase_and_Claude(by_Anthropic).ipynb b/claudeai/gsi/RAG_with_Couchbase_and_Claude(by_Anthropic).ipynb new file mode 100644 index 00000000..8f58d252 --- /dev/null +++ b/claudeai/gsi/RAG_with_Couchbase_and_Claude(by_Anthropic).ipynb @@ -0,0 +1,3317 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "kNdImxzypDlm" + }, + "source": [ + "# Introduction\n", + "In this guide, we will walk you through building a powerful semantic search engine using Couchbase as the backend database, [OpenAI](https://openai.com/) as the AI-powered embedding and [Anthropic](https://claude.ai/) as the language model provider. Semantic search goes beyond simple keyword matching by understanding the context and meaning behind the words in a query, making it an essential tool for applications that require intelligent information retrieval. This tutorial is designed to be beginner-friendly, with clear, step-by-step instructions that will equip you with the knowledge to create a fully functional semantic search system using GSI( Global Secondary Index) from scratch. Alternatively if you want to perform semantic search using the FTS index, please take a look at [this.](https://developer.couchbase.com/tutorial-openai-claude-couchbase-rag-with-fts/)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# How to run this tutorial\n", + "\n", + "This tutorial is available as a Jupyter Notebook (`.ipynb` file) that you can run interactively. You can access the original notebook [here](https://github.com/couchbase-examples/vector-search-cookbook/blob/main/claudeai/RAG_with_Couchbase_and_Claude(by_Anthropic).ipynb).\n", + "\n", + "You can either download the notebook file and run it on [Google Colab](https://colab.research.google.com/) or run it on your system by setting up the Python environment." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Before you start\n", + "\n", + "## Get Credentials for OpenAI and Anthropic\n", + "\n", + "* Please follow the [instructions](https://platform.openai.com/docs/quickstart) to generate the OpenAI credentials.\n", + "* Please follow the [instructions](https://docs.anthropic.com/en/api/getting-started) to generate the Anthropic credentials.\n", + "\n", + "## Create and Deploy Your Free Tier Operational cluster on Capella\n", + "\n", + "To get started with Couchbase Capella, create an account and use it to deploy a forever free tier operational cluster. This account provides you with an environment where you can explore and learn about Capella with no time constraint.\n", + "\n", + "To learn more, please follow the [instructions](https://docs.couchbase.com/cloud/get-started/create-account.html).\n", + "\n", + "Note: To run this this tutorial, you will need Capella with Couchbase Server version 8.0 or above as GSI vector search is supported only from version 8.0\n", + "\n", + "### Couchbase Capella Configuration\n", + "\n", + "When running Couchbase using [Capella](https://cloud.couchbase.com/sign-in), the following prerequisites need to be met.\n", + "\n", + "* Create the [database credentials](https://docs.couchbase.com/cloud/clusters/manage-database-users.html) to access the required bucket (Read and Write) used in the application.\n", + "\n", + "* [Allow access](https://docs.couchbase.com/cloud/clusters/allow-ip-address.html) to the Cluster from the IP on which the application is running." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NH2o6pqa69oG" + }, + "source": [ + "# Setting the Stage: Installing Necessary Libraries\n", + "To build our semantic search engine, we need a robust set of tools. The libraries we install handle everything from connecting to databases to performing complex machine learning tasks. Each library has a specific role: Couchbase libraries manage database operations, LangChain handles AI model integrations, and OpenAI provides advanced AI models for generating embeddings and Claude(by Anthropic) for understanding natural language. By setting up these libraries, we ensure our environment is equipped to handle the data-intensive and computationally complex tasks required for semantic search." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DYhPj0Ta8l_A", + "outputId": "6af482de-ca00-4a67-abb2-d5fac8bd5818" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install --quiet datasets==3.5.0 langchain-couchbase==0.5.0rc1 langchain-anthropic==0.3.19 langchain-openai==0.3.32 python-dotenv==1.1.1" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pp7GtNg8mB9" + }, + "source": [ + "# Importing Necessary Libraries\n", + "The script starts by importing a series of libraries required for various tasks, including handling JSON, logging, time tracking, Couchbase connections, embedding generation, and dataset loading. These libraries provide essential functions for working with data, managing database connections, and processing machine learning models." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "id": "8GzS6tfL8mFP" + }, + "outputs": [], + "source": [ + "import getpass\n", + "import json\n", + "import logging\n", + "import os\n", + "import time\n", + "from datetime import timedelta\n", + "from multiprocessing import AuthenticationError\n", + "\n", + "from couchbase.auth import PasswordAuthenticator\n", + "from couchbase.cluster import Cluster\n", + "from couchbase.exceptions import (CouchbaseException,\n", + " InternalServerFailureException,\n", + " QueryIndexAlreadyExistsException,\n", + " ServiceUnavailableException)\n", + "from couchbase.management.buckets import CreateBucketSettings\n", + "from couchbase.management.search import SearchIndex\n", + "from couchbase.options import ClusterOptions\n", + "from datasets import load_dataset\n", + "from dotenv import load_dotenv\n", + "from langchain_anthropic import ChatAnthropic\n", + "from langchain_core.globals import set_llm_cache\n", + "from langchain_core.prompts.chat import (ChatPromptTemplate,\n", + " HumanMessagePromptTemplate,\n", + " SystemMessagePromptTemplate)\n", + "from langchain_core.runnables import RunnablePassthrough\n", + "from langchain_couchbase.cache import CouchbaseCache\n", + "from langchain_couchbase.vectorstores import CouchbaseQueryVectorStore\n", + "from langchain_couchbase.vectorstores import DistanceStrategy\n", + "from langchain_openai import OpenAIEmbeddings\n", + "from langchain_couchbase.vectorstores import IndexType" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pBnMp5vb8mIb" + }, + "source": [ + "# Setup Logging\n", + "Logging is configured to track the progress of the script and capture any errors or warnings. This is crucial for debugging and understanding the flow of execution. The logging output includes timestamps, log levels (e.g., INFO, ERROR), and messages that describe what is happening in the script.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "Yv8kWcuf8mLx" + }, + "outputs": [], + "source": [ + "logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)\n", + "\n", + "# Disable all logging except critical to prevent OpenAI API request logs\n", + "logging.getLogger(\"httpx\").setLevel(logging.CRITICAL)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "K9G5a0en8mPA" + }, + "source": [ + "# Loading Sensitive Informnation\n", + "In this section, we prompt the user to input essential configuration settings needed. These settings include sensitive information like API keys, database credentials, and specific configuration names. Instead of hardcoding these details into the script, we request the user to provide them at runtime, ensuring flexibility and security.\n", + "\n", + "The script also validates that all required inputs are provided, raising an error if any crucial information is missing. This approach ensures that your integration is both secure and correctly configured without hardcoding sensitive information, enhancing the overall security and maintainability of your code." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PFGyHll18mSe", + "outputId": "421e1142-85dd-4a95-b0ad-cb16430c6dcf" + }, + "outputs": [], + "source": [ + "load_dotenv()\n", + "\n", + "# Load from environment variables or prompt for input in one-liners\n", + "ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY') or getpass.getpass('Enter your Anthropic API key: ')\n", + "OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or getpass.getpass('Enter your OpenAI API key: ')\n", + "CB_HOST = os.getenv('CB_HOST', 'couchbase://localhost') or input('Enter your Couchbase host (default: couchbase://localhost): ') or 'couchbase://localhost'\n", + "CB_USERNAME = os.getenv('CB_USERNAME', 'Administrator') or input('Enter your Couchbase username (default: Administrator): ') or 'Administrator'\n", + "CB_PASSWORD = os.getenv('CB_PASSWORD', 'password') or getpass.getpass('Enter your Couchbase password (default: password): ') or 'password'\n", + "CB_BUCKET_NAME = os.getenv('CB_BUCKET_NAME', 'query-vector-search-testing') or input('Enter your Couchbase bucket name (default: query-vector-search-testing): ') or 'query-vector-search-testing'\n", + "SCOPE_NAME = os.getenv('SCOPE_NAME', 'shared') or input('Enter your scope name (default: shared): ') or 'shared'\n", + "COLLECTION_NAME = os.getenv('COLLECTION_NAME', 'claude') or input('Enter your collection name (default: claude): ') or 'claude'\n", + "CACHE_COLLECTION = os.getenv('CACHE_COLLECTION', 'cache') or input('Enter your cache collection name (default: cache): ') or 'cache'\n", + "# Check if the variables are correctly loaded\n", + "if not ANTHROPIC_API_KEY:\n", + " raise ValueError(\"ANTHROPIC_API_KEY is not set in the environment.\")\n", + "if not OPENAI_API_KEY:\n", + " raise ValueError(\"OPENAI_API_KEY is not set in the environment.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qtGrYzUY8mV3" + }, + "source": [ + "# Connecting to the Couchbase Cluster\n", + "Connecting to a Couchbase cluster is the foundation of our project. Couchbase will serve as our primary data store, handling all the storage and retrieval operations required for our semantic search engine. By establishing this connection, we enable our application to interact with the database, allowing us to perform operations such as storing embeddings, querying data, and managing collections. This connection is the gateway through which all data will flow, so ensuring it's set up correctly is paramount.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Zb3kK-7W8mZK", + "outputId": "0c574ca7-9e21-4c74-b44f-d17b3667cd76" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-09 12:15:22,899 - INFO - Successfully connected to Couchbase\n" + ] + } + ], + "source": [ + "try:\n", + " auth = PasswordAuthenticator(CB_USERNAME, CB_PASSWORD)\n", + " options = ClusterOptions(auth)\n", + " cluster = Cluster(CB_HOST, options)\n", + " cluster.wait_until_ready(timedelta(seconds=5))\n", + " logging.info(\"Successfully connected to Couchbase\")\n", + "except Exception as e:\n", + " raise ConnectionError(f\"Failed to connect to Couchbase: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "C_Gpy32N8mcZ" + }, + "source": [ + "## Setting Up Collections in Couchbase\n", + "\n", + "The setup_collection() function handles creating and configuring the hierarchical data organization in Couchbase:\n", + "\n", + "1. Bucket Creation:\n", + " - Checks if specified bucket exists, creates it if not\n", + " - Sets bucket properties like RAM quota (1024MB) and replication (disabled)\n", + " - Note: You will not be able to create a bucket on Capella\n", + "\n", + "\n", + "2. Scope Management: \n", + " - Verifies if requested scope exists within bucket\n", + " - Creates new scope if needed (unless it's the default \"_default\" scope)\n", + "\n", + "3. Collection Setup:\n", + " - Checks for collection existence within scope\n", + " - Creates collection if it doesn't exist\n", + " - Waits 2 seconds for collection to be ready\n", + "\n", + "Additional Tasks:\n", + "- Clears any existing documents for clean state\n", + "- Implements comprehensive error handling and logging\n", + "\n", + "The function is called twice to set up:\n", + "1. Main collection for vector embeddings\n", + "2. Cache collection for storing results\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ACZcwUnG8mf2", + "outputId": "a0db89a6-c9bc-4e26-ae37-d53c0700024b" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-09 12:15:26,795 - INFO - Bucket 'query-vector-search-testing' exists.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-09 12:15:26,808 - INFO - Collection 'claude' does not exist. Creating it...\n", + "2025-09-09 12:15:26,854 - INFO - Collection 'claude' created successfully.\n", + "2025-09-09 12:15:29,065 - INFO - All documents cleared from the collection.\n", + "2025-09-09 12:15:29,066 - INFO - Bucket 'query-vector-search-testing' exists.\n", + "2025-09-09 12:15:29,074 - INFO - Collection 'cache' already exists. Skipping creation.\n", + "2025-09-09 12:15:31,115 - INFO - All documents cleared from the collection.\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def setup_collection(cluster, bucket_name, scope_name, collection_name):\n", + " try:\n", + " # Check if bucket exists, create if it doesn't\n", + " try:\n", + " bucket = cluster.bucket(bucket_name)\n", + " logging.info(f\"Bucket '{bucket_name}' exists.\")\n", + " except Exception as e:\n", + " logging.info(f\"Bucket '{bucket_name}' does not exist. Creating it...\")\n", + " bucket_settings = CreateBucketSettings(\n", + " name=bucket_name,\n", + " bucket_type='couchbase',\n", + " ram_quota_mb=1024,\n", + " flush_enabled=True,\n", + " num_replicas=0\n", + " )\n", + " cluster.buckets().create_bucket(bucket_settings)\n", + " time.sleep(2) # Wait for bucket creation to complete and become available\n", + " bucket = cluster.bucket(bucket_name)\n", + " logging.info(f\"Bucket '{bucket_name}' created successfully.\")\n", + "\n", + " bucket_manager = bucket.collections()\n", + "\n", + " # Check if scope exists, create if it doesn't\n", + " scopes = bucket_manager.get_all_scopes()\n", + " scope_exists = any(scope.name == scope_name for scope in scopes)\n", + " \n", + " if not scope_exists and scope_name != \"_default\":\n", + " logging.info(f\"Scope '{scope_name}' does not exist. Creating it...\")\n", + " bucket_manager.create_scope(scope_name)\n", + " logging.info(f\"Scope '{scope_name}' created successfully.\")\n", + "\n", + " # Check if collection exists, create if it doesn't\n", + " collections = bucket_manager.get_all_scopes()\n", + " collection_exists = any(\n", + " scope.name == scope_name and collection_name in [col.name for col in scope.collections]\n", + " for scope in collections\n", + " )\n", + "\n", + " if not collection_exists:\n", + " logging.info(f\"Collection '{collection_name}' does not exist. Creating it...\")\n", + " bucket_manager.create_collection(scope_name, collection_name)\n", + " logging.info(f\"Collection '{collection_name}' created successfully.\")\n", + " else:\n", + " logging.info(f\"Collection '{collection_name}' already exists. Skipping creation.\")\n", + "\n", + " # Wait for collection to be ready\n", + " collection = bucket.scope(scope_name).collection(collection_name)\n", + " time.sleep(2) # Give the collection time to be ready for queries\n", + "\n", + " # Clear all documents in the collection\n", + " try:\n", + " query = f\"DELETE FROM `{bucket_name}`.`{scope_name}`.`{collection_name}`\"\n", + " cluster.query(query).execute()\n", + " logging.info(\"All documents cleared from the collection.\")\n", + " except Exception as e:\n", + " logging.warning(f\"Error while clearing documents: {str(e)}. The collection might be empty.\")\n", + "\n", + " return collection\n", + " except Exception as e:\n", + " raise RuntimeError(f\"Error setting up collection: {str(e)}\")\n", + " \n", + "setup_collection(cluster, CB_BUCKET_NAME, SCOPE_NAME, COLLECTION_NAME)\n", + "setup_collection(cluster, CB_BUCKET_NAME, SCOPE_NAME, CACHE_COLLECTION)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7FvxRsg38m3G" + }, + "source": [ + "# Creating OpenAI Embeddings\n", + "Embeddings are at the heart of semantic search. They are numerical representations of text that capture the semantic meaning of the words and phrases. Unlike traditional keyword-based search, which looks for exact matches, embeddings allow our search engine to understand the context and nuances of language, enabling it to retrieve documents that are semantically similar to the query, even if they don't contain the exact keywords. By creating embeddings using OpenAI, we equip our search engine with the ability to understand and process natural language in a way that's much closer to how humans understand language. This step transforms our raw text data into a format that the search engine can use to find and rank relevant documents.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_75ZyCRh8m6m", + "outputId": "711f7aa9-4a1b-4252-b64c-7bb5cfbafa2f" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-09 12:15:54,388 - INFO - Successfully created OpenAIEmbeddings\n" + ] + } + ], + "source": [ + "try:\n", + " embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY, model='text-embedding-3-small')\n", + " logging.info(\"Successfully created OpenAIEmbeddings\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Error creating OpenAIEmbeddings: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8IwZMUnF8m-N" + }, + "source": [ + "# Setting Up the Couchbase Query Vector Store\n", + "A vector store is where we'll keep our embeddings. The query vector store is specifically designed to handle embeddings and perform similarity searches. When a user inputs a query, GSI converts the query into an embedding and compares it against the embeddings stored in the vector store. This allows the engine to find documents that are semantically similar to the query, even if they don't contain the exact same words. By setting up the vector store in Couchbase, we create a powerful tool that enables us to understand and retrieve information based on the meaning and context of the query, rather than just the specific words used.\n", + "\n", + "The vector store requires a distance metric to determine how similarity between vectors is calculated. This is crucial for accurate semantic search results as different distance metrics can yield different similarity rankings. Some of the supported Distance strategies are dot, l2, euclidean, cosine, l2_squared, euclidean_squared. In our implementation we will use cosine which is particularly effective for text embeddings." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DwIJQjYT9RV_", + "outputId": "6aae222c-65d2-48d7-ce0a-17291e8e06b6" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-09 12:16:02,578 - INFO - Successfully created vector store\n" + ] + } + ], + "source": [ + "try:\n", + " vector_store = CouchbaseQueryVectorStore(\n", + " cluster=cluster,\n", + " bucket_name=CB_BUCKET_NAME,\n", + " scope_name=SCOPE_NAME,\n", + " collection_name=COLLECTION_NAME,\n", + " embedding = embeddings,\n", + " distance_metric=DistanceStrategy.COSINE\n", + " )\n", + " logging.info(\"Successfully created vector store\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Failed to create vector store: {str(e)}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load the BBC News Dataset\n", + "To build a search engine, we need data to search through. We use the BBC News dataset from RealTimeData, which provides real-world news articles. This dataset contains news articles from BBC covering various topics and time periods. Loading the dataset is a crucial step because it provides the raw material that our search engine will work with. The quality and diversity of the news articles make it an excellent choice for testing and refining our search engine, ensuring it can handle real-world news content effectively.\n", + "\n", + "The BBC News dataset allows us to work with authentic news articles, enabling us to build and test a search engine that can effectively process and retrieve relevant news content. The dataset is loaded using the Hugging Face datasets library, specifically accessing the \"RealTimeData/bbc_news_alltime\" dataset with the \"2024-12\" version." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-09 12:16:16,461 - INFO - Successfully loaded the BBC News dataset with 2687 rows.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded the BBC News dataset with 2687 rows\n" + ] + } + ], + "source": [ + "try:\n", + " news_dataset = load_dataset(\n", + " \"RealTimeData/bbc_news_alltime\", \"2024-12\", split=\"train\"\n", + " )\n", + " print(f\"Loaded the BBC News dataset with {len(news_dataset)} rows\")\n", + " logging.info(f\"Successfully loaded the BBC News dataset with {len(news_dataset)} rows.\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Error loading the BBC News dataset: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleaning up the Data\n", + "We will use the content of the news articles for our RAG system.\n", + "\n", + "The dataset contains a few duplicate records. We are removing them to avoid duplicate results in the retrieval stage of our RAG system." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "We have 1749 unique articles in our database.\n" + ] + } + ], + "source": [ + "news_articles = news_dataset[\"content\"]\n", + "unique_articles = set()\n", + "for article in news_articles:\n", + " if article:\n", + " unique_articles.add(article)\n", + "unique_news_articles = list(unique_articles)\n", + "print(f\"We have {len(unique_news_articles)} unique articles in our database.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Saving Data to the Vector Store\n", + "To efficiently handle the large number of articles, we process them in batches of articles at a time. This batch processing approach helps manage memory usage and provides better control over the ingestion process.\n", + "\n", + "We first filter out any articles that exceed 50,000 characters to avoid potential issues with token limits. Then, using the vector store's add_texts method, we add the filtered articles to our vector database. The batch_size parameter controls how many articles are processed in each iteration.\n", + "\n", + "This approach offers several benefits:\n", + "1. Memory Efficiency: Processing in smaller batches prevents memory overload\n", + "2. Progress Tracking: Easier to monitor and track the ingestion progress\n", + "3. Resource Management: Better control over CPU and network resource utilization\n", + "\n", + "We use a conservative batch size of 100 to ensure reliable operation.\n", + "The optimal batch size depends on many factors including:\n", + "- Document sizes being inserted\n", + "- Available system resources\n", + "- Network conditions\n", + "- Concurrent workload\n", + "\n", + "Consider measuring performance with your specific workload before adjusting.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-09 12:18:40,320 - INFO - Document ingestion completed successfully.\n" + ] + } + ], + "source": [ + "batch_size = 100\n", + "\n", + "# Automatic Batch Processing\n", + "articles = [article for article in unique_news_articles if article and len(article) <= 50000]\n", + "\n", + "try:\n", + " vector_store.add_texts(\n", + " texts=articles,\n", + " batch_size=batch_size\n", + " )\n", + " logging.info(\"Document ingestion completed successfully.\")\n", + "except Exception as e:\n", + " raise ValueError(f\"Failed to save documents to vector store: {str(e)}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8Pn8-dQw9RfQ" + }, + "source": [ + "# Setting Up a Couchbase Cache\n", + "To further optimize our system, we set up a Couchbase-based cache. A cache is a temporary storage layer that holds data that is frequently accessed, speeding up operations by reducing the need to repeatedly retrieve the same information from the database. In our setup, the cache will help us accelerate repetitive tasks, such as looking up similar documents. By implementing a cache, we enhance the overall performance of our search engine, ensuring that it can handle high query volumes and deliver results quickly.\n", + "\n", + "Caching is particularly valuable in scenarios where users may submit similar queries multiple times or where certain pieces of information are frequently requested. By storing these in a cache, we can significantly reduce the time it takes to respond to these queries, improving the user experience.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "V2y7dyjf9Rid", + "outputId": "5e342184-2fb1-44d3-90c1-cf45b3ca97f3" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-09 12:18:47,269 - INFO - Successfully created cache\n" + ] + } + ], + "source": [ + "try:\n", + " cache = CouchbaseCache(\n", + " cluster=cluster,\n", + " bucket_name=CB_BUCKET_NAME,\n", + " scope_name=SCOPE_NAME,\n", + " collection_name=CACHE_COLLECTION,\n", + " )\n", + " logging.info(\"Successfully created cache\")\n", + " set_llm_cache(cache)\n", + "except Exception as e:\n", + " raise ValueError(f\"Failed to create cache: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uehAx36o9Rlm" + }, + "source": [ + "# Using the Claude 4 Sonnet Language Model (LLM)\n", + "Language models are AI systems that are trained to understand and generate human language. We'll be using the `Claude 4 Sonnet` language model to process user queries and generate meaningful responses. This model is a key component of our semantic search engine, allowing it to go beyond simple keyword matching and truly understand the intent behind a query. By creating this language model, we equip our search engine with the ability to interpret complex queries, understand the nuances of language, and provide more accurate and contextually relevant responses.\n", + "\n", + "The language model's ability to understand context and generate coherent responses is what makes our search engine truly intelligent. It can not only find the right information but also present it in a way that is useful and understandable to the user.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "yRAfBRLH9RpO", + "outputId": "4b48b13f-ee7b-4f73-ac7a-db9d3da3f094" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-09 12:20:36,212 - INFO - Successfully created ChatAnthropic\n" + ] + } + ], + "source": [ + "try:\n", + " llm = ChatAnthropic(temperature=0.1, anthropic_api_key=ANTHROPIC_API_KEY, model_name='claude-sonnet-4-20250514') \n", + " logging.info(\"Successfully created ChatAnthropic\")\n", + "except Exception as e:\n", + " logging.error(f\"Error creating ChatAnthropic: {str(e)}. Please check your API key and network connection.\")\n", + " raise" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "k_XDfCx19UvG" + }, + "source": [ + "# Perform Semantic Search\n", + "Semantic search in Couchbase involves converting queries and documents into vector representations using an embeddings model. These vectors capture the semantic meaning of the text and are stored directly in Couchbase. When a query is made, Couchbase performs a similarity search by comparing the query vector against the stored document vectors. The similarity metric used for this comparison is configurable, allowing flexibility in how the relevance of documents is determined. Common metrics include cosine similarity, Euclidean distance, or dot product, but other metrics can be implemented based on specific use cases. Different embedding models like BERT, Word2Vec, or GloVe can also be used depending on the application's needs, with the vectors generated by these models stored and searched within Couchbase itself.\n", + "\n", + "In the provided code, the search process begins by recording the start time, followed by executing the `similarity_search_with_score` method of the `CouchbaseQueryVectorStore`. This method searches Couchbase for the most relevant documents based on the vector similarity to the query. The search results include the document content and the distance that reflects how closely each document aligns with the query in the defined semantic space. The time taken to perform this search is then calculated and logged, and the results are displayed, showing the most relevant documents along with their similarity scores. This approach leverages Couchbase as both a storage and retrieval engine for vector data, enabling efficient and scalable semantic searches. The integration of vector storage and search capabilities within Couchbase allows for sophisticated semantic search operations without relying on external services for vector storage or comparison." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Pk-oFbnC9Uym", + "outputId": "2a0ee48a-dc92-4d06-86d2-ae479caf5250" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-09 12:21:34,292 - INFO - Semantic search completed in 1.91 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Semantic Search Results (completed in 1.91 seconds):\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.2502, Text: A map shown during the draw for the 2026 Fifa World Cup has been criticised by Ukraine as an \"unacceptable error\" after it appeared to exclude Crimea as part of the country. The graphic - showing countries that cannot be drawn to play each other for geopolitical reasons - highlighted Ukraine but did not include the peninsula that is internationally recognised to be part of it. Crimea has been under Russian occupation since 2014 and just a handful of countries recognise the peninsula as Russian territory. Ukraine Foreign Ministry spokesman Heorhiy Tykhy said that the nation expects \"a public apology\". Fifa said it was \"aware of an issue\" and the image had been removed.\n", + "\n", + "Writing on X, Tykhy said that Fifa had not only \"acted against international law\" but had also \"supported Russian propaganda, war crimes, and the crime of aggression against Ukraine\". He added a \"fixed\" version of the map to his post, highlighting Crimea as part of Ukraine's territory. Among the countries that cannot play each other are Ukraine and Belarus, Spain and Gibraltar and Kosovo versus either Bosnia and Herzegovina or Serbia.\n", + "\n", + "This Twitter post cannot be displayed in your browser. Please enable Javascript or try a different browser. View original content on Twitter The BBC is not responsible for the content of external sites. Skip twitter post by Heorhii Tykhyi This article contains content provided by Twitter. We ask for your permission before anything is loaded, as they may be using cookies and other technologies. You may want to read Twitter’s cookie policy, external and privacy policy, external before accepting. To view this content choose ‘accept and continue’. The BBC is not responsible for the content of external sites.\n", + "\n", + "The Ukrainian Football Association has also sent a letter to Fifa secretary-general Mathias Grafström and UEFA secretary-general Theodore Theodoridis over the matter. \"We appeal to you to express our deep concern about the infographic map [shown] on December 13, 2024,\" the letter reads. \"Taking into account a number of official decisions and resolutions adopted by the Fifa Council and the UEFA executive committee since 2014... we emphasize that today's version of the cartographic image of Ukraine... is completely unacceptable and looks like an inconsistent position of Fifa and UEFA.\" The 2026 World Cup will start on 11 June that year in Mexico City and end on 19 July in New Jersey. The expanded 48-team tournament will last a record 39 days. Ukraine were placed in Group D alongside Iceland, Azerbaijan and the yet-to-be-determined winners of France's Nations League quarter-final against Croatia.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.5698, Text: Defending champions Manchester City will face Juventus in the group stage of the Fifa Club World Cup next summer, while Chelsea meet Brazilian side Flamengo. Pep Guardiola's City, who beat Brazilian side Fluminense to win the tournament for the first time in 2023, begin their title defence against Morocco's Wydad and also play Al Ain of the United Arab Emirates in Group G. Chelsea, winners of the 2021 final, were also drawn alongside Mexico's Club Leon and Tunisian side Esperance Sportive de Tunisie in Group D. The revamped Fifa Club World Cup, which has been expanded to 32 teams, will take place in the United States between 15 June and 13 July next year.\n", + "\n", + "A complex and lengthy draw ceremony was held across two separate Miami locations and lasted more than 90 minutes, during which a new Club World Cup trophy was revealed. There was also a video message from incoming US president Donald Trump, whose daughter Ivanka drew the first team. Lionel Messi's Inter Miami will take on Egyptian side Al Ahly at the Hard Rock Stadium in the opening match, staged in Miami. Elsewhere, Paris St-Germain were drawn against Atletico Madrid in Group B, while Bayern Munich meet Benfica in another all-European group-stage match-up. Teams will play each other once in the group phase and the top two will progress to the knockout stage.\n", + "\n", + "This video can not be played To play this video you need to enable JavaScript in your browser. What is the Club World Cup?\n", + "\n", + "Teams from each of the six international football confederations will be represented at next summer's tournament, including 12 European clubs - the highest quota of any confederation. The European places were decided by clubs' Champions League performances over the past four seasons, with recent winners Chelsea, Manchester City and Real Madrid guaranteed places. Al Ain, the most successful club in the UAE with 14 league titles, are owned by the country's president Sheikh Mohamed bin Zayed Al Nahyan - the older brother of City owner Sheikh Mansour. Real, who lifted the Fifa Club World Cup trophy for a record-extending fifth time in 2022, will open up against Saudi Pro League champions Al-Hilal, who currently have Neymar in their ranks. One place was reserved for a club from the host nation, which Fifa controversially awarded to Inter Miami, who will contest the tournament curtain-raiser. Messi's side were winners of the regular-season MLS Supporters' Shield but beaten in the MLS play-offs, meaning they are not this season's champions.\n", + "• None How does the new Club World Cup work & why is it so controversial?\n", + "\n", + "Matches will be played across 12 venues in the US which, alongside Canada and Mexico, also host the 2026 World Cup. Fifa is facing legal action from player unions and leagues about the scheduling of the event, which begins two weeks after the Champions League final at the end of the 2024-25 European calendar and ends five weeks before the first Premier League match of the 2025-2026 season. But football's world governing body believes the dates allow sufficient rest time before the start of the domestic campaigns. The Club World Cup will now take place once every four years, when it was previously held annually and involved just seven teams. Streaming platform DAZN has secured exclusive rights to broadcast next summer's tournament, during which 63 matches will take place over 29 days.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.5792, Text: After Fifa awards Saudi Arabia the hosting rights for the men's 2034 World Cup, BBC analysis editor Ros Atkins looks at how we got here and the controversies surrounding the decision.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.5877, Text: FA still to decide on endorsing Saudi World Cup bid\n", + "\n", + "The King Abdullah Sports City Stadium in Jeddah would be refurbished before the tournament\n", + "\n", + "The Football Association is still to decide whether it will officially endorse Saudi Arabia's 2034 men's World Cup bid at next week's virtual Fifa Congress. Insiders have told the BBC that discussions involving the FA's board remain ongoing. It has been suggested that rather than a traditional vote, ratification might instead be confirmed by acclamation - with federations in favour asked to show their support by applauding. However, Fifa is yet to confirm this, and FA officials have sought clarification on the process. On Friday, the DFB - Germany's national football association - announced it would vote in favour of both the Saudi Arabia bid and a multi-nation bid for the 2030 event. The BBC has learned that FA officials are mindful of concerns over Saudi Arabia's human rights record. However, some are also wary of suggestions of hypocrisy if the body declines to endorse the tournament, but then participates in it. Saudi Arabia is the sole bidder for the 2034 event, while the 2030 World Cup is set to be awarded to unopposed co-hosts Spain, Morocco and Portugal, with early matches also being played in Uruguay, Argentina and Paraguay. The ratification process has been combined so the 2030 and 2034 hosts will be decided jointly. It would appear, therefore, that if federations oppose one bid, they would have to support neither, with no separate acclamation for each of the potential hosts.\n", + "\n", + "In recent years, Saudi Arabia has hosted many major sports events, including Formula One, tennis, boxing and golf. Yet the kingdom's human rights record, restrictions on women's rights and the criminalisation of the LGBTQ+ community has prompted controversy over its ambition to host the men's World Cup in 10 years time. Last week, Fifa released its evaluation report for Saudi Arabia's bid, awarding it an average score of 4.2 out of 5 - the highest ever - with a conclusion that the tournament posed a 'medium' human rights risk. The assessment sparked condemnation from human rights groups.\n", + "\n", + "This video can not be played To play this video you need to enable JavaScript in your browser. Dec 2023: Saudi sports minister tells BBC sports editor Dan Roan that 'all are welcome'\n", + "\n", + "The DFB held a committee meeting on Friday to decide its stance, and unanimous approval was given, external to support the 2030 and 2034 bids. Its president, Bernd Neuendor, explained: \"We did not make the decision lightly and carefully examined the application for the 2034 World Cup. \"There was an exchange with many interest groups and experts, including human rights organisations and fans, on the basis of which a well-founded decision was made. \"We take the criticism of the applicant country seriously and will continue to engage in dialogue. Our goal is to work together with Fifa to improve the situation in the coming years.\" In 2022, England and Germany were among a group of European teams that abandoned plans to promote diversity and inclusion by wearing 'OneLove' armbands at the Qatar World Cup after Fifa threatened sporting sanctions. The FA had spoken out about human rights in the country.\n", + "\n", + "It is a sign of the complexity of this decision that the FA's hierarchy are still to come down one way or another, with just five days to go. While Saudi Arabia has become a fairly regular host of top-level sport, this would be another level entirely. The German FA's statement essentially acknowledges it will be criticised, and makes clear it did not take the decision lightly. It seems likely, therefore, that whatever decision the English FA takes, it will divide opinion. Any support of the event comes amid strong criticism from human rights and environmental campaigners, and just six weeks after more than one hundred professional women's footballers wrote to Fifa urging it to drop the Saudi oil giant Aramco as a sponsor. They called such a deal a \"punch in the stomach\" to the sport, so awarding the biggest football tournament in the world to the kingdom would likely provoke similar if not stronger criticism. However, there is a school of thought that sport can be a force for positive change, and that putting a spotlight on Saudi Arabia, if it was to host the World Cup, could help accelerate and enhance reforms.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.5905, Text: Uefa 'not worried' at Euro 2025 clash with Club World Cup\n", + "\n", + "England won Euro 2022 on home soil beating Germany in the final at Wembley\n", + "\n", + "Uefa is \"not worried\" that some Women's Euro 2025 matches will clash with the Fifa men's Club World Cup, says managing director Nadine Kessler. Fifa's new expanded men's tournament takes place in the United States from 15 June to 13 July and involves 32 teams, including Chelsea and Manchester City. Switzerland is hosting Euro 2025 from 2-27 July, meaning five or six matches are likely to be played at the same time as some in the men's competition. \"Overall, I'm not really worried. We're talking about two tournaments, in two different countries, with two different timezones,\" said Kessler.\n", + "\n", + "\"There are also broadcasting agreements in place that differs from ours. Again, I'm not worried. \"As much as we always try to get full exclusivity for our women's football tournaments, in the times we live in, with our men's football tournaments having so many in a year, to get that privilege of full exclusivity is not easy any more. \"We must also stick to our plan because I think it's important we get a professional, respected international calendar in place for women's football. We have to co-exist.\" In May, Fifa rejected claims that Fifpro and the World Leagues Association were not consulted over its plans for the Club World Cup. BBC Sport asked Fifa to justify the scheduling clash with Euro 2025 this month. A Fifa spokesperson told BBC Sport: \"The international match calendar for 2025-2030 was approved by the Fifa Council in 2023, which is made up of members from each of the six confederations, including Uefa. \"While Fifa accepts that both the men's and women's international match calendars are constrained by obvious limitations, this was deemed to be the most balanced solution.\"\n", + "\n", + "Uefa is aiming to sell out all matches at Euro 2025 and make it the most watched women's European football tournament. It has a target for a total attendance of more than 700,000. A portion of tickets went on sale on 1 October and over 200,000 have been sold. St Jakob Park in Basel will host the final with a capacity of 34,050. Uefa's events chief executive Martin Kallen said the tournament is getting \"bigger and bigger\" and this may be the last chance for such a small nation to host the Women's Euros. \"It's already at the edge for Switzerland to be able to do this Euros. I think they waited for the right moment to ask for it,\" said Kallen. \"They got it because in the future I think the stadiums and infrastructure in Switzerland is too small.\" The draw takes place on Monday, 16 December, when defending champions England and tournament debutants Wales will find out their group-stage opponents. Among Uefa's key objectives is an aim to meet \"men's Euros standards\" in terms of team facilities, football technology and analysis. Video assistant referee, goalline technology and semi-automated offsides will all be included. Artificial pitches in Bern and Thun will be overlaid with natural grass in June for the duration of the tournament in order to ensure conditions are the same across all venues. These pitches will be retained until September.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.6002, Text: The King Abdullah Sports City Stadium is one of 15 stadiums in line to host games in 2034\n", + "\n", + "The Football Association says it supported Saudi Arabia's bid to host the 2034 World Cup after being assured all fans would be safe and welcome. As expected, the governing body for English football backed the Saudi bid and a multi-nation hosting of the 2030 World Cup at Wednesday afternoon's online Fifa Congress, where the tournament hosts were officially confirmed. The FA released a statement after the meeting, which said: \"Our focus is on ensuring that all our fans can attend and enjoy tournaments. \"The FA board met the Saudi Arabian Football Federation last month to discuss their bid in more detail. \"We asked them to commit to ensuring all fans would be safe and welcome in Saudi Arabia in 2034 - including LGBTQ+ fans. They assured us that they are fully committed to providing a safe and welcome environment for all fans.\" Some senior FA officials are known to have been wary of accusations of hypocrisy if it were not to support Saudi Arabia but then wants England to participate. The FA will also be mindful of having caved in to Fifa's threats of sporting sanctions at the Qatar World Cup, when it and some other associations abandoned plans for players to wear 'OneLove' armbands intended as an anti-discrimination protest. But with a potential joint bid for the 2031 Women's World Cup, the British football federations may have been keen to avoid a rift with Fifa. And the FA will also have been aware of Saudi Arabia's importance to the UK Government as a key ally in the Middle East, with Prime Minister Sir Keir Starmer visiting the country's Crown Prince this week in a bid to strengthen economic ties between the two countries. Last year, Jake Daniels, the UK's only openly gay active male professional footballer, told the BBC he \"wouldn't feel safe\" at a Saudi World Cup. When the country's sports minister Prince Abdulaziz bin Turki Al Faisal was asked by BBC Sport last year what he would say to female and gay fans worrying whether they would be safe to attend, he said that \"everyone is welcome\". Saudi Arabia was the sole bidder for the 2034 event, while the 2030 World Cup was awarded to unopposed co-hosts Spain, Morocco and Portugal, with early matches also being played in Uruguay, Argentina and Paraguay. Rather than a traditional vote, the ratification process was confirmed by acclamation - with federations in favour asked to show their support by applauding for each bid in turn.\n", + "\n", + "This video can not be played To play this video you need to enable JavaScript in your browser.\n", + "\n", + "Norway's football federation abstained from the vote, arguing the bidding process \"undermines Fifa's own reforms for good governance\". On Friday, the DFB - Germany's national football association - announced it would vote in favour of both bids. \"We did not make the decision lightly and carefully examined the application for the 2034 World Cup,\" DFB president Bernd Neuendorf said. \"We take the criticism of the applicant country seriously and will continue to engage in dialogue. Our goal is to work together with Fifa to improve the situation in the coming years.\"\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.6122, Text: BBC and ITV agree World Cup deal for 2026 and 2030\n", + "\n", + "Argentina, led by Lionel Messi, won the 2022 men's World Cup in Qatar\n", + "\n", + "BBC Sport has agreed a deal to share live coverage of the men's Fifa World Cup in 2026 and 2030 with ITV and will broadcast the tournament across TV, audio and digital platforms. The 2026 World Cup in the United States, Canada and Mexico will be the biggest yet, with 48 teams playing 104 matches over 39 days - beginning in Mexico City on 11 June and ending in East Rutherford, New Jersey on 19 July. The two broadcasters will share the rights equally, splitting matches between them, including a shared final, ensuring continued free-to-air coverage of the Fifa World Cup. Alongside live TV coverage and highlights across the BBC TV channels and iPlayer, live audio commentary will be broadcast on BBC Radio 5 Live and 5 Sports Extra. Fans will be able to listen to 5 Live coverage on BBC Sounds and follow all the action on the BBC Sport website and app.\n", + "\n", + "ITV will deliver free-to-air coverage of live fixtures across ITV1, ITV4 and ITVX, plus highlights and exclusive content on ITV Sport social accounts. The 2030 tournament will be held across three continents and six countries. Spain, Portugal, and Morocco are co-hosting, but to mark 100 years since Uruguay staged the first World Cup there will be three matches played in South America - Argentina, Paraguay and Uruguay hosting one each - to open the tournament. Alex Kay-Jelski, BBC director of sport, said: \"Securing these iconic tournaments means BBC Sport is once again bringing people together for the biggest sporting moments. \"The World Cup is magical, something the whole planet stops to experience, and we can't wait to show it to audiences across all platforms.\" BBC Sport's rights portfolio also includes the Olympic Games, the FA Cup, men's Euro 2028 and women's Euro 2025, the Women's Super League, the Wimbledon Championships, the Women's Rugby World Cup 2025 and Match of the Day.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.6445, Text: Five killed in strike on Russia's Kursk after deadly missile attack on Kyiv\n", + "\n", + "Ukrainian officials said at least one person was killed and nine others were injured in the attack on Kyiv\n", + "\n", + "Russia says five people have been killed in a Ukrainian strike in the western Kursk region. Ukrainian officials reported earlier that Moscow had launched a fresh missile attack on Kyiv, damaging a building hosting several embassies. In Russia, the acting governor of the Kursk region said in addition to those killed, nine had been taken to hospital following the attack on the town of Rylsk. Alexander Khinshtein said a cultural centre, a fitness complex, a school and homes had been damaged in the strike which took place at 15:30 local time (12:30 GMT) on Friday.\n", + "\n", + "Russian officials earlier reported six killed, including a child, in Rylsk, about 25km (16 miles) from the Ukrainian border. But in an audio message on Telegram on Saturday morning, Khinshtein gave the latest update, saying there were five fatalities. \"There were no children among those [killed],\" he said. Ukrainian troops still hold parts of the Kursk region after launching a surprise cross-border offensive in early August. Ukraine's foreign ministry said Russia's strike on Kyiv had affected the diplomatic missions of Albania, Argentina, North Macedonia, Palestine, Portugal and Montenegro. It is unclear whether the building housing them was directly targeted in the Ukrainian capital. At least one person was killed and nine others were injured in the strike which damaged a number of buildings in the city, Ukraine's military said. It is not thought that any of the embassy diplomats were injured. In a verified video filmed in the Pecherskyi District, Kyiv's second oldest Roman Catholic church, St Nicholas Cathedral, is shown with windows shattered following a nearby blast. Ukraine's military said Russia had launched 65 drones and missiles across the country overnight, with most shot down. One man in Kyiv, who said he was the owner of a restaurant that suffered extensive damage following the attack, was filmed cursing the Russians as \"beasts\" as he surveyed the charred shell of a building in front of him. The video was widely shared on social media.\n", + "\n", + "Oksana, another resident, sent the BBC photos of her destroyed apartment, with the windows blown in and glass and brickwork strewn across the floors. \"I don't understand how I survived,\" she said. \"My balcony flew away, half my walls are gone. My neighbour is in such shock she can't even speak. I have no words for the people who did this.\" A local journalist at the scene told the BBC that one of the buildings nearby had been used by the Ukrainian Security Service, the SBU, and was likely to have been the target of the strikes, although much of the damage seen by the BBC had affected residential buildings. In a statement confirming the attack, the Russian defence ministry said missiles had been launched at an SBU \"command post\" in response to a strike on a chemical plant in Russia's Rostov Region two days ago. But there is also speculation in Kyiv that Friday's attack could be linked to the killing of a Russian general, Lt-Gen Igor Kirillov, in Moscow on Tuesday. Friday's attack come one day after Vladimir Putin's end-of-year press conference and phone-in show, in which he threatened to launch more ballistic missiles at the Ukrainian capital. There is concern in Ukraine that Russia could use a so-called Oreshnik intermediate-range ballistic missile to hit Kyiv. Moscow test-fired the missile on the central city of Dnipro earlier this month. Earlier on Friday morning, the Ukrainian authorities issued an air alert linked to the possible launch of an Oreshnik missile, and urged people in Kyiv to urgently seek shelter. It turned out to be a false alarm.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.6464, Text: Watch five iconic England goals from Euro 2022 including Georgia Stanway's screamer against Spain and Alessia Russo's cheeky backheel versus Sweden.\n", + "\n", + "Watch Women’s Euro 2025 draw at 16:55 GMT on Monday, 16 December, on BBC Two and the BBC Sport website & app\n", + "\n", + "Available to UK users only.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.6507, Text: I should have invaded Ukraine earlier, Putin tells Russians in TV marathon\n", + "\n", + "Russian President Vladimir Putin has said Russia should have launched a full-scale invasion of Ukraine earlier and been better prepared for the war. In his end-of-year press conference on Thursday, Putin said, with hindsight, there should have been \"systemic preparation\" for the 2022 invasion, which he refers to as a \"special military operation\". Russia seized Crimea from Ukraine in 2014 and pro-Russian forces began a conflict in eastern Ukraine, but it was eight years later that Putin tried to seize Kyiv. During his four-hour long appearance, Putin also talked about Syria's deposed leader, Russia's more aggressive nuclear doctrine as well as domestic issues, like the price of butter.\n", + "\n", + "Billed as \"Results of the Year with Vladimir Putin\", the event was broadcast live across the main state TV channels on Thursday. Putin appeared in front of a large blue screen emblazoned with a map of the Russian Federation, complete with annexed parts of Ukraine. He took questions from members of the public, foreign journalists and pensioners - but it was a highly choreographed and tightly controlled affair. When asked by the BBC's Russia editor Steve Rosenberg whether he felt the country was in a better state than where his predecessor, Boris Yeltsin, had left it 25 years ago, Putin said Russia had regained its \"sovereignty\". \"With everything that was happening to Russia before that, we were heading towards a complete, total loss of our sovereignty.\"\n", + "\n", + "This video can not be played To play this video you need to enable JavaScript in your browser.\n", + "\n", + "Asked about the fall of the Assad regime in Syria, Putin insisted it was not a defeat for the Kremlin - which supported President Bashar al-Assad militarily for years - but he admitted the situation was \"complicated\". He said he had not yet spoken to ousted Syrian leader, who fled to Moscow as rebel forces closed in on Damascus earlier this month, but planned to do so soon. He added that Russia was in talks with Syria's new rulers to retain two strategically important military bases on the Mediterranean coast and that Moscow would consider using them for humanitarian purposes.\n", + "\n", + "Russia holds some airbases in Syria, including this one at Hmeimim military base in Latakia province\n", + "\n", + "On US President-elect Donald Trump, Putin said the pair had not spoken in four years, but he was ready to meet him \"if he wants it\". When put to him he was in a weak position compared to Trump, who is set to take office in January, Putin quoted American writer Mark Twain: \"The rumours of my death are much exaggerated,\" prompting a smattering of laughs in the conference hall. Moving on to China, Putin said Russia's relations with its eastern neighbour had reached an all-time high and the two countries were coordinating actions on the world stage. \"In the last decade, the level and quality of our [Russia-China] relations have reached a point that has never existed throughout our entire history, \" he said. A lengthy portion of the session was focused on the war in Ukraine, with Putin saying he was \"open to compromises\" to end the war - although it was unclear what such compromises could entail. Russian forces are making progress on the frontlines \"everyday\", he said, describing his troops as \"heroes\". At one point, he produced a signed flag he said was given to him by Russian marines who were \"fighting for the motherland\" in the Kursk region, and ushered two observers to hold it behind him for the cameras.\n", + "\n", + "Putin produced a flag he said was given to him by Russian marines fighting in Kursk\n", + "\n", + "He also talked up Russia's construction projects in areas it has seized from Ukraine, claiming the standard of roads in the Ukrainian region of Luhansk had greatly improved since it was seized by Russia-backed forces in 2014. When asked by an audience member if the West had \"received the message\" on Russia's change to its nuclear doctrine, which Putin pushed through in November, he said \"you'll have to ask them.\" The new nuclear doctrine allows Russia to conduct a nuclear strike on any country, if it is backed by a nuclear power. That means if Ukraine were to launch a large attack on Russia with conventional missiles, drones or aircraft, that could meet the criteria for a nuclear response, as could an attack on Belarus or any critical threat to Russia's sovereignty. Putin also emphasised the capabilities of Russia's new intermediate-range ballistic missile, Oreshnik, which was used in a strike on Ukraine in November. In order to test its power, he suggested Russia should fire the Oreshnik towards Ukraine, and Ukrainian air defence - using US-supplied systems - should try to bring it down. As for the name \"Oreshnik\"? \"Honestly,\" Putin said with a smirk, \"No idea. No clue.\" A dominant theme throughout the event was \"Russian sovereignty\", with Putin claiming that less reliance on international partners - partly a result of Western sanctions - was one of the key achievements of his invasion of Ukraine. He said the economy was \"stable\", pointing to higher growth than countries like Germany, but admitted inflation of 9.1% was \"alarming\". In fact, the economy is overheating and highly reliant on military production - sometimes termed the \"military industrial complex\". Throughout the address, Putin also answered questions on domestic issues - from telephone scammers to young people's struggles with getting a mortgage.\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "query = \"What happened with the map shown during the 2026 FIFA World Cup draw regarding Ukraine and Crimea? What was the controversy?\"\n", + "\n", + "try:\n", + " # Perform the semantic search\n", + " start_time = time.time()\n", + " search_results = vector_store.similarity_search_with_score(query, k=10)\n", + " search_elapsed_time = time.time() - start_time\n", + "\n", + " logging.info(f\"Semantic search completed in {search_elapsed_time:.2f} seconds\")\n", + "\n", + " # Display search results\n", + " print(f\"\\nSemantic Search Results (completed in {search_elapsed_time:.2f} seconds):\")\n", + " print(\"-\" * 80) # Add separator line\n", + " for doc, score in search_results:\n", + " print(f\"Score: {score:.4f}, Text: {doc.page_content}\")\n", + " print(\"-\" * 80) # Add separator between results\n", + "\n", + "except CouchbaseException as e:\n", + " raise RuntimeError(f\"Error performing semantic search: {str(e)}\")\n", + "except Exception as e:\n", + " raise RuntimeError(f\"Unexpected error: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Optimizing Vector Search with Global Secondary Index (GSI)\n", + "\n", + "While the above semantic search using similarity_search_with_score works effectively, we can significantly improve query performance by leveraging Global Secondary Index (GSI) in Couchbase.\n", + "\n", + "Couchbase offers three types of vector indexes, but for GSI-based vector search we focus on two main types:\n", + "\n", + "Hyperscale Vector Indexes (BHIVE)\n", + "- Best for pure vector searches - content discovery, recommendations, semantic search\n", + "- High performance with low memory footprint - designed to scale to billions of vectors\n", + "- Optimized for concurrent operations - supports simultaneous searches and inserts\n", + "- Use when: You primarily perform vector-only queries without complex scalar filtering\n", + "- Ideal for: Large-scale semantic search, recommendation systems, content discovery\n", + "\n", + "Composite Vector Indexes \n", + "- Best for filtered vector searches - combines vector search with scalar value filtering\n", + "- Efficient pre-filtering - scalar attributes reduce the vector comparison scope\n", + "- Use when: Your queries combine vector similarity with scalar filters that eliminate large portions of data\n", + "- Ideal for: Compliance-based filtering, user-specific searches, time-bounded queries\n", + "\n", + "Choosing the Right Index Type\n", + "- Start with Hyperscale Vector Index for pure vector searches and large datasets\n", + "- Use Composite Vector Index when scalar filters significantly reduce your search space\n", + "- Consider your dataset size: Hyperscale scales to billions, Composite works well for tens of millions to billions\n", + "\n", + "For more details, see the [Couchbase Vector Index documentation](https://preview.docs-test.couchbase.com/docs-server-DOC-12565_vector_search_concepts/server/current/vector-index/use-vector-indexes.html).\n", + "\n", + "\n", + "## Understanding Index Configuration (Couchbase 8.0 Feature)\n", + "\n", + "The index_description parameter controls how Couchbase optimizes vector storage and search performance through centroids and quantization:\n", + "\n", + "Format: `'IVF[],{PQ|SQ}'`\n", + "\n", + "Centroids (IVF - Inverted File):\n", + "- Controls how the dataset is subdivided for faster searches\n", + "- More centroids = faster search, slower training \n", + "- Fewer centroids = slower search, faster training\n", + "- If omitted (like IVF,SQ8), Couchbase auto-selects based on dataset size\n", + "\n", + "Quantization Options:\n", + "- SQ (Scalar Quantization): SQ4, SQ6, SQ8 (4, 6, or 8 bits per dimension)\n", + "- PQ (Product Quantization): PQx (e.g., PQ32x8)\n", + "- Higher values = better accuracy, larger index size\n", + "\n", + "Common Examples:\n", + "- IVF,SQ8 - Auto centroids, 8-bit scalar quantization (good default)\n", + "- IVF1000,SQ6 - 1000 centroids, 6-bit scalar quantization \n", + "- IVF,PQ32x8 - Auto centroids, 32 subquantizers with 8 bits\n", + "\n", + "For detailed configuration options, see the [Quantization & Centroid Settings](https://preview.docs-test.couchbase.com/docs-server-DOC-12565_vector_search_concepts/server/current/vector-index/hyperscale-vector-index.html#algo_settings).\n", + "\n", + "In the code below, we demonstrate creating a BHIVE index. This method takes an index type (BHIVE or COMPOSITE) and description parameter for optimization settings. Alternatively, GSI indexes can be created manually from the Couchbase UI." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "vector_store.create_index(index_type=IndexType.BHIVE, index_name=\"claude_bhive_index\",index_description=\"IVF,SQ8\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-09 12:26:01,504 - INFO - Semantic search completed in 0.44 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Semantic Search Results (completed in 0.44 seconds):\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.2502, Text: A map shown during the draw for the 2026 Fifa World Cup has been criticised by Ukraine as an \"unacceptable error\" after it appeared to exclude Crimea as part of the country. The graphic - showing countries that cannot be drawn to play each other for geopolitical reasons - highlighted Ukraine but did not include the peninsula that is internationally recognised to be part of it. Crimea has been under Russian occupation since 2014 and just a handful of countries recognise the peninsula as Russian territory. Ukraine Foreign Ministry spokesman Heorhiy Tykhy said that the nation expects \"a public apology\". Fifa said it was \"aware of an issue\" and the image had been removed.\n", + "\n", + "Writing on X, Tykhy said that Fifa had not only \"acted against international law\" but had also \"supported Russian propaganda, war crimes, and the crime of aggression against Ukraine\". He added a \"fixed\" version of the map to his post, highlighting Crimea as part of Ukraine's territory. Among the countries that cannot play each other are Ukraine and Belarus, Spain and Gibraltar and Kosovo versus either Bosnia and Herzegovina or Serbia.\n", + "\n", + "This Twitter post cannot be displayed in your browser. Please enable Javascript or try a different browser. View original content on Twitter The BBC is not responsible for the content of external sites. Skip twitter post by Heorhii Tykhyi This article contains content provided by Twitter. We ask for your permission before anything is loaded, as they may be using cookies and other technologies. You may want to read Twitter’s cookie policy, external and privacy policy, external before accepting. To view this content choose ‘accept and continue’. The BBC is not responsible for the content of external sites.\n", + "\n", + "The Ukrainian Football Association has also sent a letter to Fifa secretary-general Mathias Grafström and UEFA secretary-general Theodore Theodoridis over the matter. \"We appeal to you to express our deep concern about the infographic map [shown] on December 13, 2024,\" the letter reads. \"Taking into account a number of official decisions and resolutions adopted by the Fifa Council and the UEFA executive committee since 2014... we emphasize that today's version of the cartographic image of Ukraine... is completely unacceptable and looks like an inconsistent position of Fifa and UEFA.\" The 2026 World Cup will start on 11 June that year in Mexico City and end on 19 July in New Jersey. The expanded 48-team tournament will last a record 39 days. Ukraine were placed in Group D alongside Iceland, Azerbaijan and the yet-to-be-determined winners of France's Nations League quarter-final against Croatia.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.5698, Text: Defending champions Manchester City will face Juventus in the group stage of the Fifa Club World Cup next summer, while Chelsea meet Brazilian side Flamengo. Pep Guardiola's City, who beat Brazilian side Fluminense to win the tournament for the first time in 2023, begin their title defence against Morocco's Wydad and also play Al Ain of the United Arab Emirates in Group G. Chelsea, winners of the 2021 final, were also drawn alongside Mexico's Club Leon and Tunisian side Esperance Sportive de Tunisie in Group D. The revamped Fifa Club World Cup, which has been expanded to 32 teams, will take place in the United States between 15 June and 13 July next year.\n", + "\n", + "A complex and lengthy draw ceremony was held across two separate Miami locations and lasted more than 90 minutes, during which a new Club World Cup trophy was revealed. There was also a video message from incoming US president Donald Trump, whose daughter Ivanka drew the first team. Lionel Messi's Inter Miami will take on Egyptian side Al Ahly at the Hard Rock Stadium in the opening match, staged in Miami. Elsewhere, Paris St-Germain were drawn against Atletico Madrid in Group B, while Bayern Munich meet Benfica in another all-European group-stage match-up. Teams will play each other once in the group phase and the top two will progress to the knockout stage.\n", + "\n", + "This video can not be played To play this video you need to enable JavaScript in your browser. What is the Club World Cup?\n", + "\n", + "Teams from each of the six international football confederations will be represented at next summer's tournament, including 12 European clubs - the highest quota of any confederation. The European places were decided by clubs' Champions League performances over the past four seasons, with recent winners Chelsea, Manchester City and Real Madrid guaranteed places. Al Ain, the most successful club in the UAE with 14 league titles, are owned by the country's president Sheikh Mohamed bin Zayed Al Nahyan - the older brother of City owner Sheikh Mansour. Real, who lifted the Fifa Club World Cup trophy for a record-extending fifth time in 2022, will open up against Saudi Pro League champions Al-Hilal, who currently have Neymar in their ranks. One place was reserved for a club from the host nation, which Fifa controversially awarded to Inter Miami, who will contest the tournament curtain-raiser. Messi's side were winners of the regular-season MLS Supporters' Shield but beaten in the MLS play-offs, meaning they are not this season's champions.\n", + "• None How does the new Club World Cup work & why is it so controversial?\n", + "\n", + "Matches will be played across 12 venues in the US which, alongside Canada and Mexico, also host the 2026 World Cup. Fifa is facing legal action from player unions and leagues about the scheduling of the event, which begins two weeks after the Champions League final at the end of the 2024-25 European calendar and ends five weeks before the first Premier League match of the 2025-2026 season. But football's world governing body believes the dates allow sufficient rest time before the start of the domestic campaigns. The Club World Cup will now take place once every four years, when it was previously held annually and involved just seven teams. Streaming platform DAZN has secured exclusive rights to broadcast next summer's tournament, during which 63 matches will take place over 29 days.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.5792, Text: After Fifa awards Saudi Arabia the hosting rights for the men's 2034 World Cup, BBC analysis editor Ros Atkins looks at how we got here and the controversies surrounding the decision.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.5877, Text: FA still to decide on endorsing Saudi World Cup bid\n", + "\n", + "The King Abdullah Sports City Stadium in Jeddah would be refurbished before the tournament\n", + "\n", + "The Football Association is still to decide whether it will officially endorse Saudi Arabia's 2034 men's World Cup bid at next week's virtual Fifa Congress. Insiders have told the BBC that discussions involving the FA's board remain ongoing. It has been suggested that rather than a traditional vote, ratification might instead be confirmed by acclamation - with federations in favour asked to show their support by applauding. However, Fifa is yet to confirm this, and FA officials have sought clarification on the process. On Friday, the DFB - Germany's national football association - announced it would vote in favour of both the Saudi Arabia bid and a multi-nation bid for the 2030 event. The BBC has learned that FA officials are mindful of concerns over Saudi Arabia's human rights record. However, some are also wary of suggestions of hypocrisy if the body declines to endorse the tournament, but then participates in it. Saudi Arabia is the sole bidder for the 2034 event, while the 2030 World Cup is set to be awarded to unopposed co-hosts Spain, Morocco and Portugal, with early matches also being played in Uruguay, Argentina and Paraguay. The ratification process has been combined so the 2030 and 2034 hosts will be decided jointly. It would appear, therefore, that if federations oppose one bid, they would have to support neither, with no separate acclamation for each of the potential hosts.\n", + "\n", + "In recent years, Saudi Arabia has hosted many major sports events, including Formula One, tennis, boxing and golf. Yet the kingdom's human rights record, restrictions on women's rights and the criminalisation of the LGBTQ+ community has prompted controversy over its ambition to host the men's World Cup in 10 years time. Last week, Fifa released its evaluation report for Saudi Arabia's bid, awarding it an average score of 4.2 out of 5 - the highest ever - with a conclusion that the tournament posed a 'medium' human rights risk. The assessment sparked condemnation from human rights groups.\n", + "\n", + "This video can not be played To play this video you need to enable JavaScript in your browser. Dec 2023: Saudi sports minister tells BBC sports editor Dan Roan that 'all are welcome'\n", + "\n", + "The DFB held a committee meeting on Friday to decide its stance, and unanimous approval was given, external to support the 2030 and 2034 bids. Its president, Bernd Neuendor, explained: \"We did not make the decision lightly and carefully examined the application for the 2034 World Cup. \"There was an exchange with many interest groups and experts, including human rights organisations and fans, on the basis of which a well-founded decision was made. \"We take the criticism of the applicant country seriously and will continue to engage in dialogue. Our goal is to work together with Fifa to improve the situation in the coming years.\" In 2022, England and Germany were among a group of European teams that abandoned plans to promote diversity and inclusion by wearing 'OneLove' armbands at the Qatar World Cup after Fifa threatened sporting sanctions. The FA had spoken out about human rights in the country.\n", + "\n", + "It is a sign of the complexity of this decision that the FA's hierarchy are still to come down one way or another, with just five days to go. While Saudi Arabia has become a fairly regular host of top-level sport, this would be another level entirely. The German FA's statement essentially acknowledges it will be criticised, and makes clear it did not take the decision lightly. It seems likely, therefore, that whatever decision the English FA takes, it will divide opinion. Any support of the event comes amid strong criticism from human rights and environmental campaigners, and just six weeks after more than one hundred professional women's footballers wrote to Fifa urging it to drop the Saudi oil giant Aramco as a sponsor. They called such a deal a \"punch in the stomach\" to the sport, so awarding the biggest football tournament in the world to the kingdom would likely provoke similar if not stronger criticism. However, there is a school of thought that sport can be a force for positive change, and that putting a spotlight on Saudi Arabia, if it was to host the World Cup, could help accelerate and enhance reforms.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.5905, Text: Uefa 'not worried' at Euro 2025 clash with Club World Cup\n", + "\n", + "England won Euro 2022 on home soil beating Germany in the final at Wembley\n", + "\n", + "Uefa is \"not worried\" that some Women's Euro 2025 matches will clash with the Fifa men's Club World Cup, says managing director Nadine Kessler. Fifa's new expanded men's tournament takes place in the United States from 15 June to 13 July and involves 32 teams, including Chelsea and Manchester City. Switzerland is hosting Euro 2025 from 2-27 July, meaning five or six matches are likely to be played at the same time as some in the men's competition. \"Overall, I'm not really worried. We're talking about two tournaments, in two different countries, with two different timezones,\" said Kessler.\n", + "\n", + "\"There are also broadcasting agreements in place that differs from ours. Again, I'm not worried. \"As much as we always try to get full exclusivity for our women's football tournaments, in the times we live in, with our men's football tournaments having so many in a year, to get that privilege of full exclusivity is not easy any more. \"We must also stick to our plan because I think it's important we get a professional, respected international calendar in place for women's football. We have to co-exist.\" In May, Fifa rejected claims that Fifpro and the World Leagues Association were not consulted over its plans for the Club World Cup. BBC Sport asked Fifa to justify the scheduling clash with Euro 2025 this month. A Fifa spokesperson told BBC Sport: \"The international match calendar for 2025-2030 was approved by the Fifa Council in 2023, which is made up of members from each of the six confederations, including Uefa. \"While Fifa accepts that both the men's and women's international match calendars are constrained by obvious limitations, this was deemed to be the most balanced solution.\"\n", + "\n", + "Uefa is aiming to sell out all matches at Euro 2025 and make it the most watched women's European football tournament. It has a target for a total attendance of more than 700,000. A portion of tickets went on sale on 1 October and over 200,000 have been sold. St Jakob Park in Basel will host the final with a capacity of 34,050. Uefa's events chief executive Martin Kallen said the tournament is getting \"bigger and bigger\" and this may be the last chance for such a small nation to host the Women's Euros. \"It's already at the edge for Switzerland to be able to do this Euros. I think they waited for the right moment to ask for it,\" said Kallen. \"They got it because in the future I think the stadiums and infrastructure in Switzerland is too small.\" The draw takes place on Monday, 16 December, when defending champions England and tournament debutants Wales will find out their group-stage opponents. Among Uefa's key objectives is an aim to meet \"men's Euros standards\" in terms of team facilities, football technology and analysis. Video assistant referee, goalline technology and semi-automated offsides will all be included. Artificial pitches in Bern and Thun will be overlaid with natural grass in June for the duration of the tournament in order to ensure conditions are the same across all venues. These pitches will be retained until September.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.6002, Text: The King Abdullah Sports City Stadium is one of 15 stadiums in line to host games in 2034\n", + "\n", + "The Football Association says it supported Saudi Arabia's bid to host the 2034 World Cup after being assured all fans would be safe and welcome. As expected, the governing body for English football backed the Saudi bid and a multi-nation hosting of the 2030 World Cup at Wednesday afternoon's online Fifa Congress, where the tournament hosts were officially confirmed. The FA released a statement after the meeting, which said: \"Our focus is on ensuring that all our fans can attend and enjoy tournaments. \"The FA board met the Saudi Arabian Football Federation last month to discuss their bid in more detail. \"We asked them to commit to ensuring all fans would be safe and welcome in Saudi Arabia in 2034 - including LGBTQ+ fans. They assured us that they are fully committed to providing a safe and welcome environment for all fans.\" Some senior FA officials are known to have been wary of accusations of hypocrisy if it were not to support Saudi Arabia but then wants England to participate. The FA will also be mindful of having caved in to Fifa's threats of sporting sanctions at the Qatar World Cup, when it and some other associations abandoned plans for players to wear 'OneLove' armbands intended as an anti-discrimination protest. But with a potential joint bid for the 2031 Women's World Cup, the British football federations may have been keen to avoid a rift with Fifa. And the FA will also have been aware of Saudi Arabia's importance to the UK Government as a key ally in the Middle East, with Prime Minister Sir Keir Starmer visiting the country's Crown Prince this week in a bid to strengthen economic ties between the two countries. Last year, Jake Daniels, the UK's only openly gay active male professional footballer, told the BBC he \"wouldn't feel safe\" at a Saudi World Cup. When the country's sports minister Prince Abdulaziz bin Turki Al Faisal was asked by BBC Sport last year what he would say to female and gay fans worrying whether they would be safe to attend, he said that \"everyone is welcome\". Saudi Arabia was the sole bidder for the 2034 event, while the 2030 World Cup was awarded to unopposed co-hosts Spain, Morocco and Portugal, with early matches also being played in Uruguay, Argentina and Paraguay. Rather than a traditional vote, the ratification process was confirmed by acclamation - with federations in favour asked to show their support by applauding for each bid in turn.\n", + "\n", + "This video can not be played To play this video you need to enable JavaScript in your browser.\n", + "\n", + "Norway's football federation abstained from the vote, arguing the bidding process \"undermines Fifa's own reforms for good governance\". On Friday, the DFB - Germany's national football association - announced it would vote in favour of both bids. \"We did not make the decision lightly and carefully examined the application for the 2034 World Cup,\" DFB president Bernd Neuendorf said. \"We take the criticism of the applicant country seriously and will continue to engage in dialogue. Our goal is to work together with Fifa to improve the situation in the coming years.\"\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.6122, Text: BBC and ITV agree World Cup deal for 2026 and 2030\n", + "\n", + "Argentina, led by Lionel Messi, won the 2022 men's World Cup in Qatar\n", + "\n", + "BBC Sport has agreed a deal to share live coverage of the men's Fifa World Cup in 2026 and 2030 with ITV and will broadcast the tournament across TV, audio and digital platforms. The 2026 World Cup in the United States, Canada and Mexico will be the biggest yet, with 48 teams playing 104 matches over 39 days - beginning in Mexico City on 11 June and ending in East Rutherford, New Jersey on 19 July. The two broadcasters will share the rights equally, splitting matches between them, including a shared final, ensuring continued free-to-air coverage of the Fifa World Cup. Alongside live TV coverage and highlights across the BBC TV channels and iPlayer, live audio commentary will be broadcast on BBC Radio 5 Live and 5 Sports Extra. Fans will be able to listen to 5 Live coverage on BBC Sounds and follow all the action on the BBC Sport website and app.\n", + "\n", + "ITV will deliver free-to-air coverage of live fixtures across ITV1, ITV4 and ITVX, plus highlights and exclusive content on ITV Sport social accounts. The 2030 tournament will be held across three continents and six countries. Spain, Portugal, and Morocco are co-hosting, but to mark 100 years since Uruguay staged the first World Cup there will be three matches played in South America - Argentina, Paraguay and Uruguay hosting one each - to open the tournament. Alex Kay-Jelski, BBC director of sport, said: \"Securing these iconic tournaments means BBC Sport is once again bringing people together for the biggest sporting moments. \"The World Cup is magical, something the whole planet stops to experience, and we can't wait to show it to audiences across all platforms.\" BBC Sport's rights portfolio also includes the Olympic Games, the FA Cup, men's Euro 2028 and women's Euro 2025, the Women's Super League, the Wimbledon Championships, the Women's Rugby World Cup 2025 and Match of the Day.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.6464, Text: Watch five iconic England goals from Euro 2022 including Georgia Stanway's screamer against Spain and Alessia Russo's cheeky backheel versus Sweden.\n", + "\n", + "Watch Women’s Euro 2025 draw at 16:55 GMT on Monday, 16 December, on BBC Two and the BBC Sport website & app\n", + "\n", + "Available to UK users only.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.6565, Text: Romania's far right presidential frontrunner vows to end Ukraine aid\n", + "\n", + "Calin Georgescu is leading the presidential race in Romania\n", + "\n", + "Calin Georgescu, the fringe nationalist politician leading the presidential race in Romania, has told the BBC that he would end all support for Ukraine if elected. He is facing a second-round run-off in the elections on Sunday, where he will run against Elena Lasconi, a former TV presenter who is campaigning on a firmly pro-EU platform. Georgescu, whose only election campaigning has been on social media, said he would make \"the Romanian people\" his priority. But he denied that his surprise success so far was the result of a Russian-backed influence operation, saying he did not care about the \"lies\" of his country's intelligence agencies as he was working with God and the people.\n", + "\n", + "On Wednesday, in a highly unusual move, Romania's outgoing president published declassified documents that detailed what was called a massive and \"highly organised\" campaign for Georgescu on TikTok co-ordinated by a \"state actor\". The papers included an intelligence assessment that Russia was carrying out hybrid attacks on Romania, which it sees as an \"enemy state\". The constitutional court is now being flooded with requests to look into the allegations of meddling with a view to possibly cancelling the election. Prosecutors today announced they were opening a criminal investigation but there is no timeline on when that might conclude. \"They are afraid,\" is how Georgescu brushed away evidence that hundreds of thousands of dollars were spent pushing campaign content for him, breaking both Romanian election law and TikTok's own rules. He denied that he was \"Moscow's man\", referring derisively to Romania's \"un-intelligence agencies\". \"They can't accept that the Romanian people finally said, 'we want our life back, our country, our dignity',\" he said, portraying himself as battling against an unyielding establishment. In a sometimes tetchy interview in which he praised Donald Trump and the Hungarian populist leader Viktor Orban, Georgescu referred to Vladimir Putin as a \"patriot and a leader\". He then added: \"But I am not a fan.\" But when questioned about Russia's war on Ukraine, he first asked, \"Are you sure of that?\", appearing to deny the war's very existence. He then said Romania was interested only in pushing for peace on its border but refused to say that this should be on Kyiv's terms. When asked whether he agreed with standing by Ukraine, as the EU puts it, \"for as long as it takes\", Georgescu said \"No.\" He said things would change. \"I agree just that I have to take care of my people. I don't want to involve my people,\" he replied, clarifying that Romania – an EU and Nato member - would provide no more military or political support for its neighbour. \"Zero. Everything stops. I have to take care just about my people. We have a lot of problems ourselves.\" It would be a dramatic change in position for Bucharest, and one that would be music to Russian ears. The president in Romania has considerable power, including influence on areas such as foreign policy. He is also the commander-in-chief of the armed forces and appoints the prime minister. Romania shares a long border with Ukraine and has been a staunch supporter of Kyiv since the full-scale invasion in 2022. It's provided a Patriot missile defence system as well as financial support. It's also become a key export route for Ukrainian grain, as Russian bombardments have crippled the work of ports there. Under a Georgescu presidency, Romania would join Hungary and Slovakia as Russia-sympathisers on the eastern flank of Nato. It would also be a serious dent to EU solidarity on Ukraine, just as it faces the prospect of assuming more responsibility for aiding Kyiv with Donald Trump back in the White House. Georgescu underlined that he would keep Romania inside the EU and Nato, but that everything from now on would be \"negotiated\" and focus on his country's interests. He refused to say that Vladimir Putin's Russia was a security threat for the West. His endorsement of conspiracy theories has also sparked concerns, including denying the Covid pandemic and doubting that anyone ever landed on the Moon. Romania is home to a giant Nato military base, close to the Black Sea, as well as a US missile defence facility. Georgescu now describes himself as a university teacher, but has previously worked in the Ministry of Foreign Affairs and with the UN. He clearly does have supporters – his clean-living, Romania-first message has popular appeal, especially outside Bucharest. But in the capital many people are worried about the direction their country may be taking. When asked if he understood why they were scared, Georgescu shook his head: \"That's just propaganda.\" On Thursday evening, several thousand protesters gathered in central Bucharest to call for Romania to remain closely allied with Europe - many holding the blue EU flag. Others brought Romanian flags with a circular hole in the centre, a reminder of how after the 1989 revolution people cut out the communist symbols. Talk of Russian influence - Moscow meddling in any form - is an emotional topic for many. Several chanted \"Freedom!\" and \"Europe!\" One man told the BBC he and his friends had been with protesters on the streets of Bucharest 35 years ago, and couldn't bear to think of Romania going back to the past. Another woman, Anca, said she saw the \"long arm of Russia\" at work in Romania's presidential election and had come to the rally to show she believes her country's future has to be firmly in Europe.\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.6575, Text: Coverage of the draw starts at 19:00 GMT\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "query = \"What happened with the map shown during the 2026 FIFA World Cup draw regarding Ukraine and Crimea? What was the controversy?\"\n", + "\n", + "try:\n", + " # Perform the semantic search\n", + " start_time = time.time()\n", + " search_results = vector_store.similarity_search_with_score(query, k=10)\n", + " search_elapsed_time = time.time() - start_time\n", + "\n", + " logging.info(f\"Semantic search completed in {search_elapsed_time:.2f} seconds\")\n", + "\n", + " # Display search results\n", + " print(f\"\\nSemantic Search Results (completed in {search_elapsed_time:.2f} seconds):\")\n", + " print(\"-\" * 80) # Add separator line\n", + " for doc, score in search_results:\n", + " print(f\"Score: {score:.4f}, Text: {doc.page_content}\")\n", + " print(\"-\" * 80) # Add separator between results\n", + "\n", + "except CouchbaseException as e:\n", + " raise RuntimeError(f\"Error performing semantic search: {str(e)}\")\n", + "except Exception as e:\n", + " raise RuntimeError(f\"Unexpected error: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: To create a COMPOSITE index, the below code can be used.\n", + "Choose based on your specific use case and query patterns. For this tutorial's news search scenario, either index type would work, but BHIVE might be more efficient for pure semantic search across news articles." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vector_store.create_index(index_type=IndexType.COMPOSITE, index_name=\"claude_composite_index\", index_description=\"IVF,SQ8\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sS0FebHI9U1l" + }, + "source": [ + "# Retrieval-Augmented Generation (RAG) with Couchbase and LangChain\n", + "Couchbase and LangChain can be seamlessly integrated to create RAG (Retrieval-Augmented Generation) chains, enhancing the process of generating contextually relevant responses. In this setup, Couchbase serves as the vector store, where embeddings of documents are stored. When a query is made, LangChain retrieves the most relevant documents from Couchbase by comparing the query’s embedding with the stored document embeddings. These documents, which provide contextual information, are then passed to a generative language model within LangChain.\n", + "\n", + "The language model, equipped with the context from the retrieved documents, generates a response that is both informed and contextually accurate. This integration allows the RAG chain to leverage Couchbase’s efficient storage and retrieval capabilities, while LangChain handles the generation of responses based on the context provided by the retrieved documents. Together, they create a powerful system that can deliver highly relevant and accurate answers by combining the strengths of both retrieval and generation." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ZGUXQQmv9ge4", + "outputId": "8c19df4b-19ab-489e-bec4-c2b093e3736a" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-09 12:26:10,540 - INFO - Successfully created RAG chain\n" + ] + } + ], + "source": [ + "system_template = \"You are a helpful assistant that answers questions based on the provided context.\"\n", + "system_message_prompt = SystemMessagePromptTemplate.from_template(system_template)\n", + "\n", + "human_template = \"Context: {context}\\n\\nQuestion: {question}\"\n", + "human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)\n", + "\n", + "chat_prompt = ChatPromptTemplate.from_messages([\n", + " system_message_prompt,\n", + " human_message_prompt\n", + "])\n", + "\n", + "def format_docs(docs):\n", + " return \"\\n\\n\".join(doc.page_content for doc in docs)\n", + "\n", + "rag_chain = (\n", + " {\"context\": lambda x: format_docs(vector_store.similarity_search(x)), \"question\": RunnablePassthrough()}\n", + " | chat_prompt\n", + " | llm\n", + ")\n", + "logging.info(\"Successfully created RAG chain\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Mia7XxM9978M", + "outputId": "be72f001-4a4c-4d71-95f1-54a2850d9dab" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RAG Response: During the draw for the 2026 FIFA World Cup, a map was shown that excluded Crimea as part of Ukraine. This graphic, which was displaying countries that cannot be drawn to play each other for geopolitical reasons, highlighted Ukraine but did not include the Crimean peninsula, which is internationally recognized as Ukrainian territory.\n", + "\n", + "This omission sparked significant controversy because Crimea has been under Russian occupation since 2014, but only a handful of countries recognize it as Russian territory. The Ukrainian Foreign Ministry spokesman, Heorhiy Tykhy, called this an \"unacceptable error\" and stated that Ukraine expected \"a public apology\" from FIFA. He criticized FIFA for acting \"against international law\" and supporting \"Russian propaganda, war crimes, and the crime of aggression against Ukraine.\"\n", + "\n", + "The Ukrainian Football Association also sent a formal letter of complaint to FIFA and UEFA officials expressing their \"deep concern\" about the cartographic representation. FIFA acknowledged they were \"aware of an issue\" and subsequently removed the image.\n", + "RAG response generated in 8.68 seconds\n" + ] + } + ], + "source": [ + "try:\n", + " start_time = time.time()\n", + " rag_response = rag_chain.invoke(query)\n", + " rag_elapsed_time = time.time() - start_time\n", + "\n", + " print(f\"RAG Response: {rag_response.content}\")\n", + " print(f\"RAG response generated in {rag_elapsed_time:.2f} seconds\")\n", + "except AuthenticationError as e:\n", + " print(f\"Authentication error: {str(e)}\")\n", + "except InternalServerFailureException as e:\n", + " if \"query request rejected\" in str(e):\n", + " print(\"Error: Search request was rejected due to rate limiting. Please try again later.\")\n", + " else:\n", + " print(f\"Internal server error occurred: {str(e)}\")\n", + "except Exception as e:\n", + " print(f\"Unexpected error occurred: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aIdayPzw9glT" + }, + "source": [ + "# Using Couchbase as a caching mechanism\n", + "Couchbase can be effectively used as a caching mechanism for RAG (Retrieval-Augmented Generation) responses by storing and retrieving precomputed results for specific queries. This approach enhances the system's efficiency and speed, particularly when dealing with repeated or similar queries. When a query is first processed, the RAG chain retrieves relevant documents, generates a response using the language model, and then stores this response in Couchbase, with the query serving as the key.\n", + "\n", + "For subsequent requests with the same query, the system checks Couchbase first. If a cached response is found, it is retrieved directly from Couchbase, bypassing the need to re-run the entire RAG process. This significantly reduces response time because the computationally expensive steps of document retrieval and response generation are skipped. Couchbase's role in this setup is to provide a fast and scalable storage solution for caching these responses, ensuring that frequently asked queries can be answered more quickly and efficiently.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "0xM2G3ef-GS2", + "outputId": "b09556d9-afc2-4f71-fc9a-33d591690e6c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Query 1: What happened when Apple's AI feature generated a false BBC headline about a murder case in New York?\n", + "Response: According to the context, Apple Intelligence (an AI feature that summarizes notifications) generated a false headline that made it appear as if BBC News had published an article claiming Luigi Mangione, who was arrested for the murder of healthcare insurance CEO Brian Thompson in New York, had shot himself. This was completely false - Mangione had not shot himself.\n", + "\n", + "The BBC complained to Apple about this misrepresentation, with a BBC spokesperson stating they had \"contacted Apple to raise this concern and fix the problem.\" The spokesperson emphasized that it's \"essential\" that audiences can trust information published under the BBC name, including notifications.\n", + "\n", + "This wasn't an isolated incident, as the context mentions that Apple's AI feature also misrepresented a New York Times article, incorrectly summarizing it as \"Netanyahu arrested\" when the actual article was about the International Criminal Court issuing an arrest warrant for the Israeli prime minister.\n", + "Time taken: 6.22 seconds\n", + "\n", + "Query 2: What happened with the map shown during the 2026 FIFA World Cup draw regarding Ukraine and Crimea? What was the controversy?\n", + "Response: During the draw for the 2026 FIFA World Cup, a map was shown that excluded Crimea as part of Ukraine. This graphic, which was displaying countries that cannot be drawn to play each other for geopolitical reasons, highlighted Ukraine but did not include the Crimean peninsula, which is internationally recognized as Ukrainian territory.\n", + "\n", + "This omission sparked significant controversy because Crimea has been under Russian occupation since 2014, but only a handful of countries recognize it as Russian territory. The Ukrainian Foreign Ministry spokesman, Heorhiy Tykhy, called this an \"unacceptable error\" and stated that Ukraine expected \"a public apology\" from FIFA. He criticized FIFA for acting \"against international law\" and supporting \"Russian propaganda, war crimes, and the crime of aggression against Ukraine.\"\n", + "\n", + "The Ukrainian Football Association also sent a formal letter of complaint to FIFA and UEFA officials expressing their \"deep concern\" about the cartographic representation. FIFA acknowledged they were \"aware of an issue\" and subsequently removed the image.\n", + "Time taken: 0.47 seconds\n", + "\n", + "Query 3: What happened when Apple's AI feature generated a false BBC headline about a murder case in New York?\n", + "Response: According to the context, Apple Intelligence (an AI feature that summarizes notifications) generated a false headline that made it appear as if BBC News had published an article claiming Luigi Mangione, who was arrested for the murder of healthcare insurance CEO Brian Thompson in New York, had shot himself. This was completely false - Mangione had not shot himself.\n", + "\n", + "The BBC complained to Apple about this misrepresentation, with a BBC spokesperson stating they had \"contacted Apple to raise this concern and fix the problem.\" The spokesperson emphasized that it's \"essential\" that audiences can trust information published under the BBC name, including notifications.\n", + "\n", + "This wasn't an isolated incident, as the context mentions that Apple's AI feature also misrepresented a New York Times article, incorrectly summarizing it as \"Netanyahu arrested\" when the actual article was about the International Criminal Court issuing an arrest warrant for the Israeli prime minister.\n", + "Time taken: 0.46 seconds\n" + ] + } + ], + "source": [ + "try:\n", + " queries = [\n", + " \"What happened when Apple's AI feature generated a false BBC headline about a murder case in New York?\",\n", + " \"What happened with the map shown during the 2026 FIFA World Cup draw regarding Ukraine and Crimea? What was the controversy?\", # Repeated query\n", + " \"What happened when Apple's AI feature generated a false BBC headline about a murder case in New York?\", # Repeated query\n", + " ]\n", + "\n", + " for i, query in enumerate(queries, 1):\n", + " print(f\"\\nQuery {i}: {query}\")\n", + " start_time = time.time()\n", + "\n", + " response = rag_chain.invoke(query)\n", + " elapsed_time = time.time() - start_time\n", + " print(f\"Response: {response.content}\")\n", + " print(f\"Time taken: {elapsed_time:.2f} seconds\")\n", + "except AuthenticationError as e:\n", + " print(f\"Authentication error: {str(e)}\")\n", + "except InternalServerFailureException as e:\n", + " if \"query request rejected\" in str(e):\n", + " print(\"Error: Search request was rejected due to rate limiting. Please try again later.\")\n", + " else:\n", + " print(f\"Internal server error occurred: {str(e)}\")\n", + "except Exception as e:\n", + " print(f\"Unexpected error occurred: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yJQ5P8E29go1" + }, + "source": [ + "## Conclusion\n", + "By following these steps, you’ll have a fully functional semantic search engine that leverages the strengths of Couchbase and Claude(by Anthropic). This guide is designed not just to show you how to build the system, but also to explain why each step is necessary, giving you a deeper understanding of the principles behind semantic search and how it improves querying data more efficiently using GSI which can significantly improve your RAG performance. Whether you're a newcomer to software development or an experienced developer looking to expand your skills, this guide will provide you with the knowledge and tools you need to create a powerful, AI-driven search engine." + ] + } + ], + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "00fec107b63648d2963143e1818a883f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2764e246c4ee485cbe9c8a123d6fdd7d", + "placeholder": "​", + "style": "IPY_MODEL_8885c27b04284da381085aff0e4b20de", + "value": "Downloading builder script: 100%" + } + }, + "01fb5f4bf5ff46379df9d0de0985b8d2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "02038b7ea0044af5b74fa2c57d594d5b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_242b078b293f4cffa8ac8225e9224954", + "IPY_MODEL_0945dcbd30414d16852d77a72a844d5e", + "IPY_MODEL_871088792d8e466eaff966a92fd87b80" + ], + "layout": "IPY_MODEL_db02362a599449558a7154c283d7aad5" + } + }, + "048c541e179a465e8e1f1b29e0daa666": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "0723a46e2b3e4781945261901aa37754": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0813454e096645b7a81c255aae4ae0c0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_294bbed142eb4689a0be83fab59dd5bb", + "IPY_MODEL_f9cc57f324294afdac594145771c1aee", + "IPY_MODEL_43dd7d8c3d8b4da6ac60ba04575f6832" + ], + "layout": "IPY_MODEL_535ad65f1cc64f45aafd5bbd5ecf70c2" + } + }, + "0945dcbd30414d16852d77a72a844d5e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e7a1464199dc45ceb19068835bed9bc8", + "max": 500, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_be2029af0ad84060ae9a54e0d0f0547c", + "value": 500 + } + }, + "18c6e6b004b04757926c79a49393bade": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "194a2e2ba51e4b4bb6aca7bc26d84cda": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1d223aa8e3e940a48f2a13be9bd92ae5", + "max": 5452, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_f2685cbcfa514cb6bd1e81164a09db91", + "value": 5452 + } + }, + "1cd504c80899404c9254a9139a0c6a55": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1d223aa8e3e940a48f2a13be9bd92ae5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "242b078b293f4cffa8ac8225e9224954": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0723a46e2b3e4781945261901aa37754", + "placeholder": "​", + "style": "IPY_MODEL_acbd6da4056b4a999df1bc6145c2e0e7", + "value": "Generating test split: 100%" + } + }, + "26c5eda7756c4a8280a4af530a9c33f9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_32f9ddc615fc486588e3b4ab18defb1b", + "IPY_MODEL_64a5e834f36c475c8421bfe72d02a9f0", + "IPY_MODEL_a7aa91bfc5d649fa9dcf1c9821e441e3" + ], + "layout": "IPY_MODEL_a682ad68842b40c49b4a2ce9fea76d52" + } + }, + "2764e246c4ee485cbe9c8a123d6fdd7d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "294bbed142eb4689a0be83fab59dd5bb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e2a142c3f4f44a5fafc861ab3482ce9d", + "placeholder": "​", + "style": "IPY_MODEL_8e97eb6fa69b4cfdbb98dbe07d19fdec", + "value": "Downloading data: 100%" + } + }, + "32f9ddc615fc486588e3b4ab18defb1b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dbe5197b9c904b34bbb0afe0219fa48e", + "placeholder": "​", + "style": "IPY_MODEL_6a52248d2dc9412a988d48017c32d240", + "value": "Downloading readme: 100%" + } + }, + "43dd7d8c3d8b4da6ac60ba04575f6832": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_823a230ef2084d828ac5460afee96e0a", + "placeholder": "​", + "style": "IPY_MODEL_908d0ced42ae4adfbed2209518a7ba1d", + "value": " 23.4k/23.4k [00:00<00:00, 594kB/s]" + } + }, + "4d321c08dc124da79f3dbc2e824f20e3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5203dcca0a134e228652d8e4cfceab4a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6a06c2ec79a44b55b8e1a8d6c91a3cd3", + "placeholder": "​", + "style": "IPY_MODEL_1cd504c80899404c9254a9139a0c6a55", + "value": "Generating train split: 100%" + } + }, + "535ad65f1cc64f45aafd5bbd5ecf70c2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5e4cdf60ec2b43b7a5ee80f59d542101": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5f367bd7da804a7b9a9159e591d93537": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6eeb609f17cb4d6980ba2c4c48bf4718", + "placeholder": "​", + "style": "IPY_MODEL_ae6902c232a74c92989949579a659e8a", + "value": " 5.09k/5.09k [00:00<00:00, 42.8kB/s]" + } + }, + "64a5e834f36c475c8421bfe72d02a9f0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_84482d2a1d4448aaa7eb6f0224dfd6e2", + "max": 10630, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_048c541e179a465e8e1f1b29e0daa666", + "value": 10630 + } + }, + "67f94c392f49486bb72df884a8dfb275": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "694a52495f6047f6b0c2f4e774de23a8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6a06c2ec79a44b55b8e1a8d6c91a3cd3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6a52248d2dc9412a988d48017c32d240": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6a9c59c86df642b78d7cbaa4979ade56": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6eeb609f17cb4d6980ba2c4c48bf4718": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "724291f2d5af47dc863df357fc5f55f9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5203dcca0a134e228652d8e4cfceab4a", + "IPY_MODEL_194a2e2ba51e4b4bb6aca7bc26d84cda", + "IPY_MODEL_7a3a658d3885477c8a7c42d92133f4f0" + ], + "layout": "IPY_MODEL_6a9c59c86df642b78d7cbaa4979ade56" + } + }, + "7a3a658d3885477c8a7c42d92133f4f0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a8d45a6621f74d63ba3e97b7d3edeca2", + "placeholder": "​", + "style": "IPY_MODEL_5e4cdf60ec2b43b7a5ee80f59d542101", + "value": " 5452/5452 [00:00<00:00, 11622.18 examples/s]" + } + }, + "823a230ef2084d828ac5460afee96e0a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "84482d2a1d4448aaa7eb6f0224dfd6e2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "863ebab7a9b8452faab6206ca2b71074": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a102c5087141442ba954c9e97da23957", + "placeholder": "​", + "style": "IPY_MODEL_9773b079ecb848f19763d10984eebaaf", + "value": "Downloading data: 100%" + } + }, + "871088792d8e466eaff966a92fd87b80": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a9e8901827ed43de85bc56d177311009", + "placeholder": "​", + "style": "IPY_MODEL_01fb5f4bf5ff46379df9d0de0985b8d2", + "value": " 500/500 [00:00<00:00, 7173.63 examples/s]" + } + }, + "8885c27b04284da381085aff0e4b20de": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8e97eb6fa69b4cfdbb98dbe07d19fdec": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "908d0ced42ae4adfbed2209518a7ba1d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9773b079ecb848f19763d10984eebaaf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9b2d831e32c54be59e287479331cc117": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9d0769b999b746748dc295a84ee8c871": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_00fec107b63648d2963143e1818a883f", + "IPY_MODEL_ee6a7ca0e2724bad8d001c356fd5004b", + "IPY_MODEL_5f367bd7da804a7b9a9159e591d93537" + ], + "layout": "IPY_MODEL_4d321c08dc124da79f3dbc2e824f20e3" + } + }, + "a102c5087141442ba954c9e97da23957": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a682ad68842b40c49b4a2ce9fea76d52": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a7aa91bfc5d649fa9dcf1c9821e441e3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_694a52495f6047f6b0c2f4e774de23a8", + "placeholder": "​", + "style": "IPY_MODEL_cd41975732464f99bcb74fd92e038461", + "value": " 10.6k/10.6k [00:00<00:00, 96.5kB/s]" + } + }, + "a8d45a6621f74d63ba3e97b7d3edeca2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a9e8901827ed43de85bc56d177311009": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ac5623d01ddb4ce8927be677d13d656e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9b2d831e32c54be59e287479331cc117", + "max": 335858, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ef13f5dfbf1941ee8c7d9c2205945c52", + "value": 335858 + } + }, + "acbd6da4056b4a999df1bc6145c2e0e7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ae6902c232a74c92989949579a659e8a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "be2029af0ad84060ae9a54e0d0f0547c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "bfae1814bf2d4c25b97d57538a66e628": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_863ebab7a9b8452faab6206ca2b71074", + "IPY_MODEL_ac5623d01ddb4ce8927be677d13d656e", + "IPY_MODEL_ee5a3d2681b546b08fe4bfa1b73a52b6" + ], + "layout": "IPY_MODEL_18c6e6b004b04757926c79a49393bade" + } + }, + "cd41975732464f99bcb74fd92e038461": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d6c120b5e40b4206ba5868a318aa4a0d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "db02362a599449558a7154c283d7aad5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dbc785c6fe8844bfbcf7547dd400b795": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dbe5197b9c904b34bbb0afe0219fa48e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dd0e4f232c7741f9a692f38e378672d1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e2a142c3f4f44a5fafc861ab3482ce9d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e35a96d0387742d59115c3db9ba11734": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e7a1464199dc45ceb19068835bed9bc8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ede1b42527c943b3b023939f14b7205a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ee5a3d2681b546b08fe4bfa1b73a52b6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dd0e4f232c7741f9a692f38e378672d1", + "placeholder": "​", + "style": "IPY_MODEL_e35a96d0387742d59115c3db9ba11734", + "value": " 336k/336k [00:00<00:00, 789kB/s]" + } + }, + "ee6a7ca0e2724bad8d001c356fd5004b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dbc785c6fe8844bfbcf7547dd400b795", + "max": 5090, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_67f94c392f49486bb72df884a8dfb275", + "value": 5090 + } + }, + "ef13f5dfbf1941ee8c7d9c2205945c52": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f2685cbcfa514cb6bd1e81164a09db91": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f9cc57f324294afdac594145771c1aee": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ede1b42527c943b3b023939f14b7205a", + "max": 23354, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d6c120b5e40b4206ba5868a318aa4a0d", + "value": 23354 + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/claudeai/gsi/frontmatter.md b/claudeai/gsi/frontmatter.md new file mode 100644 index 00000000..56ee441b --- /dev/null +++ b/claudeai/gsi/frontmatter.md @@ -0,0 +1,21 @@ +--- +# frontmatter +path: "/tutorial-openai-claude-couchbase-rag-with-global-secondary-index" +title: Retrieval-Augmented Generation (RAG) with Couchbase, OpenAI, and Claude using GSI index +short_title: RAG with Couchbase, OpenAI, and Claude using GSI index +description: + - Learn how to build a semantic search engine using Couchbase, OpenAI embeddings, and Anthropic's Claude using GSI. + - This tutorial demonstrates how to integrate Couchbase's vector search capabilities with OpenAI embeddings and use Claude as the language model. + - You'll understand how to perform Retrieval-Augmented Generation (RAG) using LangChain and Couchbase. +content_type: tutorial +filter: sdk +technology: + - vector search +tags: + - Artificial Intelligence + - LangChain + - OpenAI +sdk_language: + - python +length: 60 Mins +---