diff --git a/azure/.env.sample b/azure/.env.sample new file mode 100644 index 00000000..0ce62469 --- /dev/null +++ b/azure/.env.sample @@ -0,0 +1,15 @@ +AZURE_OPENAI_KEY = "" +AZURE_OPENAI_ENDPOINT = "" +AZURE_OPENAI_EMBEDDING_DEPLOYMENT = "" +AZURE_OPENAI_CHAT_DEPLOYMENT = "" +AZURE_OPENAI_API_VERSION = "" + +CB_HOST="" +CB_USERNAME="" +CB_PASSWORD="" +CB_BUCKET_NAME="" + +INDEX_NAME="" +SCOPE_NAME="" +COLLECTION_NAME="" +CACHE_COLLECTION="" \ No newline at end of file diff --git a/azure/RAG_with_Couchbase_and_AzureOpenAI.ipynb b/azure/RAG_with_Couchbase_and_AzureOpenAI.ipynb index ef7391a3..08f7401a 100644 --- a/azure/RAG_with_Couchbase_and_AzureOpenAI.ipynb +++ b/azure/RAG_with_Couchbase_and_AzureOpenAI.ipynb @@ -33,7 +33,7 @@ "\n", "## Create and Deploy Your Free Tier Operational cluster on Capella\n", "\n", - "To get started with Couchbase Capella, create an account and use it to deploy a forever free tier operational cluster. This account provides you with a environment where you can explore and learn about Capella with no time constraint.\n", + "To get started with Couchbase Capella, create an account and use it to deploy a forever free tier operational cluster. This account provides you with an environment where you can explore and learn about Capella with no time constraint.\n", "\n", "To know more, please follow the [instructions](https://docs.couchbase.com/cloud/get-started/create-account.html).\n", "\n", @@ -70,135 +70,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "Collecting datasets\n", - " Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)\n", - "Collecting langchain-couchbase\n", - " Downloading langchain_couchbase-0.1.1-py3-none-any.whl.metadata (1.9 kB)\n", - "Collecting langchain-openai\n", - " Downloading langchain_openai-0.1.23-py3-none-any.whl.metadata (2.6 kB)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets) (3.15.4)\n", - "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.26.4)\n", - "Collecting pyarrow>=15.0.0 (from datasets)\n", - " Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)\n", - "Collecting dill<0.3.9,>=0.3.0 (from datasets)\n", - " Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n", - "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.1.4)\n", - "Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.32.3)\n", - "Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.5)\n", - "Collecting xxhash (from datasets)\n", - " Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n", - "Collecting multiprocess (from datasets)\n", - " Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)\n", - "Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets) (2024.6.1)\n", - "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.10.5)\n", - "Requirement already satisfied: huggingface-hub>=0.21.2 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.24.6)\n", - "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (24.1)\n", - "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.2)\n", - "Collecting couchbase<5.0.0,>=4.2.1 (from langchain-couchbase)\n", - " Downloading couchbase-4.3.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (23 kB)\n", - "Collecting langchain-core<0.3,>=0.2.0 (from langchain-couchbase)\n", - " Downloading langchain_core-0.2.38-py3-none-any.whl.metadata (6.2 kB)\n", - "Collecting openai<2.0.0,>=1.40.0 (from langchain-openai)\n", - " Downloading openai-1.43.1-py3-none-any.whl.metadata (22 kB)\n", - "Collecting tiktoken<1,>=0.7 (from langchain-openai)\n", - " Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n", - "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.4.0)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n", - "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (24.2.0)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.1)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.5)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.4)\n", - "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.21.2->datasets) (4.12.2)\n", - "Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3,>=0.2.0->langchain-couchbase)\n", - " Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)\n", - "Collecting langsmith<0.2.0,>=0.1.75 (from langchain-core<0.3,>=0.2.0->langchain-couchbase)\n", - " Downloading langsmith-0.1.115-py3-none-any.whl.metadata (13 kB)\n", - "Requirement already satisfied: pydantic<3,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.3,>=0.2.0->langchain-couchbase) (2.8.2)\n", - "Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain-core<0.3,>=0.2.0->langchain-couchbase)\n", - " Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)\n", - "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=1.40.0->langchain-openai) (3.7.1)\n", - "Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai<2.0.0,>=1.40.0->langchain-openai) (1.7.0)\n", - "Collecting httpx<1,>=0.23.0 (from openai<2.0.0,>=1.40.0->langchain-openai)\n", - " Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)\n", - "Collecting jiter<1,>=0.4.0 (from openai<2.0.0,>=1.40.0->langchain-openai)\n", - " Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)\n", - "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=1.40.0->langchain-openai) (1.3.1)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.3.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.8)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2.0.7)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2024.8.30)\n", - "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken<1,>=0.7->langchain-openai) (2024.5.15)\n", - "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.1)\n", - "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.1)\n", - "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai<2.0.0,>=1.40.0->langchain-openai) (1.2.2)\n", - "Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai<2.0.0,>=1.40.0->langchain-openai)\n", - " Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)\n", - "Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai<2.0.0,>=1.40.0->langchain-openai)\n", - " Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)\n", - "Collecting jsonpointer>=1.9 (from jsonpatch<2.0,>=1.33->langchain-core<0.3,>=0.2.0->langchain-couchbase)\n", - " Downloading jsonpointer-3.0.0-py2.py3-none-any.whl.metadata (2.3 kB)\n", - "Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.75->langchain-core<0.3,>=0.2.0->langchain-couchbase)\n", - " Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.4/50.4 kB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain-core<0.3,>=0.2.0->langchain-couchbase) (0.7.0)\n", - "Requirement already satisfied: pydantic-core==2.20.1 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain-core<0.3,>=0.2.0->langchain-couchbase) (2.20.1)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n", - "Downloading datasets-2.21.0-py3-none-any.whl (527 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m527.3/527.3 kB\u001b[0m \u001b[31m17.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading langchain_couchbase-0.1.1-py3-none-any.whl (13 kB)\n", - "Downloading langchain_openai-0.1.23-py3-none-any.whl (51 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m52.0/52.0 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading couchbase-4.3.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (5.1 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.1/5.1 MB\u001b[0m \u001b[31m59.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading langchain_core-0.2.38-py3-none-any.whl (396 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m396.4/396.4 kB\u001b[0m \u001b[31m18.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading openai-1.43.1-py3-none-any.whl (365 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m365.7/365.7 kB\u001b[0m \u001b[31m19.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m39.9/39.9 MB\u001b[0m \u001b[31m15.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m40.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m7.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.4/76.4 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (318 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m318.9/318.9 kB\u001b[0m \u001b[31m19.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)\n", - "Downloading langsmith-0.1.115-py3-none-any.whl (290 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m290.2/290.2 kB\u001b[0m \u001b[31m15.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading tenacity-8.5.0-py3-none-any.whl (28 kB)\n", - "Downloading jsonpointer-3.0.0-py2.py3-none-any.whl (7.6 kB)\n", - "Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (141 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m141.9/141.9 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading h11-0.14.0-py3-none-any.whl (58 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hInstalling collected packages: xxhash, tenacity, pyarrow, orjson, jsonpointer, jiter, h11, dill, couchbase, tiktoken, multiprocess, jsonpatch, httpcore, httpx, openai, langsmith, datasets, langchain-core, langchain-openai, langchain-couchbase\n", - " Attempting uninstall: tenacity\n", - " Found existing installation: tenacity 9.0.0\n", - " Uninstalling tenacity-9.0.0:\n", - " Successfully uninstalled tenacity-9.0.0\n", - " Attempting uninstall: pyarrow\n", - " Found existing installation: pyarrow 14.0.2\n", - " Uninstalling pyarrow-14.0.2:\n", - " Successfully uninstalled pyarrow-14.0.2\n", - "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.\n", - "ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 17.0.0 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0mSuccessfully installed couchbase-4.3.1 datasets-2.21.0 dill-0.3.8 h11-0.14.0 httpcore-1.0.5 httpx-0.27.2 jiter-0.5.0 jsonpatch-1.33 jsonpointer-3.0.0 langchain-core-0.2.38 langchain-couchbase-0.1.1 langchain-openai-0.1.23 langsmith-0.1.115 multiprocess-0.70.16 openai-1.43.1 orjson-3.10.7 pyarrow-17.0.0 tenacity-8.5.0 tiktoken-0.7.0 xxhash-3.5.0\n" + "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ - "!pip install datasets langchain-couchbase langchain-openai" + "%pip install --quiet datasets langchain-couchbase langchain-openai python-dotenv" ] }, { @@ -222,22 +99,20 @@ "import getpass\n", "import json\n", "import logging\n", - "import sys\n", + "import os\n", "import time\n", "from datetime import timedelta\n", - "from uuid import uuid4\n", "\n", "from couchbase.auth import PasswordAuthenticator\n", "from couchbase.cluster import Cluster\n", - "from couchbase.exceptions import (\n", - " CouchbaseException,\n", - " InternalServerFailureException,\n", - " QueryIndexAlreadyExistsException,\n", - ")\n", + "from couchbase.exceptions import (CouchbaseException,\n", + " InternalServerFailureException,\n", + " QueryIndexAlreadyExistsException,ServiceUnavailableException)\n", + "from couchbase.management.buckets import CreateBucketSettings\n", "from couchbase.management.search import SearchIndex\n", "from couchbase.options import ClusterOptions\n", "from datasets import load_dataset\n", - "from langchain_core.documents import Document\n", + "from dotenv import load_dotenv\n", "from langchain_core.globals import set_llm_cache\n", "from langchain_core.output_parsers import StrOutputParser\n", "from langchain_core.prompts import ChatPromptTemplate\n", @@ -245,7 +120,7 @@ "from langchain_couchbase.cache import CouchbaseCache\n", "from langchain_couchbase.vectorstores import CouchbaseVectorStore\n", "from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings\n", - "from tqdm import tqdm" + "from openai import NotFoundError" ] }, { @@ -266,7 +141,10 @@ }, "outputs": [], "source": [ - "logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)" + "logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)\n", + "\n", + "# Suppress httpx logging\n", + "logging.getLogger('httpx').setLevel(logging.CRITICAL)" ] }, { @@ -291,40 +169,24 @@ "id": "PFGyHll18mSe", "outputId": "50d09055-cf2e-4d8a-d025-cc1a6a2e3193" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Enter your Azure OpenAI Key: ··········\n", - "Enter your Azure OpenAI Endpoint: https://first-couchbase-instance.openai.azure.com/\n", - "Enter your Azure OpenAI Embedding Deployment: text-embedding-ada-002\n", - "Enter your Azure OpenAI Chat Deployment: gpt-4o\n", - "Enter your Couchbase host (default: couchbase://localhost): couchbases://cb.hlcup4o4jmjr55yf.cloud.couchbase.com\n", - "Enter your Couchbase username (default: Administrator): vector-search-rag-demos\n", - "Enter your Couchbase password (default: password): ··········\n", - "Enter your Couchbase bucket name (default: vector-search-testing): \n", - "Enter your index name (default: vector_search_azure): \n", - "Enter your scope name (default: shared): \n", - "Enter your collection name (default: azure): \n", - "Enter your cache collection name (default: cache): \n" - ] - } - ], + "outputs": [], "source": [ - "AZURE_OPENAI_KEY = getpass.getpass('Enter your Azure OpenAI Key: ')\n", - "AZURE_OPENAI_ENDPOINT = input('Enter your Azure OpenAI Endpoint: ')\n", - "AZURE_OPENAI_EMBEDDING_DEPLOYMENT = input('Enter your Azure OpenAI Embedding Deployment: ')\n", - "AZURE_OPENAI_CHAT_DEPLOYMENT = input('Enter your Azure OpenAI Chat Deployment: ')\n", + "load_dotenv()\n", + "\n", + "AZURE_OPENAI_KEY = os.getenv('AZURE_OPENAI_KEY') or getpass.getpass('Enter your Azure OpenAI Key: ')\n", + "AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT') or input('Enter your Azure OpenAI Endpoint: ')\n", + "AZURE_OPENAI_EMBEDDING_DEPLOYMENT = os.getenv('AZURE_OPENAI_EMBEDDING_DEPLOYMENT') or input('Enter your Azure OpenAI Embedding Deployment: ')\n", + "AZURE_OPENAI_CHAT_DEPLOYMENT = os.getenv('AZURE_OPENAI_CHAT_DEPLOYMENT') or input('Enter your Azure OpenAI Chat Deployment: ')\n", + "AZURE_OPENAI_API_VERSION = os.getenv('AZURE_OPENAI_API_VERSION') or input('Enter your Azure OpenAI API Version/Model Version: (for example: 2024-11-20 for gpt-4o) ') or '2024-11-20'\n", "\n", - "CB_HOST = input('Enter your Couchbase host (default: couchbase://localhost): ') or 'couchbase://localhost'\n", - "CB_USERNAME = input('Enter your Couchbase username (default: Administrator): ') or 'Administrator'\n", - "CB_PASSWORD = getpass.getpass('Enter your Couchbase password (default: password): ') or 'password'\n", - "CB_BUCKET_NAME = input('Enter your Couchbase bucket name (default: vector-search-testing): ') or 'vector-search-testing'\n", - "INDEX_NAME = input('Enter your index name (default: vector_search_azure): ') or 'vector_search_azure'\n", - "SCOPE_NAME = input('Enter your scope name (default: shared): ') or 'shared'\n", - "COLLECTION_NAME = input('Enter your collection name (default: azure): ') or 'azure'\n", - "CACHE_COLLECTION = input('Enter your cache collection name (default: cache): ') or 'cache'\n", + "CB_HOST = os.getenv('CB_HOST') or input('Enter your Couchbase host (default: couchbase://localhost): ') or 'couchbase://localhost'\n", + "CB_USERNAME = os.getenv('CB_USERNAME') or input('Enter your Couchbase username (default: Administrator): ') or 'Administrator'\n", + "CB_PASSWORD = os.getenv('CB_PASSWORD') or getpass.getpass('Enter your Couchbase password (default: password): ') or 'password'\n", + "CB_BUCKET_NAME = os.getenv('CB_BUCKET_NAME') or input('Enter your Couchbase bucket name (default: vector-search-testing): ') or 'vector-search-testing'\n", + "INDEX_NAME = os.getenv('INDEX_NAME') or input('Enter your index name (default: vector_search_azure): ') or 'vector_search_azure'\n", + "SCOPE_NAME = os.getenv('SCOPE_NAME') or input('Enter your scope name (default: shared): ') or 'shared'\n", + "COLLECTION_NAME = os.getenv('COLLECTION_NAME') or input('Enter your collection name (default: azure): ') or 'azure'\n", + "CACHE_COLLECTION = os.getenv('CACHE_COLLECTION') or input('Enter your cache collection name (default: cache): ') or 'cache'\n", "\n", "# Check if the variables are correctly loaded\n", "if not all([AZURE_OPENAI_KEY, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_EMBEDDING_DEPLOYMENT, AZURE_OPENAI_CHAT_DEPLOYMENT]):\n", @@ -357,7 +219,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024-09-06 07:29:16,632 - INFO - Successfully connected to Couchbase\n" + "2025-02-05 23:32:31,737 - INFO - Successfully connected to Couchbase\n" ] } ], @@ -378,10 +240,31 @@ "id": "C_Gpy32N8mcZ" }, "source": [ - "# Setting Up Collections in Couchbase\n", - "In Couchbase, data is organized in buckets, which can be further divided into scopes and collections. Think of a collection as a table in a traditional SQL database. Before we can store any data, we need to ensure that our collections exist. If they don't, we must create them. This step is important because it prepares the database to handle the specific types of data our application will process. By setting up collections, we define the structure of our data storage, which is essential for efficient data retrieval and management.\n", + "## Setting Up Collections in Couchbase\n", + "\n", + "The setup_collection() function handles creating and configuring the hierarchical data organization in Couchbase:\n", + "\n", + "1. Bucket Creation:\n", + " - Checks if specified bucket exists, creates it if not\n", + " - Sets bucket properties like RAM quota (1024MB) and replication (disabled)\n", + "\n", + "2. Scope Management: \n", + " - Verifies if requested scope exists within bucket\n", + " - Creates new scope if needed (unless it's the default \"_default\" scope)\n", "\n", - "Moreover, setting up collections allows us to isolate different types of data within the same bucket, providing a more organized and scalable data structure. This is particularly useful when dealing with large datasets, as it ensures that related data is stored together, making it easier to manage and query." + "3. Collection Setup:\n", + " - Checks for collection existence within scope\n", + " - Creates collection if it doesn't exist\n", + " - Waits 2 seconds for collection to be ready\n", + "\n", + "Additional Tasks:\n", + "- Creates primary index on collection for query performance\n", + "- Clears any existing documents for clean state\n", + "- Implements comprehensive error handling and logging\n", + "\n", + "The function is called twice to set up:\n", + "1. Main collection for vector embeddings\n", + "2. Cache collection for storing results\n" ] }, { @@ -399,18 +282,25 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024-09-06 07:29:17,029 - INFO - Collection 'azure' already exists.Skipping creation.\n", - "2024-09-06 07:29:17,095 - INFO - Primary index present or created successfully.\n", - "2024-09-06 07:29:17,775 - INFO - All documents cleared from the collection.\n", - "2024-09-06 07:29:17,841 - INFO - Collection 'cache' already exists.Skipping creation.\n", - "2024-09-06 07:29:17,907 - INFO - Primary index present or created successfully.\n", - "2024-09-06 07:29:17,973 - INFO - All documents cleared from the collection.\n" + "2025-02-05 23:32:31,797 - INFO - Bucket 'vector-search-testing' does not exist. Creating it...\n", + "2025-02-05 23:32:32,384 - INFO - Bucket 'vector-search-testing' created successfully.\n", + "2025-02-05 23:32:32,393 - INFO - Scope 'shared' does not exist. Creating it...\n", + "2025-02-05 23:32:32,450 - INFO - Scope 'shared' created successfully.\n", + "2025-02-05 23:32:32,462 - INFO - Collection 'azure' does not exist. Creating it...\n", + "2025-02-05 23:32:32,532 - INFO - Collection 'azure' created successfully.\n", + "2025-02-05 23:32:37,006 - INFO - Primary index present or created successfully.\n", + "2025-02-05 23:32:41,769 - INFO - All documents cleared from the collection.\n", + "2025-02-05 23:32:41,771 - INFO - Bucket 'vector-search-testing' exists.\n", + "2025-02-05 23:32:41,783 - INFO - Collection 'cache' does not exist. Creating it...\n", + "2025-02-05 23:32:41,835 - INFO - Collection 'cache' created successfully.\n", + "2025-02-05 23:32:48,115 - INFO - Primary index present or created successfully.\n", + "2025-02-05 23:32:48,126 - INFO - All documents cleared from the collection.\n" ] }, { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 6, @@ -421,9 +311,34 @@ "source": [ "def setup_collection(cluster, bucket_name, scope_name, collection_name):\n", " try:\n", - " bucket = cluster.bucket(bucket_name)\n", + " # Check if bucket exists, create if it doesn't\n", + " try:\n", + " bucket = cluster.bucket(bucket_name)\n", + " logging.info(f\"Bucket '{bucket_name}' exists.\")\n", + " except Exception as e:\n", + " logging.info(f\"Bucket '{bucket_name}' does not exist. Creating it...\")\n", + " bucket_settings = CreateBucketSettings(\n", + " name=bucket_name,\n", + " bucket_type='couchbase',\n", + " ram_quota_mb=1024,\n", + " flush_enabled=True,\n", + " num_replicas=0\n", + " )\n", + " cluster.buckets().create_bucket(bucket_settings)\n", + " bucket = cluster.bucket(bucket_name)\n", + " logging.info(f\"Bucket '{bucket_name}' created successfully.\")\n", + "\n", " bucket_manager = bucket.collections()\n", "\n", + " # Check if scope exists, create if it doesn't\n", + " scopes = bucket_manager.get_all_scopes()\n", + " scope_exists = any(scope.name == scope_name for scope in scopes)\n", + " \n", + " if not scope_exists and scope_name != \"_default\":\n", + " logging.info(f\"Scope '{scope_name}' does not exist. Creating it...\")\n", + " bucket_manager.create_scope(scope_name)\n", + " logging.info(f\"Scope '{scope_name}' created successfully.\")\n", + "\n", " # Check if collection exists, create if it doesn't\n", " collections = bucket_manager.get_all_scopes()\n", " collection_exists = any(\n", @@ -436,9 +351,11 @@ " bucket_manager.create_collection(scope_name, collection_name)\n", " logging.info(f\"Collection '{collection_name}' created successfully.\")\n", " else:\n", - " logging.info(f\"Collection '{collection_name}' already exists.Skipping creation.\")\n", + " logging.info(f\"Collection '{collection_name}' already exists. Skipping creation.\")\n", "\n", + " # Wait for collection to be ready\n", " collection = bucket.scope(scope_name).collection(collection_name)\n", + " time.sleep(2) # Give the collection time to be ready for queries\n", "\n", " # Ensure primary index exists\n", " try:\n", @@ -458,9 +375,9 @@ " return collection\n", " except Exception as e:\n", " raise RuntimeError(f\"Error setting up collection: {str(e)}\")\n", - "\n", + " \n", "setup_collection(cluster, CB_BUCKET_NAME, SCOPE_NAME, COLLECTION_NAME)\n", - "setup_collection(cluster, CB_BUCKET_NAME, SCOPE_NAME, CACHE_COLLECTION)" + "setup_collection(cluster, CB_BUCKET_NAME, SCOPE_NAME, CACHE_COLLECTION)\n" ] }, { @@ -473,6 +390,8 @@ "\n", "Semantic search requires an efficient way to retrieve relevant documents based on a user's query. This is where the Couchbase **Vector Search Index** comes into play. In this step, we load the Vector Search Index definition from a JSON file, which specifies how the index should be structured. This includes the fields to be indexed, the dimensions of the vectors, and other parameters that determine how the search engine processes queries based on vector similarity.\n", "\n", + "This Azure vector search index configuration requires specific default settings to function properly. This tutorial uses the bucket named `vector-search-testing` with the scope `shared` and collection `azure`. The configuration is set up for vectors with exactly `1536 dimensions`, using dot product similarity and optimized for recall. If you want to use a different bucket, scope, or collection, you will need to modify the index configuration accordingly.\n", + "\n", "For more information on creating a vector search index, please follow the [instructions](https://docs.couchbase.com/cloud/vector-search/create-vector-search-index-ui.html).\n" ] }, @@ -487,233 +406,40 @@ "id": "y7xiCrOc8mmj", "outputId": "833d3fd1-f4e8-4869-f1e8-b4848136cd71" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Upload your index definition file\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " Upload widget is only available when the cell has been executed in the\n", - " current browser session. Please rerun this cell to enable.\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Saving azure_index.json to azure_index.json\n" - ] - } - ], + "outputs": [], "source": [ "# If you are running this script locally (not in Google Colab), uncomment the following line\n", "# and provide the path to your index definition file.\n", "\n", "# index_definition_path = '/path_to_your_index_file/azure_index.json' # Local setup: specify your file path here\n", "\n", - "# If you are running in Google Colab, use the following code to upload the index definition file\n", - "from google.colab import files\n", - "print(\"Upload your index definition file\")\n", - "uploaded = files.upload()\n", - "index_definition_path = list(uploaded.keys())[0]\n", + "# # Version for Google Colab\n", + "# def load_index_definition_colab():\n", + "# from google.colab import files\n", + "# print(\"Upload your index definition file\")\n", + "# uploaded = files.upload()\n", + "# index_definition_path = list(uploaded.keys())[0]\n", "\n", - "try:\n", - " with open(index_definition_path, 'r') as file:\n", - " index_definition = json.load(file)\n", - "except Exception as e:\n", - " raise ValueError(f\"Error loading index definition from {index_definition_path}: {str(e)}\")" + "# try:\n", + "# with open(index_definition_path, 'r') as file:\n", + "# index_definition = json.load(file)\n", + "# return index_definition\n", + "# except Exception as e:\n", + "# raise ValueError(f\"Error loading index definition from {index_definition_path}: {str(e)}\")\n", + "\n", + "# Version for Local Environment\n", + "def load_index_definition_local(index_definition_path):\n", + " try:\n", + " with open(index_definition_path, 'r') as file:\n", + " index_definition = json.load(file)\n", + " return index_definition\n", + " except Exception as e:\n", + " raise ValueError(f\"Error loading index definition from {index_definition_path}: {str(e)}\")\n", + "\n", + "# Usage\n", + "# Uncomment the appropriate line based on your environment\n", + "# index_definition = load_index_definition_colab()\n", + "index_definition = load_index_definition_local('azure_index.json')" ] }, { @@ -742,8 +468,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024-09-06 07:30:01,070 - INFO - Index 'vector_search_azure' found\n", - "2024-09-06 07:30:01,373 - INFO - Index 'vector_search_azure' already exists. Skipping creation/update.\n" + "2025-02-05 23:32:48,217 - INFO - Creating new index 'vector_search_azure'...\n", + "2025-02-05 23:32:48,398 - INFO - Index 'vector_search_azure' successfully created/updated.\n" ] } ], @@ -769,243 +495,11 @@ "\n", "except QueryIndexAlreadyExistsException:\n", " logging.info(f\"Index '{index_name}' already exists. Skipping creation/update.\")\n", - "\n", + "except ServiceUnavailableException:\n", + " raise RuntimeError(\"Search service is not available. Please ensure the Search service is enabled in your Couchbase cluster.\")\n", "except InternalServerFailureException as e:\n", - " error_message = str(e)\n", - " logging.error(f\"InternalServerFailureException raised: {error_message}\")\n", - "\n", - " try:\n", - " # Accessing the response_body attribute from the context\n", - " error_context = e.context\n", - " response_body = error_context.response_body\n", - " if response_body:\n", - " error_details = json.loads(response_body)\n", - " error_message = error_details.get('error', '')\n", - "\n", - " if \"collection: 'azure' doesn't belong to scope: 'shared'\" in error_message:\n", - " raise ValueError(\"Collection 'azure' does not belong to scope 'shared'. Please check the collection and scope names.\")\n", - "\n", - " except ValueError as ve:\n", - " logging.error(str(ve))\n", - " raise\n", - "\n", - " except Exception as json_error:\n", - " logging.error(f\"Failed to parse the error message: {json_error}\")\n", - " raise RuntimeError(f\"Internal server error while creating/updating search index: {error_message}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QRV4k06L8mwS" - }, - "source": [ - "# Load the TREC Dataset\n", - "To build a search engine, we need data to search through. We use the TREC dataset, a well-known benchmark in the field of information retrieval. This dataset contains a wide variety of text data that we'll use to train our search engine. Loading the dataset is a crucial step because it provides the raw material that our search engine will work with. The quality and diversity of the data in the TREC dataset make it an excellent choice for testing and refining our search engine, ensuring that it can handle a wide range of queries effectively.\n", - "\n", - "The TREC dataset's rich content allows us to simulate real-world scenarios where users ask complex questions, enabling us to fine-tune our search engine's ability to understand and respond to various types of queries." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 420, - "referenced_widgets": [ - "8a564229df744b46954b2071c6a6675e", - "2bcc0c8ac8cf450786af9c143aa0c45e", - "9c4057c5ed274735b9e2a4a68a75850f", - "851647814d2144b1a5d45d089b7e550d", - "4258be805cd640da80eb31d7bcd57224", - "e44357c05e6340c2a9d62e03dbae75c7", - "7e9eacd828ad4fc8a91e0e709a6ae894", - "adf70ce39af24961bf4813b33dfa9a6d", - "8223fee9a1014359a08574cd33ed9be8", - "1bb005b776a54fa3b4540c0b0e54f9ad", - "5a09109b9cca4f51ae6b0f27d6beaff5", - "e6d34dac7c1347088812a24dd5eaa9b4", - "d0d85c5e388f4d98bf4ebe41dc735c59", - "1cfb972e32b94a61b90c1cd6e30c0168", - "de7b327febff4ce5aeace4cf5e7d8425", - "5506530162c44ffc8d2fa499bdd898e5", - "f4a92688a4ac4fbe8cdf0cf0c2adf92e", - "7f8b7a00ef79456b9c1be68d7c80acbd", - "846d0a68db8649a4b6a449e33e456557", - "48c2da481c184d12a1bc7f629f797393", - "313d80ed87f045da841b18d3ad40aaea", - "be3342a74451480f9be8660631fd716a", - "966f78ebd6fe451ba94f225c74822b45", - "f741064b353e49849872c7cf972de3c3", - "bb5c9bbe1f734aa28681f84d43fcf11b", - "fdf561800c7d4080a3f79da7acac0ade", - "c51ec2e3b7a942818a93aa41ba102d64", - "5ff28aed5de546f9871d3db0447eb66f", - "0a72183bcd8f49639ff7e704f1f82d37", - "42712160f4b44a87aa9e31c9df4a3feb", - "108bb0c972fa4fee8fca2bd2e632b47e", - "f862f2cc0ee74a25a41201e1c0d9ee22", - "ddf1806edafa473182ea94144930221a", - "3c2eeaa8cf524d0e9b36e97a8b87a2af", - "e7e7b99a739f4a759dcc5cb1c6c8f506", - "5004572235214e4db3b9318711cc08b4", - "570d400cfb394b0898e8b6f09521993e", - "ad752ded95884bcba2c01eac5ef3fdfe", - "d69ec185180d448893a6a01c3d64985b", - "5c83ccc140a84a1db041b7e1969041e2", - "b6c9ccf11a6b43b89e3f47664899026a", - "c393cea168ca4ddbb843ea6fc94759a9", - "3dabbf584cae4640949d1b57567e79ad", - "eabfed3f083c44f2b925defbb1c32cdc", - "71d2eeadaf064a9786c99a0c3100ab12", - "55459ee4f481430a8211e95f80d1ebd2", - "a070fe396d44404990fd382d14d58e6b", - "2497fec84589445381608179f088ec2f", - "2e0d06bf3b2e499b9649f801048b6202", - "c4d9f620c732417d97167bd97847e75f", - "da0c03f23da94e4bada122a084255fdc", - "57b3d95153164e74872d5b68e4c388f4", - "c911cc70bd394b8abedce89618dfb5b9", - "be421d34915d453ba49b3939be725cac", - "40646a88c38c4fca8068165fa5409c79", - "830bdf8312e64f9694cc9bb2f77da4f7", - "f1249595418f4481aad0f82f33fd1d96", - "b831670efec14fd08993cba39d1245bc", - "cabeb6f338ad4535b88491fd70416e3c", - "357cbf7417524b1facf5b953fab75c1a", - "46dc4caa549c48c897519ed4cd781e17", - "3c30fa950dac4aea9b2e5be54173ddf6", - "526c22f699ea4ab8afd347b8d5924c47", - "c46366a97dd6455486a5ab616d1a7dc4", - "7189e1ec10ab4f2f84daf9b2d5e0af4d", - "c1e113316431460e8f868a8d914839d9" - ] - }, - "id": "TRfRslF_8mzo", - "outputId": "b2428026-a686-4fb3-bdbf-06c747113280" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", - "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", - "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", - "You will be able to reuse this secret in all of your notebooks.\n", - "Please note that authentication is recommended but still optional to access public models or datasets.\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "8a564229df744b46954b2071c6a6675e", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading builder script: 0%| | 0.00/5.09k [00:00