Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(docs) MMR page switch to AstraDBVectorStore (=data api) #474

Merged
merged 6 commits into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 39 additions & 50 deletions docs/modules/examples/pages/mmr.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,12 @@ DB Access Token] with Database Administrator permissions.
+
[source,text]
----
ASTRA_DB_ID=aad075g999-8ab4-4d81-aa7d-7f58dbed3ead
ASTRA_DB_API_ENDPOINT=https://...
ASTRA_DB_APPLICATION_TOKEN=AstraCS:...
OPENAI_API_KEY=sk-...
ASTRA_DB_KEYSPACE=default_keyspace #optional
OPENAI_API_KEY=sk-...
----
+
[NOTE]
====
The `ASTRA_DB_ID` can be found in the {db-serverless} API Endpoint that's displayed for your vector-enabled database in {astra_ui}. If your API Endpoint is `https://aad075g999-8ab4-4d81-aa7d-7f58dbed3ead-us-east-2.apps.astra.datastax.com`, then your `ASTRA_DB_ID` is `aad075g999-8ab4-4d81-aa7d-7f58dbed3ead`.
====
+
. Install the following dependencies:
+
[source,python]
Expand All @@ -52,12 +47,11 @@ See the https://docs.datastax.com/en/ragstack/docs/prerequisites.html[Prerequisi
[source,python]
----
import os
import cassio
from dotenv import load_dotenv
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
from langchain_openai import OpenAI, OpenAIEmbeddings
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain_community.vectorstores import Cassandra
from langchain_astradb import AstraDBVectorStore

load_dotenv()
----
Expand All @@ -74,19 +68,14 @@ myEmbedding = OpenAIEmbeddings()
+
[source,python]
----
cassio.init(
database_id=os.environ["ASTRA_DB_ID"],
token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
keyspace=os.environ.get("ASTRA_DB_KEYSPACE"), # this is optional
)

myCassandraVStore = Cassandra(
myAstraDBVStore = AstraDBVectorStore(
embedding=myEmbedding,
session=None,
keyspace=None,
table_name='vs_test2',
api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
namespace=os.environ.get("ASTRA_DB_KEYSPACE"), # this is optional
collection_name="mmr_test",
)
index = VectorStoreIndexWrapper(vectorstore=myCassandraVStore)
index = VectorStoreIndexWrapper(vectorstore=myAstraDBVStore)
----

== Populate the vector store
Expand All @@ -98,20 +87,20 @@ Note that the last sentence's content is considerably different from the others.
----
# declare data

BASE_SENTENCE_0 = ('The frogs and the toads were meeting in the night '
'for a party under the moon.')
BASE_SENTENCE_0 = ("The frogs and the toads were meeting in the night "
"for a party under the moon.")

BASE_SENTENCE_1 = ('There was a party under the moon, that all toads, '
'with the frogs, decided to throw that night.')
BASE_SENTENCE_1 = ("There was a party under the moon, that all toads, "
"with the frogs, decided to throw that night.")

BASE_SENTENCE_2 = ('And the frogs and the toads said: "Let us have a party '
'tonight, as the moon is shining".')
BASE_SENTENCE_2 = ("And the frogs and the toads said: \"Let us have a party "
"tonight, as the moon is shining\".")

BASE_SENTENCE_3 = ('I remember that night... toads, along with frogs, '
'were all busy planning a moonlit celebration.')
BASE_SENTENCE_3 = ("I remember that night... toads, along with frogs, "
"were all busy planning a moonlit celebration.")

DIFFERENT_SENTENCE = ('For the party, frogs and toads set a rule: '
'everyone was to wear a purple hat.')
DIFFERENT_SENTENCE = ("For the party, frogs and toads set a rule: "
"everyone was to wear a purple hat.")

# insert into index
texts = [
Expand All @@ -122,23 +111,23 @@ texts = [
DIFFERENT_SENTENCE,
]
metadatas = [
{'source': 'Barney\'s story at the pub'},
{'source': 'Barney\'s story at the pub'},
{'source': 'Barney\'s story at the pub'},
{'source': 'Barney\'s story at the pub'},
{'source': 'The chronicles at the village library'},
{"source": "Barney's story at the pub"},
{"source": "Barney's story at the pub"},
{"source": "Barney's story at the pub"},
{"source": "Barney's story at the pub"},
{"source": "The chronicles at the village library"},
]
----
+
. Load the sentences into the vector store and print their IDs.
+
[source,python]
----
ids = myCassandraVStore.add_texts(
ids = myAstraDBVStore.add_texts(
texts,
metadatas=metadatas,
)
print('\n'.join(ids))
print("\n".join(ids))
----

== Create and compare retrievers
Expand All @@ -152,17 +141,17 @@ Ask them a question, and see how the MMR response differs from the similarity re
+
[source,python]
----
QUESTION = 'Tell me about the party that night.'
QUESTION = "Tell me about the party that night."
----
+
. Create a retriever with similarity search.
+
[source,python]
----
retrieverSim = myCassandraVStore.as_retriever(
search_type='similarity',
retrieverSim = myAstraDBVStore.as_retriever(
search_type="similarity",
search_kwargs={
'k': 2,
"k": 2,
},
)

Expand All @@ -172,19 +161,19 @@ chainSimSrc = RetrievalQAWithSourcesChain.from_chain_type(
)

responseSimSrc = chainSimSrc.invoke({chainSimSrc.question_key: QUESTION})
print('Similarity-based chain:')
print(f' ANSWER : {responseSimSrc["answer"].strip()}')
print(f' SOURCES: {responseSimSrc["sources"].strip()}')
print("Similarity-based chain:")
print(f" ANSWER : {responseSimSrc['answer'].strip()}")
print(f" SOURCES: {responseSimSrc['sources'].strip()}")
----
+
. Create a retriever with MMR search.
+
[source,python]
----
retrieverMMR = myCassandraVStore.as_retriever(
search_type='mmr',
retrieverMMR = myAstraDBVStore.as_retriever(
search_type="mmr",
search_kwargs={
'k': 2,
"k": 2,
},
)

Expand All @@ -194,9 +183,9 @@ chainMMRSrc = RetrievalQAWithSourcesChain.from_chain_type(
)

responseMMRSrc = chainMMRSrc.invoke({chainMMRSrc.question_key: QUESTION})
print('MMR-based chain:')
print(f' ANSWER : {responseMMRSrc["answer"].strip()}')
print(f' SOURCES: {responseMMRSrc["sources"].strip()}')
print("MMR-based chain:")
print(f" ANSWER : {responseMMRSrc['answer'].strip()}")
print(f" SOURCES: {responseMMRSrc['sources'].strip()}")
----
+
. Run the code and observe the differences in the responses.
Expand Down
85 changes: 40 additions & 45 deletions docs/modules/examples/partials/mmr-example.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -4,50 +4,45 @@
[source,python]
----
import os
import cassio
from dotenv import load_dotenv
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
from langchain_openai import OpenAI, OpenAIEmbeddings
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain_community.vectorstores import Cassandra
from langchain_astradb import AstraDBVectorStore

# Load environment variables
load_dotenv()

# Initialize OpenAI and embeddings
# Initialize the OpenAI model and embeddings.
llm = OpenAI(temperature=0)
myEmbedding = OpenAIEmbeddings()

cassio.init(
database_id=os.environ["ASTRA_DB_ID"],
token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
keyspace=os.environ.get("ASTRA_DB_KEYSPACE"), # this is optional
)

myCassandraVStore = Cassandra(
# Initialize the vector store.
myAstraDBVStore = AstraDBVectorStore(
embedding=myEmbedding,
session=None,
keyspace=None,
table_name='vs_test2',
api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
namespace=os.environ.get("ASTRA_DB_KEYSPACE"), # this is optional
collection_name="mmr_test",
)
index = VectorStoreIndexWrapper(vectorstore=myCassandraVStore)
index = VectorStoreIndexWrapper(vectorstore=myAstraDBVStore)

# declare data

BASE_SENTENCE_0 = ('The frogs and the toads were meeting in the night '
'for a party under the moon.')
BASE_SENTENCE_0 = ("The frogs and the toads were meeting in the night "
"for a party under the moon.")

BASE_SENTENCE_1 = ('There was a party under the moon, that all toads, '
'with the frogs, decided to throw that night.')
BASE_SENTENCE_1 = ("There was a party under the moon, that all toads, "
"with the frogs, decided to throw that night.")

BASE_SENTENCE_2 = ('And the frogs and the toads said: "Let us have a party '
'tonight, as the moon is shining".')
BASE_SENTENCE_2 = ("And the frogs and the toads said: \"Let us have a party "
"tonight, as the moon is shining\".")

BASE_SENTENCE_3 = ('I remember that night... toads, along with frogs, '
'were all busy planning a moonlit celebration.')
BASE_SENTENCE_3 = ("I remember that night... toads, along with frogs, "
"were all busy planning a moonlit celebration.")

DIFFERENT_SENTENCE = ('For the party, frogs and toads set a rule: '
'everyone was to wear a purple hat.')
DIFFERENT_SENTENCE = ("For the party, frogs and toads set a rule: "
"everyone was to wear a purple hat.")

# insert into index
texts = [
Expand All @@ -58,29 +53,29 @@ texts = [
DIFFERENT_SENTENCE,
]
metadatas = [
{'source': 'Barney\'s story at the pub'},
{'source': 'Barney\'s story at the pub'},
{'source': 'Barney\'s story at the pub'},
{'source': 'Barney\'s story at the pub'},
{'source': 'The chronicles at the village library'},
{"source": "Barney's story at the pub"},
{"source": "Barney's story at the pub"},
{"source": "Barney's story at the pub"},
{"source": "Barney's story at the pub"},
{"source": "The chronicles at the village library"},
]

# add texts to vector store and print first
ids = myCassandraVStore.add_texts(
# add texts to vector store and print IDs
ids = myAstraDBVStore.add_texts(
texts,
metadatas=metadatas,
)
print('\n'.join(ids))
print("\n".join(ids))

# query the index

QUESTION = 'Tell me about the party that night.'
QUESTION = "Tell me about the party that night."

# manual creation of the "retriever" with the 'similarity' search type
retrieverSim = myCassandraVStore.as_retriever(
search_type='similarity',
retrieverSim = myAstraDBVStore.as_retriever(
search_type="similarity",
search_kwargs={
'k': 2,
"k": 2,
},
)

Expand All @@ -91,18 +86,18 @@ chainSimSrc = RetrievalQAWithSourcesChain.from_chain_type(

# Run the chain and print results with sources
responseSimSrc = chainSimSrc.invoke({chainSimSrc.question_key: QUESTION})
print('Similarity-based chain:')
print(f' ANSWER : {responseSimSrc["answer"].strip()}')
print(f' SOURCES: {responseSimSrc["sources"].strip()}')
print("Similarity-based chain:")
print(f" ANSWER : {responseSimSrc['answer'].strip()}")
print(f" SOURCES: {responseSimSrc['sources'].strip()}")


# mmr search with sources

# manual creation of the "retriever" with the 'MMR' search type
retrieverMMR = myCassandraVStore.as_retriever(
search_type='mmr',
retrieverMMR = myAstraDBVStore.as_retriever(
search_type="mmr",
search_kwargs={
'k': 2,
"k": 2,
},
)

Expand All @@ -113,8 +108,8 @@ chainMMRSrc = RetrievalQAWithSourcesChain.from_chain_type(

# Run the chain and print results with sources
responseMMRSrc = chainMMRSrc.invoke({chainMMRSrc.question_key: QUESTION})
print('MMR-based chain:')
print(f' ANSWER : {responseMMRSrc["answer"].strip()}')
print(f' SOURCES: {responseMMRSrc["sources"].strip()}')
print("MMR-based chain:")
print(f" ANSWER : {responseMMRSrc['answer'].strip()}")
print(f" SOURCES: {responseMMRSrc['sources'].strip()}")
----
====
Loading