datastax · cbornet · Jul 10, 2024 · Jul 2, 2024 · Jul 10, 2024
diff --git a/docker/examples/basic/app.py b/docker/examples/basic/app.py
@@ -32,4 +32,4 @@ def hello():
 
 # Execute the application
 if __name__ == "__main__":
-    app.run(port=8080, host="0.0.0.0")
+    app.run(port=8080)
diff --git a/docker/examples/multistage/app.py b/docker/examples/multistage/app.py
@@ -32,4 +32,4 @@ def hello():
 
 # Execute the application
 if __name__ == "__main__":
-    app.run(port=8080, host="0.0.0.0")
+    app.run(port=8080)
diff --git a/docs/modules/examples/pages/langchain-unstructured-astra.adoc b/docs/modules/examples/pages/langchain-unstructured-astra.adoc
@@ -86,7 +86,7 @@ load_dotenv()
 url = "https://raw.githubusercontent.com/datastax/ragstack-ai/48bc55e7dc4de6a8b79fcebcedd242dc1254dd63/examples/notebooks/resources/attention_pages_9_10.pdf"
 file_path = "./attention_pages_9_10.pdf"
 
-response = requests.get(url)
+response = requests.get(url, timeout=30)
 if response.status_code == 200:
     with open(file_path, "wb") as file:
         file.write(response.content)
@@ -264,7 +264,7 @@ load_dotenv()
 url = "https://raw.githubusercontent.com/datastax/ragstack-ai/48bc55e7dc4de6a8b79fcebcedd242dc1254dd63/examples/notebooks/resources/attention_pages_9_10.pdf"
 file_path = "./attention_pages_9_10.pdf"
 
-response = requests.get(url)
+response = requests.get(url, timeout=30)
 if response.status_code == 200:
     with open(file_path, "wb") as file:
         file.write(response.content)

diff --git a/docs/modules/examples/pages/langchain_multimodal_gemini.adoc b/docs/modules/examples/pages/langchain_multimodal_gemini.adoc
@@ -47,7 +47,10 @@ Let's see if Gemini Pro Vision can identify a part to an espresso machine and te
 ----
 import requests
 
-source_img_data = requests.get('https://drive.google.com/uc?export=view&id=15ddcn-AIxpvRdWcFGvIr77XLWdo4Maof').content
+source_img_data = requests.get(
+    'https://drive.google.com/uc?export=view&id=15ddcn-AIxpvRdWcFGvIr77XLWdo4Maof',
+    timeout=30,
+).content
 with open('coffee_maker_part.png', 'wb') as handler:
   handler.write(source_img_data)
 ----
@@ -162,7 +165,7 @@ for i in range(len(df)):
 
   # Download this product's image and save it to the Colab filesystem.
   # In a production system this binary data would be stored in Google Cloud Storage
-  img_data = requests.get(image).content
+  img_data = requests.get(image, timeout=30).content
   with open(f'{name}.png', 'wb') as handler:
     handler.write(img_data)
 
@@ -303,7 +306,10 @@ from langchain.schema.messages import HumanMessage
 from vertexai.preview.vision_models import MultiModalEmbeddingModel, Image
 from astrapy.db import AstraDB
 
-source_img_data = requests.get('https://drive.google.com/uc?export=view&id=15ddcn-AIxpvRdWcFGvIr77XLWdo4Maof').content
+source_img_data = requests.get(
+    'https://drive.google.com/uc?export=view&id=15ddcn-AIxpvRdWcFGvIr77XLWdo4Maof',
+    timeout=30,
+).content
 with open('coffee_maker_part.png', 'wb') as handler:
     handler.write(source_img_data)
 
@@ -343,7 +349,7 @@ for i in range(len(df)):
 
     # Download this product's image and save it to your local filesystem.
     # In a production system this binary data would be stored in Google Cloud Storage
-    img_data = requests.get(image).content
+    img_data = requests.get(image, timeout=30).content
     with open(f'{name}.png', 'wb') as handler:
         handler.write(img_data)
 

diff --git a/docs/modules/examples/pages/llama-parse-astra.adoc b/docs/modules/examples/pages/llama-parse-astra.adoc
@@ -83,7 +83,7 @@ Settings.embed_model = OpenAIEmbedding(
 url = "https://arxiv.org/pdf/1706.03762.pdf"
 file_path = "./attention.pdf"
 
-response = requests.get(url)
+response = requests.get(url, timeout=30)
 if response.status_code == 200:
     with open(file_path, "wb") as file:
         file.write(response.content)

diff --git a/docs/modules/examples/partials/llama-parse.adoc b/docs/modules/examples/partials/llama-parse.adoc
@@ -31,7 +31,7 @@ Settings.embed_model = OpenAIEmbedding(
 # Download a PDF for indexing
 url = "https://arxiv.org/pdf/1706.03762.pdf"
 file_path = "./attention.pdf"
-response = requests.get(url)
+response = requests.get(url, timeout=30)
 if response.status_code == 200:
     with open(file_path, "wb") as file:
         file.write(response.content)

diff --git a/examples/evaluation/tru_dashboard.py b/examples/evaluation/tru_dashboard.py
@@ -1,4 +1,4 @@
 import tru_shared
 
 tru = tru_shared.init_tru()
-tru.run_dashboard(address="0.0.0.0", port=8501, force=True)
+tru.run_dashboard(force=True)
diff --git a/examples/notebooks/conftest.py b/examples/notebooks/conftest.py
@@ -1,5 +1,6 @@
 import logging
 import os
+import tempfile
 import time
 
 from astrapy.db import AstraDB
@@ -16,9 +17,11 @@ def get_required_env(name) -> str:
 
 # vertex-ai
 if "GOOGLE_APPLICATION_CREDENTIALS" not in os.environ:
-    with open("/tmp/gcloud-account-key.json", "w") as f:
+    with tempfile.NamedTemporaryFile(
+        prefix="gcloud-account-key", suffix=".json", mode="w", delete=False
+    ) as f:
         f.write(os.getenv("GCLOUD_ACCOUNT_KEY_JSON", ""))
-    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/tmp/gcloud-account-key.json"
+        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = f.name
 
 client = AstraDB(
     token=get_required_env("ASTRA_DB_APPLICATION_TOKEN"),

diff --git a/examples/notebooks/langchain-unstructured-astra.ipynb b/examples/notebooks/langchain-unstructured-astra.ipynb
@@ -69,7 +69,7 @@
     "import requests\n",
     "\n",
     "url = \"https://raw.githubusercontent.com/datastax/ragstack-ai/main/examples/notebooks/resources/attention_pages_9_10.pdf\"\n",
-    "response = requests.get(url)\n",
+    "response = requests.get(url, timeout=30)\n",
     "with open(\"attention_pages_9_10.pdf\", \"wb\") as file:\n",
     "    file.write(response.content)"
    ]

diff --git a/examples/notebooks/langchain_multimodal_gemini.ipynb b/examples/notebooks/langchain_multimodal_gemini.ipynb
@@ -146,12 +146,15 @@
     "    auth.authenticate_user()\n",
     "except ImportError:\n",
     "    import os\n",
+    "    import tempfile\n",
     "\n",
     "    if \"GOOGLE_APPLICATION_CREDENTIALS\" not in os.environ:\n",
     "        credentials = getpass(\"Enter Google JSON credentials file: \")\n",
-    "        with open(\"/tmp/gcloud-account-key.json\", \"w\") as f:\n",
+    "        with tempfile.NamedTemporaryFile(\n",
+    "            prefix=\"gcloud-account-key\", suffix=\".json\", delete=False\n",
+    "        ) as f:\n",
     "            f.write(credentials)\n",
-    "        os.environ[\"GOOGLE_APPLICATION_CREDENTIALS\"] = \"/tmp/gcloud-account-key.json\""
+    "            os.environ[\"GOOGLE_APPLICATION_CREDENTIALS\"] = f.name"
    ]
   },
   {
@@ -194,7 +197,8 @@
     "import requests\n",
     "\n",
     "source_img_data = requests.get(\n",
-    "    \"https://drive.google.com/uc?export=view&id=15ddcn-AIxpvRdWcFGvIr77XLWdo4Maof\"\n",
+    "    \"https://drive.google.com/uc?export=view&id=15ddcn-AIxpvRdWcFGvIr77XLWdo4Maof\",\n",
+    "    timeout=30,\n",
     ").content\n",
     "with open(\"coffee_maker_part.png\", \"wb\") as handler:\n",
     "    handler.write(source_img_data)"
@@ -607,7 +611,7 @@
     "\n",
     "    # Download this product's image and save it to the Colab filesystem.\n",
     "    # In a production system this binary data would be stored in Google Cloud Storage\n",
-    "    img_data = requests.get(image).content\n",
+    "    img_data = requests.get(image, timeout=30).content\n",
     "    with open(f\"{name}.png\", \"wb\") as handler:\n",
     "        handler.write(img_data)\n",
     "\n",

diff --git a/examples/notebooks/llama-parse-astra.ipynb b/examples/notebooks/llama-parse-astra.ipynb
@@ -148,7 +148,7 @@
     "file_path = \"./attention.pdf\"\n",
     "\n",
     "# Perform the HTTP request\n",
-    "response = requests.get(url)\n",
+    "response = requests.get(url, timeout=30)\n",
     "\n",
     "# Check if the request was successful\n",
     "if response.status_code == 200:\n",

diff --git a/libs/e2e-tests/e2e_tests/conftest.py b/libs/e2e-tests/e2e_tests/conftest.py
@@ -1,6 +1,7 @@
 import logging
 import os
 import pathlib
+import tempfile
 import time
 
 import pytest
@@ -197,6 +198,8 @@ def _report_to_file(stats_str: str, filename: str, report_lines: list):
 
 # vertex-ai
 if "GOOGLE_APPLICATION_CREDENTIALS" not in os.environ:
-    with open("/tmp/gcloud-account-key.json", "w") as f:
+    with tempfile.NamedTemporaryFile(
+        prefix="gcloud-account-key", suffix=".json", mode="w", delete=False
+    ) as f:
         f.write(os.getenv("GCLOUD_ACCOUNT_KEY_JSON", ""))
-    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/tmp/gcloud-account-key.json"
+        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = f.name
diff --git a/libs/e2e-tests/e2e_tests/langchain/test_astra.py b/libs/e2e-tests/e2e_tests/langchain/test_astra.py
@@ -74,7 +74,7 @@ def test_wrong_connection_parameters(vectorstore: AstraDBVectorStore):
         AstraDBVectorStore(
             collection_name="something",
             embedding=MockEmbeddings(),
-            token="xxxxx",
+            token="xxxxx",  # noqa: S106
             # we assume that post 1234 is not open locally
             api_endpoint="https://locahost:1234",
         )
@@ -90,7 +90,7 @@ def test_wrong_connection_parameters(vectorstore: AstraDBVectorStore):
         AstraDBVectorStore(
             collection_name="something",
             embedding=MockEmbeddings(),
-            token="this-is-a-wrong-token",
+            token="this-is-a-wrong-token",  # noqa: S106
             api_endpoint=api_endpoint,
         )
         pytest.fail("Should have thrown exception")

diff --git a/libs/e2e-tests/e2e_tests/langchain/test_cassandra_tool.py b/libs/e2e-tests/e2e_tests/langchain/test_cassandra_tool.py
@@ -39,7 +39,7 @@ def test_tool_with_openai_tool():
         f"""
         INSERT INTO default_keyspace.tool_table_users (user_id, user_name)
         VALUES ({user_id}, 'my_user');
-    """
+    """  # noqa: S608
     )
     db = CassandraDatabase()
 

diff --git a/libs/e2e-tests/e2e_tests/llama_index/test_astra.py b/libs/e2e-tests/e2e_tests/llama_index/test_astra.py
@@ -97,7 +97,7 @@ def test_ingest_errors(environment: Environment):
 def test_wrong_connection_parameters(environment: Environment):
     try:
         AstraDBVectorStore(
-            token="xxxxx",
+            token="xxxxx",  # noqa: S106
             # we assume that post 1234 is not open locally
             api_endpoint="https://locahost:1234",
             collection_name="something",
@@ -113,7 +113,7 @@ def test_wrong_connection_parameters(environment: Environment):
     try:
         print("api_endpoint:", api_endpoint)
         AstraDBVectorStore(
-            token="this-is-a-wrong-token",
+            token="this-is-a-wrong-token",  # noqa: S106
             api_endpoint=api_endpoint,
             collection_name="something",
             embedding_dimension=1536,

diff --git a/libs/e2e-tests/e2e_tests/llama_index/test_cassandra_tool.py b/libs/e2e-tests/e2e_tests/llama_index/test_cassandra_tool.py
@@ -36,7 +36,7 @@ def test_tool_with_openai_tool():
         f"""
         INSERT INTO default_keyspace.tool_table_users (user_id, user_name)
         VALUES ({user_id}, 'my_user');
-    """
+    """  # noqa: S608
     )
     db = CassandraDatabase()
 

diff --git a/libs/knowledge-graph/ragstack_knowledge_graph/knowledge_graph.py b/libs/knowledge-graph/ragstack_knowledge_graph/knowledge_graph.py
@@ -1,4 +1,5 @@
 import json
+import re
 from itertools import repeat
 from typing import Any, Dict, Iterable, Optional, Sequence, Tuple, Union, cast
 
@@ -29,6 +30,9 @@ def _parse_node(row: Any) -> Node:
     )
 
 
+_CQL_IDENTIFIER_PATTERN = re.compile(r"[a-zA-Z][a-zA-Z0-9_]*")
+
+
 class CassandraKnowledgeGraph:
     """Cassandra Knowledge Graph.
 
@@ -56,6 +60,15 @@ def __init__(
         session = check_resolve_session(session)
         keyspace = check_resolve_keyspace(keyspace)
 
+        if not _CQL_IDENTIFIER_PATTERN.fullmatch(keyspace):
+            raise ValueError(f"Invalid keyspace: {keyspace}")
+
+        if not _CQL_IDENTIFIER_PATTERN.fullmatch(node_table):
+            raise ValueError(f"Invalid node table name: {node_table}")
+
+        if not _CQL_IDENTIFIER_PATTERN.fullmatch(edge_table):
+            raise ValueError(f"Invalid edge table name: {edge_table}")
+
         self._text_embeddings = text_embeddings
         self._text_embeddings_dim = (
             # Embedding vectors must have dimension:
@@ -78,23 +91,23 @@ def __init__(
             f"""INSERT INTO {keyspace}.{node_table} (
                     name, type, text_embedding, properties_json
                 ) VALUES (?, ?, ?, ?)
-            """
+            """  # noqa: S608
         )
 
         self._insert_relationship = self._session.prepare(
             f"""
             INSERT INTO {keyspace}.{edge_table} (
                 source_name, source_type, target_name, target_type, edge_type
             ) VALUES (?, ?, ?, ?, ?)
-            """
+            """  # noqa: S608
         )
 
         self._query_relationship = self._session.prepare(
             f"""
             SELECT name, type, properties_json
             FROM {keyspace}.{node_table}
             WHERE name = ? AND type = ?
-            """
+            """  # noqa: S608
         )
 
         self._query_nodes_by_embedding = self._session.prepare(
@@ -103,7 +116,7 @@ def __init__(
             FROM {keyspace}.{node_table}
             ORDER BY text_embedding ANN OF ?
             LIMIT ?
-            """
+            """  # noqa: S608
         )
 
     def _apply_schema(self) -> None:

diff --git a/libs/knowledge-graph/ragstack_knowledge_graph/runnables.py b/libs/knowledge-graph/ragstack_knowledge_graph/runnables.py
@@ -39,8 +39,14 @@ def extract_entities(
             `{format_instructions}` which describe how to produce the output.
     """
     prompt = ChatPromptTemplate.from_messages([keyword_extraction_prompt])
-    assert "question" in prompt.input_variables
-    assert "format_instructions" in prompt.input_variables
+    if "question" not in prompt.input_variables:
+        raise ValueError(
+            "Missing 'question' placeholder in extraction prompt template."
+        )
+    if "format_instructions" not in prompt.input_variables:
+        raise ValueError(
+            "Missing 'format_instructions' placeholder in extraction prompt template."
+        )
 
     class SimpleNode(BaseModel):
         """Represents a node in a graph with associated properties."""

diff --git a/libs/knowledge-store/notebooks/astra_support.ipynb b/libs/knowledge-store/notebooks/astra_support.ipynb
@@ -68,6 +68,7 @@
     "            \"User-Agent\": \"Mozilla/5.0 (X11; Linux x86_64; rv:58.0) Gecko/20100101 \"\n",
     "            \"Firefox/58.0\",\n",
     "        },\n",
+    "        timeout=30,\n",
     "    )\n",
     "    xml = r.text\n",
     "\n",

diff --git a/libs/knowledge-store/ragstack_knowledge_store/_mmr_helper.py b/libs/knowledge-store/ragstack_knowledge_store/_mmr_helper.py
@@ -127,7 +127,11 @@ def _pop_candidate(self, candidate_id: str) -> NDArray[np.float32]:
         """
         # Get the embedding for the id.
         index = self.candidate_id_to_index.pop(candidate_id)
-        assert self.candidates[index].id == candidate_id
+        if not self.candidates[index].id == candidate_id:
+            raise ValueError(
+                "ID in self.candidate_id_to_index doesn't match the ID of the "
+                "corresponding index in self.candidates"
+            )
         embedding: NDArray[np.float32] = self.candidate_embeddings[index].copy()
 
         # Swap that index with the last index in the candidates and