VirtualPatientEngine · awmulyadi · Jan 9, 2025 · Dec 10, 2024 · Dec 10, 2024 · Dec 10, 2024
diff --git a/.github/workflows/tests_talk2biomodels.yml b/.github/workflows/tests_talk2biomodels.yml
@@ -0,0 +1,124 @@
+# This is a basic workflow to help you get started with GitHub Actions
+name: TESTS Talk2BioModels
+
+# Controls when the workflow will run
+on:
+  # Triggers the workflow on push or pull request events
+  pull_request:
+    branches: [ main ]
+    paths:
+      - 'aiagents4pharma/talk2biomodels/**'
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+env:
+  OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+
+# This workflow contains jobs covering linting and code coverage (along with testing).
+jobs:
+  pylint-windows-ubuntu-macos:
+    # The type of runner that the job will run on
+    name: pylint-windows-ubuntu-macos
+    runs-on: ${{ matrix.os }}
+
+    strategy:
+        matrix:
+            os: [windows-latest, ubuntu-latest, macos-latest]
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.12
+
+      # install requirements
+      - name: Install the requirements
+        run: |
+          pip3 install --break-system-packages -r requirements.txt
+
+      # pylint
+      - name: Run pylint
+        run: |
+          pylint --disable=R0801,R0902,W0221,W0122 aiagents4pharma/talk2biomodels
+
+  # code coverage job for ubuntu and macos
+  code-cov-ubuntu-macos:
+    name: code-coverage-ubuntu-macos
+    runs-on: ${{ matrix.os }}
+
+    strategy:
+        matrix:
+          os: [ubuntu-latest, macos-latest]
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.12
+
+    - name: Install dependencies
+      run: pip3 install -r requirements.txt  # Adjust this according to your project
+
+    - name: Run tests with coverage
+      run: coverage run --include=aiagents4pharma/talk2biomodels/* -m pytest --cache-clear aiagents4pharma/talk2biomodels/tests/
+
+    - name: Check coverage
+      run: |
+        coverage report -m
+        TOTAL_COVERAGE=$(coverage report -m | awk 'END {print int($NF)}')
+        if [[ $TOTAL_COVERAGE -ne 100 ]]; then
+          echo "Code coverage is not 100%. Please check the coverage report."
+          exit 1
+        fi
+      env:
+        COVERAGE_FILE: './.coverage'
+
+  # code coverage job for windows
+  code-cov-windows:
+    name: code-coverage-windows
+    runs-on: ${{ matrix.os }}
+
+    strategy:
+        matrix:
+          os: [windows-latest]
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.x
+
+    - name: Install dependencies
+      run: pip3 install -r requirements.txt  # Adjust this according to your project
+
+    - name: Run tests with coverage
+      run: coverage run --include=aiagents4pharma/talk2biomodels/* -m pytest --cache-clear aiagents4pharma/talk2biomodels/tests/
+
+    - name: Check coverage
+      run: |
+        coverage report -m
+        # $TOTAL_COVERAGE=(& coverage report -m | Select-Object -Last 1) -replace "[^\d]"  # Extract the last line and remove non-numeric characters
+        $TOTAL_COVERAGE=(& coverage report -m | Select-Object -Last 1)
+        # split and extract the last element
+        $TOTAL_COVERAGE=($TOTAL_COVERAGE -split " ")[-1]
+        # remove non-numeric characters
+        $TOTAL_COVERAGE=($TOTAL_COVERAGE -replace "[^\d]")
+        # convert to int
+        $TOTAL_COVERAGE=[int]$TOTAL_COVERAGE
+        echo "Total coverage: $TOTAL_COVERAGE"
+        if ($TOTAL_COVERAGE -ne 100) {
+          Write-Host "Code coverage is not 100%. Please check the coverage report."
+          exit 1
+        }
+      env:
+        COVERAGE_FILE: './.coverage'
diff --git a/.github/workflows/tests.yml → .github/workflows/tests_talk2cells.yml b/.github/workflows/tests.yml → .github/workflows/tests_talk2cells.yml
@@ -1,22 +1,21 @@
 # This is a basic workflow to help you get started with GitHub Actions
-name: TESTS
+name: TESTS Talk2Cells
 
 # Controls when the workflow will run
 on:
   # Triggers the workflow on push or pull request events
   pull_request:
     branches: [ main ]
+    paths:
+      - 'aiagents4pharma/talk2cells/**'
 
   # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
 
 env:
   OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
 
-# This workflow contains 3 jobs called:
-# 1. pylint-windows-ubuntu-macos
-# 2. code-cov-ubuntu-macos
-# 3. code-cov-windows
+# This workflow contains jobs covering linting and code coverage (along with testing).
 jobs:
   pylint-windows-ubuntu-macos:
     # The type of runner that the job will run on
@@ -39,16 +38,14 @@ jobs:
 
       # install requirements
       - name: Install the requirements
-        shell: bash -l {0}
         run: |
           pip3 install --break-system-packages -r requirements.txt
-      
+
       # pylint
       - name: Run pylint
-        shell: bash -l {0}
         run: |
-          pylint --disable=R0801,W0221,W0122 aiagents4pharma
-  
+          pylint --disable=R0801,R0902,W0221,W0122 aiagents4pharma/talk2cells
+
   # code coverage job for ubuntu and macos
   code-cov-ubuntu-macos:
     name: code-coverage-ubuntu-macos
@@ -70,7 +67,7 @@ jobs:
       run: pip3 install -r requirements.txt  # Adjust this according to your project
 
     - name: Run tests with coverage
-      run: coverage run -m pytest --cache-clear aiagents4pharma/talk2biomodels/tests/
+      run: coverage run --include=aiagents4pharma/talk2cells/* -m pytest --cache-clear aiagents4pharma/talk2cells/tests/
 
     - name: Check coverage
       run: |
@@ -82,7 +79,7 @@ jobs:
         fi
       env:
         COVERAGE_FILE: './.coverage'
-  
+
   # code coverage job for windows
   code-cov-windows:
     name: code-coverage-windows
@@ -105,7 +102,7 @@ jobs:
       run: pip3 install -r requirements.txt  # Adjust this according to your project
 
     - name: Run tests with coverage
-      run: coverage run -m pytest --cache-clear aiagents4pharma/talk2biomodels/tests/
+      run: coverage run --include=aiagents4pharma/talk2cells/* -m pytest --cache-clear aiagents4pharma/talk2cells/tests/
 
     - name: Check coverage
       run: |
@@ -125,4 +122,3 @@ jobs:
         }
       env:
         COVERAGE_FILE: './.coverage'
-
diff --git a/.github/workflows/tests_talk2knowledgegraphs.yml b/.github/workflows/tests_talk2knowledgegraphs.yml
@@ -0,0 +1,124 @@
+# This is a basic workflow to help you get started with GitHub Actions
+name: TESTS Talk2KnowledgeGraphs
+
+# Controls when the workflow will run
+on:
+  # Triggers the workflow on push or pull request events
+  pull_request:
+    branches: [ main ]
+    paths:
+      - 'aiagents4pharma/talk2knowledgegraphs/**'
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+env:
+  OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+
+# This workflow contains jobs covering linting and code coverage (along with testing).
+jobs:
+  pylint-windows-ubuntu-macos:
+    # The type of runner that the job will run on
+    name: pylint-windows-ubuntu-macos
+    runs-on: ${{ matrix.os }}
+
+    strategy:
+        matrix:
+            os: [windows-latest, ubuntu-latest, macos-latest]
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.12
+
+      # install requirements
+      - name: Install the requirements
+        run: |
+          pip3 install --break-system-packages -r requirements.txt
+
+      # pylint
+      - name: Run pylint
+        run: |
+          pylint --disable=R0801,R0902,W0221,W0122 aiagents4pharma/talk2knowledgegraphs
+
+  # code coverage job for ubuntu and macos
+  code-cov-ubuntu-macos:
+    name: code-coverage-ubuntu-macos
+    runs-on: ${{ matrix.os }}
+
+    strategy:
+        matrix:
+          os: [ubuntu-latest, macos-latest]
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.12
+
+    - name: Install dependencies
+      run: pip3 install -r requirements.txt  # Adjust this according to your project
+
+    - name: Run tests with coverage
+      run: coverage run --include=aiagents4pharma/talk2knowledgegraphs/* -m pytest --cache-clear aiagents4pharma/talk2knowledgegraphs/tests/
+
+    - name: Check coverage
+      run: |
+        coverage report -m
+        TOTAL_COVERAGE=$(coverage report -m | awk 'END {print int($NF)}')
+        if [[ $TOTAL_COVERAGE -ne 100 ]]; then
+          echo "Code coverage is not 100%. Please check the coverage report."
+          exit 1
+        fi
+      env:
+        COVERAGE_FILE: './.coverage'
+
+  # code coverage job for windows
+  code-cov-windows:
+    name: code-coverage-windows
+    runs-on: ${{ matrix.os }}
+
+    strategy:
+        matrix:
+          os: [windows-latest]
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.x
+
+    - name: Install dependencies
+      run: pip3 install -r requirements.txt  # Adjust this according to your project
+
+    - name: Run tests with coverage
+      run: coverage run --include=aiagents4pharma/talk2knowledgegraphs/* -m pytest --cache-clear aiagents4pharma/talk2knowledgegraphs/tests/
+
+    - name: Check coverage
+      run: |
+        coverage report -m
+        # $TOTAL_COVERAGE=(& coverage report -m | Select-Object -Last 1) -replace "[^\d]"  # Extract the last line and remove non-numeric characters
+        $TOTAL_COVERAGE=(& coverage report -m | Select-Object -Last 1)
+        # split and extract the last element
+        $TOTAL_COVERAGE=($TOTAL_COVERAGE -split " ")[-1]
+        # remove non-numeric characters
+        $TOTAL_COVERAGE=($TOTAL_COVERAGE -replace "[^\d]")
+        # convert to int
+        $TOTAL_COVERAGE=[int]$TOTAL_COVERAGE
+        echo "Total coverage: $TOTAL_COVERAGE"
+        if ($TOTAL_COVERAGE -ne 100) {
+          Write-Host "Code coverage is not 100%. Please check the coverage report."
+          exit 1
+        }
+      env:
+        COVERAGE_FILE: './.coverage'
diff --git a/aiagents4pharma/__init__.py b/aiagents4pharma/__init__.py
@@ -1,5 +1,7 @@
 '''
-This file is used to import the talk2biomodels module.
+This file is used to import aiagents4pharma modules.
 '''
 
 from . import talk2biomodels
+from . import talk2cells
+from . import talk2knowledgegraphs
diff --git a/aiagents4pharma/talk2knowledgegraphs/__init__.py b/aiagents4pharma/talk2knowledgegraphs/__init__.py
@@ -0,0 +1,4 @@
+'''
+This file is used to import the datasets, utils, and tools.
+'''
+from . import datasets
diff --git a/aiagents4pharma/talk2knowledgegraphs/datasets/__init__.py b/aiagents4pharma/talk2knowledgegraphs/datasets/__init__.py
@@ -0,0 +1,7 @@
+'''
+This file is used to import all the models in the package.
+'''
+from . import dataset
+from . import primekg
+from . import starkqa_primekg
+from . import biobridge_primekg
diff --git a/aiagents4pharma/talk2knowledgegraphs/datasets/biobridge_primekg.py b/aiagents4pharma/talk2knowledgegraphs/datasets/biobridge_primekg.py
diff --git a/aiagents4pharma/talk2knowledgegraphs/datasets/dataset.py b/aiagents4pharma/talk2knowledgegraphs/datasets/dataset.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+
+"""
+Abstract class for dataset.
+"""
+
+from abc import ABC, abstractmethod
+
+class Dataset(ABC):
+    """
+    Abstract class for dataset.
+    """
+    @abstractmethod
+    def setup(self):
+        """
+        A method to set up the dataset.
+        """
+
+    @abstractmethod
+    def load_data(self):
+        """
+        A method to load the dataset and potentially preprocess it.
+        """
diff --git a/aiagents4pharma/talk2knowledgegraphs/datasets/primekg.py b/aiagents4pharma/talk2knowledgegraphs/datasets/primekg.py
@@ -0,0 +1,201 @@
+"""
+Class for loading PrimeKG dataset.
+"""
+
+import os
+import requests
+from tqdm import tqdm
+import pandas as pd
+from .dataset import Dataset
+
+class PrimeKG(Dataset):
+    """
+    Class for loading PrimeKG dataset.
+    It downloads the data from the Harvard Dataverse and stores it in the local directory.
+    The data is then loaded into pandas DataFrame of nodes and edges.
+    """
+
+    def __init__(self, local_dir: str = "../../../data/primekg/"):
+        """
+        Constructor for PrimeKG class.
+
+        Args:
+            local_dir (str): The local directory where the data will be stored.
+        """
+        self.name: str = "primekg"
+        self.server_path: str = "https://dataverse.harvard.edu/api/access/datafile/"
+        self.file_ids: dict = {"nodes": 6180617, "edges": 6180616}
+        self.local_dir: str = local_dir
+
+        # Attributes to store the data
+        self.nodes: pd.DataFrame = None
+        self.edges: pd.DataFrame = None
+
+        # Set up the dataset
+        self.setup()
+
+    def setup(self):
+        """
+        A method to set up the dataset.
+        """
+        # Make the directory if it doesn't exist
+        os.makedirs(os.path.dirname(self.local_dir), exist_ok=True)
+
+
+    def _download_file(self, remote_url:str, local_path: str):
+        """
+        A helper function to download a file from remote URL to the local directory.
+
+        Args:
+            remote_url (str): The remote URL of the file to be downloaded.
+            local_path (str): The local path where the file will be saved.
+        """
+        response = requests.get(remote_url, stream=True, timeout=300)
+        response.raise_for_status()
+        progress_bar = tqdm(
+            total=int(response.headers.get("content-length", 0)),
+            unit="iB",
+            unit_scale=True,
+        )
+        with open(local_path, "wb") as file:
+            for data in response.iter_content(1024):
+                progress_bar.update(len(data))
+                file.write(data)
+        progress_bar.close()
+
+    def _load_nodes(self) -> pd.DataFrame:
+        """
+        Private method to load the nodes dataframe of PrimeKG dataset.
+        This method downloads the nodes file from the Harvard Dataverse if it does not exist
+        in the local directory. Otherwise, it loads the data from the local directory.
+        It further processes the dataframe of nodes and returns it.
+
+        Returns:
+            pd.DataFrame: The nodes dataframe of PrimeKG dataset.
+        """
+        local_file = os.path.join(self.local_dir, f"{self.name}_nodes.tsv.gz")
+        if os.path.exists(local_file):
+            print(f"{local_file} already exists. Loading the data from the local directory.")
+
+            # Load the dataframe from the local directory and assign it to the nodes attribute
+            nodes = pd.read_csv(local_file, sep="\t", compression="gzip", low_memory=False)
+        else:
+            print(f"Downloading node file from {self.server_path}{self.file_ids['nodes']}")
+
+            # Download the file from the Harvard Dataverse with designated file_id for node
+            self._download_file(f"{self.server_path}{self.file_ids['nodes']}",
+                                os.path.join(self.local_dir, "nodes.tab"))
+
+            # Load the downloaded file into a pandas DataFrame
+            nodes = pd.read_csv(os.path.join(self.local_dir, "nodes.tab"),
+                                     sep="\t", low_memory=False)
+
+            # Further processing of the dataframe
+            nodes = nodes[
+                ["node_index", "node_name", "node_source", "node_id", "node_type"]
+            ]
+
+            # Store compressed dataframe in the local directory
+            nodes.to_csv(local_file, index=False, sep="\t", compression="gzip")
+
+        return nodes
+
+    def _load_edges(self, nodes: pd.DataFrame) -> pd.DataFrame:
+        """
+        Private method to load the edges dataframe of PrimeKG dataset.
+        This method downloads the edges file from the Harvard Dataverse if it does not exist
+        in the local directory. Otherwise, it loads the data from the local directory.
+        It further processes the dataframe of edges and returns it.
+
+        Args:
+            nodes (pd.DataFrame): The nodes dataframe of PrimeKG dataset.
+
+        Returns:
+            pd.DataFrame: The edges dataframe of PrimeKG dataset.
+        """
+        local_file = os.path.join(self.local_dir, f"{self.name}_edges.tsv.gz")
+        if os.path.exists(local_file):
+            print(f"{local_file} already exists. Loading the data from the local directory.")
+
+            # Load the dataframe from the local directory and assign it to the edges attribute
+            edges = pd.read_csv(local_file, sep="\t", compression="gzip", low_memory=False)
+        else:
+            print(f"Downloading edge file from {self.server_path}{self.file_ids['edges']}")
+
+            # Download the file from the Harvard Dataverse with designated file_id for edge
+            self._download_file(f"{self.server_path}{self.file_ids['edges']}",
+                                os.path.join(self.local_dir, "edges.csv"))
+
+            # Load the downloaded file into a pandas DataFrame
+            edges = pd.read_csv(os.path.join(self.local_dir, "edges.csv"),
+                                     sep=",", low_memory=False)
+
+            # Further processing of the dataframe
+            edges = edges.merge(
+                nodes, left_on="x_index", right_on="node_index"
+            )
+            edges.drop(["x_index"], axis=1, inplace=True)
+            edges.rename(
+                columns={
+                    "node_index": "head_index",
+                    "node_name": "head_name",
+                    "node_source": "head_source",
+                    "node_id": "head_id",
+                    "node_type": "head_type",
+                },
+                inplace=True,
+            )
+            edges = edges.merge(
+                nodes, left_on="y_index", right_on="node_index"
+            )
+            edges.drop(["y_index"], axis=1, inplace=True)
+            edges.rename(
+                columns={
+                    "node_index": "tail_index",
+                    "node_name": "tail_name",
+                    "node_source": "tail_source",
+                    "node_id": "tail_id",
+                    "node_type": "tail_type"
+                },
+                inplace=True,
+            )
+            edges = edges[
+                [
+                    "head_index", "head_name", "head_source", "head_id", "head_type",
+                    "tail_index", "tail_name", "tail_source", "tail_id", "tail_type",
+                    "display_relation", "relation",
+                ]
+            ]
+
+            # Store compressed dataframe in the local directory
+            edges.to_csv(local_file, index=False, sep="\t", compression="gzip")
+
+        return edges
+
+    def load_data(self):
+        """
+        Load the PrimeKG dataset into pandas DataFrame of nodes and edges.
+        """
+        print("Loading nodes of PrimeKG dataset ...")
+        self.nodes = self._load_nodes()
+
+        print("Loading edges of PrimeKG dataset ...")
+        self.edges = self._load_edges(self.nodes)
+
+    def get_nodes(self) -> pd.DataFrame:
+        """
+        Get the nodes dataframe of PrimeKG dataset.
+
+        Returns:
+            pd.DataFrame: The nodes dataframe of PrimeKG dataset.
+        """
+        return self.nodes
+
+    def get_edges(self) -> pd.DataFrame:
+        """
+        Get the edges dataframe of PrimeKG dataset.
+
+        Returns:
+            pd.DataFrame: The edges dataframe of PrimeKG dataset.
+        """
+        return self.edges
diff --git a/aiagents4pharma/talk2knowledgegraphs/datasets/starkqa_primekg.py b/aiagents4pharma/talk2knowledgegraphs/datasets/starkqa_primekg.py
@@ -0,0 +1,198 @@
+"""
+Class for loading StarkQAPrimeKG dataset.
+"""
+
+import os
+import shutil
+import pickle
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+import torch
+from huggingface_hub import hf_hub_download, list_repo_files
+import gdown
+from .dataset import Dataset
+
+class StarkQAPrimeKG(Dataset):
+    """
+    Class for loading StarkQAPrimeKG dataset.
+    It downloads the data from the HuggingFace repo and stores it in the local directory.
+    The data is then loaded into pandas DataFrame of QA pairs, dictionary of split indices,
+    and node information.
+    """
+
+    def __init__(self, local_dir: str = "../../../data/starkqa_primekg/"):
+        """
+        Constructor for StarkQAPrimeKG class.
+
+        Args:
+            local_dir (str): The local directory to store the dataset files.
+        """
+        self.name: str = "starkqa_primekg"
+        self.hf_repo_id: str = "snap-stanford/stark"
+        self.local_dir: str = local_dir
+        # Attributes to store the data
+        self.starkqa: pd.DataFrame = None
+        self.starkqa_split_idx: dict = None
+        self.starkqa_node_info: dict = None
+        self.query_emb_dict: dict = None
+        self.node_emb_dict: dict = None
+
+        # Set up the dataset
+        self.setup()
+
+    def setup(self):
+        """
+        A method to set up the dataset.
+        """
+        # Make the directory if it doesn't exist
+        os.makedirs(os.path.dirname(self.local_dir), exist_ok=True)
+
+    def _load_stark_repo(self) -> pd.DataFrame:
+        """
+        Private method to load related files of StarkQAPrimeKG dataset.
+
+        Returns:
+            pd.DataFrame: The nodes dataframe of StarkQAPrimeKG dataset.
+        """
+        # Download the file if it does not exist in the local directory
+        # Otherwise, load the data from the local directory
+        local_file = os.path.join(self.local_dir, "qa/prime/stark_qa/stark_qa.csv")
+        if os.path.exists(local_file):
+            print(f"{local_file} already exists. Loading the data from the local directory.")
+        else:
+            print(f"Downloading files from {self.hf_repo_id}")
+
+            # List all related files in the HuggingFace Hub repository
+            files = list_repo_files(self.hf_repo_id, repo_type="dataset")
+            files = [f for f in files if ((f.startswith("qa/prime/") or
+                                           f.startswith("skb/prime/")) and f.find("raw") == -1)]
+
+            # Download and save each file in the specified folder
+            for file in tqdm(files):
+                _ = hf_hub_download(self.hf_repo_id,
+                                    file,
+                                    repo_type="dataset",
+                                    local_dir=self.local_dir)
+
+            # Unzip the processed files
+            shutil.unpack_archive(
+                os.path.join(self.local_dir, "skb/prime/processed.zip"),
+                os.path.join(self.local_dir, "skb/prime/")
+            )
+
+        # Load StarkQA dataframe
+        starkqa = pd.read_csv(
+            os.path.join(self.local_dir, "qa/prime/stark_qa/stark_qa.csv"),
+            low_memory=False)
+
+        # Read split indices
+        qa_indices = sorted(starkqa['id'].tolist())
+        starkqa_split_idx = {}
+        for split in ['train', 'val', 'test', 'test-0.1']:
+            indices_file = os.path.join(self.local_dir, "qa/prime/split", f'{split}.index')
+            with open(indices_file, 'r', encoding='utf-8') as f:
+                indices = f.read().strip().split('\n')
+            query_ids = [int(idx) for idx in indices]
+            starkqa_split_idx[split] = np.array(
+                [qa_indices.index(query_id) for query_id in query_ids]
+            )
+
+        # Load the node info of PrimeKG preprocessed for StarkQA
+        with open(os.path.join(self.local_dir, 'skb/prime/processed/node_info.pkl'), 'rb') as f:
+            starkqa_node_info = pickle.load(f)
+
+        return starkqa, starkqa_split_idx, starkqa_node_info
+
+    def _load_stark_embeddings(self) -> tuple:
+        """
+        Private method to load the embeddings of StarkQAPrimeKG dataset.
+
+        Returns:
+            tuple: A tuple of query and node embeddings dictionaries.
+        """
+        # Load the provided embeddings of query and nodes
+        # Note that they utilized 'text-embedding-ada-002' for embeddings
+        emb_model = 'text-embedding-ada-002'
+        query_emb_url = 'https://drive.google.com/uc?id=1MshwJttPZsHEM2cKA5T13SIrsLeBEdyU'
+        node_emb_url = 'https://drive.google.com/uc?id=16EJvCMbgkVrQ0BuIBvLBp-BYPaye-Edy'
+
+        # Prepare respective directories to store the embeddings
+        emb_dir = os.path.join(self.local_dir, emb_model)
+        query_emb_dir = os.path.join(emb_dir, "query")
+        node_emb_dir = os.path.join(emb_dir, "doc")
+        os.makedirs(query_emb_dir, exist_ok=True)
+        os.makedirs(node_emb_dir, exist_ok=True)
+        query_emb_path = os.path.join(query_emb_dir, "query_emb_dict.pt")
+        node_emb_path = os.path.join(node_emb_dir, "candidate_emb_dict.pt")
+
+        # Download the embeddings if they do not exist in the local directory
+        if not os.path.exists(query_emb_path) or not os.path.exists(node_emb_path):
+            # Download the query embeddings
+            gdown.download(query_emb_url, query_emb_path, quiet=False)
+
+            # Download the node embeddings
+            gdown.download(node_emb_url, node_emb_path, quiet=False)
+
+        # Load the embeddings
+        query_emb_dict = torch.load(query_emb_path)
+        node_emb_dict = torch.load(node_emb_path)
+
+        return query_emb_dict, node_emb_dict
+
+    def load_data(self):
+        """
+        Load the StarkQAPrimeKG dataset into pandas DataFrame of QA pairs,
+        dictionary of split indices, and node information.
+        """
+        print("Loading StarkQAPrimeKG dataset...")
+        self.starkqa, self.starkqa_split_idx, self.starkqa_node_info = self._load_stark_repo()
+
+        print("Loading StarkQAPrimeKG embeddings...")
+        self.query_emb_dict, self.node_emb_dict = self._load_stark_embeddings()
+
+
+    def get_starkqa(self) -> pd.DataFrame:
+        """
+        Get the dataframe of StarkQAPrimeKG dataset, containing the QA pairs.
+
+        Returns:
+            pd.DataFrame: The nodes dataframe of PrimeKG dataset.
+        """
+        return self.starkqa
+
+    def get_starkqa_split_indicies(self) -> dict:
+        """
+        Get the split indices of StarkQAPrimeKG dataset.
+
+        Returns:
+            dict: The split indices of StarkQAPrimeKG dataset.
+        """
+        return self.starkqa_split_idx
+
+    def get_starkqa_node_info(self) -> dict:
+        """
+        Get the node information of StarkQAPrimeKG dataset.
+
+        Returns:
+            dict: The node information of StarkQAPrimeKG dataset.
+        """
+        return self.starkqa_node_info
+
+    def get_query_embeddings(self) -> dict:
+        """
+        Get the query embeddings of StarkQAPrimeKG dataset.
+
+        Returns:
+            dict: The query embeddings of StarkQAPrimeKG dataset.
+        """
+        return self.query_emb_dict
+
+    def get_node_embeddings(self) -> dict:
+        """
+        Get the node embeddings of StarkQAPrimeKG dataset.
+
+        Returns:
+            dict: The node embeddings of StarkQAPrimeKG dataset.
+        """
+        return self.node_emb_dict
diff --git a/aiagents4pharma/talk2knowledgegraphs/pyproject.toml b/aiagents4pharma/talk2knowledgegraphs/pyproject.toml
@@ -0,0 +1,43 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "aiagents4pharma-talk2knowledgegraphs"
+description = "AI Agents for drug discovery, drug development, and other pharmaceutical R&D"
+readme = "README.md"
+requires-python = ">=3.10"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+]
+dependencies = [
+    "coverage==7.6.4",
+    "gdown==5.2.0",
+    "huggingface_hub==0.26.5",
+    "joblib==1.4.2",
+    "pandas==2.2.3",
+    "pydantic==2.9.2",
+    "pylint==3.3.1",
+    "pytest==8.3.3",
+    "torch==2.5.1",
+    "tqdm==4.66.6",
+    "mkdocs==1.6.1",
+    "mkdocs-jupyter==0.25.1",
+    "mkdocs-material==9.5.47",
+    "mkdocstrings-python==1.12.2",
+    "mkdocs-include-markdown-plugin==7.1.2",
+    "mkdocstrings==0.27.0"
+]
+dynamic = ["version"]
+
+[tool.setuptools.dynamic]
+version = {file = "release_version.txt"}
+
+# find packages
+[tool.setuptools]
+packages = ["aiagents4pharma",
+            "aiagents4pharma.talk2knowledgegraphs",
+            "aiagents4pharma.talk2knowledgegraphs.datasets",
+            "aiagents4pharma.talk2knowledgegraphs.utils"]
diff --git a/aiagents4pharma/talk2knowledgegraphs/tests/__init__.py b/aiagents4pharma/talk2knowledgegraphs/tests/__init__.py
diff --git a/aiagents4pharma/talk2knowledgegraphs/tests/test_biobridge_primekg_loader.py b/aiagents4pharma/talk2knowledgegraphs/tests/test_biobridge_primekg_loader.py
@@ -0,0 +1,242 @@
+"""
+Test cases for primekg_loader.py
+"""
+
+import os
+import shutil
+import pytest
+from ..datasets.biobridge_primekg import BioBridgePrimeKG
+
+# Remove the data folder for testing if it exists
+PRIMEKG_LOCAL_DIR = "../data/primekg_test/"
+LOCAL_DIR = "../data/biobridge_primekg_test/"
+shutil.rmtree(LOCAL_DIR, ignore_errors=True)
+
+@pytest.fixture(name="biobridge_primekg")
+def biobridge_primekg_fixture():
+    """
+    Fixture for creating an instance of PrimeKG.
+    """
+    return BioBridgePrimeKG(primekg_dir=PRIMEKG_LOCAL_DIR,
+                            local_dir=LOCAL_DIR)
+
+def test_download_primekg(biobridge_primekg):
+    """
+    Test the loading method of the BioBridge-PrimeKG class by downloading data from repository.
+    """
+    # Load BioBridge-PrimeKG data
+    biobridge_primekg.load_data()
+    primekg_nodes = biobridge_primekg.get_primekg().get_nodes()
+    primekg_edges = biobridge_primekg.get_primekg().get_edges()
+    biobridge_data_config = biobridge_primekg.get_data_config()
+    biobridge_emb_dict = biobridge_primekg.get_node_embeddings()
+    biobridge_triplets = biobridge_primekg.get_primekg_triplets()
+    biobridge_splits = biobridge_primekg.get_train_test_split()
+    biobridge_node_info = biobridge_primekg.get_node_info_dict()
+
+    # Check if the local directories exists
+    assert os.path.exists(biobridge_primekg.primekg_dir)
+    assert os.path.exists(biobridge_primekg.local_dir)
+    # Check if downloaded and processed files exist
+    # PrimeKG files
+    files = ["nodes.tab", "primekg_nodes.tsv.gz",
+             "edges.csv", "primekg_edges.tsv.gz"]
+    for file in files:
+        path = f"{biobridge_primekg.primekg_dir}/{file}"
+        assert os.path.exists(path)
+    # BioBridge data config
+    assert os.path.exists(f"{biobridge_primekg.local_dir}/data_config.json")
+    # BioBridge embeddings
+    files = [
+        "protein.pkl",
+        "mf.pkl",
+        "cc.pkl",
+        "bp.pkl",
+        "drug.pkl",
+        "disease.pkl",
+        "embedding_dict.pkl"
+    ]
+    for file in files:
+        path = f"{biobridge_primekg.local_dir}/embeddings/{file}"
+        assert os.path.exists(path)
+    # BioBridge processed files
+    files = [
+        "protein.csv",
+        "mf.csv",
+        "cc.csv",
+        "bp.csv",
+        "drug.csv",
+        "disease.csv",
+        "triplet_full.tsv.gz",
+        "triplet_full_altered.tsv.gz",
+        "node_train.tsv.gz",
+        "triplet_train.tsv.gz",
+        "node_test.tsv.gz",
+        "triplet_test.tsv.gz",
+    ]
+    for file in files:
+        path = f"{biobridge_primekg.local_dir}/processed/{file}"
+        assert os.path.exists(path)
+    # Check processed PrimeKG dataframes
+    # Nodes
+    assert primekg_nodes is not None
+    assert len(primekg_nodes) > 0
+    assert primekg_nodes.shape[0] == 129375
+    # Edges
+    assert primekg_edges is not None
+    assert len(primekg_edges) > 0
+    assert primekg_edges.shape[0] == 8100498
+    # Check processed BioBridge data config
+    assert biobridge_data_config is not None
+    assert len(biobridge_data_config) > 0
+    assert len(biobridge_data_config['node_type']) == 10
+    assert len(biobridge_data_config['relation_type']) == 18
+    assert len(biobridge_data_config['emb_dim']) == 6
+    # Check processed BioBridge embeddings
+    assert biobridge_emb_dict is not None
+    assert len(biobridge_emb_dict) > 0
+    assert len(biobridge_emb_dict) == 85466
+    # Check processed BioBridge triplets
+    assert biobridge_triplets is not None
+    assert len(biobridge_triplets) > 0
+    assert biobridge_triplets.shape[0] == 3904610
+    assert list(biobridge_splits.keys()) == ['train', 'node_train', 'test', 'node_test']
+    assert len(biobridge_splits['train']) == 3510930
+    assert len(biobridge_splits['node_train']) == 76486
+    assert len(biobridge_splits['test']) == 393680
+    assert len(biobridge_splits['node_test']) == 8495
+    # Check node info dictionary
+    assert list(biobridge_node_info.keys()) == ['gene/protein',
+                                                'molecular_function',
+                                                'cellular_component',
+                                                'biological_process',
+                                                'drug',
+                                                'disease']
+    assert len(biobridge_node_info['gene/protein']) == 19162
+    assert len(biobridge_node_info['molecular_function']) == 10966
+    assert len(biobridge_node_info['cellular_component']) == 4013
+    assert len(biobridge_node_info['biological_process']) == 27478
+    assert len(biobridge_node_info['drug']) == 6948
+    assert len(biobridge_node_info['disease']) == 44133
+
+
+def test_load_existing_primekg(biobridge_primekg):
+    """
+    Test the loading method of the BioBridge-PrimeKG class by loading existing data in local.
+    """
+    # Load BioBridge-PrimeKG data
+    biobridge_primekg.load_data()
+    primekg_nodes = biobridge_primekg.get_primekg().get_nodes()
+    primekg_edges = biobridge_primekg.get_primekg().get_edges()
+    biobridge_data_config = biobridge_primekg.get_data_config()
+    biobridge_emb_dict = biobridge_primekg.get_node_embeddings()
+    biobridge_triplets = biobridge_primekg.get_primekg_triplets()
+    biobridge_splits = biobridge_primekg.get_train_test_split()
+    biobridge_node_info = biobridge_primekg.get_node_info_dict()
+
+    # Check if the local directories exists
+    assert os.path.exists(biobridge_primekg.primekg_dir)
+    assert os.path.exists(biobridge_primekg.local_dir)
+    # Check if downloaded and processed files exist
+    # PrimeKG files
+    files = ["nodes.tab", "primekg_nodes.tsv.gz",
+             "edges.csv", "primekg_edges.tsv.gz"]
+    for file in files:
+        path = f"{biobridge_primekg.primekg_dir}/{file}"
+        assert os.path.exists(path)
+    # BioBridge data config
+    assert os.path.exists(f"{biobridge_primekg.local_dir}/data_config.json")
+    # BioBridge embeddings
+    files = [
+        "protein.pkl",
+        "mf.pkl",
+        "cc.pkl",
+        "bp.pkl",
+        "drug.pkl",
+        "disease.pkl",
+        "embedding_dict.pkl"
+    ]
+    for file in files:
+        path = f"{biobridge_primekg.local_dir}/embeddings/{file}"
+        assert os.path.exists(path)
+    # BioBridge processed files
+    files = [
+        "protein.csv",
+        "mf.csv",
+        "cc.csv",
+        "bp.csv",
+        "drug.csv",
+        "disease.csv",
+        "triplet_full.tsv.gz",
+        "triplet_full_altered.tsv.gz",
+        "node_train.tsv.gz",
+        "triplet_train.tsv.gz",
+        "node_test.tsv.gz",
+        "triplet_test.tsv.gz",
+    ]
+    for file in files:
+        path = f"{biobridge_primekg.local_dir}/processed/{file}"
+        assert os.path.exists(path)
+    # Check processed PrimeKG dataframes
+    # Nodes
+    assert primekg_nodes is not None
+    assert len(primekg_nodes) > 0
+    assert primekg_nodes.shape[0] == 129375
+    # Edges
+    assert primekg_edges is not None
+    assert len(primekg_edges) > 0
+    assert primekg_edges.shape[0] == 8100498
+    # Check processed BioBridge data config
+    assert biobridge_data_config is not None
+    assert len(biobridge_data_config) > 0
+    assert len(biobridge_data_config['node_type']) == 10
+    assert len(biobridge_data_config['relation_type']) == 18
+    assert len(biobridge_data_config['emb_dim']) == 6
+    # Check processed BioBridge embeddings
+    assert biobridge_emb_dict is not None
+    assert len(biobridge_emb_dict) > 0
+    assert len(biobridge_emb_dict) == 85466
+    # Check processed BioBridge triplets
+    assert biobridge_triplets is not None
+    assert len(biobridge_triplets) > 0
+    assert biobridge_triplets.shape[0] == 3904610
+    assert list(biobridge_splits.keys()) == ['train', 'node_train', 'test', 'node_test']
+    assert len(biobridge_splits['train']) == 3510930
+    assert len(biobridge_splits['node_train']) == 76486
+    assert len(biobridge_splits['test']) == 393680
+    assert len(biobridge_splits['node_test']) == 8495
+    # Check node info dictionary
+    assert list(biobridge_node_info.keys()) == ['gene/protein',
+                                                'molecular_function',
+                                                'cellular_component',
+                                                'biological_process',
+                                                'drug',
+                                                'disease']
+    assert len(biobridge_node_info['gene/protein']) == 19162
+    assert len(biobridge_node_info['molecular_function']) == 10966
+    assert len(biobridge_node_info['cellular_component']) == 4013
+    assert len(biobridge_node_info['biological_process']) == 27478
+    assert len(biobridge_node_info['drug']) == 6948
+    assert len(biobridge_node_info['disease']) == 44133
+
+# def test_load_existing_primekg_with_negative_triplets(biobridge_primekg):
+#     """
+#     Test the loading method of the BioBridge-PrimeKG class by loading existing data in local.
+#     In addition, it builds negative triplets for training data.
+#     """
+#     # Load BioBridge-PrimeKG data
+#     # Using 1 negative sample per positive triplet
+#     biobridge_primekg.load_data(build_neg_triplest=True, n_neg_samples=1)
+#     biobridge_neg_triplets = biobridge_primekg.get_primekg_triplets_negative()
+
+#     # Check if the local directories exists
+#     assert os.path.exists(biobridge_primekg.primekg_dir)
+#     assert os.path.exists(biobridge_primekg.local_dir)
+#     # Check if downloaded and processed files exist
+#     path = f"{biobridge_primekg.local_dir}/processed/triplet_train_negative.tsv.gz"
+#     assert os.path.exists(path)
+#     # Check processed BioBridge triplets
+#     assert biobridge_neg_triplets is not None
+#     assert len(biobridge_neg_triplets) > 0
+#     assert biobridge_neg_triplets.shape[0] == 3510930
+#     assert len(biobridge_neg_triplets.negative_tail_index[0]) == 1
diff --git a/aiagents4pharma/talk2knowledgegraphs/tests/test_primekg_loader.py b/aiagents4pharma/talk2knowledgegraphs/tests/test_primekg_loader.py
@@ -0,0 +1,73 @@
+"""
+Test cases for primekg_loader.py
+"""
+
+import os
+import shutil
+import pytest
+from ..datasets.primekg import PrimeKG
+
+# Remove the data folder for testing if it exists
+LOCAL_DIR = "../data/primekg_test/"
+shutil.rmtree(LOCAL_DIR, ignore_errors=True)
+
+@pytest.fixture(name="primekg")
+def primekg_fixture():
+    """
+    Fixture for creating an instance of PrimeKG.
+    """
+    return PrimeKG(local_dir=LOCAL_DIR)
+
+def test_download_primekg(primekg):
+    """
+    Test the loading method of the PrimeKG class by downloading PrimeKG from server.
+    """
+    # Load PrimeKG data
+    primekg.load_data()
+    primekg_nodes = primekg.get_nodes()
+    primekg_edges = primekg.get_edges()
+
+    # Check if the local directory exists
+    assert os.path.exists(primekg.local_dir)
+    # Check if downloaded and processed files exist
+    files = ["nodes.tab", f"{primekg.name}_nodes.tsv.gz",
+             "edges.csv", f"{primekg.name}_edges.tsv.gz"]
+    for file in files:
+        path = f"{primekg.local_dir}/{file}"
+        assert os.path.exists(path)
+    # Check processed PrimeKG dataframes
+    # Nodes
+    assert primekg_nodes is not None
+    assert len(primekg_nodes) > 0
+    assert primekg_nodes.shape[0] == 129375
+    # Edges
+    assert primekg_edges is not None
+    assert len(primekg_edges) > 0
+    assert primekg_edges.shape[0] == 8100498
+
+def test_load_existing_primekg(primekg):
+    """
+    Test the loading method of the PrimeKG class by loading existing PrimeKG in local.
+    """
+    # Load PrimeKG data
+    primekg.load_data()
+    primekg_nodes = primekg.get_nodes()
+    primekg_edges = primekg.get_edges()
+
+    # Check if the local directory exists
+    assert os.path.exists(primekg.local_dir)
+    # Check if downloaded and processed files exist
+    files = ["nodes.tab", f"{primekg.name}_nodes.tsv.gz",
+             "edges.csv", f"{primekg.name}_edges.tsv.gz"]
+    for file in files:
+        path = f"{primekg.local_dir}/{file}"
+        assert os.path.exists(path)
+    # Check processed PrimeKG dataframes
+    # Nodes
+    assert primekg_nodes is not None
+    assert len(primekg_nodes) > 0
+    assert primekg_nodes.shape[0] == 129375
+    # Edges
+    assert primekg_edges is not None
+    assert len(primekg_edges) > 0
+    assert primekg_edges.shape[0] == 8100498
diff --git a/aiagents4pharma/talk2knowledgegraphs/tests/test_starkqa_primekg_loader.py b/aiagents4pharma/talk2knowledgegraphs/tests/test_starkqa_primekg_loader.py
@@ -0,0 +1,116 @@
+"""
+Test cases for starkqa_primekg_loader.py
+"""
+
+import os
+import shutil
+import pytest
+from ..datasets.starkqa_primekg import StarkQAPrimeKG
+
+# Remove the data folder for testing if it exists
+LOCAL_DIR = "../data/starkqa_primekg_test/"
+shutil.rmtree(LOCAL_DIR, ignore_errors=True)
+
+@pytest.fixture(name="starkqa_primekg")
+def starkqa_primekg_fixture():
+    """
+    Fixture for creating an instance of StarkQAPrimeKGData.
+    """
+    return StarkQAPrimeKG(local_dir=LOCAL_DIR)
+
+def test_download_starkqa_primekg(starkqa_primekg):
+    """
+    Test the loading method of the StarkQAPrimeKGLoaderTool class by downloading files
+    from HuggingFace Hub.
+    """
+    # Load StarkQA PrimeKG data
+    starkqa_primekg.load_data()
+    starkqa_df = starkqa_primekg.get_starkqa()
+    primekg_node_info = starkqa_primekg.get_starkqa_node_info()
+    split_idx = starkqa_primekg.get_starkqa_split_indicies()
+    query_embeddings = starkqa_primekg.get_query_embeddings()
+    node_embeddings = starkqa_primekg.get_node_embeddings()
+
+    # Check if the local directory exists
+    assert os.path.exists(starkqa_primekg.local_dir)
+    # Check if downloaded files exist in the local directory
+    files = ['qa/prime/split/test-0.1.index',
+             'qa/prime/split/test.index',
+             'qa/prime/split/train.index',
+             'qa/prime/split/val.index',
+             'qa/prime/stark_qa/stark_qa.csv',
+             'qa/prime/stark_qa/stark_qa_human_generated_eval.csv',
+             'skb/prime/processed.zip']
+    for file in files:
+        path = f"{starkqa_primekg.local_dir}/{file}"
+        assert os.path.exists(path)
+    # Check dataframe
+    assert starkqa_df is not None
+    assert len(starkqa_df) > 0
+    assert starkqa_df.shape[0] == 11204
+    # Check node information
+    assert primekg_node_info is not None
+    assert len(primekg_node_info) == 129375
+    # Check split indices
+    assert list(split_idx.keys()) == ['train', 'val', 'test', 'test-0.1']
+    assert len(split_idx['train']) == 6162
+    assert len(split_idx['val']) == 2241
+    assert len(split_idx['test']) == 2801
+    assert len(split_idx['test-0.1']) == 280
+    # Check query embeddings
+    assert query_embeddings is not None
+    assert len(query_embeddings) == 11204
+    assert query_embeddings[0].shape[1] == 1536
+    # Check node embeddings
+    assert node_embeddings is not None
+    assert len(node_embeddings) == 129375
+    assert node_embeddings[0].shape[1] == 1536
+
+def test_load_existing_starkqa_primekg(starkqa_primekg):
+    """
+
+    Test the loading method of the StarkQAPrimeKGLoaderTool class by loading existing files
+    in the local directory.
+    """
+    # Load StarkQA PrimeKG data
+    starkqa_primekg.load_data()
+    starkqa_df = starkqa_primekg.get_starkqa()
+    primekg_node_info = starkqa_primekg.get_starkqa_node_info()
+    split_idx = starkqa_primekg.get_starkqa_split_indicies()
+    query_embeddings = starkqa_primekg.get_query_embeddings()
+    node_embeddings = starkqa_primekg.get_node_embeddings()
+
+    # Check if the local directory exists
+    assert os.path.exists(starkqa_primekg.local_dir)
+    # Check if downloaded and processed files exist
+    files = ['qa/prime/split/test-0.1.index',
+             'qa/prime/split/test.index',
+             'qa/prime/split/train.index',
+             'qa/prime/split/val.index',
+             'qa/prime/stark_qa/stark_qa.csv',
+             'qa/prime/stark_qa/stark_qa_human_generated_eval.csv',
+             'skb/prime/processed.zip']
+    for file in files:
+        path = f"{starkqa_primekg.local_dir}/{file}"
+        assert os.path.exists(path)
+    # Check dataframe
+    assert starkqa_df is not None
+    assert len(starkqa_df) > 0
+    assert starkqa_df.shape[0] == 11204
+    # Check node information
+    assert primekg_node_info is not None
+    assert len(primekg_node_info) == 129375
+    # Check split indices
+    assert list(split_idx.keys()) == ['train', 'val', 'test', 'test-0.1']
+    assert len(split_idx['train']) == 6162
+    assert len(split_idx['val']) == 2241
+    assert len(split_idx['test']) == 2801
+    assert len(split_idx['test-0.1']) == 280
+    # Check query embeddings
+    assert query_embeddings is not None
+    assert len(query_embeddings) == 11204
+    assert query_embeddings[0].shape[1] == 1536
+    # Check node embeddings
+    assert node_embeddings is not None
+    assert len(node_embeddings) == 129375
+    assert node_embeddings[0].shape[1] == 1536
diff --git a/docs/Talk2KnowledgeGraphs/datasets/biobridge_primekg.md b/docs/Talk2KnowledgeGraphs/datasets/biobridge_primekg.md
@@ -0,0 +1 @@
+::: aiagents4pharma.talk2knowledgegraphs.datasets.biobridge_primekg
diff --git a/docs/Talk2KnowledgeGraphs/datasets/dataset.md b/docs/Talk2KnowledgeGraphs/datasets/dataset.md
@@ -0,0 +1 @@
+::: aiagents4pharma.talk2knowledgegraphs.datasets.dataset
diff --git a/docs/Talk2KnowledgeGraphs/datasets/primekg.md b/docs/Talk2KnowledgeGraphs/datasets/primekg.md
@@ -0,0 +1 @@
+::: aiagents4pharma.talk2knowledgegraphs.datasets.primekg
diff --git a/docs/Talk2KnowledgeGraphs/datasets/starkqa_primekg.md b/docs/Talk2KnowledgeGraphs/datasets/starkqa_primekg.md
@@ -0,0 +1 @@
+::: aiagents4pharma.talk2knowledgegraphs.datasets.starkqa_primekg
diff --git a/docs/index.md b/docs/index.md
@@ -8,7 +8,7 @@ Our toolkit currently consists of three agents, each designed to simplify and en
 
 - [**Talk2Cells**](talk2cells/intro.md) *(Coming soon)*: Query and analyze sequencing data with ease.
 
-- [**Talk2KnowledgeGraphs**](Talk2KnowledgeGraphs/intro.md) *(Coming soon)*: Access and explore complex biological knowledge graphs for insightful data connections.
+- [**Talk2KnowledgeGraphs**](talk2knowledgegraphs/intro.md) *(Coming soon)*: Access and explore complex biological knowledge graphs for insightful data connections.
 
 ### Prerequisites
 
@@ -57,9 +57,9 @@ Check out the tutorials on each agent for detailed instrcutions.
    export LANGCHAIN_TRACING_V2=true
    export LANGCHAIN_API_KEY=<your-api-key>
    ```
-   Please note that this will create a new tracing project in your Langsmith 
-   account with the name `<user_name>@<uuid>`, where `user_name` is the name 
-   you provided in the previous step. If you skip the previous step, it will 
+   Please note that this will create a new tracing project in your Langsmith
+   account with the name `<user_name>@<uuid>`, where `user_name` is the name
+   you provided in the previous step. If you skip the previous step, it will
    default to `default`.
 
 6. **Launch the app:**

diff --git a/docs/notebooks/talk2knowledgegraphs/tutorial_biobridge_primekg_loader.ipynb b/docs/notebooks/talk2knowledgegraphs/tutorial_biobridge_primekg_loader.ipynb
diff --git a/docs/notebooks/talk2knowledgegraphs/tutorial_primekg_loader.ipynb b/docs/notebooks/talk2knowledgegraphs/tutorial_primekg_loader.ipynb
diff --git a/docs/notebooks/talk2knowledgegraphs/tutorial_starkqa_primekg_loader.ipynb b/docs/notebooks/talk2knowledgegraphs/tutorial_starkqa_primekg_loader.ipynb
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -3,33 +3,42 @@ site_name: AIAgents4Pharma
 nav:
   - Home: index.md
 
-  - Talk2Biomodels: 
+  - Talk2Biomodels:
       - Introduction: talk2biomodels/models/intro.md
       - Tutorials: ./notebooks/talk2biomodels/tutorial.ipynb
       - Models:
         - System Bio Model: talk2biomodels/models/sys_bio_model.md
-        - Basico Model : talk2biomodels/models/basico_model.md 
+        - Basico Model : talk2biomodels/models/basico_model.md
       - Tools:
         - Search models: talk2biomodels/tools/search_models.md
         - Model description: talk2biomodels/tools/model_description.md
         - Fetch parameters: talk2biomodels/tools/fetch_parameters.md
         - Simulate model: talk2biomodels/tools/simulate_model.md
         - Ask questions: talk2biomodels/tools/ask_question.md
         - Custom figures: talk2biomodels/tools/custom_plotter.md
-        
+
   - Talk2Cells:
       - Talk2Cells: talk2cells/intro.md
 
   - Talk2Knowledgegraphs :
-      - Talk2Knowledgegraphs: Talk2KnowledgeGraphs/intro.md 
+      - Introduction: talk2knowledgegraphs/intro.md
+      - Tutorials:
+        - PrimeKG: ./notebooks/talk2knowledgegraphs/tutorial_primekg_loader.ipynb
+        - StarkQA-PrimeKG: ./notebooks/talk2knowledgegraphs/tutorial_starkqa_primekg_loader.ipynb
+        - BioBridge-PrimeKG: ./notebooks/talk2knowledgegraphs/tutorial_biobridge_primekg_loader.ipynb
+      - Datasets:
+        - Dataset: talk2knowledgegraphs/datasets/dataset.md
+        - PrimeKG: talk2knowledgegraphs/datasets/primekg.md
+        - StarkQA-PrimeKG: talk2knowledgegraphs/datasets/starkqa_primekg.md
+        - BioBridge-PrimeKG: talk2knowledgegraphs/datasets/biobridge_primekg.md
 
-  - Documentation : 
+  - Documentation :
     - Ops:
         - Introduction: ops/Documentation_intro.md
         - CodeOps : ops/CodeOps.md
         - DevOps : ops/DevOps.md
     - Contributing:
-        - Contributing : CONTRIBUTING.md 
+        - Contributing : CONTRIBUTING.md
 
 repo_url: https://github.com/VirtualPatientEngine/AIAgents4Pharma
 repo_name: VPE/AIAgents4Pharma

diff --git a/pyproject.toml b/pyproject.toml
@@ -15,6 +15,9 @@ classifiers = [
 dependencies = [
     "copasi_basico==0.78",
     "coverage==7.6.4",
+    "gdown==5.2.0",
+    "huggingface_hub==0.26.5",
+    "joblib==1.4.2",
     "langchain==0.3.7",
     "langchain-community==0.3.5",
     "langchain-core==0.3.15",
@@ -29,6 +32,7 @@ dependencies = [
     "pytest==8.3.3",
     "streamlit==1.39.0",
     "tabulate==0.9.0",
+    "torch==2.5.1",
     "tqdm==4.66.6",
     "mkdocs==1.6.1",
     "mkdocs-jupyter==0.25.1",
@@ -48,7 +52,10 @@ version = {file = "release_version.txt"}
 packages = ["aiagents4pharma",
             "aiagents4pharma.talk2biomodels",
             "aiagents4pharma.talk2biomodels.models",
-            "aiagents4pharma.talk2biomodels.tools"]
+            "aiagents4pharma.talk2biomodels.tools",
+            "aiagents4pharma.talk2knowledgegraphs",
+            "aiagents4pharma.talk2knowledgegraphs.datasets",
+            "aiagents4pharma.talk2knowledgegraphs.utils"]
 
 # [tool.setuptools.packages.find]
 # where = ["aiagents4pharma", "aiagents4pharma.talk2biomodels"]

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,8 @@
 copasi_basico==0.78
 coverage==7.6.4
+gdown==5.2.0
+huggingface_hub==0.26.5
+joblib==1.4.2
 langchain==0.3.7
 langchain-community==0.3.5
 langchain-core==0.3.15
@@ -15,6 +18,7 @@ pytest==8.3.3
 streamlit==1.39.0
 tabulate==0.9.0
 tqdm==4.66.6
+torch==2.2.2
 mkdocs==1.6.1
 mkdocs-jupyter==0.25.1
 mkdocs-material==9.5.47
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		::: aiagents4pharma.talk2knowledgegraphs.datasets.biobridge_primekg