From 3c237482d2eec8b209c53be83c91608a85939cfa Mon Sep 17 00:00:00 2001
From: YSK <yskale@users.noreply.github.com>
Date: Fri, 5 Dec 2025 13:45:46 -0500
Subject: [PATCH] Implement complete pipeline: generate questions, create
 embeddings, upload to Qdrant

---
 db_builder/README.md        |  98 +++++
 db_builder/qdrant_loader.py | 762 +++++++++++++++++++++++++++++++++---
 2 files changed, 809 insertions(+), 51 deletions(-)
 create mode 100644 db_builder/README.md

diff --git a/db_builder/README.md b/db_builder/README.md
new file mode 100644
index 0000000..92fbc55
--- /dev/null
+++ b/db_builder/README.md
@@ -0,0 +1,98 @@
+# Qdrant Question Generation Pipeline
+
+Pipeline for generating natural language questions from study descriptions using LLM, creating vector embeddings, and uploading to Qdrant vector database.
+
+## Setup
+
+1. **Install dependencies:**
+   ```bash
+   pip install qdrant-client langchain-ollama pandas requests
+   ```
+
+2. **Configure environment:**
+   ```bash
+   cp .env.example .env
+   # Edit .env with your settings
+   ```
+
+3. **Run the pipeline:**
+   ```bash
+   python qdrant_loader.py
+   ```
+
+## Configuration
+
+All configuration is managed through environment variables in the `.env` file.
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| **File Paths** |||
+| `INPUT_STUDIES_FILE` | Path to studies JSON file | `99_select_studies.json` |
+| `INPUT_PERSONAS_FILE` | Path to personas CSV file | `personas.csv` |
+| `OUTPUT_QUESTIONS_FILE` | Generated questions output | `generated_questions_payloads.json` |
+| `INPUT_EMBEDDINGS_FILE` | Questions with embeddings | `generated_questions_payloads_with_embeddings.json` |
+| **LLM** |||
+| `LLM_BASE_URL` | LLM service endpoint | `http://localhost:52236` |
+| `LLM_MODEL_NAME` | Model for question generation | `google/gemma-3-12b-it` |
+| **Embeddings** |||
+| `EMBEDDING_BASE_URL` | Embedding service endpoint | `http://localhost:11434` |
+| `EMBEDDING_MODEL_NAME` | Embedding model | `bge-m3` |
+| `EMBEDDING_BATCH_SIZE` | Batch size for embedding | `100` |
+| `EMBEDDING_SAVE_INTERVAL` | Save interval (embeddings) | `50` |
+| **Qdrant** |||
+| `QDRANT_URL` | Qdrant server URL | `http://localhost:60850` |
+| `QDRANT_COLLECTION_NAME` | Target collection name | `Collection_name` |
+| `VECTOR_SIZE` | Embedding dimensions | `1024` |
+| `DISTANCE_METRIC` | Distance metric | `Cosine` |
+| `UPLOAD_BATCH_SIZE` | Upload batch size | `100` |
+| **Pipeline** |||
+| `START_PERSONA` | Starting persona index | `1` |
+| `MAX_STUDIES` | Limit studies (empty = all) | - |
+| `MAX_PERSONAS` | Limit personas (empty = all) | - |
+| `SAVE_INTERVAL` | Save interval (questions) | `100` |
+
+## Usage
+
+The pipeline provides six operational modes. Edit the `if __name__ == "__main__":` block in `qdrant_loader.py` to uncomment your desired option.
+
+### Available Operations
+
+| Option | Function | Description |
+|--------|----------|-------------|
+| 1 | `connect_and_display_records()` | View sample records from Qdrant collection |
+| 2 | `create_bdc_collection()` | Create new Qdrant collection with config settings |
+| 3 | `run_question_generation_pipeline()` | Generate questions from studies (full dataset) |
+| 4 | `test_question_generation_pipeline()` | Test generation with limited data (5 studies, 1 persona) |
+| 5 | `generate_embeddings()` | Create vector embeddings for questions |
+| 6 | `upload_embeddings_to_qdrant()` | Upload embeddings to Qdrant |
+
+### Complete Workflow
+
+For a full end-to-end execution:
+
+```bash
+# Step 1: Generate questions from studies
+# Uncomment: run_question_generation_pipeline()
+python qdrant_loader.py
+
+# Step 2: Generate embeddings
+# Uncomment: generate_embeddings()
+python qdrant_loader.py
+
+# Step 3: Upload to Qdrant
+# Uncomment: upload_embeddings_to_qdrant()
+python qdrant_loader.py
+```
+
+## Pipeline Flow
+
+```
+Studies (JSON) → Question Generation (LLM) → Embedding Generation (Ollama) → Upload to Qdrant
+```
+
+## Notes
+
+- Progress is saved incrementally to avoid data loss
+- Use `START_PERSONA` parameter to resume interrupted runs
+- All configuration managed through `.env` file
+- Collection is created automatically if it doesn't exist
diff --git a/db_builder/qdrant_loader.py b/db_builder/qdrant_loader.py
index 50e4045..c253e59 100644
--- a/db_builder/qdrant_loader.py
+++ b/db_builder/qdrant_loader.py
@@ -1,61 +1,721 @@
+"""
+Qdrant Question Generation and Upload Pipeline
+
+This script provides a complete pipeline for:
+1. Generating natural language questions from study descriptions using LLM
+2. Creating payloads with question embeddings
+3. Uploading embeddings to Qdrant vector database
+
+Author: YSK
+Date: 2025
+"""
 
 from qdrant_client import QdrantClient
-from qdrant_client.models import Distance, VectorParams
-from qdrant_client.models import PointStruct
+from qdrant_client.models import VectorParams, PointStruct, Distance
+import json
+import csv
+import requests
 import pandas as pd
-import glob,os,time
-
-client = QdrantClient(url="http://localhost:54713/",timeout=120)
-'''client.create_collection(
-    collection_name="LLM_generated_question_collection",
-    vectors_config=VectorParams(size=4, distance=Distance.DOT),
-)'''
-
-
-#Read the question no and it's corresponding vector embedding.
-def read_vector_embed(collect_name):
-    input_directory ="/Users/ykale/Documents/Dev/koios/Koios/QuestionEmbedding-4"
-    records = []
-    main_index= 1
-    for count,filename in enumerate(glob.glob(os.path.join(input_directory, '*.csv'))):
-        if main_index % 5 == 0:
-            print("Adding delay of 3 seconds...")
-            time.sleep(3)
-        # Load the CSV file
-        print("Working on",filename)
-        df = pd.read_csv(filename, header=None)
-        df.columns = ['question_id','question', 'embedding']
-        # Convert embeddings from string to list of floats
-        df['embedding'] = df['embedding'].apply(lambda x: list(map(float, x.strip('[]').split(','))))
-        for index, row in df.iterrows():
-            record = {
-                'id': main_index,  # Incremental integer id
-                'vector': row['embedding'],
-                'payload': {'question_id': row['question_id'],'question':row['question']}  # Store question_id in payload
-                  }
-            main_index = main_index + 1
-            records.append(record)
-            # Upload records to Qdrant
-        client.upsert(
-            collection_name=collect_name,
-            wait=True,
-            points=records
+from typing import List, Dict
+import re
+import time
+from pathlib import Path
+
+# ============================================================================
+# CONFIGURATION - Load from .env file
+# ============================================================================
+
+def load_env_config():
+    """Load configuration from .env file if it exists."""
+    env_file = Path(__file__).parent / ".env"
+    config = {}
+
+    if env_file.exists():
+        with open(env_file, 'r') as f:
+            for line in f:
+                line = line.strip()
+                if line and not line.startswith('#') and '=' in line:
+                    key, value = line.split('=', 1)
+                    key = key.strip()
+                    value = value.strip()
+
+                    # Convert to appropriate type
+                    if value.isdigit():
+                        value = int(value)
+                    elif value.lower() == 'true':
+                        value = True
+                    elif value.lower() == 'false':
+                        value = False
+                    elif value == '':
+                        value = None
+
+                    config[key] = value
+
+    return config
+
+# Load configuration from .env
+_config = load_env_config()
+
+# File paths
+INPUT_STUDIES_FILE = _config.get('INPUT_STUDIES_FILE', "99_select_studies.json")
+INPUT_PERSONAS_FILE = _config.get('INPUT_PERSONAS_FILE', "personas.csv")
+OUTPUT_QUESTIONS_FILE = _config.get('OUTPUT_QUESTIONS_FILE', "generated_questions_payloads.json")
+INPUT_EMBEDDINGS_FILE = _config.get('INPUT_EMBEDDINGS_FILE', "generated_questions_payloads_with_embeddings.json")
+
+# LLM configuration
+LLM_BASE_URL = _config.get('LLM_BASE_URL', "http://localhost:52236")
+LLM_MODEL_NAME = _config.get('LLM_MODEL_NAME', "google/gemma-3-12b-it")
+
+# Embedding configuration
+EMBEDDING_BASE_URL = _config.get('EMBEDDING_BASE_URL', "http://localhost:11434")
+EMBEDDING_MODEL_NAME = _config.get('EMBEDDING_MODEL_NAME', "bge-m3")
+EMBEDDING_BATCH_SIZE = _config.get('EMBEDDING_BATCH_SIZE', 100)
+EMBEDDING_SAVE_INTERVAL = _config.get('EMBEDDING_SAVE_INTERVAL', 50)
+
+# Qdrant configuration
+QDRANT_URL = _config.get('QDRANT_URL', "http://localhost:60850")
+QDRANT_COLLECTION_NAME = _config.get('QDRANT_COLLECTION_NAME', "Collection_name")
+VECTOR_SIZE = _config.get('VECTOR_SIZE', 1024)
+DISTANCE_METRIC = _config.get('DISTANCE_METRIC', "Cosine")  # Options: Cosine, Euclid, Dot
+UPLOAD_BATCH_SIZE = _config.get('UPLOAD_BATCH_SIZE', 100)
+
+# Generation parameters
+START_PERSONA = _config.get('START_PERSONA', 16)
+MAX_STUDIES = _config.get('MAX_STUDIES', None)  # Set to a number to limit, None for all
+MAX_PERSONAS = _config.get('MAX_PERSONAS', None)  # Set to a number to limit, None for all
+SAVE_INTERVAL = _config.get('SAVE_INTERVAL', 100)  # Save progress every N questions
+
+# ============================================================================
+
+def connect_and_display_records(qdrant_url=QDRANT_URL, collection_name=QDRANT_COLLECTION_NAME):
+    """Display sample records from a Qdrant collection."""
+    client = QdrantClient(url=qdrant_url)
+
+    try:
+        # Get collection info to understand vector dimensions
+        collection_info = client.get_collection(collection_name)
+        print(f"Collection info: {collection_info}")
+        print(f"Vector size: {collection_info.config.params.vectors.size}")
+        print(f"Distance metric: {collection_info.config.params.vectors.distance}")
+        print("-" * 50)
+
+        records = client.scroll(
+            collection_name=collection_name,
+            limit=2,
+            with_payload=True,
+            with_vectors=True
+        )
+
+        print(f"First 2 records from collection '{collection_name}':")
+        print("-" * 50)
+
+        for i, record in enumerate(records[0], 1):
+            print(f"Record {i}:")
+            print(f"ID: {record.id}")
+            print(f"Payload: {record.payload}")
+            if record.vector:
+                print(f"Vector dimensions: {len(record.vector)}")
+                print(f"First 5 vector values: {record.vector[:5]}")
+            print("-" * 30)
+
+        return collection_info.config.params.vectors.size, collection_info.config.params.vectors.distance
+
+    except Exception as e:
+        print(f"Error connecting to Qdrant: {e}")
+        return None, None
+
+def create_bdc_collection(qdrant_url=QDRANT_URL, new_collection_name=QDRANT_COLLECTION_NAME):
+    """Create a new Qdrant collection with the same configuration as the reference collection."""
+    client = QdrantClient(url=qdrant_url)
+
+    try:
+        # First get the vector dimensions from the existing collection
+        vector_size, distance_metric = connect_and_display_records()
+
+        if vector_size is None:
+            print("Could not get vector dimensions from existing collection")
+            return
+
+        print(f"\nCreating new collection '{new_collection_name}' with:")
+        print(f"Vector size: {vector_size}")
+        print(f"Distance metric: {distance_metric}")
+
+        # Create the new collection with same configuration
+        client.create_collection(
+            collection_name=new_collection_name,
+            vectors_config=VectorParams(
+                size=vector_size,
+                distance=distance_metric
+            )
+        )
+
+        print(f"Successfully created collection '{new_collection_name}'")
+
+        # Verify the collection was created
+        collections = client.get_collections()
+        print(f"Available collections: {[c.name for c in collections.collections]}")
+
+    except Exception as e:
+        print(f"Error creating collection: {e}")
+
+class QuestionGenerationPipeline:
+    """Pipeline for generating natural language questions from study descriptions using LLM."""
+
+    def __init__(self, llm_base_url=LLM_BASE_URL, model_name=LLM_MODEL_NAME):
+        self.llm_base_url = llm_base_url
+        self.model_name = model_name
+
+    def load_studies(self, json_file_path: str) -> List[Dict]:
+        with open(json_file_path, 'r', encoding='utf-8') as file:
+            return json.load(file)
+
+    def load_personas(self, csv_file_path: str) -> List[str]:
+        personas = []
+        with open(csv_file_path, 'r', encoding='utf-8') as file:
+            reader = csv.reader(file)
+            for row in reader:
+                if len(row) >= 2:
+                    personas.append(row[1])
+        return personas
+
+    def generate_questions_with_llm(self, description: str, persona: str) -> List[str]:
+        system_prompt = (
+            f"You are generating natural search queries from the perspective of: {persona}. "
+            "These queries will be embedded using semantic search to retrieve this specific study abstract when real users ask similar questions. "
+            "Your goal is to anticipate how THIS persona would naturally phrase questions that this study could answer. "
+            "\n"
+            "Generate exactly 5 diverse, natural-language search queries that would lead users to THIS specific study. "
+            "\n"
+            "Critical Requirements: "
+            "1. NATURAL LANGUAGE: Write as real users would ask - use complete thoughts, not keyword lists. Mix questions and statements naturally. "
+            "2. DIVERSE USER INTENTS: Cover different reasons someone would seek this study: "
+            "   - Intent 1: Finding datasets by disease/condition (e.g., 'where can I find data on...')"
+            "   - Intent 2: Methodology/design questions (e.g., 'what studies use longitudinal cohort design for...')"
+            "   - Intent 3: Specific features (e.g., 'datasets with genetic data and family history for...')"
+            "   - Intent 4: Population/eligibility (e.g., 'studies that include multi-generational participants...')"
+            "   - Intent 5: Access/practical concerns (e.g., 'available cardiovascular datasets with phenotype and genotype data')"
+            "3. PERSONA-AUTHENTIC: Frame each query using terminology and concerns this specific persona would have "
+            "4. STUDY-SPECIFIC: Include 2-3 unique identifiers per query (study name, disease, sample size, specific methodology, unique features) that distinguish this study from others "
+            "5. SEMANTIC VARIETY: Use synonyms and different phrasings - avoid repeating the same keywords across all 5 queries "
+            "6. CONVERSATIONAL: Write how people talk to chatbots, not how they write academic searches "
+            "7. NO QUESTION MARKS: Write queries as statements or questions without ending punctuation "
+            "8. NO MARKDOWN: Do not use markdown code blocks like ```json or ``` in your response "
+            "9. NO EXTRA TEXT: Do not include any explanations, notes, or text outside the JSON array "
+            "\n"
+            "Format Requirements: "
+            "- Return ONLY a valid JSON array of exactly 5 strings "
+            "- The array must start with [ and end with ] "
+            "- Each string must be enclosed in double quotes "
+            "- Strings must be separated by commas "
+            "- Do NOT wrap the array in markdown code blocks "
+            "- Do NOT add any text before or after the array "
+            "\n"
+            "Example of GOOD queries (natural, diverse intents, conversational):\n"
+            "[\"Where can I find longitudinal heart disease data that follows families across multiple generations\", "
+            "\"I need cardiovascular risk factor datasets with both genetic and clinical data from large cohorts\", "
+            "\"What studies have echocardiography imaging paired with long-term health outcomes for heart disease\", "
+            "\"Looking for datasets that track blood pressure cholesterol and smoking in the same participants over decades\", "
+            "\"Are there any large family-based studies with genome-wide genotyping for cardiovascular disease research\"]\n"
+            "\n"
+            "BAD example (keyword lists, no diversity):\n"
+            "[\"Framingham Heart Study cardiovascular risk factors 5209 participants\", "
+            "\"Framingham study blood pressure cholesterol\", "
+            "\"Framingham three generation family study\", "
+            "\"Framingham genome analysis\", "
+            "\"Framingham echocardiography imaging\"]"
+        )
+
+        human_prompt = f"The study abstract starts here after : \n {description}"
+
+        payload = {
+            "model": self.model_name,
+            "messages": [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": human_prompt}
+            ],
+            "max_tokens": 1000,
+            "temperature": 0.7
+        }
+
+        try:
+            response = requests.post(f"{self.llm_base_url}/v1/chat/completions", json=payload)
+            response.raise_for_status()
+            result = response.json()
+
+            generated_text = result['choices'][0]['message']['content']
+
+            # Parse the generated questions (assuming they're in a list format)
+            questions = self.parse_questions_from_response(generated_text)
+
+            return questions[:5]  # Return only 5 questions as requested
+
+        except Exception as e:
+            print(f"Error generating questions: {e}")
+            return []
+
+    def parse_questions_from_response(self, response_text: str) -> List[str]:
+        # Try to extract questions from the response
+        # Remove any extra formatting and split by common separators
+        cleaned_text = response_text.strip()
+
+        # If it looks like a Python list, try to evaluate it safely
+        if cleaned_text.startswith('[') and cleaned_text.endswith(']'):
+            try:
+                # Use regex to extract quoted strings from the list
+                questions = re.findall(r'"([^"]*)"', cleaned_text)
+                if not questions:
+                    questions = re.findall(r"'([^']*)'", cleaned_text)
+                return questions
+            except:
+                pass
+
+        # Fallback: split by common separators
+        if ',' in cleaned_text:
+            questions = [q.strip().strip('"\'') for q in cleaned_text.split(',')]
+        elif '\n' in cleaned_text:
+            questions = [q.strip().strip('"\'') for q in cleaned_text.split('\n') if q.strip()]
+        else:
+            questions = [cleaned_text.strip().strip('"\'')]
+
+        # Clean up questions
+        questions = [q for q in questions if q and len(q) > 10]
+        return questions
+
+    def create_qdrant_payload(self, study_data: Dict, persona: str, questions: List[str], persona_num: int, start_question_num: int) -> List[Dict]:
+        """Create payload structure matching existing Qdrant collection format with correct question_id numbering"""
+        payloads = []
+        for i, question in enumerate(questions):
+            # Create question_id in format: study_id_personanumber_questionnumber (e.g., phs003529.v1.p1_1_1)
+            question_number = start_question_num + i
+            # Use StudyId as study_id for the new JSON structure
+            study_id = study_data.get('StudyId', '')
+            question_id = f"{study_id}_{persona_num}_{question_number}"
+
+            # Payload structure matching existing 'questions_on_study_abstract_bge' collection
+            payload = {
+                'question': question,  # Main question text (matches existing format)
+                'question_id': question_id,  # Unique ID (format: study_id_personanumber_questionnumber)
+                # Additional metadata (not in original but useful for filtering/search)
+                'study_id': study_id,
+                'study_name': study_data.get('StudyName', ''),
+                'persona': persona,
+                'description': study_data.get('Description', ''),
+                'permalink': study_data.get('Permalink', ''),
+                # Placeholder for vector embedding (to be added later)
+                'embedding': None
+            }
+            payloads.append(payload)
+        return payloads
+
+    def process_all_studies_and_personas(self, json_file_path: str, csv_file_path: str,
+                                        max_studies: int = None, max_personas: int = None,
+                                        start_persona: int = 1,
+                                        incremental_save: bool = True,
+                                        save_interval: int = 100,
+                                        output_file: str = "generated_questions_payloads.json") -> List[Dict]:
+        studies = self.load_studies(json_file_path)
+        personas = self.load_personas(csv_file_path)
+
+        # Limit for testing if specified
+        if max_studies:
+            studies = studies[:max_studies]
+            print(f"Limited to first {max_studies} studies for testing")
+
+        if max_personas:
+            personas = personas[:max_personas]
+            print(f"Limited to first {max_personas} personas for testing")
+
+        # Skip personas before start_persona
+        if start_persona > 1:
+            personas = personas[start_persona - 1:]
+            print(f"Starting from persona {start_persona}")
+
+        all_payloads = []
+        total_combinations = len(studies) * len(personas)
+        processed = 0
+        last_save_count = 0
+
+        print(f"Processing {len(studies)} studies with {len(personas)} personas...")
+        print(f"Total combinations to process: {total_combinations}")
+        print(f"Incremental save: {'enabled' if incremental_save else 'disabled'}")
+        if incremental_save:
+            print(f"Saving every {save_interval} questions")
+
+        for persona_idx, persona in enumerate(personas):
+            # Adjust persona_num to account for start_persona offset
+            persona_num = persona_idx + start_persona
+
+            print(f"\n{'='*70}")
+            print(f"PROCESSING PERSONA {persona_num}/{start_persona + len(personas) - 1}")
+            print(f"Persona: {persona[:80]}...")
+            print(f"{'='*70}")
+
+            for study_idx, study in enumerate(studies):
+                study_id = study.get('StudyId', 'unknown')
+                description = study.get('Description', '')
+
+                if not description:
+                    print(f"Skipping study {study_id} - no description")
+                    continue
+
+                processed += 1
+                print(f"Study {study_idx + 1}/{len(studies)}: {study_id} (Overall: {processed}/{total_combinations})")
+
+                questions = self.generate_questions_with_llm(description, persona)
+
+                if questions:
+                    # Each persona gets questions numbered 1 to 5 (or however many questions are generated)
+                    payloads = self.create_qdrant_payload(study, persona, questions, persona_num, 1)
+                    all_payloads.extend(payloads)
+                    print(f"  ✓ Generated {len(questions)} questions (Total: {len(all_payloads)})")
+
+                    # Save every save_interval questions
+                    if incremental_save and len(all_payloads) - last_save_count >= save_interval:
+                        print(f"\n{'='*70}")
+                        print(f"SAVING PROGRESS: {len(all_payloads)} questions generated")
+                        print(f"New questions since last save: {len(all_payloads) - last_save_count}")
+                        self.save_payloads_to_json(all_payloads, output_file)
+                        print(f"Saved to {output_file}")
+                        print(f"Pausing for 10 seconds...")
+                        time.sleep(10)
+                        print(f"{'='*70}\n")
+                        last_save_count = len(all_payloads)
+                else:
+                    print(f"  ✗ No questions generated")
+
+        # Final save if there are any remaining unsaved payloads
+        if incremental_save and len(all_payloads) > last_save_count:
+            print(f"\n{'='*70}")
+            print(f"FINAL SAVE: {len(all_payloads)} total questions")
+            print(f"New questions since last save: {len(all_payloads) - last_save_count}")
+            self.save_payloads_to_json(all_payloads, output_file)
+            print(f"Saved to {output_file}")
+            print(f"{'='*70}\n")
+
+        print(f"\nTotal payloads created: {len(all_payloads)}")
+        return all_payloads
+
+    def save_payloads_to_json(self, payloads: List[Dict], output_file: str = "generated_questions_payloads.json"):
+        with open(output_file, 'w', encoding='utf-8') as file:
+            json.dump(payloads, file, indent=2, ensure_ascii=False)
+        print(f"Payloads saved to {output_file}")
+
+    def save_payloads_to_csv(self, payloads: List[Dict], output_file: str = "generated_questions_payloads.csv"):
+        """Save payloads to CSV for easy viewing"""
+        if not payloads:
+            print("No payloads to save")
+            return
+
+        df = pd.DataFrame(payloads)
+        df.to_csv(output_file, index=False, encoding='utf-8')
+        print(f"Payloads saved to {output_file}")
+        print(f"CSV contains {len(df)} rows and {len(df.columns)} columns")
+        print(f"Columns: {list(df.columns)}")
+
+    def run_complete_pipeline(self,
+                            json_file_path: str = INPUT_STUDIES_FILE,
+                            csv_file_path: str = INPUT_PERSONAS_FILE,
+                            output_file: str = OUTPUT_QUESTIONS_FILE,
+                            max_studies: int = MAX_STUDIES,
+                            max_personas: int = MAX_PERSONAS,
+                            start_persona: int = START_PERSONA,
+                            save_interval: int = SAVE_INTERVAL,
+                            save_csv: bool = True):
+        print("Starting Question Generation Pipeline...")
+        print("=" * 60)
+
+        # Process studies and personas
+        payloads = self.process_all_studies_and_personas(
+            json_file_path, csv_file_path, max_studies, max_personas,
+            start_persona, incremental_save=True, save_interval=save_interval,
+            output_file=output_file
         )
-        print(f"Inserted all the records Successfully")
-    return records
 
+        # Save final results
+        self.save_payloads_to_json(payloads, output_file)
+
+        if save_csv:
+            self.save_payloads_to_csv(payloads)
+
+        print("\nPipeline completed!")
+        print(f"Generated {len(payloads)} question payloads with Qdrant-compatible format")
+        print("Next step: Add embeddings to the 'embedding' field before inserting to Qdrant")
+        print("Payload structure matches existing collection: 'question' and 'question_id' fields")
+
+        return payloads
 
-def create_a_collection_qdrant(collect_name):
-    collection_name = collect_name
-    vector_size = 4096 # Assuming all vectors are of the same length
-    client.recreate_collection(
-        collection_name=collection_name,
-        vectors_config=VectorParams(size=vector_size, distance='Cosine')  #'Cosine' to 'Euclidean' or 'Dot' as needed
+def run_question_generation_pipeline():
+    """Run the complete question generation pipeline with configured parameters."""
+    pipeline = QuestionGenerationPipeline(
+        llm_base_url=LLM_BASE_URL,
+        model_name=LLM_MODEL_NAME
     )
+    return pipeline.run_complete_pipeline()
+
+def test_question_generation_pipeline():
+    """Run pipeline with limited data for testing."""
+    pipeline = QuestionGenerationPipeline(
+        llm_base_url=LLM_BASE_URL,
+        model_name=LLM_MODEL_NAME
+    )
+    return pipeline.run_complete_pipeline(
+        max_studies=5,
+        max_personas=1,
+        save_csv=True
+    )
+
+def generate_embeddings(
+    input_file: str = OUTPUT_QUESTIONS_FILE,
+    output_file: str = INPUT_EMBEDDINGS_FILE,
+    embedding_base_url: str = EMBEDDING_BASE_URL,
+    embedding_model: str = EMBEDDING_MODEL_NAME,
+    batch_size: int = EMBEDDING_BATCH_SIZE,
+    save_interval: int = EMBEDDING_SAVE_INTERVAL,
+    start_index: int = 0
+):
+    """
+    Generate embeddings for questions using Ollama embeddings.
+
+    Args:
+        input_file: Path to input JSON file with questions
+        output_file: Path to output JSON file with embeddings
+        embedding_base_url: Base URL for embedding service
+        embedding_model: Name of the embedding model
+        batch_size: Number of questions to process at once
+        save_interval: Save after this many embeddings
+        start_index: Index to start from (useful for resuming)
+
+    Returns:
+        List of questions with embeddings
+    """
+    try:
+        from langchain_ollama import OllamaEmbeddings
+    except ImportError:
+        print("Error: langchain_ollama not installed. Install with: pip install langchain-ollama")
+        return None
+
+    print(f"Initializing OllamaEmbeddings with model: {embedding_model}")
+    print(f"Base URL: {embedding_base_url}")
+    ollama_emb = OllamaEmbeddings(model=embedding_model, base_url=embedding_base_url)
+
+    # Load questions
+    with open(input_file, 'r', encoding='utf-8') as file:
+        questions = json.load(file)
+
+    total_questions = len(questions)
+    print(f"\n{'='*70}")
+    print(f"Starting embedding generation")
+    print(f"Total questions: {total_questions}")
+    print(f"Starting from index: {start_index}")
+    print(f"Batch size: {batch_size}")
+    print(f"Save interval: {save_interval}")
+    print(f"{'='*70}\n")
+
+    last_save_index = start_index
+
+    # Process questions in batches
+    for i in range(start_index, total_questions, batch_size):
+        batch_end = min(i + batch_size, total_questions)
+        batch = questions[i:batch_end]
+
+        # Extract question texts for this batch
+        question_texts = [q['question'] for q in batch]
+
+        try:
+            print(f"Processing questions {i+1}-{batch_end}/{total_questions}...")
+            embeddings = ollama_emb.embed_documents(question_texts)
+
+            # Add embeddings to the questions
+            for j, embedding in enumerate(embeddings):
+                questions[i + j]['embedding'] = embedding
+
+            print(f"  ✓ Generated {len(embeddings)} embeddings")
+
+            # Save incrementally
+            if (i + batch_size) - last_save_index >= save_interval or batch_end == total_questions:
+                print(f"\n{'='*70}")
+                print(f"SAVING PROGRESS: {batch_end}/{total_questions} questions processed")
+                print(f"New embeddings since last save: {batch_end - last_save_index}")
+                with open(output_file, 'w', encoding='utf-8') as file:
+                    json.dump(questions, file, indent=2, ensure_ascii=False)
+                print(f"Saved to {output_file}")
+
+                if batch_end < total_questions:
+                    print(f"Pausing for 10 seconds...")
+                    time.sleep(10)
+
+                print(f"{'='*70}\n")
+                last_save_index = batch_end
+
+        except Exception as e:
+            print(f"  ✗ Error processing batch {i+1}-{batch_end}: {e}")
+            print(f"Saving progress before error...")
+            with open(output_file, 'w', encoding='utf-8') as file:
+                json.dump(questions, file, indent=2, ensure_ascii=False)
+            raise
+
+    print(f"\n{'='*70}")
+    print(f"COMPLETED: All {total_questions} embeddings generated")
+    print(f"Output saved to: {output_file}")
+    print(f"{'='*70}\n")
+
+    return questions
+
+def upload_embeddings_to_qdrant(
+    json_file_path: str = INPUT_EMBEDDINGS_FILE,
+    qdrant_url: str = QDRANT_URL,
+    collection_name: str = QDRANT_COLLECTION_NAME,
+    batch_size: int = UPLOAD_BATCH_SIZE,
+    create_if_missing: bool = True,
+    vector_size: int = VECTOR_SIZE,
+    distance_metric: str = DISTANCE_METRIC
+):
+    """
+    Upload embeddings from JSON file to Qdrant collection.
+
+    Args:
+        json_file_path: Path to the JSON file containing question payloads with embeddings
+        qdrant_url: URL of the Qdrant instance
+        collection_name: Name of the collection to upload to
+        batch_size: Number of points to upload in each batch
+        create_if_missing: Create collection if it doesn't exist
+        vector_size: Size of the embedding vectors
+        distance_metric: Distance metric to use (Cosine, Euclid, or Dot)
+
+    Returns:
+        Number of successfully uploaded points
+    """
+    print(f"Loading embeddings from {json_file_path}...")
+    with open(json_file_path, 'r', encoding='utf-8') as file:
+        payloads = json.load(file)
+
+    print(f"Loaded {len(payloads)} payloads")
+
+    # Connect to Qdrant
+    client = QdrantClient(url=qdrant_url)
+
+    # Check if collection exists
+    try:
+        collection_info = client.get_collection(collection_name)
+        print(f"Collection '{collection_name}' exists")
+        print(f"Vector size: {collection_info.config.params.vectors.size}")
+        print(f"Distance metric: {collection_info.config.params.vectors.distance}")
+    except Exception as e:
+        if create_if_missing:
+            print(f"Collection does not exist. Creating '{collection_name}'...")
+
+            # Map distance metric string to enum
+            distance_map = {
+                "Cosine": Distance.COSINE,
+                "Euclid": Distance.EUCLID,
+                "Dot": Distance.DOT
+            }
+
+            try:
+                client.create_collection(
+                    collection_name=collection_name,
+                    vectors_config=VectorParams(
+                        size=vector_size,
+                        distance=distance_map.get(distance_metric, Distance.COSINE)
+                    )
+                )
+                print(f"Successfully created collection '{collection_name}'")
+                print(f"Vector size: {vector_size}")
+                print(f"Distance metric: {distance_metric}")
+            except Exception as create_error:
+                print(f"Error creating collection: {create_error}")
+                return 0
+        else:
+            print(f"Collection does not exist: {e}")
+            return 0
+
+    # Prepare points for upload
+    points = []
+    skipped = 0
+
+    for idx, payload in enumerate(payloads):
+        # Check if embedding exists
+        if 'embedding' not in payload or payload['embedding'] is None:
+            skipped += 1
+            continue
+
+        # Extract embedding and create payload without it (Qdrant stores vectors separately)
+        embedding = payload['embedding']
+
+        # Create payload for Qdrant (matching reference collection format)
+        # Only include 'question' and 'question_id' fields
+        qdrant_payload = {
+            'question': payload.get('question', ''),
+            'question_id': payload.get('question_id', '')
+        }
+
+        # Use numeric index as point ID (Qdrant requires unsigned int or UUID)
+        point_id = idx
+
+        # Create PointStruct
+        point = PointStruct(
+            id=point_id,
+            vector=embedding,
+            payload=qdrant_payload
+        )
+
+        points.append(point)
+
+    print(f"Prepared {len(points)} points for upload (skipped {skipped} without embeddings)")
+
+    # Upload in batches
+    total_uploaded = 0
+    for i in range(0, len(points), batch_size):
+        batch = points[i:i + batch_size]
+        try:
+            client.upsert(
+                collection_name=collection_name,
+                points=batch
+            )
+            total_uploaded += len(batch)
+            print(f"Uploaded batch {i // batch_size + 1}: {total_uploaded}/{len(points)} points")
+        except Exception as e:
+            print(f"Error uploading batch {i // batch_size + 1}: {e}")
+            # Continue with next batch even if one fails
+
+    print(f"\nUpload completed!")
+    print(f"Total points uploaded: {total_uploaded}")
+    print(f"Total points skipped: {skipped}")
+
+    return total_uploaded
+
+if __name__ == "__main__":
+    """
+    Main execution block. Uncomment the function you want to run.
+    All configuration is set at the top of the file in the CONFIGURATION section.
+    """
+
+    # ========================================================================
+    # Option 1: Display existing records from Qdrant collection
+    # ========================================================================
+    # connect_and_display_records()
+
+    # ========================================================================
+    # Option 2: Create a new Qdrant collection
+    # ========================================================================
+    # create_bdc_collection()
+
+    # ========================================================================
+    # Option 3: Generate questions using LLM (full pipeline)
+    # ========================================================================
+    # run_question_generation_pipeline()
 
+    # ========================================================================
+    # Option 4: Test question generation with limited data
+    # ========================================================================
+    # test_question_generation_pipeline()
 
-collect_name = 'test_collection2'
-create_a_collection_qdrant(collect_name)
-records = read_vector_embed(collect_name)
-#insert_data_qdrant(records)
+    # ========================================================================
+    # Option 5: Generate embeddings for questions
+    # ========================================================================
+    # generate_embeddings()
 
+    # ========================================================================
+    # Option 6: Upload embeddings to Qdrant
+    # ========================================================================
+    upload_embeddings_to_qdrant()
\ No newline at end of file