dimakan-dev · dimakan-dev · Nov 17, 2025 · Nov 21, 2025
diff --git a/STREAMLIT_UPDATE_PROMPT.md b/STREAMLIT_UPDATE_PROMPT.md
@@ -0,0 +1,139 @@
+# Streamlit App Update: Support Timestamp-Enabled OpenSearch Index
+
+## Context
+
+A new timestamp-enabled OpenSearch index has been created alongside the existing index. This new index includes timestamp information for each chunk, allowing the RAG system to generate timestamped YouTube URLs that open episodes at the exact moment where relevant snippets occur.
+
+## Index Structure
+
+The timestamp-enabled index has all the same fields as the standard index, plus these additional fields:
+
+- **`youtube_url`** (keyword): Full YouTube URL (e.g., `https://www.youtube.com/watch?v=BVM6TUSfn3E`)
+- **`youtube_video_id`** (keyword): Just the video ID (e.g., `BVM6TUSfn3E`)
+- **`timestamp`** (integer): Chunk start time in seconds (e.g., 1296 for 21:36)
+- **`chunk_index`** (integer): Position of chunk within the episode
+
+## Environment Variables
+
+The timestamp-enabled index name is determined by:
+- `INDEX_NAME_WITH_TIMESTAMPS` environment variable (if set)
+- Otherwise defaults to `{INDEX_NAME}_timestamps` (e.g., if `INDEX_NAME=transcripts`, then `transcripts_timestamps`)
+
+## Required Changes
+
+### 1. Update Index Name Configuration
+
+Add support for selecting which index to use (standard or timestamp-enabled). You can either:
+- Add a configuration option/environment variable to choose the index
+- Or add a toggle in the Streamlit UI to switch between indices
+
+### 2. Update Search Query to Include Timestamp Fields
+
+When querying the timestamp-enabled index, ensure your search results include the new fields:
+
+```python
+# Example: Include timestamp fields in the _source
+search_body = {
+    "query": {...},
+    "_source": ["title", "url", "content", "youtube_url", "youtube_video_id", "timestamp", "chunk_index", ...]
+}
+```
+
+### 3. Construct Timestamped YouTube URLs
+
+When displaying episode URLs in search results, check if `timestamp` and `youtube_video_id` are available. If they are, construct a timestamped URL:
+
+**URL Format:**
+- Base YouTube URL: `https://www.youtube.com/watch?v={video_id}`
+- Timestamped URL: `https://youtu.be/{video_id}?t={timestamp}`
+
+**Example:**
+- Video ID: `BVM6TUSfn3E`
+- Timestamp: `1296` (seconds)
+- Result: `https://youtu.be/BVM6TUSfn3E?t=1296`
+
+**Implementation:**
+```python
+def build_episode_url(hit):
+    """Build episode URL with timestamp if available"""
+    url = hit.get('_source', {}).get('url', '')
+    youtube_video_id = hit.get('_source', {}).get('youtube_video_id')
+    timestamp = hit.get('_source', {}).get('timestamp')
+
+    # If we have YouTube video ID and timestamp, use timestamped URL
+    if youtube_video_id and timestamp is not None:
+        return f"https://youtu.be/{youtube_video_id}?t={timestamp}"
+
+    # Otherwise, fall back to original URL
+    return url
+```
+
+### 4. Handle Missing Timestamps Gracefully
+
+Some chunks may not have timestamps (e.g., old transcripts without Whisper segments). Always check if `timestamp` is not None before constructing timestamped URLs:
+
+```python
+if timestamp is not None:
+    # Use timestamped URL
+else:
+    # Use regular episode URL
+```
+
+### 5. Display Timestamps in UI (Optional Enhancement)
+
+Consider displaying the timestamp in a human-readable format alongside the URL:
+
+```python
+def format_timestamp(seconds):
+    """Convert seconds to MM:SS or HH:MM:SS format"""
+    if seconds is None:
+        return None
+    hours = seconds // 3600
+    minutes = (seconds % 3600) // 60
+    secs = seconds % 60
+    if hours > 0:
+        return f"{hours}:{minutes:02d}:{secs:02d}"
+    return f"{minutes}:{secs:02d}"
+
+# In your UI:
+if timestamp:
+    st.write(f"⏱️ {format_timestamp(timestamp)}")
+```
+
+## Example Search Result Structure
+
+When querying the timestamp-enabled index, search results will have this structure:
+
+```python
+{
+    "_source": {
+        "title": "Episode Title",
+        "url": "https://rss.com/podcasts/vector-podcast/599924",
+        "content": "Chunk text content...",
+        "youtube_url": "https://www.youtube.com/watch?v=BVM6TUSfn3E",  # May be None
+        "youtube_video_id": "BVM6TUSfn3E",  # May be None
+        "timestamp": 1296,  # Integer seconds, may be None
+        "chunk_index": 5,
+        # ... other existing fields
+    }
+}
+```
+
+## Testing Checklist
+
+- [ ] Update index name to use timestamp-enabled index
+- [ ] Verify search queries return timestamp fields
+- [ ] Test URL construction with timestamps
+- [ ] Test fallback to regular URL when timestamp is None
+- [ ] Test with episodes that have YouTube URLs
+- [ ] Test with episodes that don't have YouTube URLs
+- [ ] Verify timestamped URLs open YouTube at correct time
+
+## Notes
+
+- The timestamp-enabled index can coexist with the standard index
+- Both indices can be queried independently
+- The standard index remains unchanged and continues to work as before
+- Episodes without Whisper segments will have `timestamp: None`
+- Episodes without YouTube URLs in description will have `youtube_url: None` and `youtube_video_id: None`
+
diff --git a/rewrite_rules.json b/rewrite_rules.json
@@ -0,0 +1,18 @@
+{
+  "corner shortened": "Connor Shorten",
+  "Dimeji Conan": "Dmitry Kan",
+  "Dimitri Can": "Dmitry Kan",
+  "Dmitri Khan": "Dmitry Kan",
+  "Dimitri": "Dmitry",
+  "Mietri": "Dmitry",
+  "Leo Boyzov": "Leo Boytsov",
+  "Cupid": "Quepid",
+  "Daniel Tungerling": "Daniel Tunkelang",
+  "Daniel Tungaling": "Daniel Tunkelang",
+  "VV8": "Weaviate",
+  "Weeveate": "Weaviate",
+  "Pine Cone": "Pinecone",
+  "FAYS": "FAISS",
+  "Yanny Vaknin": "Yaniv Vaknin",
+  "deep set": "Deepset"
+}
diff --git a/src/add_title_keyword_field.py b/src/add_title_keyword_field.py
@@ -0,0 +1,40 @@
+"""
+Add keyword subfield to title field in existing OpenSearch index.
+This allows aggregations on the title field.
+"""
+
+import os
+from dotenv import load_dotenv
+from opensearchpy import OpenSearch
+
+load_dotenv()
+
+connection_string = os.getenv("OPENSEARCH_SERVICE_URI")
+index_name = os.getenv("INDEX_NAME", "embedded_vp_transcripts")
+client = OpenSearch(connection_string, use_ssl=True, timeout=100)
+
+# Update mapping to add keyword subfield to title
+mapping_update = {
+    "properties": {
+        "title": {
+            "type": "text",
+            "fields": {
+                "keyword": {"type": "keyword"}
+            }
+        }
+    }
+}
+
+print(f"Updating mapping for index: {index_name}")
+try:
+    response = client.indices.put_mapping(
+        index=index_name,
+        body=mapping_update
+    )
+    print(f"✅ Mapping updated successfully: {response}")
+    print("\nNote: The keyword subfield will only be available for new documents.")
+    print("To make it available for existing documents, you'll need to reindex.")
+except Exception as e:
+    print(f"❌ Error updating mapping: {e}")
+    raise
+
diff --git a/src/apply_rewrite_rules.py b/src/apply_rewrite_rules.py
@@ -0,0 +1,185 @@
+"""Apply rewrite rules to transcript files to correct common transcription errors."""
+
+import json
+import pathlib
+import re
+from typing import Dict
+
+import frontmatter
+import typer
+import typing_extensions
+from rich.progress import track
+
+app = typer.Typer()
+
+
+def load_rewrite_rules(rules_file: pathlib.Path) -> Dict[str, str]:
+    """Load rewrite rules from a JSON file."""
+    if not rules_file.exists():
+        typer.echo(f"Error: Rules file not found: {rules_file}", err=True)
+        raise typer.Exit(1)
+
+    try:
+        with open(rules_file, "r", encoding="utf-8") as f:
+            rules = json.load(f)
+        return rules
+    except json.JSONDecodeError as e:
+        typer.echo(f"Error: Invalid JSON in rules file: {e}", err=True)
+        raise typer.Exit(1)
+
+
+def apply_rewrite_rules(text: str, rules: Dict[str, str]) -> str:
+    """
+    Apply rewrite rules to text using case-insensitive matching with word boundaries.
+
+    Args:
+        text: The text to process
+        rules: Dictionary mapping incorrect text to correct text
+
+    Returns:
+        Text with rewrite rules applied
+    """
+    result = text
+
+    for incorrect, correct in rules.items():
+        # Create a regex pattern with word boundaries for case-insensitive matching
+        # For multi-word phrases, replace spaces with \s+ to allow flexible whitespace
+        # and use word boundaries only at the start and end
+        escaped = re.escape(incorrect)
+        # Replace escaped spaces with \s+ pattern
+        pattern = escaped.replace(r"\ ", r"\s+")
+        # Add word boundaries at start and end
+        pattern = r"\b" + pattern + r"\b"
+        result = re.sub(pattern, correct, result, flags=re.IGNORECASE)
+
+    return result
+
+
+def process_transcript_file(
+    input_file: pathlib.Path,
+    output_file: pathlib.Path,
+    rules: Dict[str, str],
+) -> bool:
+    """
+    Process a single transcript file by applying rewrite rules.
+
+    Args:
+        input_file: Path to input transcript file
+        output_file: Path to output transcript file
+        rules: Dictionary of rewrite rules
+
+    Returns:
+        True if successful, False otherwise
+    """
+    try:
+        # Read and parse the file with frontmatter
+        content = input_file.read_text(encoding="utf-8")
+        post = frontmatter.loads(content)
+
+        # Apply rewrite rules to the content (not the frontmatter metadata)
+        corrected_content = apply_rewrite_rules(post.content, rules)
+
+        # Update the post with corrected content
+        post.content = corrected_content
+
+        # Ensure output directory exists
+        output_file.parent.mkdir(parents=True, exist_ok=True)
+
+        # Write the corrected file
+        output_file.write_text(frontmatter.dumps(post), encoding="utf-8")
+
+        return True
+    except Exception as e:
+        typer.echo(f"Error processing {input_file}: {e}", err=True)
+        return False
+
+
+@app.command()
+def apply_rules(
+    input_dir: typing_extensions.Annotated[
+        pathlib.Path,
+        typer.Option("--input-dir", "-i", help="Input directory containing transcript files"),
+    ] = None,
+    output_dir: typing_extensions.Annotated[
+        pathlib.Path,
+        typer.Option("--output-dir", "-o", help="Output directory for corrected transcript files"),
+    ] = None,
+    rules_file: typing_extensions.Annotated[
+        pathlib.Path,
+        typer.Option("--rules-file", "-r", help="Path to rewrite rules JSON file"),
+    ] = pathlib.Path("rewrite_rules.json"),
+    with_timestamps: typing_extensions.Annotated[
+        bool,
+        typer.Option("--with-timestamps", help="Process transcripts_with_timestamps/ directory instead of transcripts/"),
+    ] = False,
+    in_place: typing_extensions.Annotated[
+        bool,
+        typer.Option("--in-place", help="Overwrite input files instead of writing to output directory"),
+    ] = False,
+):
+    """
+    Apply rewrite rules to transcript files.
+
+    Processes all .md files in the input directory recursively and writes
+    corrected versions to the output directory, preserving directory structure.
+
+    If --with-timestamps is used, defaults to transcripts_with_timestamps/ directory.
+    If --in-place is used, files are overwritten in the input directory.
+    """
+    # Set default directories based on with_timestamps flag
+    if input_dir is None:
+        input_dir = pathlib.Path("transcripts_with_timestamps" if with_timestamps else "transcripts")
+
+    if output_dir is None:
+        if in_place:
+            output_dir = input_dir
+        else:
+            # Default output: add _corrected suffix
+            output_dir = pathlib.Path(f"{input_dir}_corrected")
+
+    # Validate input directory
+    if not input_dir.exists():
+        typer.echo(f"Error: Input directory does not exist: {input_dir}", err=True)
+        raise typer.Exit(1)
+
+    if not input_dir.is_dir():
+        typer.echo(f"Error: Input path is not a directory: {input_dir}", err=True)
+        raise typer.Exit(1)
+
+    # Load rewrite rules
+    typer.echo(f"Loading rewrite rules from {rules_file}")
+    rules = load_rewrite_rules(rules_file)
+    typer.echo(f"Loaded {len(rules)} rewrite rules")
+
+    # Find all markdown files recursively
+    md_files = list(input_dir.rglob("*.md"))
+
+    if not md_files:
+        typer.echo(f"No .md files found in {input_dir}")
+        return
+
+    typer.echo(f"Found {len(md_files)} transcript files to process")
+
+    # Process each file
+    successful = 0
+    failed = 0
+
+    for input_file in track(md_files, description="Processing files"):
+        # Calculate relative path from input directory
+        relative_path = input_file.relative_to(input_dir)
+        output_file = output_dir / relative_path
+
+        if process_transcript_file(input_file, output_file, rules):
+            successful += 1
+        else:
+            failed += 1
+
+    typer.echo(f"\nProcessing complete:")
+    typer.echo(f"  ✓ Successfully processed: {successful}")
+    if failed > 0:
+        typer.echo(f"  ✗ Failed: {failed}", err=True)
+
+
+if __name__ == "__main__":
+    app()
+