Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ Phase 3 migration complete: all 28 routers moved from `app/routers/` to domain m
| `modules/chat/` | `router.py` | `app/routers/chat.py` |
| `modules/llm/` | `router.py` | `app/routers/llm.py` |
| `modules/google/` | `router.py` (+ `callback_router`) | `app/routers/google.py` |
| `modules/knowledge/` | `router_google_drive.py` | Google Drive RAG sync (`/admin/google-drive/*`) |

**Phase 4 routers** (extracted from `orchestrator.py`, not from `app/routers/`):

Expand Down
9 changes: 9 additions & 0 deletions admin/src/plugins/i18n.ts
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,9 @@ const messages = {
adminUsersList: "Администраторы бота",
adminUsersDesc: "Пользователи с доступом к команде /status",
noAdmins: "Нет администраторов",
botUsers: "Пользователи бота",
noUsers: "Ещё нет пользователей",
userSince: "с",
activeSessions: "Активные сессии",
sessionsCount: "активных",
clearSessions: "Очистить",
Expand Down Expand Up @@ -2119,6 +2122,9 @@ const messages = {
adminUsersList: "Bot administrators",
adminUsersDesc: "Users with access to /status command",
noAdmins: "No administrators",
botUsers: "Bot users",
noUsers: "No users yet",
userSince: "since",
activeSessions: "Active sessions",
sessionsCount: "active",
clearSessions: "Clear",
Expand Down Expand Up @@ -3473,6 +3479,9 @@ const messages = {
adminUsersList: "Бот әкімшілері",
adminUsersDesc: "/status командасына рұқсаты бар пайдаланушылар",
noAdmins: "Әкімшілер жоқ",
botUsers: "Бот пайдаланушылары",
noUsers: "Пайдаланушылар жоқ",
userSince: "бастап",
activeSessions: "Белсенді сессиялар",
sessionsCount: "белсенді",
clearSessions: "Тазалау",
Expand Down
38 changes: 38 additions & 0 deletions admin/src/views/TelegramView.vue
Original file line number Diff line number Diff line change
Expand Up @@ -997,6 +997,44 @@ watch(instances, (newInstances) => {

<!-- Users Tab -->
<template v-if="activeTab === 'users'">
<!-- Bot Users List -->
<div class="bg-card rounded-xl border border-border p-4">
<div class="flex items-center justify-between mb-4">
<h3 class="font-medium">{{ t('telegram.botUsers') }} ({{ sessions.length }})</h3>
</div>
<div v-if="sessions.length === 0" class="text-center py-8 text-muted-foreground">
{{ t('telegram.noUsers') }}
</div>
<div v-else class="space-y-2">
<div
v-for="session in sessions"
:key="session.user_id"
class="flex items-center justify-between p-3 bg-secondary rounded-lg"
>
<div class="flex items-center gap-3">
<div class="w-9 h-9 rounded-full bg-primary/20 text-primary flex items-center justify-center text-sm font-medium shrink-0">
{{ (session.first_name || session.username || 'U')[0].toUpperCase() }}
</div>
<div>
<p class="font-medium">
{{ session.first_name || session.username || `User ${session.user_id}` }}
<span v-if="session.last_name"> {{ session.last_name }}</span>
</p>
<p class="text-sm text-muted-foreground">
ID: {{ session.user_id }}
<span v-if="session.username"> · @{{ session.username }}</span>
</p>
</div>
</div>
<div class="text-right text-xs text-muted-foreground">
<p v-if="session.updated">{{ new Date(session.updated).toLocaleDateString() }}</p>
<p v-if="session.created" class="opacity-60">{{ t('telegram.userSince') }} {{ new Date(session.created).toLocaleDateString() }}</p>
</div>
</div>
</div>
</div>

<!-- Access Settings -->
<div class="bg-card rounded-xl border border-border p-4">
<h3 class="font-medium mb-2">{{ t('telegram.allowedUsersList') }}</h3>
<p class="text-sm text-muted-foreground mb-3">{{ t('telegram.allowedUsersDesc') }}</p>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""add google_drive_projects table

Revision ID: 8338ee0a5c4d
Revises: ed1d201ecb55
Create Date: 2026-03-25 21:33:31.416445
"""

from typing import Sequence, Union

import sqlalchemy as sa

from alembic import op


revision: str = "8338ee0a5c4d"
down_revision: Union[str, None] = "ed1d201ecb55"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
op.create_table(
"google_drive_projects",
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
sa.Column(
"user_id",
sa.Integer(),
sa.ForeignKey("users.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column("name", sa.String(200), nullable=False),
sa.Column("folder_id", sa.String(200), nullable=False, server_default="root"),
sa.Column("folder_name", sa.String(500), nullable=True),
sa.Column(
"collection_id",
sa.Integer(),
sa.ForeignKey("knowledge_collections.id"),
nullable=True,
),
sa.Column("sync_status", sa.String(20), server_default="idle"),
sa.Column("sync_error", sa.Text(), nullable=True),
sa.Column("last_synced", sa.DateTime(), nullable=True),
sa.Column("file_count", sa.Integer(), server_default="0"),
sa.Column("total_size_bytes", sa.Integer(), server_default="0"),
sa.Column("include_mime_types", sa.Text(), nullable=True),
sa.Column(
"workspace_id",
sa.Integer(),
sa.ForeignKey("workspaces.id"),
nullable=False,
server_default="1",
),
sa.Column("created", sa.DateTime(), server_default=sa.text("CURRENT_TIMESTAMP")),
sa.Column("updated", sa.DateTime(), server_default=sa.text("CURRENT_TIMESTAMP")),
)


def downgrade() -> None:
op.drop_table("google_drive_projects")
247 changes: 247 additions & 0 deletions app/services/google_drive_sync_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
"""Google Drive sync service — downloads files from Drive folder, converts to markdown."""

import logging
import re
from pathlib import Path

import httpx


logger = logging.getLogger(__name__)

# Google Apps MIME types that can be exported
EXPORTABLE_TYPES = {
"application/vnd.google-apps.document": ("text/plain", ".txt", "google_docs"),
"application/vnd.google-apps.spreadsheet": (
"text/csv",
".csv",
"google_sheets",
),
"application/vnd.google-apps.presentation": (
"text/plain",
".txt",
"google_slides",
),
}

# Regular file types we can read as text
TEXT_EXTENSIONS = {
".txt",
".md",
".csv",
".json",
".xml",
".html",
".yml",
".yaml",
".toml",
".ini",
".cfg",
".log",
".py",
".js",
".ts",
".vue",
".jsx",
".tsx",
".css",
".scss",
".sql",
".sh",
".bash",
".rb",
".go",
".rs",
".java",
".kt",
".c",
".cpp",
".h",
".hpp",
}

MAX_FILE_SIZE = 512 * 1024 # 512 KB per file


async def list_drive_folder_recursive(
access_token: str,
folder_id: str = "root",
max_files: int = 500,
) -> list[dict]:
"""List all files in a Drive folder recursively (up to max_files)."""
files: list[dict] = []
folders_to_scan = [folder_id]

async with httpx.AsyncClient(timeout=30) as client:
while folders_to_scan and len(files) < max_files:
current_folder = folders_to_scan.pop(0)
page_token = None

while True:
params: dict = {
"q": f"'{current_folder}' in parents and trashed = false",
"fields": "nextPageToken, files(id, name, mimeType, size, modifiedTime)",
"pageSize": 100,
}
if page_token:
params["pageToken"] = page_token

resp = await client.get(
"https://www.googleapis.com/drive/v3/files",
headers={"Authorization": f"Bearer {access_token}"},
params=params,
)
resp.raise_for_status()
data = resp.json()

for f in data.get("files", []):
if f["mimeType"] == "application/vnd.google-apps.folder":
folders_to_scan.append(f["id"])
else:
files.append(f)
if len(files) >= max_files:
break

page_token = data.get("nextPageToken")
if not page_token or len(files) >= max_files:
break

return files


async def download_and_convert_file(
access_token: str,
file_info: dict,
) -> tuple[str, str, int] | None:
"""Download a single file and convert to text.

Returns (title, content, size_bytes) or None if file can't be processed.
"""
file_id = file_info["id"]
name = file_info["name"]
mime_type = file_info["mimeType"]
size = int(file_info.get("size", 0))

async with httpx.AsyncClient(timeout=60) as client:
headers = {"Authorization": f"Bearer {access_token}"}

# Google Apps files (Docs, Sheets, Slides) — export
if mime_type in EXPORTABLE_TYPES:
export_mime, _ext, _source = EXPORTABLE_TYPES[mime_type]
resp = await client.get(
f"https://www.googleapis.com/drive/v3/files/{file_id}/export",
headers=headers,
params={"mimeType": export_mime},
)
if resp.status_code != 200:
logger.warning(f"Failed to export {name}: {resp.status_code}")
return None
content = resp.text
return (name, content, len(content.encode("utf-8")))

# Regular files — check size and extension
if size > MAX_FILE_SIZE:
logger.info(f"Skipping {name}: too large ({size} bytes)")
return None

ext = Path(name).suffix.lower()
if ext not in TEXT_EXTENSIONS:
logger.info(f"Skipping {name}: unsupported extension {ext}")
return None

# Download as text
resp = await client.get(
f"https://www.googleapis.com/drive/v3/files/{file_id}",
headers=headers,
params={"alt": "media"},
)
if resp.status_code != 200:
logger.warning(f"Failed to download {name}: {resp.status_code}")
return None

try:
content = resp.text
except Exception:
return None

return (name, content, len(content.encode("utf-8")))


def _sanitize_filename(name: str) -> str:
"""Convert file name to safe filename for disk."""
name = re.sub(r"[^\w\s\-.]", "_", name)
name = re.sub(r"\s+", "_", name)
return name[:200]


async def sync_drive_folder(
access_token: str,
folder_id: str,
output_dir: str,
max_files: int = 500,
) -> list[dict]:
"""Sync entire Drive folder to disk as markdown/text files.

Returns list of document dicts for DatasetSynced event.
"""
Path(output_dir).mkdir(parents=True, exist_ok=True)

# Clean old files
for old_file in Path(output_dir).glob("*.md"):
old_file.unlink()

# List all files
drive_files = await list_drive_folder_recursive(access_token, folder_id, max_files)
logger.info(f"Google Drive sync: found {len(drive_files)} files in folder {folder_id}")

documents: list[dict] = []
total_size = 0

for file_info in drive_files:
try:
result = await download_and_convert_file(access_token, file_info)
if not result:
continue
title, content, size_bytes = result
except Exception as e:
logger.warning(f"Error processing {file_info['name']}: {e}")
continue

# Determine source type
mime_type = file_info["mimeType"]
if mime_type in EXPORTABLE_TYPES:
source_type = EXPORTABLE_TYPES[mime_type][2]
else:
source_type = "google_drive"

# Write to disk as markdown
safe_name = _sanitize_filename(title)
if not safe_name.endswith(".md"):
safe_name = (
safe_name.rsplit(".", 1)[0] + ".md" if "." in safe_name else safe_name + ".md"
)
filepath = Path(output_dir) / safe_name

# Wrap content in markdown with title
md_content = f"# {title}\n\n{content}"
with open(filepath, "w", encoding="utf-8") as f:
f.write(md_content)

# Count sections
section_count = len(re.findall(r"^#{2,3}\s+.+$", md_content, re.MULTILINE))

documents.append(
{
"filename": safe_name,
"title": title,
"source_type": source_type,
"file_size_bytes": size_bytes,
"section_count": max(section_count, 1),
}
)
total_size += size_bytes

logger.info(
f"Google Drive sync complete: {len(documents)} documents, {total_size / 1024:.1f} KB total"
)
return documents
Loading
Loading