-
Notifications
You must be signed in to change notification settings - Fork 44
feat(issue-5): Implement Training Dataset backend with vector embeddings #73
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,243 @@ | ||
| import { Request, Response, NextFunction } from 'express' | ||
| import TrainingDatasetService from './TrainingDataset.service' | ||
| import * as multer from 'multer' | ||
| import * as path from 'path' | ||
| import * as os from 'os' | ||
|
|
||
| // Configure multer for file uploads | ||
| const uploadDir = path.join(os.tmpdir(), 'training-datasets') | ||
| const storage = multer.diskStorage({ | ||
| destination: (req, file, cb) => { | ||
| cb(null, uploadDir) | ||
| }, | ||
| filename: (req, file, cb) => { | ||
| const timestamp = Date.now() | ||
| const random = Math.random().toString(36).substring(7) | ||
| cb(null, `${timestamp}-${random}-${file.originalname}`) | ||
| }, | ||
| }) | ||
|
|
||
| const upload = multer({ | ||
| storage, | ||
| limits: { fileSize: 100 * 1024 * 1024 }, // 100MB max | ||
| fileFilter: (req, file, cb) => { | ||
| const allowedMimes = ['text/csv', 'application/json', 'text/markdown', 'text/plain'] | ||
| const allowedExts = ['.csv', '.json', '.md', '.markdown', '.txt'] | ||
|
|
||
| const ext = path.extname(file.originalname).toLowerCase() | ||
| const isMimeAllowed = allowedMimes.includes(file.mimetype) | ||
| const isExtAllowed = allowedExts.includes(ext) | ||
|
|
||
| if (isMimeAllowed || isExtAllowed) { | ||
| cb(null, true) | ||
| } else { | ||
| cb(new Error(`File type not supported. Allowed: ${allowedExts.join(', ')}`)) | ||
| } | ||
| }, | ||
| }) | ||
|
Comment on lines
+20
to
+37
|
||
|
|
||
| class TrainingDatasetController { | ||
| /** | ||
| * POST /api/v1/training-datasets/upload | ||
| * Upload and process a training dataset file | ||
| */ | ||
| async uploadDataset(req: Request, res: Response, next: NextFunction): Promise<void> { | ||
| try { | ||
| const userId = req.user?.id | ||
| if (!userId) { | ||
| res.status(401).json({ error: 'Unauthorized' }) | ||
| return | ||
| } | ||
|
|
||
| if (!req.file) { | ||
| res.status(400).json({ error: 'No file provided' }) | ||
| return | ||
| } | ||
|
|
||
| const { name, description } = req.body | ||
|
|
||
| if (!name) { | ||
| res.status(400).json({ error: 'Dataset name is required' }) | ||
| return | ||
| } | ||
|
|
||
| // Determine file type from extension | ||
| const ext = path.extname(req.file.originalname).toLowerCase() | ||
| let fileType: 'csv' | 'json' | 'markdown' | 'text' | ||
|
|
||
| switch (ext) { | ||
| case '.csv': | ||
| fileType = 'csv' | ||
| break | ||
| case '.json': | ||
| fileType = 'json' | ||
| break | ||
| case '.md': | ||
| case '.markdown': | ||
| fileType = 'markdown' | ||
| break | ||
| case '.txt': | ||
| fileType = 'text' | ||
| break | ||
| default: | ||
| res.status(400).json({ error: 'Unsupported file type' }) | ||
| return | ||
| } | ||
|
|
||
| // Create dataset record | ||
| const dataset = await TrainingDatasetService.createDataset( | ||
| userId, | ||
| req.file.originalname, | ||
| fileType, | ||
| req.file.size, | ||
| name, | ||
| description | ||
| ) | ||
|
|
||
| // Process dataset asynchronously | ||
| TrainingDatasetService.processDataset(dataset._id.toString(), userId, req.file.path).catch(error => { | ||
| console.error('Error processing dataset:', error) | ||
| }) | ||
|
|
||
| res.status(201).json({ | ||
| success: true, | ||
| data: dataset, | ||
| message: 'Dataset uploaded successfully and is being processed', | ||
| }) | ||
| } catch (error: any) { | ||
| console.error('Error uploading dataset:', error) | ||
| res.status(500).json({ error: error.message || 'Failed to upload dataset' }) | ||
| } | ||
|
Comment on lines
+44
to
+110
|
||
| } | ||
|
|
||
| /** | ||
| * GET /api/v1/training-datasets | ||
| * Get all datasets for a user | ||
| */ | ||
| async getDatasets(req: Request, res: Response, next: NextFunction): Promise<void> { | ||
| try { | ||
| const userId = req.user?.id | ||
| if (!userId) { | ||
| res.status(401).json({ error: 'Unauthorized' }) | ||
| return | ||
| } | ||
|
|
||
| const { skip = 0, limit = 20 } = req.query | ||
|
|
||
| const result = await TrainingDatasetService.getDatasets( | ||
| userId, | ||
| parseInt(skip as string) || 0, | ||
| parseInt(limit as string) || 20 | ||
| ) | ||
|
|
||
| res.status(200).json({ | ||
| success: true, | ||
| data: result.datasets, | ||
| pagination: { | ||
| skip: parseInt(skip as string) || 0, | ||
| limit: parseInt(limit as string) || 20, | ||
| total: result.total, | ||
| }, | ||
| }) | ||
|
Comment on lines
+125
to
+141
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The pagination parameters const skip = parseInt(req.query.skip as string, 10) || 0
const limit = parseInt(req.query.limit as string, 10) || 20
const result = await TrainingDatasetService.getDatasets(userId, skip, limit)
res.status(200).json({
success: true,
data: result.datasets,
pagination: {
skip,
limit,
total: result.total,
},
}) |
||
| } catch (error) { | ||
| console.error('Error fetching datasets:', error) | ||
| res.status(500).json({ error: 'Failed to fetch datasets' }) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * GET /api/v1/training-datasets/:id | ||
| * Get dataset by ID | ||
| */ | ||
| async getDataset(req: Request, res: Response, next: NextFunction): Promise<void> { | ||
| try { | ||
| const userId = req.user?.id | ||
| const { id } = req.params | ||
|
|
||
| if (!userId) { | ||
| res.status(401).json({ error: 'Unauthorized' }) | ||
| return | ||
| } | ||
|
|
||
| const dataset = await TrainingDatasetService.getDataset(id, userId) | ||
|
|
||
| if (!dataset) { | ||
| res.status(404).json({ error: 'Dataset not found' }) | ||
| return | ||
| } | ||
|
|
||
| res.status(200).json({ | ||
| success: true, | ||
| data: dataset, | ||
| }) | ||
| } catch (error) { | ||
| console.error('Error fetching dataset:', error) | ||
| res.status(500).json({ error: 'Failed to fetch dataset' }) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * DELETE /api/v1/training-datasets/:id | ||
| * Delete dataset | ||
| */ | ||
| async deleteDataset(req: Request, res: Response, next: NextFunction): Promise<void> { | ||
| try { | ||
| const userId = req.user?.id | ||
| const { id } = req.params | ||
| const { deleteSamples = false } = req.query | ||
|
|
||
| if (!userId) { | ||
| res.status(401).json({ error: 'Unauthorized' }) | ||
| return | ||
| } | ||
|
|
||
| const success = await TrainingDatasetService.deleteDataset(id, userId, deleteSamples === 'true') | ||
|
|
||
| if (!success) { | ||
| res.status(404).json({ error: 'Dataset not found' }) | ||
| return | ||
| } | ||
|
|
||
| res.status(200).json({ | ||
| success: true, | ||
| message: 'Dataset deleted successfully', | ||
| }) | ||
| } catch (error) { | ||
| console.error('Error deleting dataset:', error) | ||
| res.status(500).json({ error: 'Failed to delete dataset' }) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * GET /api/v1/training-datasets/stats | ||
| * Get dataset statistics | ||
| */ | ||
| async getStatistics(req: Request, res: Response, next: NextFunction): Promise<void> { | ||
| try { | ||
| const userId = req.user?.id | ||
| if (!userId) { | ||
| res.status(401).json({ error: 'Unauthorized' }) | ||
| return | ||
| } | ||
|
|
||
| const stats = await TrainingDatasetService.getStatistics(userId) | ||
|
|
||
| res.status(200).json({ | ||
| success: true, | ||
| data: stats, | ||
| }) | ||
| } catch (error) { | ||
| console.error('Error fetching statistics:', error) | ||
| res.status(500).json({ error: 'Failed to fetch statistics' }) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Get multer upload middleware | ||
| */ | ||
| getUploadMiddleware() { | ||
| return upload.single('file') | ||
| } | ||
| } | ||
|
|
||
| export default new TrainingDatasetController() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,87 @@ | ||
| import { Schema, Document, Types, model } from 'mongoose' | ||
|
|
||
| export interface ITrainingDataset extends Document { | ||
| userId: Types.ObjectId | ||
| name: string | ||
| description?: string | ||
| fileName: string | ||
| fileType: 'csv' | 'json' | 'markdown' | 'text' | ||
| fileSize: number // in bytes | ||
| sampleCount: number | ||
| status: 'pending' | 'processing' | 'completed' | 'failed' | ||
| errorMessage?: string | ||
| importedAt?: Date | ||
| isActive: boolean | ||
| metadata?: { | ||
| headers?: string[] | ||
| delimiter?: string | ||
| encoding?: string | ||
| } | ||
| createdAt: Date | ||
| updatedAt: Date | ||
| } | ||
|
|
||
| const TrainingDatasetSchema = new Schema<ITrainingDataset>( | ||
| { | ||
| userId: { | ||
| type: Schema.Types.ObjectId, | ||
| ref: 'User', | ||
| required: true, | ||
| }, | ||
| name: { | ||
| type: String, | ||
| required: true, | ||
| maxlength: 255, | ||
| }, | ||
| description: { | ||
| type: String, | ||
| maxlength: 1000, | ||
| }, | ||
| fileName: { | ||
| type: String, | ||
| required: true, | ||
| index: true, | ||
| }, | ||
| fileType: { | ||
| type: String, | ||
| enum: ['csv', 'json', 'markdown', 'text'], | ||
| required: true, | ||
| }, | ||
| fileSize: { | ||
| type: Number, | ||
| required: true, | ||
| }, | ||
| sampleCount: { | ||
| type: Number, | ||
| default: 0, | ||
| }, | ||
| status: { | ||
| type: String, | ||
| enum: ['pending', 'processing', 'completed', 'failed'], | ||
| default: 'pending', | ||
| index: true, | ||
| }, | ||
| errorMessage: String, | ||
| importedAt: Date, | ||
| isActive: { | ||
| type: Boolean, | ||
| default: true, | ||
| index: true, | ||
| }, | ||
| metadata: { | ||
| headers: [String], | ||
| delimiter: String, | ||
| encoding: String, | ||
| }, | ||
| }, | ||
| { | ||
| timestamps: true, | ||
| } | ||
| ) | ||
|
|
||
| // Compound indices for common queries | ||
| TrainingDatasetSchema.index({ userId: 1, status: 1 }) | ||
| TrainingDatasetSchema.index({ userId: 1, isActive: 1 }) | ||
| TrainingDatasetSchema.index({ userId: 1, createdAt: -1 }) | ||
|
|
||
| export const TrainingDataset = model<ITrainingDataset>('TrainingDataset', TrainingDatasetSchema) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,52 @@ | ||
| import { Router } from 'express' | ||
| import TrainingDatasetController from './TrainingDataset.controller' | ||
|
|
||
| const router = Router() | ||
|
Comment on lines
+1
to
+4
|
||
|
|
||
| /** | ||
| * Training Dataset Routes | ||
| * All routes require authentication | ||
| */ | ||
|
|
||
| /** | ||
| * POST /api/v1/training-datasets/upload | ||
| * Upload a training dataset file | ||
| * Supports: CSV, JSON, Markdown, Text files | ||
| * Max file size: 100MB | ||
| */ | ||
| router.post( | ||
| '/upload', | ||
| TrainingDatasetController.getUploadMiddleware(), | ||
| TrainingDatasetController.uploadDataset.bind(TrainingDatasetController) | ||
| ) | ||
|
|
||
| /** | ||
| * GET /api/v1/training-datasets | ||
| * Get all datasets for the user | ||
| * Query params: | ||
| * - skip: number (default: 0) | ||
| * - limit: number (default: 20) | ||
| */ | ||
| router.get('/', TrainingDatasetController.getDatasets.bind(TrainingDatasetController)) | ||
|
|
||
| /** | ||
| * GET /api/v1/training-datasets/stats | ||
| * Get statistics about datasets | ||
| */ | ||
| router.get('/stats', TrainingDatasetController.getStatistics.bind(TrainingDatasetController)) | ||
|
|
||
| /** | ||
| * GET /api/v1/training-datasets/:id | ||
| * Get a single dataset | ||
| */ | ||
| router.get('/:id', TrainingDatasetController.getDataset.bind(TrainingDatasetController)) | ||
morningstarxcdcode marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| /** | ||
| * DELETE /api/v1/training-datasets/:id | ||
| * Delete a dataset | ||
| * Query params: | ||
| * - deleteSamples: boolean (default: false) - whether to delete associated samples | ||
| */ | ||
| router.delete('/:id', TrainingDatasetController.deleteDataset.bind(TrainingDatasetController)) | ||
|
|
||
| export default router | ||
Uh oh!
There was an error while loading. Please reload this page.