diff --git a/LocalMind-Backend/src/api/v1/TrainingSample/TrainingDataset.controller.ts b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingDataset.controller.ts new file mode 100644 index 0000000..f671222 --- /dev/null +++ b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingDataset.controller.ts @@ -0,0 +1,243 @@ +import { Request, Response, NextFunction } from 'express' +import TrainingDatasetService from './TrainingDataset.service' +import * as multer from 'multer' +import * as path from 'path' +import * as os from 'os' + +// Configure multer for file uploads +const uploadDir = path.join(os.tmpdir(), 'training-datasets') +const storage = multer.diskStorage({ + destination: (req, file, cb) => { + cb(null, uploadDir) + }, + filename: (req, file, cb) => { + const timestamp = Date.now() + const random = Math.random().toString(36).substring(7) + cb(null, `${timestamp}-${random}-${file.originalname}`) + }, +}) + +const upload = multer({ + storage, + limits: { fileSize: 100 * 1024 * 1024 }, // 100MB max + fileFilter: (req, file, cb) => { + const allowedMimes = ['text/csv', 'application/json', 'text/markdown', 'text/plain'] + const allowedExts = ['.csv', '.json', '.md', '.markdown', '.txt'] + + const ext = path.extname(file.originalname).toLowerCase() + const isMimeAllowed = allowedMimes.includes(file.mimetype) + const isExtAllowed = allowedExts.includes(ext) + + if (isMimeAllowed || isExtAllowed) { + cb(null, true) + } else { + cb(new Error(`File type not supported. Allowed: ${allowedExts.join(', ')}`)) + } + }, +}) + +class TrainingDatasetController { + /** + * POST /api/v1/training-datasets/upload + * Upload and process a training dataset file + */ + async uploadDataset(req: Request, res: Response, next: NextFunction): Promise { + try { + const userId = req.user?.id + if (!userId) { + res.status(401).json({ error: 'Unauthorized' }) + return + } + + if (!req.file) { + res.status(400).json({ error: 'No file provided' }) + return + } + + const { name, description } = req.body + + if (!name) { + res.status(400).json({ error: 'Dataset name is required' }) + return + } + + // Determine file type from extension + const ext = path.extname(req.file.originalname).toLowerCase() + let fileType: 'csv' | 'json' | 'markdown' | 'text' + + switch (ext) { + case '.csv': + fileType = 'csv' + break + case '.json': + fileType = 'json' + break + case '.md': + case '.markdown': + fileType = 'markdown' + break + case '.txt': + fileType = 'text' + break + default: + res.status(400).json({ error: 'Unsupported file type' }) + return + } + + // Create dataset record + const dataset = await TrainingDatasetService.createDataset( + userId, + req.file.originalname, + fileType, + req.file.size, + name, + description + ) + + // Process dataset asynchronously + TrainingDatasetService.processDataset(dataset._id.toString(), userId, req.file.path).catch(error => { + console.error('Error processing dataset:', error) + }) + + res.status(201).json({ + success: true, + data: dataset, + message: 'Dataset uploaded successfully and is being processed', + }) + } catch (error: any) { + console.error('Error uploading dataset:', error) + res.status(500).json({ error: error.message || 'Failed to upload dataset' }) + } + } + + /** + * GET /api/v1/training-datasets + * Get all datasets for a user + */ + async getDatasets(req: Request, res: Response, next: NextFunction): Promise { + try { + const userId = req.user?.id + if (!userId) { + res.status(401).json({ error: 'Unauthorized' }) + return + } + + const { skip = 0, limit = 20 } = req.query + + const result = await TrainingDatasetService.getDatasets( + userId, + parseInt(skip as string) || 0, + parseInt(limit as string) || 20 + ) + + res.status(200).json({ + success: true, + data: result.datasets, + pagination: { + skip: parseInt(skip as string) || 0, + limit: parseInt(limit as string) || 20, + total: result.total, + }, + }) + } catch (error) { + console.error('Error fetching datasets:', error) + res.status(500).json({ error: 'Failed to fetch datasets' }) + } + } + + /** + * GET /api/v1/training-datasets/:id + * Get dataset by ID + */ + async getDataset(req: Request, res: Response, next: NextFunction): Promise { + try { + const userId = req.user?.id + const { id } = req.params + + if (!userId) { + res.status(401).json({ error: 'Unauthorized' }) + return + } + + const dataset = await TrainingDatasetService.getDataset(id, userId) + + if (!dataset) { + res.status(404).json({ error: 'Dataset not found' }) + return + } + + res.status(200).json({ + success: true, + data: dataset, + }) + } catch (error) { + console.error('Error fetching dataset:', error) + res.status(500).json({ error: 'Failed to fetch dataset' }) + } + } + + /** + * DELETE /api/v1/training-datasets/:id + * Delete dataset + */ + async deleteDataset(req: Request, res: Response, next: NextFunction): Promise { + try { + const userId = req.user?.id + const { id } = req.params + const { deleteSamples = false } = req.query + + if (!userId) { + res.status(401).json({ error: 'Unauthorized' }) + return + } + + const success = await TrainingDatasetService.deleteDataset(id, userId, deleteSamples === 'true') + + if (!success) { + res.status(404).json({ error: 'Dataset not found' }) + return + } + + res.status(200).json({ + success: true, + message: 'Dataset deleted successfully', + }) + } catch (error) { + console.error('Error deleting dataset:', error) + res.status(500).json({ error: 'Failed to delete dataset' }) + } + } + + /** + * GET /api/v1/training-datasets/stats + * Get dataset statistics + */ + async getStatistics(req: Request, res: Response, next: NextFunction): Promise { + try { + const userId = req.user?.id + if (!userId) { + res.status(401).json({ error: 'Unauthorized' }) + return + } + + const stats = await TrainingDatasetService.getStatistics(userId) + + res.status(200).json({ + success: true, + data: stats, + }) + } catch (error) { + console.error('Error fetching statistics:', error) + res.status(500).json({ error: 'Failed to fetch statistics' }) + } + } + + /** + * Get multer upload middleware + */ + getUploadMiddleware() { + return upload.single('file') + } +} + +export default new TrainingDatasetController() diff --git a/LocalMind-Backend/src/api/v1/TrainingSample/TrainingDataset.model.ts b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingDataset.model.ts new file mode 100644 index 0000000..a9ab41e --- /dev/null +++ b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingDataset.model.ts @@ -0,0 +1,87 @@ +import { Schema, Document, Types, model } from 'mongoose' + +export interface ITrainingDataset extends Document { + userId: Types.ObjectId + name: string + description?: string + fileName: string + fileType: 'csv' | 'json' | 'markdown' | 'text' + fileSize: number // in bytes + sampleCount: number + status: 'pending' | 'processing' | 'completed' | 'failed' + errorMessage?: string + importedAt?: Date + isActive: boolean + metadata?: { + headers?: string[] + delimiter?: string + encoding?: string + } + createdAt: Date + updatedAt: Date +} + +const TrainingDatasetSchema = new Schema( + { + userId: { + type: Schema.Types.ObjectId, + ref: 'User', + required: true, + }, + name: { + type: String, + required: true, + maxlength: 255, + }, + description: { + type: String, + maxlength: 1000, + }, + fileName: { + type: String, + required: true, + index: true, + }, + fileType: { + type: String, + enum: ['csv', 'json', 'markdown', 'text'], + required: true, + }, + fileSize: { + type: Number, + required: true, + }, + sampleCount: { + type: Number, + default: 0, + }, + status: { + type: String, + enum: ['pending', 'processing', 'completed', 'failed'], + default: 'pending', + index: true, + }, + errorMessage: String, + importedAt: Date, + isActive: { + type: Boolean, + default: true, + index: true, + }, + metadata: { + headers: [String], + delimiter: String, + encoding: String, + }, + }, + { + timestamps: true, + } +) + +// Compound indices for common queries +TrainingDatasetSchema.index({ userId: 1, status: 1 }) +TrainingDatasetSchema.index({ userId: 1, isActive: 1 }) +TrainingDatasetSchema.index({ userId: 1, createdAt: -1 }) + +export const TrainingDataset = model('TrainingDataset', TrainingDatasetSchema) diff --git a/LocalMind-Backend/src/api/v1/TrainingSample/TrainingDataset.routes.ts b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingDataset.routes.ts new file mode 100644 index 0000000..24f97bf --- /dev/null +++ b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingDataset.routes.ts @@ -0,0 +1,52 @@ +import { Router } from 'express' +import TrainingDatasetController from './TrainingDataset.controller' + +const router = Router() + +/** + * Training Dataset Routes + * All routes require authentication + */ + +/** + * POST /api/v1/training-datasets/upload + * Upload a training dataset file + * Supports: CSV, JSON, Markdown, Text files + * Max file size: 100MB + */ +router.post( + '/upload', + TrainingDatasetController.getUploadMiddleware(), + TrainingDatasetController.uploadDataset.bind(TrainingDatasetController) +) + +/** + * GET /api/v1/training-datasets + * Get all datasets for the user + * Query params: + * - skip: number (default: 0) + * - limit: number (default: 20) + */ +router.get('/', TrainingDatasetController.getDatasets.bind(TrainingDatasetController)) + +/** + * GET /api/v1/training-datasets/stats + * Get statistics about datasets + */ +router.get('/stats', TrainingDatasetController.getStatistics.bind(TrainingDatasetController)) + +/** + * GET /api/v1/training-datasets/:id + * Get a single dataset + */ +router.get('/:id', TrainingDatasetController.getDataset.bind(TrainingDatasetController)) + +/** + * DELETE /api/v1/training-datasets/:id + * Delete a dataset + * Query params: + * - deleteSamples: boolean (default: false) - whether to delete associated samples + */ +router.delete('/:id', TrainingDatasetController.deleteDataset.bind(TrainingDatasetController)) + +export default router diff --git a/LocalMind-Backend/src/api/v1/TrainingSample/TrainingDataset.service.ts b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingDataset.service.ts new file mode 100644 index 0000000..f8678d7 --- /dev/null +++ b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingDataset.service.ts @@ -0,0 +1,218 @@ +import { Types } from 'mongoose' +import { TrainingDataset, ITrainingDataset } from './TrainingDataset.model' +import { TrainingSample } from './TrainingSample.model' +import TrainingSampleFileParser, { ParsedSample } from './TrainingSample.utils' +import EmbeddingUtils from './TrainingSample.embedding' +import * as fs from 'fs' + +class TrainingDatasetService { + /** + * Create a new dataset record + */ + async createDataset( + userId: string, + fileName: string, + fileType: 'csv' | 'json' | 'markdown' | 'text', + fileSize: number, + name: string, + description?: string + ): Promise { + const dataset = await TrainingDataset.create({ + userId: new Types.ObjectId(userId), + fileName, + fileType, + fileSize, + name, + description, + status: 'pending', + }) + + return dataset + } + + /** + * Process a dataset file and create training samples + */ + async processDataset(datasetId: string, userId: string, filePath: string): Promise { + const dataset = await TrainingDataset.findOne({ + _id: datasetId, + userId: new Types.ObjectId(userId), + }) + + if (!dataset) { + throw new Error('Dataset not found') + } + + try { + // Update status to processing + await TrainingDataset.updateOne({ _id: datasetId }, { status: 'processing' }) + + // Parse file + const parsedSamples = await TrainingSampleFileParser.parseFile(filePath) + + // Validate samples + const { valid: validSamples, errors } = TrainingSampleFileParser.validateSamples(parsedSamples) + + if (validSamples.length === 0) { + throw new Error(`No valid samples found. Errors: ${errors.join('; ')}`) + } + + // Generate embeddings and create samples + const createdSamples = [] + for (const sample of validSamples) { + try { + const embeddingText = `${sample.question} ${sample.answerTemplate.answer}` + const { embedding } = await EmbeddingUtils.generateEmbedding(embeddingText) + + const trainingSample = await TrainingSample.create({ + userId: new Types.ObjectId(userId), + ...sample, + embedding, + }) + + createdSamples.push(trainingSample) + } catch (error) { + console.error('Error creating sample:', sample.question, error) + // Continue with next sample + } + } + + // Update dataset with results + await TrainingDataset.updateOne( + { _id: datasetId }, + { + status: 'completed', + sampleCount: createdSamples.length, + importedAt: new Date(), + errorMessage: errors.length > 0 ? errors.slice(0, 5).join('; ') : undefined, + } + ) + + // Clean up temporary file + if (fs.existsSync(filePath)) { + fs.unlinkSync(filePath) + } + + return createdSamples.length + } catch (error: any) { + console.error('Error processing dataset:', error) + + await TrainingDataset.updateOne( + { _id: datasetId }, + { + status: 'failed', + errorMessage: error.message, + } + ) + + throw error + } + } + + /** + * Get all datasets for a user + */ + async getDatasets( + userId: string, + skip: number = 0, + limit: number = 20 + ): Promise<{ datasets: ITrainingDataset[]; total: number }> { + const [datasets, total] = await Promise.all([ + TrainingDataset.find({ userId: new Types.ObjectId(userId), isActive: true }) + .skip(skip) + .limit(limit) + .sort({ createdAt: -1 }) + .exec(), + TrainingDataset.countDocuments({ + userId: new Types.ObjectId(userId), + isActive: true, + }), + ]) + + return { datasets, total } + } + + /** + * Get dataset by ID + */ + async getDataset(datasetId: string, userId: string): Promise { + return await TrainingDataset.findOne({ + _id: datasetId, + userId: new Types.ObjectId(userId), + }) + } + + /** + * Delete dataset (and optionally associated samples) + */ + async deleteDataset(datasetId: string, userId: string, deletesSamples: boolean = false): Promise { + const dataset = await this.getDataset(datasetId, userId) + + if (!dataset) { + return false + } + + if (deletesSamples) { + // Delete all samples associated with this dataset + await TrainingSample.deleteMany({ + sourceType: 'dataset', + userId: new Types.ObjectId(userId), + }) + } + + await TrainingDataset.updateOne({ _id: datasetId }, { isActive: false }) + + return true + } + + /** + * Get dataset statistics + */ + async getStatistics(userId: string): Promise<{ + totalDatasets: number + completedDatasets: number + failedDatasets: number + totalSamples: number + byFileType: Record + }> { + const userIdObj = new Types.ObjectId(userId) + + const [totalDatasets, completedDatasets, failedDatasets, totalSamples, byFileType] = await Promise.all([ + TrainingDataset.countDocuments({ userId: userIdObj, isActive: true }), + TrainingDataset.countDocuments({ + userId: userIdObj, + status: 'completed', + isActive: true, + }), + TrainingDataset.countDocuments({ + userId: userIdObj, + status: 'failed', + isActive: true, + }), + TrainingSample.countDocuments({ + userId: userIdObj, + sourceType: 'dataset', + isActive: true, + }), + TrainingDataset.aggregate([ + { $match: { userId: userIdObj, isActive: true } }, + { $group: { _id: '$fileType', count: { $sum: 1 } } }, + ]), + ]) + + const byFileTypeObj: Record = {} + byFileType.forEach((item: any) => { + byFileTypeObj[item._id] = item.count + }) + + return { + totalDatasets, + completedDatasets, + failedDatasets, + totalSamples, + byFileType: byFileTypeObj, + } + } +} + +export default new TrainingDatasetService() diff --git a/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.controller.ts b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.controller.ts new file mode 100644 index 0000000..fd19402 --- /dev/null +++ b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.controller.ts @@ -0,0 +1,261 @@ +import { Request, Response, NextFunction } from 'express' +import TrainingSampleService from './TrainingSample.service' +import { createTrainingSampleSchema, updateTrainingSampleSchema, vectorSearchSchema } from './TrainingSample.validator' + +class TrainingSampleController { + /** + * POST /api/v1/training-samples + * Create a new training sample + */ + async createTrainingSample(req: Request, res: Response, next: NextFunction): Promise { + try { + const userId = req.user?.id + if (!userId) { + res.status(401).json({ error: 'Unauthorized' }) + return + } + + // Validate request body + const validatedData = createTrainingSampleSchema.parse(req.body) + + const sample = await TrainingSampleService.createSample(userId, validatedData) + + res.status(201).json({ + success: true, + data: sample, + message: 'Training sample created successfully', + }) + } catch (error: any) { + if (error.name === 'ZodError') { + res.status(400).json({ + error: 'Validation error', + details: error.errors, + }) + } else { + console.error('Error creating training sample:', error) + res.status(500).json({ error: 'Failed to create training sample' }) + } + } + } + + /** + * GET /api/v1/training-samples + * Get all training samples with filtering and pagination + */ + async getSamples(req: Request, res: Response, next: NextFunction): Promise { + try { + const userId = req.user?.id + if (!userId) { + res.status(401).json({ error: 'Unauthorized' }) + return + } + + const { + type, + tags, + sourceType, + isActive = true, + language, + skip = 0, + limit = 20, + } = req.query + + const filters = { + type: type ? (Array.isArray(type) ? type : [type]) : undefined, + tags: tags ? (Array.isArray(tags) ? tags : [tags]) : undefined, + sourceType: sourceType as 'manual' | 'dataset' | undefined, + isActive: isActive === 'true' ? true : isActive === 'false' ? false : undefined, + language: language as string | undefined, + } + + const result = await TrainingSampleService.getSamplesByUser( + userId, + filters, + parseInt(skip as string) || 0, + parseInt(limit as string) || 20 + ) + + res.status(200).json({ + success: true, + data: result.samples, + pagination: { + skip: parseInt(skip as string) || 0, + limit: parseInt(limit as string) || 20, + total: result.total, + }, + }) + } catch (error) { + console.error('Error fetching training samples:', error) + res.status(500).json({ error: 'Failed to fetch training samples' }) + } + } + + /** + * GET /api/v1/training-samples/:id + * Get a single training sample by ID + */ + async getSampleById(req: Request, res: Response, next: NextFunction): Promise { + try { + const userId = req.user?.id + const { id } = req.params + + if (!userId) { + res.status(401).json({ error: 'Unauthorized' }) + return + } + + const sample = await TrainingSampleService.getSampleById(id, userId) + + if (!sample) { + res.status(404).json({ error: 'Training sample not found' }) + return + } + + res.status(200).json({ + success: true, + data: sample, + }) + } catch (error) { + console.error('Error fetching training sample:', error) + res.status(500).json({ error: 'Failed to fetch training sample' }) + } + } + + /** + * PUT /api/v1/training-samples/:id + * Update a training sample + */ + async updateSample(req: Request, res: Response, next: NextFunction): Promise { + try { + const userId = req.user?.id + const { id } = req.params + + if (!userId) { + res.status(401).json({ error: 'Unauthorized' }) + return + } + + // Validate request body + const validatedData = updateTrainingSampleSchema.parse(req.body) + + const updatedSample = await TrainingSampleService.updateSample(id, userId, validatedData) + + if (!updatedSample) { + res.status(404).json({ error: 'Training sample not found' }) + return + } + + res.status(200).json({ + success: true, + data: updatedSample, + message: 'Training sample updated successfully', + }) + } catch (error: any) { + if (error.name === 'ZodError') { + res.status(400).json({ + error: 'Validation error', + details: error.errors, + }) + } else { + console.error('Error updating training sample:', error) + res.status(500).json({ error: 'Failed to update training sample' }) + } + } + } + + /** + * DELETE /api/v1/training-samples/:id + * Delete a training sample (soft delete) + */ + async deleteSample(req: Request, res: Response, next: NextFunction): Promise { + try { + const userId = req.user?.id + const { id } = req.params + + if (!userId) { + res.status(401).json({ error: 'Unauthorized' }) + return + } + + const success = await TrainingSampleService.deleteSample(id, userId) + + if (!success) { + res.status(404).json({ error: 'Training sample not found' }) + return + } + + res.status(200).json({ + success: true, + message: 'Training sample deleted successfully', + }) + } catch (error) { + console.error('Error deleting training sample:', error) + res.status(500).json({ error: 'Failed to delete training sample' }) + } + } + + /** + * POST /api/v1/training-samples/search + * Vector semantic search + */ + async vectorSearch(req: Request, res: Response, next: NextFunction): Promise { + try { + const userId = req.user?.id + if (!userId) { + res.status(401).json({ error: 'Unauthorized' }) + return + } + + // Validate request body + const validatedData = vectorSearchSchema.parse(req.body) + + const result = await TrainingSampleService.vectorSearch(userId, validatedData) + + res.status(200).json({ + success: true, + data: result.samples, + metadata: { + totalResults: result.totalResults, + searchTime: result.searchTime, + query: validatedData.query, + }, + }) + } catch (error: any) { + if (error.name === 'ZodError') { + res.status(400).json({ + error: 'Validation error', + details: error.errors, + }) + } else { + console.error('Error performing vector search:', error) + res.status(500).json({ error: 'Failed to perform vector search' }) + } + } + } + + /** + * GET /api/v1/training-samples/stats + * Get statistics about training samples + */ + async getStatistics(req: Request, res: Response, next: NextFunction): Promise { + try { + const userId = req.user?.id + if (!userId) { + res.status(401).json({ error: 'Unauthorized' }) + return + } + + const stats = await TrainingSampleService.getStatistics(userId) + + res.status(200).json({ + success: true, + data: stats, + }) + } catch (error) { + console.error('Error fetching statistics:', error) + res.status(500).json({ error: 'Failed to fetch statistics' }) + } + } +} + +export default new TrainingSampleController() diff --git a/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.embedding.ts b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.embedding.ts new file mode 100644 index 0000000..d1e174c --- /dev/null +++ b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.embedding.ts @@ -0,0 +1,69 @@ +import { GoogleGenerativeAI } from '@google/generative-ai' +import { IEmbeddingResponse } from './TrainingSample.types' + +class EmbeddingUtils { + private genAI: GoogleGenerativeAI + private embeddingModel = 'models/embedding-001' + + constructor() { + const apiKey = process.env.GOOGLE_API_KEY + if (!apiKey) { + throw new Error('GOOGLE_API_KEY environment variable is required for embeddings') + } + this.genAI = new GoogleGenerativeAI(apiKey) + } + + async generateEmbedding(text: string): Promise { + try { + const model = this.genAI.getGenerativeModel({ model: this.embeddingModel }) + + const result = await model.embedContent(text) + const embedding = result.embedding + + if (!embedding || !embedding.values) { + throw new Error('Failed to generate embedding - no embedding values returned') + } + + return { + embedding: embedding.values, + modelUsed: this.embeddingModel, + } + } catch (error) { + console.error('Error generating embedding:', error) + throw new Error(`Failed to generate embedding: ${(error as Error).message}`) + } + } + + async generateEmbeddingBatch(texts: string[]): Promise { + const results = await Promise.all(texts.map(text => this.generateEmbedding(text))) + return results + } + + // Calculate cosine similarity between two vectors + static cosineSimilarity(a: number[], b: number[]): number { + if (a.length !== b.length) { + throw new Error('Vectors must have the same length') + } + + let dotProduct = 0 + let magnitudeA = 0 + let magnitudeB = 0 + + for (let i = 0; i < a.length; i++) { + dotProduct += a[i] * b[i] + magnitudeA += a[i] * a[i] + magnitudeB += b[i] * b[i] + } + + magnitudeA = Math.sqrt(magnitudeA) + magnitudeB = Math.sqrt(magnitudeB) + + if (magnitudeA === 0 || magnitudeB === 0) { + return 0 + } + + return dotProduct / (magnitudeA * magnitudeB) + } +} + +export default new EmbeddingUtils() diff --git a/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.model.ts b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.model.ts new file mode 100644 index 0000000..914361d --- /dev/null +++ b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.model.ts @@ -0,0 +1,133 @@ +import { Schema, model, Types } from 'mongoose' +import { ITrainingSample, ISection, IAnswerTemplate } from './TrainingSample.types' + +// Section schema for structured answers +const SectionSchema = new Schema( + { + title: { + type: String, + required: true, + trim: true, + }, + content: { + type: String, + required: true, + trim: true, + }, + }, + { _id: false } +) + +// Answer template schema +const AnswerTemplateSchema = new Schema( + { + greeting: { + type: String, + trim: true, + }, + answer: { + type: String, + required: true, + trim: true, + }, + sections: { + type: [SectionSchema], + default: [], + }, + suggestions: { + type: [String], + default: [], + }, + }, + { _id: false } +) + +// Main Training Sample schema +const TrainingSampleSchema = new Schema( + { + userId: { + type: Types.ObjectId, + ref: 'User', + required: true, + index: true, + }, + question: { + type: String, + required: true, + trim: true, + index: true, + }, + type: { + type: String, + enum: ['qa', 'snippet', 'doc', 'faq', 'other'], + default: 'qa', + index: true, + }, + answerTemplate: { + type: AnswerTemplateSchema, + required: true, + }, + codeSnippet: { + type: String, + default: null, + }, + // Vector embedding for semantic search + embedding: { + type: [Number], + required: true, + // Note: For MongoDB Atlas Vector Search, add vector index in MongoDB + // db.createIndex({ "embedding": "vector" }) + }, + filePath: { + type: String, + default: null, + }, + fileMimeType: { + type: String, + default: null, + }, + fileSizeInBytes: { + type: Number, + default: null, + }, + sourceType: { + type: String, + enum: ['manual', 'dataset'], + default: 'manual', + index: true, + }, + datasetId: { + type: Types.ObjectId, + ref: 'TrainingDataset', + default: null, + }, + tags: { + type: [String], + default: [], + index: true, + }, + language: { + type: String, + default: 'en', + trim: true, + index: true, + }, + isActive: { + type: Boolean, + default: true, + index: true, + }, + }, + { + timestamps: true, + versionKey: false, + } +) + +// Compound indices for common queries +TrainingSampleSchema.index({ userId: 1, isActive: 1 }) +TrainingSampleSchema.index({ userId: 1, type: 1 }) +TrainingSampleSchema.index({ userId: 1, sourceType: 1 }) +TrainingSampleSchema.index({ tags: 1, userId: 1 }) + +export const TrainingSample = model('TrainingSample', TrainingSampleSchema) diff --git a/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.routes.ts b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.routes.ts new file mode 100644 index 0000000..4ee3e18 --- /dev/null +++ b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.routes.ts @@ -0,0 +1,79 @@ +import { Router } from 'express' +import TrainingSampleController from './TrainingSample.controller' + +const router = Router() + +/** + * Training Sample Routes + * All routes require authentication + */ + +/** + * POST /api/v1/training-samples + * Create a new training sample + * Body: { + * question: string, + * answerTemplate: { answer, format, structure }, + * type: 'qa' | 'snippet' | 'doc' | 'faq' | 'other', + * sourceType: 'manual' | 'dataset', + * tags: string[], + * language: string + * } + */ +router.post('/', TrainingSampleController.createTrainingSample.bind(TrainingSampleController)) + +/** + * GET /api/v1/training-samples + * Get all training samples with filtering + * Query params: + * - type: string[] (comma-separated) + * - tags: string[] (comma-separated) + * - sourceType: 'manual' | 'dataset' + * - isActive: boolean + * - language: string + * - skip: number (default: 0) + * - limit: number (default: 20) + */ +router.get('/', TrainingSampleController.getSamples.bind(TrainingSampleController)) + +/** + * GET /api/v1/training-samples/stats + * Get statistics about training samples + */ +router.get('/stats', TrainingSampleController.getStatistics.bind(TrainingSampleController)) + +/** + * GET /api/v1/training-samples/:id + * Get a single training sample + */ +router.get('/:id', TrainingSampleController.getSampleById.bind(TrainingSampleController)) + +/** + * PUT /api/v1/training-samples/:id + * Update a training sample + */ +router.put('/:id', TrainingSampleController.updateSample.bind(TrainingSampleController)) + +/** + * DELETE /api/v1/training-samples/:id + * Delete a training sample (soft delete) + */ +router.delete('/:id', TrainingSampleController.deleteSample.bind(TrainingSampleController)) + +/** + * POST /api/v1/training-samples/search + * Vector semantic search + * Body: { + * query: string, + * topK: number (default: 5), + * filters: { + * type?: string[], + * tags?: string[], + * sourceType?: 'manual' | 'dataset', + * language?: string + * } + * } + */ +router.post('/search', TrainingSampleController.vectorSearch.bind(TrainingSampleController)) + +export default router diff --git a/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.service.ts b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.service.ts new file mode 100644 index 0000000..19a66ca --- /dev/null +++ b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.service.ts @@ -0,0 +1,229 @@ +import { Types } from 'mongoose' +import { TrainingSample } from './TrainingSample.model' +import { ITrainingSample, IVectorSearchRequest, IVectorSearchResult } from './TrainingSample.types' +import EmbeddingUtils from './TrainingSample.embedding' + +class TrainingSampleService { + /** + * Create a new training sample with embedding generation + */ + async createSample(userId: string, data: any): Promise { + try { + // Generate embedding from question and answer + const embeddingText = `${data.question} ${data.answerTemplate.answer}` + const { embedding } = await EmbeddingUtils.generateEmbedding(embeddingText) + + const sample = await TrainingSample.create({ + userId: new Types.ObjectId(userId), + ...data, + embedding, + }) + + return sample + } catch (error) { + console.error('Error creating training sample:', error) + throw error + } + } + + /** + * Get all training samples for a user with filters + */ + async getSamplesByUser( + userId: string, + filters: { + type?: string[] + tags?: string[] + sourceType?: 'manual' | 'dataset' + isActive?: boolean + language?: string + } = {}, + skip: number = 0, + limit: number = 20 + ): Promise<{ samples: ITrainingSample[]; total: number }> { + const query: any = { + userId: new Types.ObjectId(userId), + } + + if (filters.type && filters.type.length > 0) { + query.type = { $in: filters.type } + } + if (filters.tags && filters.tags.length > 0) { + query.tags = { $in: filters.tags } + } + if (filters.sourceType) { + query.sourceType = filters.sourceType + } + if (filters.isActive !== undefined) { + query.isActive = filters.isActive + } + if (filters.language) { + query.language = filters.language + } + + const [samples, total] = await Promise.all([ + TrainingSample.find(query).skip(skip).limit(limit).exec(), + TrainingSample.countDocuments(query), + ]) + + return { samples, total } + } + + /** + * Get a single training sample by ID + */ + async getSampleById(sampleId: string, userId: string): Promise { + return await TrainingSample.findOne({ + _id: new Types.ObjectId(sampleId), + userId: new Types.ObjectId(userId), + }) + } + + /** + * Update a training sample and regenerate embedding if content changed + */ + async updateSample(sampleId: string, userId: string, data: any): Promise { + const sample = await this.getSampleById(sampleId, userId) + + if (!sample) { + throw new Error('Training sample not found') + } + + // Regenerate embedding if question or answer changed + let embedding = sample.embedding + if (data.question || data.answerTemplate) { + const question = data.question || sample.question + const answer = data.answerTemplate?.answer || sample.answerTemplate.answer + const embeddingText = `${question} ${answer}` + const result = await EmbeddingUtils.generateEmbedding(embeddingText) + embedding = result.embedding + } + + const updated = await TrainingSample.findByIdAndUpdate( + sampleId, + { + ...data, + embedding, + }, + { new: true } + ) + + return updated + } + + /** + * Soft delete a training sample + */ + async deleteSample(sampleId: string, userId: string): Promise { + const result = await TrainingSample.findOneAndUpdate( + { + _id: new Types.ObjectId(sampleId), + userId: new Types.ObjectId(userId), + }, + { isActive: false }, + { new: true } + ) + + return !!result + } + + /** + * Vector semantic search with cosine similarity + */ + async vectorSearch(userId: string, searchRequest: IVectorSearchRequest): Promise { + const startTime = Date.now() + + try { + // Generate embedding for the query + const { embedding: queryEmbedding } = await EmbeddingUtils.generateEmbedding(searchRequest.query) + + // Build filter query + const filterQuery: any = { + userId: new Types.ObjectId(userId), + isActive: true, + } + + if (searchRequest.filters) { + if (searchRequest.filters.type && searchRequest.filters.type.length > 0) { + filterQuery.type = { $in: searchRequest.filters.type } + } + if (searchRequest.filters.tags && searchRequest.filters.tags.length > 0) { + filterQuery.tags = { $in: searchRequest.filters.tags } + } + if (searchRequest.filters.sourceType) { + filterQuery.sourceType = searchRequest.filters.sourceType + } + if (searchRequest.filters.language) { + filterQuery.language = searchRequest.filters.language + } + } + + // Get all matching samples (note: for large datasets, consider MongoDB Atlas Vector Search) + const samples = await TrainingSample.find(filterQuery).exec() + + // Calculate similarity scores + const scoredSamples = samples + .map(sample => ({ + sample, + score: EmbeddingUtils.cosineSimilarity(queryEmbedding, sample.embedding), + })) + .sort((a, b) => b.score - a.score) + .slice(0, searchRequest.topK || 5) + + const searchTime = Date.now() - startTime + + return { + samples: scoredSamples.map(s => s.sample), + totalResults: scoredSamples.length, + searchTime, + } + } catch (error) { + console.error('Error performing vector search:', error) + throw error + } + } + + /** + * Get statistics for training samples + */ + async getStatistics(userId: string): Promise<{ + total: number + active: number + byType: Record + byLanguage: Record + }> { + const userIdObj = new Types.ObjectId(userId) + + const [total, active, byType, byLanguage] = await Promise.all([ + TrainingSample.countDocuments({ userId: userIdObj }), + TrainingSample.countDocuments({ userId: userIdObj, isActive: true }), + TrainingSample.aggregate([ + { $match: { userId: userIdObj } }, + { $group: { _id: '$type', count: { $sum: 1 } } }, + ]), + TrainingSample.aggregate([ + { $match: { userId: userIdObj } }, + { $group: { _id: '$language', count: { $sum: 1 } } }, + ]), + ]) + + const byTypeObj: Record = {} + byType.forEach((item: any) => { + byTypeObj[item._id] = item.count + }) + + const byLanguageObj: Record = {} + byLanguage.forEach((item: any) => { + byLanguageObj[item._id] = item.count + }) + + return { + total, + active, + byType: byTypeObj, + byLanguage: byLanguageObj, + } + } +} + +export default new TrainingSampleService() diff --git a/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.types.ts b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.types.ts new file mode 100644 index 0000000..b419999 --- /dev/null +++ b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.types.ts @@ -0,0 +1,75 @@ +import { Document, Types } from 'mongoose' + +export interface ISection { + title: string + content: string +} + +export interface IAnswerTemplate { + greeting?: string + answer: string + sections: ISection[] + suggestions: string[] +} + +export interface IAgent { + provider: string + model: string + isPaid: boolean +} + +export interface ITrainingSample extends Document { + userId: Types.ObjectId + question: string + type: 'qa' | 'snippet' | 'doc' | 'faq' | 'other' + answerTemplate: IAnswerTemplate + codeSnippet?: string + embedding: number[] + filePath?: string + fileMimeType?: string + fileSizeInBytes?: number + sourceType: 'manual' | 'dataset' + datasetId?: Types.ObjectId + tags: string[] + language: string + isActive: boolean + createdAt: Date + updatedAt: Date +} + +export interface ITrainingDataset extends Document { + userId: Types.ObjectId + fileName: string + filePath: string + fileMimeType: string + fileSizeInBytes: number + totalSamples: number + processedSamples: number + status: 'pending' | 'processing' | 'completed' | 'failed' + errorMessage?: string + createdAt: Date + updatedAt: Date +} + +export interface IEmbeddingResponse { + embedding: number[] + modelUsed: string +} + +export interface IVectorSearchRequest { + query: string + topK?: number + filters?: { + type?: string[] + tags?: string[] + sourceType?: 'manual' | 'dataset' + isActive?: boolean + language?: string + } +} + +export interface IVectorSearchResult { + samples: ITrainingSample[] + totalResults: number + searchTime: number +} diff --git a/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.utils.ts b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.utils.ts new file mode 100644 index 0000000..dd9a66f --- /dev/null +++ b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.utils.ts @@ -0,0 +1,246 @@ +import * as fs from 'fs' +import * as path from 'path' +import * as readline from 'readline' + +export interface ParsedSample { + question: string + answerTemplate: { + answer: string + format?: string + structure?: string[] + } + type: 'qa' | 'snippet' | 'doc' | 'faq' | 'other' + tags?: string[] + language?: string + sourceType: 'dataset' +} + +/** + * Utility class for parsing training data files + */ +export class TrainingSampleFileParser { + /** + * Parse CSV file for training samples + * Expected format: question,answer,type,tags,language + */ + static async parseCSV(filePath: string): Promise { + const samples: ParsedSample[] = [] + + return new Promise((resolve, reject) => { + const fileStream = fs.createReadStream(filePath) + const rl = readline.createInterface({ + input: fileStream, + crlfDelay: Infinity, + }) + + let isFirstLine = true + + rl.on('line', (line: string) => { + // Skip header row + if (isFirstLine) { + isFirstLine = false + return + } + + try { + const [question, answer, type, tags, language] = line.split(',').map(s => s.trim()) + + if (!question || !answer) { + console.warn('Skipping invalid CSV row:', line) + return + } + + const sample: ParsedSample = { + question, + answerTemplate: { + answer, + format: 'text', + }, + type: (type || 'qa') as any, + tags: tags ? tags.split(';').map(t => t.trim()) : [], + language: language || 'en', + sourceType: 'dataset', + } + + samples.push(sample) + } catch (error) { + console.warn('Error parsing CSV line:', line, error) + } + }) + + rl.on('close', () => { + resolve(samples) + }) + + rl.on('error', reject) + }) + } + + /** + * Parse JSON file for training samples + * Expected format: Array of objects with question, answer, type, tags, language + */ + static async parseJSON(filePath: string): Promise { + const content = fs.readFileSync(filePath, 'utf-8') + const data = JSON.parse(content) + + if (!Array.isArray(data)) { + throw new Error('JSON file must contain an array of samples') + } + + return data.map((item: any) => { + if (!item.question || !item.answerTemplate?.answer) { + throw new Error('Each sample must have question and answerTemplate.answer') + } + + return { + question: item.question, + answerTemplate: { + answer: item.answerTemplate.answer, + format: item.answerTemplate.format || 'text', + structure: item.answerTemplate.structure || [], + }, + type: item.type || 'qa', + tags: item.tags || [], + language: item.language || 'en', + sourceType: 'dataset', + } as ParsedSample + }) + } + + /** + * Parse Markdown file for training samples + * Format: ## Question\nAnswer text\nTags: tag1, tag2\n---\n + */ + static async parseMarkdown(filePath: string): Promise { + const content = fs.readFileSync(filePath, 'utf-8') + const sections = content.split('---').map(s => s.trim()).filter(s => s) + + return sections.map((section, index) => { + const lines = section.split('\n').filter(l => l.trim()) + + if (lines.length < 2) { + throw new Error(`Invalid markdown section ${index}: must have at least question and answer`) + } + + // Extract question (first ## header) + const questionMatch = lines[0].match(/^#+\s+(.+)$/) + const question = questionMatch ? questionMatch[1] : lines[0] + + // Extract answer (everything except question and metadata) + let answer = '' + let tags: string[] = [] + let language = 'en' + + for (let i = 1; i < lines.length; i++) { + const line = lines[i] + if (line.startsWith('Tags:')) { + tags = line + .replace('Tags:', '') + .split(',') + .map(t => t.trim()) + } else if (line.startsWith('Language:')) { + language = line.replace('Language:', '').trim() + } else if (!line.startsWith('---')) { + answer += line + '\n' + } + } + + return { + question, + answerTemplate: { + answer: answer.trim(), + format: 'markdown', + }, + type: 'doc', + tags, + language, + sourceType: 'dataset', + } as ParsedSample + }) + } + + /** + * Parse Text file for training samples + * Simple format: pairs of questions and answers separated by newlines + */ + static async parseText(filePath: string): Promise { + const content = fs.readFileSync(filePath, 'utf-8') + const lines = content.split('\n').filter(l => l.trim()) + + const samples: ParsedSample[] = [] + + for (let i = 0; i < lines.length; i += 2) { + if (i + 1 < lines.length) { + samples.push({ + question: lines[i], + answerTemplate: { + answer: lines[i + 1], + format: 'text', + }, + type: 'qa', + tags: [], + language: 'en', + sourceType: 'dataset', + }) + } + } + + return samples + } + + /** + * Auto-detect and parse file based on extension + */ + static async parseFile(filePath: string): Promise { + const ext = path.extname(filePath).toLowerCase() + + switch (ext) { + case '.csv': + return this.parseCSV(filePath) + case '.json': + return this.parseJSON(filePath) + case '.md': + case '.markdown': + return this.parseMarkdown(filePath) + case '.txt': + return this.parseText(filePath) + default: + throw new Error(`Unsupported file format: ${ext}`) + } + } + + /** + * Validate parsed samples + */ + static validateSamples(samples: ParsedSample[]): { valid: ParsedSample[]; errors: string[] } { + const valid: ParsedSample[] = [] + const errors: string[] = [] + + samples.forEach((sample, index) => { + const errs: string[] = [] + + if (!sample.question || sample.question.trim().length < 5) { + errs.push(`Question too short (min 5 chars)`) + } + + if (!sample.answerTemplate.answer || sample.answerTemplate.answer.trim().length < 5) { + errs.push(`Answer too short (min 5 chars)`) + } + + if (!['qa', 'snippet', 'doc', 'faq', 'other'].includes(sample.type)) { + errs.push(`Invalid type: ${sample.type}`) + } + + if (errs.length === 0) { + valid.push(sample) + } else { + errors.push(`Sample ${index}: ${errs.join('; ')}`) + } + }) + + return { valid, errors } + } +} + +export default TrainingSampleFileParser diff --git a/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.validator.ts b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.validator.ts new file mode 100644 index 0000000..b0dd51d --- /dev/null +++ b/LocalMind-Backend/src/api/v1/TrainingSample/TrainingSample.validator.ts @@ -0,0 +1,36 @@ +import { z } from 'zod' + +const SectionSchema = z.object({ + title: z.string().min(1, 'Section title is required').max(200, 'Title too long'), + content: z.string().min(1, 'Section content is required'), +}) + +const AnswerTemplateSchema = z.object({ + greeting: z.string().max(500, 'Greeting too long').optional(), + answer: z.string().min(10, 'Answer must be at least 10 characters').max(5000, 'Answer too long'), + sections: z.array(SectionSchema).default([]), + suggestions: z.array(z.string().min(1).max(200)).default([]), +}) + +export const createTrainingSampleSchema = z.object({ + question: z.string().min(5, 'Question must be at least 5 characters').max(1000), + type: z.enum(['qa', 'snippet', 'doc', 'faq', 'other']).default('qa'), + answerTemplate: AnswerTemplateSchema, + codeSnippet: z.string().max(10000).optional(), + tags: z.array(z.string().min(1).max(50)).default([]), + language: z.string().default('en').max(10), +}) + +export const updateTrainingSampleSchema = createTrainingSampleSchema.partial() + +export const vectorSearchSchema = z.object({ + query: z.string().min(1, 'Query is required'), + topK: z.number().int().min(1).max(100).default(5), + filters: z.object({ + type: z.array(z.string()).optional(), + tags: z.array(z.string()).optional(), + sourceType: z.enum(['manual', 'dataset']).optional(), + isActive: z.boolean().optional(), + language: z.string().optional(), + }).optional(), +})