Conrad-X · AwaisKamran · May 15, 2025 · May 15, 2025 · May 20, 2025 · May 20, 2025
diff --git a/.gitignore b/.gitignore
@@ -108,8 +108,12 @@ __pycache__/
 
 # Jest
 /coverage
+.coverage
 
 # End of https://www.toptal.com/developers/gitignore/api/node
 
 .venv/
-venv/
+venv/
+
+# Sqlite
+chroma.sqlite3
diff --git a/server/__init__.py b/server/__init__.py
@@ -0,0 +1,5 @@
+"""
+text2SQL Server Package.
+
+This package handles the server-side operations for converting natural language text to SQL queries.
+"""
diff --git a/server/preprocess/prepare_sample_dataset.py b/server/preprocess/prepare_sample_dataset.py
@@ -15,6 +15,7 @@
 
 import os
 import shutil
+from pathlib import Path
 
 from preprocess.add_descriptions_bird_dataset import add_database_descriptions
 from tqdm import tqdm
@@ -85,10 +86,16 @@ def create_train_file(train_file: str) -> None:
         if PATH_CONFIG.sample_dataset_type == DatasetType.BIRD_TRAIN:
             add_sequential_ids_to_questions(train_file)
     except FileNotFoundError as e:
+        if isinstance(e, FileNotFoundError):
+            raise
         logger.error(ERROR_FILE_NOT_FOUND.format(error=str(e)))
     except IOError as e:
+        if isinstance(e, IOError):
+            raise
         logger.error(ERROR_IO.format(error=str(e)))
     except Exception as e:
+        if isinstance(e, PermissionError):
+            raise
         logger.error(UNEXPECTED_ERROR.format(error=str(e)))
 
 def copy_bird_train_file(train_file: str) -> None:
@@ -131,7 +138,7 @@ def get_train_data(train_file: str) -> list:
         logger.error(ERROR_INVALID_TRAIN_FILE)
         return None
 
-def add_schema_used(train_data: list, dataset_type: str) -> None:
+def add_schema_used(train_data: list, dataset_type: str, train_file: Path) -> None:
     """
     Add schema_used field to each item in the train data.
 
@@ -162,7 +169,9 @@ def add_schema_used(train_data: list, dataset_type: str) -> None:
                 except Exception as e:
                     logger.warning(WARNING_FAILED_TO_ADD_SCHEMA_USED.format(question_id=item[QUESTION_ID_KEY]))
                     item[SCHEMA_USED] = None
-    except KeyboardInterrupt:
+    except KeyboardInterrupt as e:
+        if isinstance(e, KeyboardInterrupt):
+            raise
         logger.error(ERROR_USER_KEYBOARD_INTERRUPTION)
 
     finally:
@@ -198,10 +207,10 @@ def add_schema_used(train_data: list, dataset_type: str) -> None:
     """
     train_file = get_train_file_path()
     dataset_type = PATH_CONFIG.sample_dataset_type
-    train_data = get_train_data(train_file)
+    train_data = get_train_data(Path(train_file))
-    train_data = get_train_data(Path(train_file))
+    train_data = get_train_data(Path(train_file))  # Ensure get_train_data supports Path objects
-    train_data = get_train_data(Path(train_file))
+    train_data = get_train_data(Path(train_file))  # Ensure get_train_data supports Path objects
 
     if train_data:  
-        add_schema_used(train_data, dataset_type)
+        add_schema_used(train_data, dataset_type, Path(train_file))
         # Need to Updated `add_database_descriptions` to work for sample datasets
         add_database_descriptions(
             dataset_type=dataset_type,

diff --git a/server/test/preprocess/test_prepare_sample_dataset.py b/server/test/preprocess/test_prepare_sample_dataset.py
@@ -0,0 +1,141 @@
+import os
+import sys
+import unittest
+from unittest.mock import MagicMock, mock_open, patch
+
+# Append the project root directory to sys.path
+sys.path.append(os.path.join(os.path.dirname(__file__), '../../..'))
+from pathlib import Path
+
+from utilities.config import PATH_CONFIG
+from utilities.constants.bird_utils.indexing_constants import (DB_ID_KEY,
+                                                               QUESTION_ID_KEY,
+                                                               SCHEMA_USED,
+                                                               SQL)
+from utilities.constants.database_enums import DatasetType
+
+from server.preprocess.prepare_sample_dataset import (add_schema_used,
+                                                      copy_bird_train_file,
+                                                      create_train_file,
+                                                      get_train_data,
+                                                      get_train_file_path)
+
+
+class TestGetTrainFilePathExisting(unittest.TestCase):
+    @patch('os.path.exists', return_value=True)
+    @patch('os.makedirs')
+    @patch('shutil.copyfile')
+    @patch('server.preprocess.prepare_sample_dataset.add_sequential_ids_to_questions')
+    def test_get_train_file_path_existing(self, mock_add_ids, mock_copyfile, mock_makedirs, mock_exists):
+        PATH_CONFIG.processed_train_path = MagicMock(return_value='/path/to/train_file.json')
+        result = get_train_file_path()
+        self.assertEqual(result, '/path/to/train_file.json')
+        mock_add_ids.assert_not_called()
+
+class TestGetTrainFilePathNonExisting(unittest.TestCase):
+    @patch('os.path.exists', return_value=False)
+    @patch('os.makedirs')
+    @patch('shutil.copyfile')
+    @patch('server.preprocess.prepare_sample_dataset.add_sequential_ids_to_questions')
+    def test_get_train_file_path_non_existing(self, mock_add_ids, mock_copyfile, mock_makedirs, mock_exists):
+        PATH_CONFIG.processed_train_path = MagicMock(return_value='/path/to/train_file.json')
+        PATH_CONFIG.sample_dataset_type = DatasetType.BIRD_TRAIN
+        result = get_train_file_path()
+        self.assertEqual(result, '/path/to/train_file.json')
+        mock_add_ids.assert_called_once_with('/path/to/train_file.json')
+
+class TestCreateTrainFile(unittest.TestCase):
+    @patch('os.makedirs')
+    @patch('shutil.copyfile')
+    @patch('server.preprocess.prepare_sample_dataset.add_sequential_ids_to_questions')
+    def test_create_train_file(self, mock_add_ids, mock_copyfile, mock_makedirs):
+        PATH_CONFIG.sample_dataset_type = DatasetType.BIRD_TRAIN
+
+        # Test normal behavior
+        create_train_file('/path/to/train_file.json')
+        mock_makedirs.assert_called_once_with('/path/to', exist_ok=True)
+        mock_copyfile.assert_called_once()
+
+    @patch('os.makedirs')
+    @patch('shutil.copyfile')
+    @patch('server.preprocess.prepare_sample_dataset.add_sequential_ids_to_questions')   
+    def test_file_not_found_error(self, mock_add_ids, mock_copyfile, mock_makedirs):
+        """Test FileNotFoundError"""
+        mock_copyfile.side_effect = FileNotFoundError
+        with self.assertRaises(FileNotFoundError):
+            create_train_file('/path/to/train_file.json')
+
+    @patch('os.makedirs')
+    @patch('shutil.copyfile')
+    @patch('server.preprocess.prepare_sample_dataset.add_sequential_ids_to_questions')
+    def test_io_error(self, mock_add_ids, mock_copyfile, mock_makedirs):
+        """Test IOError"""
+        mock_copyfile.side_effect = IOError
+        with self.assertRaises(IOError):
+            create_train_file('/path/to/train_file.json')
+
+    @patch('os.makedirs')
+    @patch('shutil.copyfile')
+    @patch('server.preprocess.prepare_sample_dataset.add_sequential_ids_to_questions')
+    def test_permission_error(self, mock_add_ids, mock_copyfile, mock_makedirs):
+        """Test PermissionError"""
+        mock_copyfile.side_effect = PermissionError
+        with self.assertRaises(PermissionError):
+            create_train_file('/path/to/train_file.json')
+
+class TestCreateTrainFilePermissionError(unittest.TestCase):
+    @patch('shutil.copyfile')
+    @patch('os.makedirs', side_effect=PermissionError("Permission denied"))
+    def test_create_train_file_permission_error(self, mock_makedirs, mock_copyfile):
+        PATH_CONFIG.sample_dataset_type = DatasetType.BIRD_TRAIN
+        with self.assertRaises(PermissionError):
+            create_train_file('/path/to/train_file.json')
+
+class TestCopyBirdTrainFile(unittest.TestCase):
+    @patch('shutil.copyfile')
+    def test_copy_bird_train_file(self, mock_copyfile):
+        PATH_CONFIG.bird_file_path = MagicMock(return_value='/path/to/source_file.json')
+        copy_bird_train_file('/path/to/train_file.json')
+        mock_copyfile.assert_called_once_with('/path/to/source_file.json', '/path/to/train_file.json')
+
+class TestGetTrainDataValid(unittest.TestCase):
+    @patch('builtins.open', new_callable=mock_open, read_data='{"db_id": "db1", "question_id": "q1", "sql": "SELECT * FROM table"}')
+    @patch('os.path.exists', return_value=True)
+    @patch('server.preprocess.prepare_sample_dataset.load_json_from_file', return_value=[{'db_id': 'db1', 'question_id': 'q1', 'sql': 'SELECT * FROM table'}])
+    def test_get_train_data_valid(self, mock_open_file, mock_exists, mock_load_json):
+        result = get_train_data('/path/to/train_file.json')
+        self.assertEqual(result, [{'db_id': 'db1', 'question_id': 'q1', 'sql': 'SELECT * FROM table'}])
+
+class TestGetTrainDataInvalidFile(unittest.TestCase):
+    @patch('os.path.exists', return_value=False)
+    def test_get_train_data_invalid_file(self, mock_exists):
+        train_file = "/path/to/non_existent_file.json"
+        result = get_train_data(train_file)
+        self.assertIsNone(result)
+
+class TestAddSchemaUsed(unittest.TestCase):
+    @patch('server.preprocess.prepare_sample_dataset.save_json_to_file')
+    @patch('server.preprocess.prepare_sample_dataset.get_sql_columns_dict', return_value={'columns': ['col1', 'col2']})
+    @patch('builtins.open', new_callable=mock_open, read_data='{"db_id": "db1", "question_id": "q1", "sql": "SELECT * FROM table"}')
+    def test_add_schema_used(self, mock_open_file, mock_get_sql_columns_dict, mock_save_json):
+        train_data = [{DB_ID_KEY: 'db1', QUESTION_ID_KEY: 'q1', SQL: 'SELECT * FROM table'}]
+        train_file = Path('/path/to/train_file.json')
+
+        add_schema_used(train_data, DatasetType.BIRD_TRAIN, train_file)
+        self.assertEqual(train_data[0][SCHEMA_USED], {'columns': ['col1', 'col2']})
+        mock_save_json.assert_called_once()
+
+    @patch('server.preprocess.prepare_sample_dataset.save_json_to_file')
+    @patch('server.preprocess.prepare_sample_dataset.get_sql_columns_dict', side_effect=KeyboardInterrupt)
+    @patch('builtins.open', new_callable=mock_open, read_data='{"db_id": "db1", "question_id": "q1", "sql": "SELECT * FROM table"}')
+    def test_add_schema_used_keyboard_interrupt(self, mock_open_file, mock_get_sql_columns_dict, mock_save_json):
+        train_data = [{DB_ID_KEY: 'db1', QUESTION_ID_KEY: 'q1', SQL: 'SELECT * FROM table'}]
+        train_file = Path('/path/to/train_file.json')
+
+        with self.assertRaises(KeyboardInterrupt):
+            add_schema_used(train_data, DatasetType.BIRD_TRAIN, train_file)
+
+        self.assertNotIn(SCHEMA_USED, train_data[0])
+
+if __name__ == '__main__':
+    unittest.main()