diff --git a/.vscode/launch.json b/.vscode/launch.json index 812944d7..80dc36ba 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -27,7 +27,24 @@ }, "request": "launch", "program": "evalbench/client/eval_client.py", - "args": ["--insecure"], + "args": [ + "--insecure" + ], + "console": "integratedTerminal" + }, + { + "name": "Standalone", + "type": "debugpy", + "cwd": "${workspaceFolder}", + "env": { + "PYTHONPATH": "./evalbench:./evalbench/evalproto" + }, + "request": "launch", + "program": "evalbench/evalbench.py", + "args": [ + "--experiment_config", + "${workspaceFolder}/datasets/adk-tools/example_run_config.yaml" + ], "console": "integratedTerminal" } ] diff --git a/datasets/adk-tools/example_run_config.yaml b/datasets/adk-tools/example_run_config.yaml new file mode 100644 index 00000000..1535947a --- /dev/null +++ b/datasets/adk-tools/example_run_config.yaml @@ -0,0 +1,8 @@ +############################################################ +### Dataset / Eval Items +############################################################ +dataset_config: datasets/adk-tools/toolbox.evalset.json +dataset_format: adk-format +setup_directory: datasets/adk-tools/setup +agent_name: myagent +agent_module: myagent \ No newline at end of file diff --git a/datasets/adk-tools/toolbox.evalset.json b/datasets/adk-tools/toolbox.evalset.json new file mode 100644 index 00000000..1e998f91 --- /dev/null +++ b/datasets/adk-tools/toolbox.evalset.json @@ -0,0 +1,324 @@ +{ + "eval_set_id": "toolbox", + "name": "toolbox", + "eval_cases": [ + { + "eval_id": "checks", + "conversation": [ + { + "invocation_id": "e-91dc14e5-f293-4b21-a6c8-2614679446ae", + "user_content": { + "parts": [ + { + "text": "list clusters and check if there is a cluster named adk-toolbox in cloud-db-nl2sql in us-central2" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "Yes, there is a cluster named `adk-toolbox` in the `cloud-db-nl2sql` project in the `us-central2` location.", + "thought_signature": "CtYGAXLI2nw_UOdZpSS56kssJpDvhF6IDEQTWedupOmmqijRftni6Mp7d4f9rG70KjP3sAsy_g1TGMUFDcAp2VWxUsgdFERjVVyVDMDtI1spbIXKa0AQDPx_IVjbDVO5iMEBQa_kdEwRDExW_rPhMxUZusgyhbwF3HpZ-YJvjoBFTsg2dFjsZcMb-S8uGtHvlZpUlBoP5ukg2GtbgSpt2F_tcN6D1ZM6JPbxfkIK2IntyXO3Kt5DRFT3B-_w5tvEQNdueqfaAR7ITSXpO7bQsQ6FP5kr28mS_je0VyIX7VuCQT3mGNbohBaaVbX7f1KHYAd_MAJgGVClHqGhJaz7FVpX8BTMvPFn5yxZ0rM1jhW_g8WMjAJdEBI7B3oDo7qLVfSca6YatVcBHsGo6iENP4OM06uVnDXbpDXlQBwZZ9lBVx-yiTTUBVCx-I_q47QS3LYWPr1wG0uQPmcepODOm-NErBupDaVq440CSjeg5S0MCI5htiO3gGKEKJPPaCGX5x33wVip85fdcFbzdS7ye4RiLw0WbnEIMfhIbU91zdS1_UN7qB-FPhf_g4lygZc7R6msoait3s4-g9b5yuwzVbEUm-1estVJcyhVh62kGp7LHe3ttbIRKwvBqB9EFCG1gBiXwNwQM518_tBCnDUOtNbHgdoZotohg_-gWSbOBYLJZzAjY5JrDTknHBMtAtOR5NfLGk_KG1OxbJzuSo3hkak5yENeqeyaJ4r01asj9LDBsJVuPvnkmwz2CeNcbYEmZU-JT2hRykejkz_XVwJSIAftrQ3k_mCYDTlPA1YTsI41pxoSiYbhRqHKZzvhT3WNKoT1nWLrVQOIWQQrwXDplYAy_ayiwWx8miuDQSPfANdpNO3aRS_ZuziQX1Keh8cxsiaiiLnF5j3BjDTf1AFapvxnuEuf8Z7jI_lTW7eIABPtl1YsioLf6OzqLKsshA3JcQp99-L4JF-0Hqchf_4uX-kbEep-HqAAoJsIPPU18gE1Iv2x_NkRdk2NKTjt9Gh68jFkUX4o15CEUvDjIYitwXEqWQ2k_sbjRQdDnHzaLNleNyVMXqy0rx87dCIJVlev1P9KnsYWre6NMYsO_orvl9h7jPp88teP4pesUVYTLm4JmJYT9ZgR7FY=" + } + ], + "role": "model" + }, + "intermediate_data": { + "invocation_events": [ + { + "author": "root_agent", + "content": { + "parts": [ + { + "function_call": { + "id": "adk-f656353a-b968-4296-9a5f-c329e483454e", + "args": { + "project": "cloud-db-nl2sql", + "location": "us-central2" + }, + "name": "list_clusters" + }, + "thought_signature": "Ct8EAXLI2nwx9Vmt958q3WRKIxVOoCf0wfNBZRo07i2Cs4DTHNW6HAO4e_EAJTPJkTBDMfqBpGcXVTshaBsHOsMqFwZrt7rVKzwNzNguU412p9V-_lGhKDgYMr_ymyG_gOyTvDXuFrqHvqwUBRD2_OO0R5kn1xULhfavGopAxu5mXxfqHnek6Tpokwm7u17hjuL0v8eT3nxlI8KAjjp5BoFvfmT4i9Bb_djR_ndIVHrPopRyoPd1cMmrcfjfgHbUVmtIlTFi1WOPnjAiSssChObcLFrPiFggWKfD3TXpGNIrkKBXioxbS3PasaHiGHTWp_KYjNuE9Qt5yrZ7pUMxvL6mokQpvlAHVuKJYo90Mzo7SNP5g0xlC3Uo0Zxb9zcGEQboBKuzT3DOuPP2LWhFPiP7iumS4ddMM9EBmtwKsZ-5of2GSdIKSW53KY3ygDa1RWn2nWziGYCsykXlpEWyMQCIIbtdwLAWjwi3jQRK3J7BK1js4GKhVnOsT8GU-EvAi9yBLpDuqrNEnA8wittY7UXbAX1JoZm_jfjrXJPd8eovyBLNdcgCq4c436ugVawgfyjeHcU2KGO1sp_dntKnZtooihZKOB-3992aPxaw8IiFaFb_iqox_JftQby3lHIGwcR_YNKjx0c-pGZIUaGCOYzkB1-61K2QdVuVmps4pgutCSXgUhvJZWtKwrO-qeC0hnxSbTaQBQHJle4Cksw2Gn2xRX42T_JhQwu4mkw4TXD2lAf5pQrc5eZukPy-twGstF5afUXjt8paS1_xSDbt2kI7uy067Q-81r1Wag0ShAm80w==" + } + ], + "role": "model" + } + }, + { + "author": "root_agent", + "content": { + "parts": [ + { + "function_response": { + "id": "adk-f656353a-b968-4296-9a5f-c329e483454e", + "name": "list_clusters", + "response": { + "result": "{\"clusters\":[{\"automatedBackupPolicy\":{\"backupWindow\":\"3600s\",\"location\":\"us-central2\",\"timeBasedRetention\":{\"retentionPeriod\":\"1209600s\"},\"weeklySchedule\":{\"daysOfWeek\":[\"MONDAY\",\"TUESDAY\",\"WEDNESDAY\",\"THURSDAY\",\"FRIDAY\",\"SATURDAY\",\"SUNDAY\"],\"startTimes\":[{\"hours\":23}]}},\"clusterType\":\"PRIMARY\",\"continuousBackupConfig\":{\"enabled\":true,\"recoveryWindowDays\":14},\"continuousBackupInfo\":{\"enabledTime\":\"2025-12-16T18:58:29.364924883Z\",\"encryptionInfo\":{\"encryptionType\":\"GOOGLE_DEFAULT_ENCRYPTION\"},\"schedule\":[\"MONDAY\",\"TUESDAY\",\"WEDNESDAY\",\"THURSDAY\",\"FRIDAY\",\"SATURDAY\",\"SUNDAY\"]},\"createTime\":\"2025-12-16T18:58:29.597395684Z\",\"databaseVersion\":\"POSTGRES_16\",\"encryptionInfo\":{\"encryptionType\":\"GOOGLE_DEFAULT_ENCRYPTION\"},\"name\":\"projects/cloud-db-nl2sql/locations/us-central2/clusters/adk-toolbox\",\"network\":\"projects/898603957982/global/networks/default\",\"networkConfig\":{\"network\":\"projects/898603957982/global/networks/default\"},\"state\":\"READY\",\"subscriptionType\":\"STANDARD\",\"uid\":\"36d2bc60-e46b-41d7-b7a8-ed082a78fa01\",\"updateTime\":\"2025-12-16T19:05:18.786398365Z\"},{\"automatedBackupPolicy\":{\"backupWindow\":\"3600s\",\"location\":\"us-central2\",\"timeBasedRetention\":{\"retentionPeriod\":\"1209600s\"},\"weeklySchedule\":{\"daysOfWeek\":[\"MONDAY\",\"TUESDAY\",\"WEDNESDAY\",\"THURSDAY\",\"FRIDAY\",\"SATURDAY\",\"SUNDAY\"],\"startTimes\":[{\"hours\":8}]}},\"clusterType\":\"PRIMARY\",\"continuousBackupConfig\":{\"enabled\":true,\"recoveryWindowDays\":14},\"continuousBackupInfo\":{\"enabledTime\":\"2025-12-04T05:45:33.951947875Z\",\"encryptionInfo\":{\"encryptionType\":\"GOOGLE_DEFAULT_ENCRYPTION\"},\"schedule\":[\"MONDAY\",\"TUESDAY\",\"WEDNESDAY\",\"THURSDAY\",\"FRIDAY\",\"SATURDAY\",\"SUNDAY\"]},\"createTime\":\"2025-12-04T05:45:34.133749280Z\",\"databaseVersion\":\"POSTGRES_16\",\"encryptionInfo\":{\"encryptionType\":\"GOOGLE_DEFAULT_ENCRYPTION\"},\"name\":\"projects/cloud-db-nl2sql/locations/us-central2/clusters/postgres\",\"network\":\"projects/898603957982/global/networks/default\",\"networkConfig\":{\"network\":\"projects/898603957982/global/networks/default\"},\"state\":\"READY\",\"subscriptionType\":\"STANDARD\",\"uid\":\"75a5af5f-0050-4fb9-93ab-5e2ae3c4932a\",\"updateTime\":\"2025-12-16T18:18:07.956419930Z\"}]}" + } + } + } + ], + "role": "user" + } + } + ] + }, + "creation_timestamp": 1765911971.685792 + }, + { + "invocation_id": "e-3e6510d9-4699-4dd8-a1dd-756a33eadb30", + "user_content": { + "parts": [ + { + "text": "does adk-toolbox exist in cloud-db-nl2sql in us-central2\n" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "Yes, the cluster 'adk-toolbox' exists in the project 'cloud-db-nl2sql' in the 'us-central2' location." + } + ], + "role": "model" + }, + "intermediate_data": { + "invocation_events": [ + { + "author": "root_agent", + "content": { + "parts": [ + { + "function_call": { + "id": "adk-0d7054d7-c9c0-43a2-bfbe-cab8a0320a2d", + "args": { + "project": "cloud-db-nl2sql", + "cluster": "adk-toolbox", + "location": "us-central2" + }, + "name": "get_specific_cluster" + }, + "thought_signature": "CroEAXLI2ny_4mvjKQwaIjyPNcUnpiXw5FMTGQ7Jd_gU_zCgnZd8eaw78e7BXY4H03Xa9AvjntzRKTC4kCFzaUHitEW5eCopBZ-qoncfeFSBMkg0jJUuSv1Ls5dEmvAQOO_-L5bAt248ZGgBgNCYm-jfApQMUhKRuWwe7N1Zs4ZesYz2KNWaMvdeVGQRcpFw0z18w2n07I1e--iVNH-H2Xp9_J8n5-ipKxKPqX3qeV5Nxj6QvF1gxnexlm1Krfo4SDs7qCYdrjLvgaPbOFjfop-ndClCTbThJcUi67O0GiIqrPbW4_BK4vN1eFFPwUrW44zkAHJS7dBSBeT4LvpDAmrwXOWGLb8ESxxZNob-ff36skAn8wrbIBs899ofm12JGiTLApbu5xbzKifB1K4yDJ6hi2aFvNk3YI_S40N5YtpRuK7TZoTpifjOgLIauTjeeDV8XvyqEZ4rOEat_E6jjvpuzPwXY2Hkvr-qq4oYAEQ0b-4s2ILq9dBefIIfPGtI_xyLgk_4kSCXWOGwknXH6vUjI7LgurCnj-PAnJhAwVcv4ox97Zb1Th-LZGXpVxswoNw1-mx0E3DBQ5VAjg7ySsfExyU6MuR2CfBDuRqDN1RKGspFA56j3WOSLsUzLPckpOlqIOSn1TLKJ2UPPc1D4-eyY7V6SHNAXqt3pQaSEBNU3MGKNOi7KD66R_WWsXUYYrrdLne62WKN50dhOv5bgDoS5NjyaMD-fl3GMHqXTWr_31-VYuQOAyNi8izE" + } + ], + "role": "model" + } + }, + { + "author": "root_agent", + "content": { + "parts": [ + { + "function_response": { + "id": "adk-0d7054d7-c9c0-43a2-bfbe-cab8a0320a2d", + "name": "get_specific_cluster", + "response": { + "result": "{\"automatedBackupPolicy\":{\"backupWindow\":\"3600s\",\"location\":\"us-central2\",\"timeBasedRetention\":{\"retentionPeriod\":\"1209600s\"},\"weeklySchedule\":{\"daysOfWeek\":[\"MONDAY\",\"TUESDAY\",\"WEDNESDAY\",\"THURSDAY\",\"FRIDAY\",\"SATURDAY\",\"SUNDAY\"],\"startTimes\":[{\"hours\":23}]}},\"clusterType\":\"PRIMARY\",\"continuousBackupConfig\":{\"enabled\":true,\"recoveryWindowDays\":14},\"continuousBackupInfo\":{\"enabledTime\":\"2025-12-16T18:58:29.364924883Z\",\"encryptionInfo\":{\"encryptionType\":\"GOOGLE_DEFAULT_ENCRYPTION\"},\"schedule\":[\"MONDAY\",\"TUESDAY\",\"WEDNESDAY\",\"THURSDAY\",\"FRIDAY\",\"SATURDAY\",\"SUNDAY\"]},\"createTime\":\"2025-12-16T18:58:29.597395684Z\",\"databaseVersion\":\"POSTGRES_16\",\"encryptionInfo\":{\"encryptionType\":\"GOOGLE_DEFAULT_ENCRYPTION\"},\"name\":\"projects/cloud-db-nl2sql/locations/us-central2/clusters/adk-toolbox\",\"network\":\"projects/898603957982/global/networks/default\",\"networkConfig\":{\"network\":\"projects/898603957982/global/networks/default\"},\"state\":\"READY\",\"subscriptionType\":\"STANDARD\",\"uid\":\"36d2bc60-e46b-41d7-b7a8-ed082a78fa01\",\"updateTime\":\"2025-12-16T19:05:18.786398365Z\"}" + } + } + } + ], + "role": "user" + } + } + ] + }, + "creation_timestamp": 1765912006.326264 + }, + { + "invocation_id": "e-d43424e4-274a-42f0-b3f3-14f338019f89", + "user_content": { + "parts": [ + { + "text": "does instance adk-toolbox-instance exist in cluster adk-toolbox in project cloud-db-nl2sql in us-central2\n" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "Yes, the instance 'adk-toolbox-instance' exists in the cluster 'adk-toolbox' in the project 'cloud-db-nl2sql' in the 'us-central2' location." + } + ], + "role": "model" + }, + "intermediate_data": { + "invocation_events": [ + { + "author": "root_agent", + "content": { + "parts": [ + { + "function_call": { + "id": "adk-b90247ff-c2a4-4fca-8d08-dce336711e98", + "args": { + "location": "us-central2", + "instance": "adk-toolbox-instance", + "cluster": "adk-toolbox", + "project": "cloud-db-nl2sql" + }, + "name": "get_specific_instance" + }, + "thought_signature": "CtIHAXLI2nw-7oWAoejav-cFodP9-2zkBszj_CMx62IHNlLPc6221f5IMqOpFqgp84G_KYolHNw38h8hFPRDPJf8ysith423z4XQT0cwthVp8zp6ihJXh7muOPvlIhSYB0fGARsqMj279IOEzcNYkNmewKzYYtXa0wSvN4VV2zdnQCWbpWoBOlja2hlRFQe4SZEY5QktlyTNSz3gyLWsSECKdCqH-mr3nRNooauhzSpaLMYNmo17uH_m_KcbSNYvIzV_0ov_AIUqHQa7C6uZy_W5PFx0CsFV6_gtthRdFN9P-4c7BLAGVfsVOW3fBxlVNIFgExUN0m_gFpuTr1t8OwGoayDO3ArOfgUg-eCcsGBJXbz2blb3zvabl42rormEKoohHeTBxLDbQDZ64XWk5ARM2mY5hF_DXwVPVura3JnHMrlSvXNhz0TXIq6R0-UKUzsqOJHDOSUtnWBJ8NSmq5Mz9qgwev9Z6qftSEQ9uqnx1znx6nFNtOs1fU5JWTbQVRVd4uG6B5eZLWXpkCTmEKXp3e23AuWF0r8Nni7WrbKknXwvQKZxJdjuv0QV0Xqu5qVPR9EaRP1S2bA8ejeyoqEuW53YJ3I8NfRrXdctwaGpHqSLJKUe9VPFULsCKKKKffvl_bJmICdLGKC47KEK4d7CMqQX-GIQsIxJ-yLIZppTbq3lAV4oXLDH4Z6UJ9lVHuVja5wRRWj17AC-71So0JECcHZO5Mt9wpfCHtgSqXf8aGtAv_KWNsh5tC8KdIg832ijjIlKzzCJQRIKssOphqr_IXCdByNJldt6HbobIdyRDckxMaJRlKbwN8x_czBVW_uT5uIwC3muoFAbZpXod76QZ8V_ONSQR66NHYUkz5apRnkfQESyMVBAjmb0StzIotbA94NUqtWnvbYiJ6l7wD9oNbX2sru2JL26wOCW6N_BCIIIjSyQCq62614bL8b9uTgqh1Io8WDdB89HDc2dUQcmzWNAj8G64pHQwXrcQNaF01Yznah_wBnA4qzcZrSb4uSZQRvsMwaYXRBKQcBFbREq5-hQBJ6-2HPuAY4jHPuert_W2Tmp37GYuV21sG-BX3yA8d4n5lvKU06lNrW7YP88aJqVJNDH0bRcQ0VQYbMtKZ0Vz6_ZwS1cbBMkjfGEkaab7iEkHD-LO9nRCK0V8mZF2qL-HQtEytfCwEnNpaHG9WIYK3GmDVdkVI8Y7DsniiuPHS3hZamfizttiPow6yxvXHX3mv93sQKJOawWLNgIYOSFuJlOIh92472QxePUxuM_CTOT2vbK_ePs7XIrMhB8uSfC" + } + ], + "role": "model" + } + }, + { + "author": "root_agent", + "content": { + "parts": [ + { + "function_response": { + "id": "adk-b90247ff-c2a4-4fca-8d08-dce336711e98", + "name": "get_specific_instance", + "response": { + "result": "{\"activationPolicy\":\"ALWAYS\",\"availabilityType\":\"REGIONAL\",\"clientConnectionConfig\":{\"sslConfig\":{\"sslMode\":\"ENCRYPTED_ONLY\"}},\"createTime\":\"2025-12-16T18:58:40.730828585Z\",\"databaseFlags\":{\"password.enforce_complexity\":\"on\"},\"instanceType\":\"PRIMARY\",\"ipAddress\":\"10.19.131.17\",\"machineConfig\":{\"cpuCount\":4},\"name\":\"projects/cloud-db-nl2sql/locations/us-central2/clusters/adk-toolbox/instances/adk-toolbox-instance\",\"networkConfig\":{\"enablePublicIp\":true},\"nodes\":[{\"zoneId\":\"us-central2-b\"}],\"observabilityConfig\":{\"maxQueryStringLength\":10240,\"queryPlansPerMinute\":20,\"trackWaitEventTypes\":true,\"trackWaitEvents\":true},\"publicIpAddress\":\"35.186.100.136\",\"queryInsightsConfig\":{\"queryPlansPerMinute\":5,\"queryStringLength\":1024},\"state\":\"READY\",\"uid\":\"88ef5b4c-7690-4a04-b5a3-612fa2f93059\",\"updateTime\":\"2025-12-16T19:06:48.571852656Z\",\"writableNode\":{\"zoneId\":\"us-central2-a\"}}" + } + } + } + ], + "role": "user" + } + } + ] + }, + "creation_timestamp": 1765912017.128056 + }, + { + "invocation_id": "e-57fbdcf4-3c0f-479c-ae4a-2905f5d69962", + "user_content": { + "parts": [ + { + "text": "list users in cluster adk-toolbox in project cloud-db-nl2sql in us-central2\n" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "There is one user named 'postgres' in the cluster 'adk-toolbox' in the project 'cloud-db-nl2sql' in the 'us-central2' location. This user has the 'alloydbsuperuser' role and is an 'ALLOYDB_BUILT_IN' user." + } + ], + "role": "model" + }, + "intermediate_data": { + "invocation_events": [ + { + "author": "root_agent", + "content": { + "parts": [ + { + "function_call": { + "id": "adk-306b09d7-914a-4207-abf0-13d78ab0bd04", + "args": { + "project": "cloud-db-nl2sql", + "cluster": "adk-toolbox", + "location": "us-central2" + }, + "name": "list_users" + }, + "thought_signature": "CtYFAXLI2ny9eS22ESi2cGIIIO6rsniZozHgbw2rGYdujQ693fO13GYPimk9I9OFh_ueu7VRPDgK1eqbsSAzI9GBMI5gCOTMmBHyVl5LRgxMwS2mbjfrGkQCvi_IL82oftHam5JO5gzBoiQrq9LMhAvJ9RwCKfgaPzI1xc90_2erdDPSb5-mVloJhqKMZmlM1s3KQBmRc51mWN-kEe3BZd3Nr5ooGdD9OZEgOJi3rTvJxcoKObfA0_OjtCfgOWlsZHs7CYLotAhMqGk2Jh5bOcRwlIah0XpboeQ8qBYFDi84OoDosfJp1iHpzXeZTW_f2Tml70qLdzmTpIsMoS1Rk1HF00jcIzNlgrSIaWY1JxtLF4Kazr7cI4ToIJj4o-YI0YbTxRGyzlvgd3cqpQbkvxvyvDfZQiiU0VLTPduYlbRtTc-Vsyc4bmu2m64pqqZEwLS19CBZgB5Uh8wXq1oBimCtgFI4DqOgBdjva7LIyUJlgm1twoqbWKy6VYZus8upKf9FPLfrVdzj_annsWTt-NWSJTARnypeeFFvsDe6HXWPJ5bxR6oEgXEhbcJXxPy57j5pgWVV6lOm_USoWKkb1gt9mxGG76qdAFWhsswm1GYP9EZZvE0k84fLDaEQnXfe4z8T0bKCWDsSDS9MLfAE44qvDcn72jk3Xam_br3BMrekuEiOGlnlpNdardx8ZmORJDMWudLQAjczpEV6jA_vInkGZJ2zF0Y1IH4LadjriAIxuxvNGcUju-JJ5vUiJrBpXCWmElvwV-jfiQEH8G2vxr85nMmQ9xtHoCKHbKE2vWB88MTna6maalVL00-Uxa2-bKh3MGyCBik_RT8iViY5NseFI8NMZ0s7qK-KbQ_VqCDCb6PBe_LPPsBJRizdxjBw8dvtDqSd4vI_o7GgWlmV7787a-mR3awH2J0ZTb5hkw89C9leO0i5Xl2mXa-L2mmND50B_ZlxLdMx" + } + ], + "role": "model" + } + }, + { + "author": "root_agent", + "content": { + "parts": [ + { + "function_response": { + "id": "adk-306b09d7-914a-4207-abf0-13d78ab0bd04", + "name": "list_users", + "response": { + "result": "{\"users\":[{\"databaseRoles\":[\"alloydbsuperuser\"],\"name\":\"projects/cloud-db-nl2sql/locations/us-central2/clusters/adk-toolbox/users/postgres\",\"userType\":\"ALLOYDB_BUILT_IN\"}]}" + } + } + } + ], + "role": "user" + } + } + ] + }, + "creation_timestamp": 1765912024.093766 + }, + { + "invocation_id": "e-3db6d8cb-2bef-4c95-87ec-b42c36d6f6f3", + "user_content": { + "parts": [ + { + "text": "does a \"postgres\" user exist in cluster adk-toolbox in project cloud-db-nl2sql in us-central2" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "Yes, a user named \"postgres\" exists in the cluster \"adk-toolbox\" in the project \"cloud-db-nl2sql\" in \"us-central2\". This user has the `alloydbsuperuser` database role and is of type `ALLOYDB_BUILT_IN`." + } + ], + "role": "model" + }, + "intermediate_data": { + "invocation_events": [ + { + "author": "root_agent", + "content": { + "parts": [ + { + "function_call": { + "id": "adk-56f8e7ff-b43a-4571-bd98-bd159202f844", + "args": { + "project": "cloud-db-nl2sql", + "cluster": "adk-toolbox", + "location": "us-central2", + "user": "postgres" + }, + "name": "get_specific_user" + }, + "thought_signature": "CtECAXLI2nxGwQPKsaCM9UULuvVF0EZQTUyCDV6JjaT6pi036jVxEDZrT-9HOfltI1HYOWn4TvP4BFHJnW1Zc56WXK6hFD8HO016okPTBuNHz57wnkXTvI5AzG_xhcX7TYDaTnEvnAXv1o9yhQBASQtZAWy2qLoHe-csixat1yU0t-H-kdg72V6nqZ5FK27f_0_NF9Slgg9__hm22ErHo3Hr5Ez3uFzgSP6vqDn4Zv2SCMMds3XKBybwRDpi7cY06wlMoK6b9WrFWqGyj2SBYEdRHo17eUPo-vugIg7MXBAgiVvFDyIKdaYPaB6vgwlkfMUQCQpjcMhf-E3v2pft1kApuRb9vpTpoWGR1znDE5t18o5yD2Kj2SxRPx6R6Z_aW-rRVW8NwgY3vxitN4zZxB29ER5gSqjuXOPKQCojaGm7470bkpYO9MINAcT4tlUppRSeGw==" + } + ], + "role": "model" + } + }, + { + "author": "root_agent", + "content": { + "parts": [ + { + "function_response": { + "id": "adk-56f8e7ff-b43a-4571-bd98-bd159202f844", + "name": "get_specific_user", + "response": { + "result": "{\"databaseRoles\":[\"alloydbsuperuser\"],\"name\":\"projects/cloud-db-nl2sql/locations/us-central2/clusters/adk-toolbox/users/postgres\",\"userType\":\"ALLOYDB_BUILT_IN\"}" + } + } + } + ], + "role": "user" + } + } + ] + }, + "creation_timestamp": 1765912057.30368 + } + ], + "session_input": { + "app_name": "myagent", + "user_id": "user" + }, + "creation_timestamp": 1765912543.509523 + } + ], + "creation_timestamp": 1765912517.262793 +} \ No newline at end of file diff --git a/evalbench/agent/myagent/__init__.py b/evalbench/agent/myagent/__init__.py new file mode 100644 index 00000000..02c597e1 --- /dev/null +++ b/evalbench/agent/myagent/__init__.py @@ -0,0 +1 @@ +from . import agent diff --git a/evalbench/agent/myagent/agent.py b/evalbench/agent/myagent/agent.py new file mode 100644 index 00000000..6e3614b7 --- /dev/null +++ b/evalbench/agent/myagent/agent.py @@ -0,0 +1,20 @@ +from google.adk import Agent +from google.adk.apps import App +from toolbox_core import ToolboxSyncClient, auth_methods + +URL = "https://toolbox-zxwxgw5sma-uc.a.run.app" +auth_token_provider = auth_methods.aget_google_id_token(URL) # can also use sync method + +client = ToolboxSyncClient( + URL, + client_headers={"Authorization": auth_token_provider}, +) + +root_agent = Agent( + name="root_agent", + model="gemini-2.5-flash", + instruction="You are a helpful AI assistant designed to provide accurate and useful information.", + tools=client.load_toolset(), +) + +app = App(root_agent=root_agent, name="myagent") diff --git a/evalbench/dataset/dataset.py b/evalbench/dataset/dataset.py index 6bb6f55b..478780cd 100644 --- a/evalbench/dataset/dataset.py +++ b/evalbench/dataset/dataset.py @@ -6,6 +6,7 @@ from collections.abc import Sequence from dataset.evalinput import EvalInputRequest from dataset.evalinteractinput import EvalInteractInputRequest +from dataset.evaladkinput import EvalADKRequest from itertools import chain @@ -90,6 +91,19 @@ def load_bird_interact_dataset(json_file_path, config): return input_items +def load_adk_json(json_file_path): + all_items = [] + with open(json_file_path, "r") as json_file: + json_item = json_file.read() + item = json.loads(json_item) + eval_input = EvalADKRequest( + id=item["eval_set_id"], + payload=json_item, + ) + all_items.append(eval_input) + return all_items + + def load_json(json_file_path): all_items = [] with open(json_file_path, "r") as json_file: @@ -102,6 +116,8 @@ def load_dataset_from_json(json_file_path, config): dataset_format = config.get("dataset_format", "evalbench-standard-format") if dataset_format == "bird-interact-format": all_items = load_bird_interact_dataset(json_file_path, config) + elif dataset_format == "adk-format": + all_items = load_adk_json(json_file_path) else: all_items = load_json(json_file_path) @@ -114,11 +130,15 @@ def load_dataset_from_json(json_file_path, config): elif dataset_format == "bird-interact-format": config["orchestrator"] = "interact" input_items = all_items + elif dataset_format == "adk-format": + config["orchestrator"] = "adk" + input_items = all_items else: raise ValueError("Dataset not in any of the recognised formats") - totalEntries = sum(len(input_items.get(q, [])) for q in ["dql", "dml", "ddl"]) - logging.info(f"Converted {totalEntries} entries to EvalInput.") + if dataset_format != "adk-format": + totalEntries = sum(len(input_items.get(q, [])) for q in ["dql", "dml", "ddl"]) + logging.info(f"Converted {totalEntries} entries to EvalInput.") return input_items @@ -249,4 +269,6 @@ def breakdown_datasets(total_dataset: list[EvalInputRequest]): def flatten_dataset(dataset: dict[str, list]): + if type(dataset) == list: + return dataset return list(chain.from_iterable(dataset.values())) diff --git a/evalbench/dataset/evaladkinput.py b/evalbench/dataset/evaladkinput.py new file mode 100644 index 00000000..1c08de01 --- /dev/null +++ b/evalbench/dataset/evaladkinput.py @@ -0,0 +1,60 @@ +from google.protobuf.json_format import MessageToDict +import copy +from enum import Enum + +try: + from evalproto import eval_request_pb2 + + PROTO_IMPORTED = True +except ImportError: + PROTO_IMPORTED = False + + +class EvalADKRequest: + def __init__( + self, + id: str, + payload: dict, + job_id: str = "", + trace_id: str = "", + ): + """Initializes an EvalInputRequest object with all required fields. + + See eval_request_pb2 for types + """ + self.id = id + self.payload = payload + self.job_id = job_id + self.trace_id = trace_id + + if PROTO_IMPORTED: + + @classmethod + def init_from_proto(self, proto: eval_request_pb2.EvalInputRequest): # type: ignore + """Initializes an EvalInputRequest from eval_request_pb2 proto.""" + + request = MessageToDict(proto) + return self( + id=str(request.get("id") or -1), + payload=request.get("payload") or {}, + job_id=request.get("jobId") or "", + trace_id=request.get("traceId") or "", + ) + + def to_proto(self): # type: ignore + return eval_request_pb2.EvalInputRequest( # type: ignore + id=int(self.id), + payload=self.payload, + ) + + else: + + @classmethod + def init_from_proto(cls, proto): + raise ImportError("protobuf module not available") + + def to_proto(self): + raise ImportError("protobuf module not available") + + def copy(self): + return copy.deepcopy(self) diff --git a/evalbench/dataset/evaladkoutput.py b/evalbench/dataset/evaladkoutput.py new file mode 100644 index 00000000..bce46cd8 --- /dev/null +++ b/evalbench/dataset/evaladkoutput.py @@ -0,0 +1,9 @@ +from dataset.evaladkinput import EvalADKRequest + + +class EvalADKOutput(dict): + def __init__( + self, + evaladkinput: EvalADKRequest, + ): + self.update(evaladkinput.__dict__) diff --git a/evalbench/evaluator/__init__.py b/evalbench/evaluator/__init__.py index 2279322a..1f22cde6 100644 --- a/evalbench/evaluator/__init__.py +++ b/evalbench/evaluator/__init__.py @@ -1,6 +1,7 @@ from evaluator.orchestrator import Orchestrator from evaluator.oneshotorchestrator import OneShotOrchestrator from evaluator.interactorchestrator import InteractOrchestrator +from evaluator.adkorchestrator import ADKOrchestrator import logging @@ -11,5 +12,7 @@ def get_orchestrator(config, db_configs, setup_config, report_progress=False): return OneShotOrchestrator(config, db_configs, setup_config, report_progress) elif orchestrator_type == "interact": return InteractOrchestrator(config, db_configs, setup_config, report_progress) + elif orchestrator_type == "adk": + return ADKOrchestrator(config, db_configs, setup_config, report_progress) else: return Orchestrator(config, db_configs, setup_config, report_progress) diff --git a/evalbench/evaluator/adkevaluator.py b/evalbench/evaluator/adkevaluator.py new file mode 100644 index 00000000..3d344305 --- /dev/null +++ b/evalbench/evaluator/adkevaluator.py @@ -0,0 +1,164 @@ +from typing import Any, List +import datetime +from dataset.evaladkinput import EvalADKRequest +from dataset.evaladkoutput import EvalADKOutput +from queue import Queue +from databases import DB +import logging +import asyncio +from rich.console import Console +from google.genai.types import Part +from google.adk.evaluation.agent_evaluator import ( + AgentEvaluator, + EvalConfig, + EvalSet, + get_eval_metrics_from_config, + UserSimulatorProvider, + EvalStatus, +) +from google.adk.evaluation.eval_result import EvalCaseResult +from dotenv import load_dotenv +from typing import Optional + +console = Console() + + +class ADKEvaluator: + def __init__( + self, + config, + ): + self.config = config + self.agent_name = config["agent_name"] + self.agent_module = config["agent_module"] + self.eval_dataset_file = config["dataset_config"] + if "initial_session_file" in config: + self.initial_session_file = config["initial_session_file"] + else: + self.initial_session_file = None + + def evaluate( + self, + dataset: List[EvalADKRequest], + job_id: str, + run_time: datetime.datetime, + ): + eval_outputs: List[Any] = [] + scoring_results: List[Any] = [] + logging.info("Running ADK evaluation") + for eval_input in dataset: + self._evaluate(eval_input, eval_outputs, job_id, run_time, scoring_results) + return eval_outputs, scoring_results + + def _evaluate( + self, + eval_input: EvalADKRequest, + eval_outputs: List[Any], + job_id: str, + run_time: datetime.datetime, + scoring_results: List[Any], + num_runs: int = 1, + ): + test_files = [self.eval_dataset_file] + + initial_session = AgentEvaluator._get_initial_session(self.initial_session_file) + + for test_file in test_files: + eval_output = EvalADKOutput(eval_input) + eval_output["job_id"] = job_id + eval_output["run_time"] = run_time + eval_output["test_file"] = test_file + eval_config = AgentEvaluator.find_config_for_test_file(test_file) + eval_set = AgentEvaluator._load_eval_set_from_file( + test_file, eval_config, initial_session + ) + eval_output["eval_set"] = eval_set + asyncio.run( + self.evaluate_eval_set( + eval_output, + scoring_results, + agent_module=self.agent_module, + eval_set=eval_set, + eval_config=eval_config, + num_runs=num_runs, + agent_name=self.agent_name, + ) + ) + + async def evaluate_eval_set( + self, + eval_output: EvalADKOutput, + scoring_results: List[Any], + agent_module: str, + eval_set: EvalSet, + criteria: Optional[dict[str, float]] = None, + eval_config: Optional[EvalConfig] = None, + num_runs: int = 1, + agent_name: Optional[str] = None, + ): + agent_for_eval = await AgentEvaluator._get_agent_for_eval( + module_name=agent_module, agent_name=None + ) + eval_metrics = get_eval_metrics_from_config(eval_config) + + user_simulator_provider = UserSimulatorProvider( + user_simulator_config=eval_config.user_simulator_config + ) + + eval_results_by_eval_id = await AgentEvaluator._get_eval_results_by_eval_id( + agent_for_eval=agent_for_eval, + eval_set=eval_set, + eval_metrics=eval_metrics, + num_runs=num_runs, + user_simulator_provider=user_simulator_provider, + ) + self.process_results(eval_results_by_eval_id) + eval_output["eval_results_by_eval_id"] = eval_results_by_eval_id + return eval_results_by_eval_id + + def merge_parts(self, parts: list[Part]): + return "".join([part.text for part in parts]) + + def process_eval_result( + self, + eval_result: EvalCaseResult, + ): + for eval_metric_result in eval_result.overall_eval_metric_results: + console.print( + f"{eval_metric_result.metric_name}: {eval_metric_result.eval_status}: {eval_metric_result.score}" + ) + + for ( + eval_metric_result_per_invocation + ) in eval_result.eval_metric_result_per_invocation: + console.print( + "*************************************************************************************\n" + ) + console.print( + f"USER: {self.merge_parts(eval_metric_result_per_invocation.actual_invocation.user_content.parts)}" + ) + console.print( + f"ASSISTANT: {self.merge_parts(eval_metric_result_per_invocation.actual_invocation.final_response.parts)}" + ) + for ( + eval_metric_result + ) in eval_metric_result_per_invocation.eval_metric_results: + console.print( + f"{eval_metric_result.metric_name}: {eval_metric_result.eval_status}: {eval_metric_result.score}" + ) + console.print( + "*************************************************************************************\n" + ) + + def process_results( + self, + eval_results_by_eval_id: dict[str, list], + ): + for ( + eval_id, + eval_results, + ) in eval_results_by_eval_id.items(): + print(f"Processing eval results for eval id: {eval_id}") + for eval_result in eval_results: + print(type(eval_result)) + self.process_eval_result(eval_result) diff --git a/evalbench/evaluator/adkorchestrator.py b/evalbench/evaluator/adkorchestrator.py new file mode 100644 index 00000000..9089c281 --- /dev/null +++ b/evalbench/evaluator/adkorchestrator.py @@ -0,0 +1,30 @@ +from evaluator.orchestrator import Orchestrator +import uuid +import datetime +from dataset.evaladkinput import EvalADKRequest +import logging +from evaluator.adkevaluator import ADKEvaluator + + +class ADKOrchestrator(Orchestrator): + def __init__( + self, + config, + db_configs, + setup_config, + report_progress=False, + ): + self.config = config + self.db_configs = db_configs + self.setup_config = setup_config + self.job_id = f"{uuid.uuid4()}" + self.run_time = datetime.datetime.now() + self.total_eval_outputs = [] + self.total_scoring_results = [] + self.reporting_total_evals_done = 0 + self.report_progress = False + + def evaluate(self, dataset: list[EvalADKRequest]): + logging.info("Starting ADK evaluation") + evaluator = ADKEvaluator(self.config) + return evaluator.evaluate(dataset, self.job_id, self.run_time) diff --git a/evalbench/evaluator/interactorchestrator.py b/evalbench/evaluator/interactorchestrator.py index 6643d974..1680ea47 100644 --- a/evalbench/evaluator/interactorchestrator.py +++ b/evalbench/evaluator/interactorchestrator.py @@ -190,13 +190,11 @@ def evaluate_sub_dataset( ) total_eval_outputs.extend(eval_outputs) total_scoring_results.extend(scoring_results) - break except Exception as e: logging.info( f"Failed to evaluate {sub_dataset_len} {query_type} queries " + f"on DB {database} on {dialect}. Due to {e}" ) - break # Cleanup all the tmp creations that were built from the core connection if core_db: diff --git a/evalbench/run.sh b/evalbench/run.sh index 45243d41..e678de43 100755 --- a/evalbench/run.sh +++ b/evalbench/run.sh @@ -1,4 +1,5 @@ #!/bin/bash +EVAL_CONFIG=datasets/adk-tools/example_run_config.yaml if [[ -z "${EVAL_CONFIG}" ]]; then echo "EVAL_CONFIG is required"; @@ -9,4 +10,5 @@ fi ulimit -n 4096 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python +export PYTHONPATH=$PYTHONPATH:evalbench/agent python3 evalbench/evalbench.py --experiment_config="$EVAL_CONFIG" \ No newline at end of file diff --git a/evalbench/scorers/adkscorer.py b/evalbench/scorers/adkscorer.py new file mode 100644 index 00000000..fc37ceb7 --- /dev/null +++ b/evalbench/scorers/adkscorer.py @@ -0,0 +1,42 @@ +"""ADKScorer implementation.""" + +from typing import Tuple, Any +from scorers import comparator + + +class ADKScorer(comparator.Comparator): + """ + ADKScorer class implements the Comparator base class using ADK evaluation. + + Attributes: + 1. name: Name of the comparator. Set to "adk_scorer" + 2. config: the scorer config defined in the run config yaml file + """ + + def __init__(self, config: dict): + super().__init__(config) + self.name = "adk_scorer" + self.config = config + + def compare( + self, + nl_prompt: Any, + golden_query: Any, + query_type: Any, + golden_execution_result: Any, + golden_eval_result: Any, + golden_error: Any, + generated_query: Any, + generated_execution_result: Any, + generated_eval_result: Any, + generated_error: Any, + ) -> Tuple[float, str]: + """compare function implements the comparison logic for ADKScorer. + + For now, this is a placeholder implementation. + """ + # Placeholder logic: return 100.0 if there are no errors, else 0.0 + if generated_error: + return 0.0, f"Generated query had an error: {generated_error}" + + return 100.0, "ADK scoring completed successfully (placeholder)." diff --git a/evalbench/scorers/score.py b/evalbench/scorers/score.py index 027efad5..2844ae1f 100644 --- a/evalbench/scorers/score.py +++ b/evalbench/scorers/score.py @@ -8,6 +8,7 @@ from scorers import llmrater from scorers import returnedsql from scorers import executablesql +from scorers import adkscorer from dataset.evaloutput import EvalOutput import logging @@ -46,6 +47,8 @@ def compare( comparators.append( executablesql.ExecutableGenerationScore(scorers["executable_sql"]) ) + if "adk_scorer" in scorers: + comparators.append(adkscorer.ADKScorer(scorers["adk_scorer"])) for comp in comparators: score = 0 diff --git a/evalbench/util/service.py b/evalbench/util/service.py index 56d4813e..bf0260c1 100644 --- a/evalbench/util/service.py +++ b/evalbench/util/service.py @@ -3,10 +3,10 @@ def load_session_configs(session): return ( - session["config"], - session["db_configs"], - session["model_config"], - session["setup_config"], + session["config"] if "config" in session else None, + session["db_configs"] if "db_configs" in session else None, + session["model_config"] if "model_config" in session else None, + session["setup_config"] if "setup_config" in session else None, ) diff --git a/requirements.txt b/requirements.txt index f83628be..10f8fbff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,4 +29,7 @@ redis anthropic[vertex] sqlglot pymongo -mongomock \ No newline at end of file +mongomock +google-adk +toolbox-core +google-adk[eval] \ No newline at end of file