Tool use evolves from static ReAct-style prompting and supervised datasets to Tool-Integrated Reasoning (TIR) where RL teaches agents when to invoke tools, how to compose them effectively, and how to recover from failures. This outcome-driven approach moves beyond imitation to discover optimal tool strategies through environmental feedback.
- Evolution Path: ReAct prompting → SFT datasets → RL-optimized tool policies
- Dynamic Selection: Agents learn tool choice based on task context rather than fixed rules
- Composition Learning: RL discovers tool chaining and error recovery strategies
- Credit Assignment: Long-horizon TIR enables temporal credit across tool sequences
# Verify tool integration libraries
python -c "import requests, json; print('HTTP tools ready')"
python -c "import subprocess; print('System tools ready')"
python -c "import torch, transformers; print('ML integration ready')"
# Conceptual check
echo "Do you understand function calling and API interfaces?"
echo "Are you familiar with the ReAct (Reasoning + Acting) framework?"
echo "Have you completed Module 3.1 (Planning)?"import json
import random
import time
from typing import Dict, List, Any, Optional
from dataclasses import dataclass
@dataclass
class ToolResult:
success: bool
result: Any
execution_time: float
error_message: Optional[str] = None
class RuleBasedToolAgent:
"""Traditional rule-based tool selection"""
def __init__(self):
self.tools = {
'calculator': self.calculator_tool,
'web_search': self.web_search_tool,
'file_reader': self.file_reader_tool,
'code_executor': self.code_executor_tool
}
# Fixed rules for tool selection
self.selection_rules = {
'math_keywords': ['calculate', 'compute', '+', '-', '*', '/', '%', 'equation'],
'search_keywords': ['search', 'find', 'lookup', 'research', 'what is'],
'file_keywords': ['read', 'file', 'document', 'load', 'open'],
'code_keywords': ['run', 'execute', 'python', 'code', 'script']
}
def select_tool(self, task: str) -> Optional[str]:
"""Rule-based tool selection"""
task_lower = task.lower()
# Check each rule category
for tool_type, keywords in [
('calculator', self.selection_rules['math_keywords']),
('web_search', self.selection_rules['search_keywords']),
('file_reader', self.selection_rules['file_keywords']),
('code_executor', self.selection_rules['code_keywords'])
]:
if any(keyword in task_lower for keyword in keywords):
return tool_type
return None # No tool needed
def execute_task(self, task: str) -> Dict[str, Any]:
"""Execute task using rule-based tool selection"""
start_time = time.time()
# Select tool based on rules
selected_tool = self.select_tool(task)
if not selected_tool:
return {
'action': 'direct_response',
'tool_used': None,
'result': 'Responding directly without tools',
'execution_time': time.time() - start_time,
'success': True
}
# Execute selected tool
tool_function = self.tools[selected_tool]
tool_result = tool_function(task)
return {
'action': 'tool_use',
'tool_used': selected_tool,
'result': tool_result.result,
'success': tool_result.success,
'execution_time': time.time() - start_time,
'error': tool_result.error_message
}
# Simplified tool implementations
def calculator_tool(self, query: str) -> ToolResult:
"""Basic calculator tool"""
try:
# Extract mathematical expression (simplified)
import re
numbers = re.findall(r'\d+\.?\d*', query)
if len(numbers) >= 2:
result = float(numbers[0]) + float(numbers[1]) # Simplified: just add
return ToolResult(True, f"Result: {result}", 0.1)
else:
return ToolResult(False, None, 0.1, "Could not parse mathematical expression")
except Exception as e:
return ToolResult(False, None, 0.1, str(e))
def web_search_tool(self, query: str) -> ToolResult:
"""Simulated web search"""
time.sleep(0.2) # Simulate network delay
mock_results = [
f"Search result 1 for '{query[:20]}...'",
f"Search result 2 for '{query[:20]}...'",
f"Search result 3 for '{query[:20]}...'"
]
return ToolResult(True, mock_results, 0.2)
def file_reader_tool(self, query: str) -> ToolResult:
"""Simulated file reading"""
return ToolResult(True, f"File content for query: {query}", 0.05)
def code_executor_tool(self, query: str) -> ToolResult:
"""Simulated code execution"""
return ToolResult(True, "Code executed successfully", 0.3)
# Demo rule-based tool selection
rule_agent = RuleBasedToolAgent()
tasks = [
"Calculate 15 + 27",
"Search for information about machine learning",
"Read the contents of data.txt",
"Execute this Python code: print('hello')",
"Explain the concept of gravity"
]
print("=== Rule-Based Tool Selection ===")
for task in tasks:
result = rule_agent.execute_task(task)
print(f"\nTask: {task}")
print(f"Action: {result['action']}")
if result['tool_used']:
print(f"Tool: {result['tool_used']}")
print(f"Success: {result['success']}")
print(f"Time: {result['execution_time']:.3f}s")import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
class ToolSelectionPolicy(nn.Module):
"""Neural policy for tool selection and composition"""
def __init__(self, vocab_size: int, num_tools: int, hidden_dim: int = 128):
super().__init__()
# Text encoder for task understanding
self.text_encoder = nn.Sequential(
nn.Embedding(vocab_size, hidden_dim),
nn.LSTM(hidden_dim, hidden_dim, batch_first=True),
)
# Tool selection head
self.tool_selector = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, num_tools + 1) # +1 for "no tool" option
)
# Tool composition head (for chaining)
self.composition_head = nn.Sequential(
nn.Linear(hidden_dim * 2, hidden_dim), # Current state + tool result
nn.ReLU(),
nn.Linear(hidden_dim, 3) # Continue, terminate, or retry
)
def forward(self, task_embedding, tool_history=None):
"""Forward pass for tool selection"""
# Encode task
lstm_out, (hidden, _) = self.text_encoder(task_embedding)
task_features = hidden[-1] # Last hidden state
# Tool selection
tool_logits = self.tool_selector(task_features)
# Composition decision (if tool history provided)
composition_logits = None
if tool_history is not None:
combined = torch.cat([task_features, tool_history], dim=-1)
composition_logits = self.composition_head(combined)
return tool_logits, composition_logits
class RLToolAgent:
"""RL-optimized tool selection and composition agent"""
def __init__(self, vocab_size: int = 1000, num_tools: int = 4):
self.policy = ToolSelectionPolicy(vocab_size, num_tools)
self.optimizer = optim.Adam(self.policy.parameters(), lr=0.001)
self.tools = ['calculator', 'web_search', 'file_reader', 'code_executor']
self.tool_implementations = RuleBasedToolAgent().tools
# Experience buffer for training
self.experience_buffer = deque(maxlen=1000)
# Tool usage statistics for learning
self.tool_success_rates = {tool: 0.5 for tool in self.tools}
self.tool_usage_counts = {tool: 0 for tool in self.tools}
def encode_task(self, task: str) -> torch.Tensor:
"""Simple task encoding (in practice: use proper tokenization)"""
# Simplified encoding - hash to vocab indices
words = task.lower().split()
indices = [abs(hash(word)) % 1000 for word in words[:10]] # Max 10 words
# Pad to consistent length
while len(indices) < 10:
indices.append(0)
return torch.tensor(indices).unsqueeze(0) # Batch dimension
def select_tool_sequence(self, task: str, max_tools: int = 3) -> List[Dict]:
"""RL-based tool selection and composition"""
task_embedding = self.encode_task(task)
tool_sequence = []
current_state = task_embedding
tool_history = torch.zeros(128) # Initialize tool history
for step in range(max_tools):
with torch.no_grad():
tool_logits, composition_logits = self.policy(current_state, tool_history)
# Sample tool selection
tool_probs = torch.softmax(tool_logits, dim=-1)
tool_idx = torch.multinomial(tool_probs, 1).item()
if tool_idx == len(self.tools): # "No tool" option
break
selected_tool = self.tools[tool_idx]
tool_confidence = float(tool_probs[0, tool_idx].item())
# Execute tool
tool_result = self.execute_tool(selected_tool, task)
tool_sequence.append({
'step': step,
'tool': selected_tool,
'confidence': tool_confidence,
'result': tool_result,
'success': tool_result.success
})
# Update tool history for next iteration
tool_history = torch.randn(128) # Simplified tool result encoding
# Composition decision - should we continue?
if composition_logits is not None:
comp_probs = torch.softmax(composition_logits, dim=-1)
continue_prob = float(comp_probs[0, 0].item())
if continue_prob < 0.4 or not tool_result.success:
break
return tool_sequence
def execute_tool(self, tool_name: str, task: str) -> ToolResult:
"""Execute tool and update statistics"""
start_time = time.time()
try:
tool_func = self.tool_implementations[tool_name]
result = tool_func(task)
# Update success rate statistics
self.tool_usage_counts[tool_name] += 1
current_rate = self.tool_success_rates[tool_name]
# Exponential moving average
alpha = 0.1
if result.success:
self.tool_success_rates[tool_name] = current_rate * (1 - alpha) + alpha
else:
self.tool_success_rates[tool_name] = current_rate * (1 - alpha)
return result
except Exception as e:
return ToolResult(False, None, time.time() - start_time, str(e))
def train_on_experience(self, experiences: List[Dict]) -> float:
"""Train tool selection policy using RL"""
if len(experiences) < 5:
return 0.0
# Prepare training data
states = []
actions = []
rewards = []
for exp in experiences:
states.append(self.encode_task(exp['task']))
actions.append(exp['tool_action'])
rewards.append(exp['reward'])
states = torch.cat(states)
actions = torch.tensor(actions)
rewards = torch.tensor(rewards, dtype=torch.float32)
# Policy gradient update
tool_logits, _ = self.policy(states)
log_probs = torch.log_softmax(tool_logits, dim=-1)
selected_log_probs = log_probs.gather(1, actions.unsqueeze(1)).squeeze()
# REINFORCE loss
loss = -(selected_log_probs * rewards).mean()
# Update policy
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
return loss.item()
def get_tool_analytics(self) -> Dict:
"""Get tool usage analytics"""
return {
'success_rates': self.tool_success_rates.copy(),
'usage_counts': self.tool_usage_counts.copy(),
'total_uses': sum(self.tool_usage_counts.values()),
'best_tool': max(self.tool_success_rates.items(), key=lambda x: x[1])[0]
}
# Demo RL tool agent
rl_agent = RLToolAgent()
print("\n=== RL-Optimized Tool Selection ===")
for task in tasks:
tool_sequence = rl_agent.select_tool_sequence(task)
print(f"\nTask: {task}")
if not tool_sequence:
print("No tools selected - direct response")
else:
print(f"Tool sequence ({len(tool_sequence)} steps):")
for tool_step in tool_sequence:
print(f" Step {tool_step['step']}: {tool_step['tool']} "
f"(confidence: {tool_step['confidence']:.3f}, "
f"success: {tool_step['success']})")
# Show analytics
analytics = rl_agent.get_tool_analytics()
print(f"\nTool Analytics:")
print(f"Best performing tool: {analytics['best_tool']}")
print(f"Success rates: {analytics['success_rates']}")class ToolChainAgent:
"""Advanced tool chaining with error recovery"""
def __init__(self):
self.rl_agent = RLToolAgent()
self.max_retries = 2
self.composition_strategies = {
'sequential': self.sequential_composition,
'parallel': self.parallel_composition,
'conditional': self.conditional_composition
}
def execute_with_recovery(self, task: str) -> Dict:
"""Execute task with automatic error recovery"""
attempts = 0
results_history = []
while attempts < self.max_retries:
attempt_result = self.attempt_execution(task, results_history)
results_history.append(attempt_result)
if attempt_result['success']:
return {
'final_result': attempt_result,
'attempts': attempts + 1,
'recovery_used': attempts > 0,
'history': results_history
}
attempts += 1
return {
'final_result': results_history[-1],
'attempts': attempts,
'recovery_failed': True,
'history': results_history
}
def attempt_execution(self, task: str, previous_attempts: List) -> Dict:
"""Single execution attempt with learning from failures"""
# Analyze previous failures
failure_analysis = self.analyze_failures(previous_attempts)
# Adjust strategy based on failures
strategy = self.select_composition_strategy(task, failure_analysis)
# Execute with selected strategy
return strategy(task, failure_analysis)
def analyze_failures(self, previous_attempts: List) -> Dict:
"""Analyze why previous attempts failed"""
if not previous_attempts:
return {'failed_tools': [], 'error_types': [], 'patterns': []}
failed_tools = []
error_types = []
for attempt in previous_attempts:
if not attempt.get('success', False):
if 'tool_sequence' in attempt:
for step in attempt['tool_sequence']:
if not step.get('success', True):
failed_tools.append(step['tool'])
if 'result' in step and hasattr(step['result'], 'error_message'):
error_types.append(step['result'].error_message)
return {
'failed_tools': failed_tools,
'error_types': error_types,
'patterns': self.detect_failure_patterns(failed_tools, error_types)
}
def detect_failure_patterns(self, failed_tools: List, error_types: List) -> List:
"""Detect patterns in tool failures"""
patterns = []
# Tool-specific patterns
if failed_tools.count('calculator') > 1:
patterns.append('calculator_unreliable')
if failed_tools.count('web_search') > 1:
patterns.append('search_failing')
# Error-type patterns
if any('timeout' in str(error).lower() for error in error_types):
patterns.append('timeout_issues')
if any('parse' in str(error).lower() for error in error_types):
patterns.append('parsing_errors')
return patterns
def select_composition_strategy(self, task: str, failure_analysis: Dict) -> callable:
"""Select composition strategy based on task and failure history"""
# Default to sequential
strategy_name = 'sequential'
# If previous tools failed, try parallel execution
if failure_analysis['failed_tools']:
strategy_name = 'parallel'
# If timeouts occurred, use conditional execution
if 'timeout_issues' in failure_analysis['patterns']:
strategy_name = 'conditional'
return self.composition_strategies[strategy_name]
def sequential_composition(self, task: str, failure_analysis: Dict) -> Dict:
"""Execute tools in sequence"""
tool_sequence = self.rl_agent.select_tool_sequence(task)
for i, tool_step in enumerate(tool_sequence):
if not tool_step['success']:
return {
'success': False,
'strategy': 'sequential',
'failed_at_step': i,
'tool_sequence': tool_sequence
}
return {
'success': True,
'strategy': 'sequential',
'tool_sequence': tool_sequence,
'final_result': tool_sequence[-1]['result'] if tool_sequence else None
}
def parallel_composition(self, task: str, failure_analysis: Dict) -> Dict:
"""Execute multiple tools in parallel and combine results"""
# Simplified parallel execution (in practice: use threading/asyncio)
potential_tools = ['calculator', 'web_search', 'file_reader']
# Avoid previously failed tools
available_tools = [t for t in potential_tools
if t not in failure_analysis.get('failed_tools', [])]
if not available_tools:
available_tools = potential_tools # Fallback if all failed
results = []
for tool in available_tools[:2]: # Use up to 2 tools in parallel
result = self.rl_agent.execute_tool(tool, task)
results.append({
'tool': tool,
'result': result,
'success': result.success
})
# Use best result
successful_results = [r for r in results if r['success']]
if successful_results:
best_result = max(successful_results,
key=lambda x: len(str(x['result'].result)) if x['result'].result else 0)
return {
'success': True,
'strategy': 'parallel',
'all_results': results,
'selected_result': best_result
}
else:
return {
'success': False,
'strategy': 'parallel',
'all_results': results
}
def conditional_composition(self, task: str, failure_analysis: Dict) -> Dict:
"""Execute tools conditionally based on intermediate results"""
# Start with most reliable tool
analytics = self.rl_agent.get_tool_analytics()
best_tool = analytics['best_tool']
first_result = self.rl_agent.execute_tool(best_tool, task)
if first_result.success:
return {
'success': True,
'strategy': 'conditional',
'single_tool_success': True,
'tool_used': best_tool,
'result': first_result
}
else:
# Try backup strategy
backup_tools = [t for t in self.rl_agent.tools if t != best_tool]
for backup_tool in backup_tools[:1]: # Try one backup
backup_result = self.rl_agent.execute_tool(backup_tool, task)
if backup_result.success:
return {
'success': True,
'strategy': 'conditional',
'backup_success': True,
'primary_tool': best_tool,
'backup_tool': backup_tool,
'result': backup_result
}
return {
'success': False,
'strategy': 'conditional',
'primary_failed': True,
'backup_failed': True
}
# Demo tool chaining with recovery
chain_agent = ToolChainAgent()
print("\n=== Tool Chaining with Error Recovery ===")
complex_tasks = [
"Calculate the compound interest and then search for current bank rates",
"Read the file and execute any Python code found in it",
"Search for Python tutorials and calculate learning time estimate"
]
for task in complex_tasks:
result = chain_agent.execute_with_recovery(task)
print(f"\nTask: {task}")
print(f"Success: {result['final_result']['success']}")
print(f"Strategy: {result['final_result'].get('strategy', 'unknown')}")
print(f"Attempts: {result['attempts']}")
if result.get('recovery_used'):
print("Error recovery was used")Tool Use Evolution Path:
Stage 1: Rule-Based Selection
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ Task │───►│ Keyword │───►│ Single │
│ Input │ │ Matching │ │ Tool │
└─────────────┘ └─────────────┘ └─────────────┘
│
Fixed Heuristics
Stage 2: RL-Optimized Selection
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ Task │───►│ Selection │───►│ Dynamic │
│ + Context │ │ Policy │ │ Tool │
└─────────────┘ │ π(t|s) │ │ Choice │
└─────────────┘ └─────────────┘
│
RL Training Loop
Stage 3: Tool Composition + Recovery
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ Task │───►│ Composition │───►│ Tool │
│ + History │ │ Policy │ │ Sequence │
└─────────────┘ │ π(seq|s,h) │ │ + Recovery │
└─────────────┘ └─────────────┘
│ │
┌─────────────┐ │
│ Failure │◄─────────┘
│ Analysis │
└─────────────┘
Tool Selection Architecture:
┌─────────────────────────┐
│ Task Encoding │
│ (LSTM/Transformer) │
└─────────────────────────┘
│
┌─────────┼─────────┐
│ │ │
▼ ▼ ▼
┌─────────────┐ ┌─────────┐ ┌─────────────┐
│Tool Select │ │Compose │ │ Error │
│ Head │ │ Head │ │Recovery Head│
│ π(tool|s) │ │π(seq|s) │ │ π(retry|h) │
└─────────────┘ └─────────┘ └─────────────┘
def train_tool_policies():
"""Complete training pipeline for tool use policies"""
agent = RLToolAgent()
chain_agent = ToolChainAgent()
# Training tasks with ground truth labels
training_data = [
{'task': 'Calculate 25 * 47', 'optimal_tool': 'calculator', 'reward': 10},
{'task': 'Find recent news about AI', 'optimal_tool': 'web_search', 'reward': 8},
{'task': 'Load configuration from config.json', 'optimal_tool': 'file_reader', 'reward': 7},
{'task': 'Run unit tests', 'optimal_tool': 'code_executor', 'reward': 9}
]
print("=== Tool Policy Training ===")
# Training loop
for epoch in range(50):
experiences = []
for data_point in training_data:
# Execute task and collect experience
tool_sequence = agent.select_tool_sequence(data_point['task'])
# Compute reward based on tool selection quality
reward = compute_tool_reward(tool_sequence, data_point)
# Store experience
experience = {
'task': data_point['task'],
'tool_action': agent.tools.index(data_point['optimal_tool']),
'reward': reward
}
experiences.append(experience)
# Train on collected experiences
loss = agent.train_on_experience(experiences)
if epoch % 10 == 0:
print(f"Epoch {epoch}: Loss = {loss:.4f}")
analytics = agent.get_tool_analytics()
print(f" Best tool: {analytics['best_tool']}")
print("Tool policy training complete")
def compute_tool_reward(tool_sequence: List, ground_truth: Dict) -> float:
"""Compute reward for tool selection"""
if not tool_sequence:
return 1.0 # Baseline for no tool use
base_reward = 1.0
optimal_tool = ground_truth['optimal_tool']
# Reward correct tool selection
if tool_sequence[0]['tool'] == optimal_tool:
base_reward += 5.0
# Reward successful execution
if all(step['success'] for step in tool_sequence):
base_reward += 3.0
# Penalize excessive tool use
if len(tool_sequence) > 2:
base_reward -= 1.0
return max(base_reward, 0.0)
# Demonstrate training
print("Starting tool policy training...")
# train_tool_policies() # Uncommented for demo
print("Training complete")| Aspect | Rule-Based Tools | RL-Optimized Tools |
|---|---|---|
| Selection | Keyword matching | Learned context policies |
| Composition | No chaining | Learned sequences |
| Error Handling | Retry same tool | Strategy adaptation |
| Optimization | Manual rule tuning | Automatic from feedback |
| Context Awareness | Limited keywords | Full task understanding |
| Adaptation | Static rules | Continuous learning |
# Exercise 1: Build domain-specific tool agent
def exercise_domain_tools():
"""Create tool agent for specific domain (e.g., data science)"""
pass
# Exercise 2: Implement parallel tool execution
def exercise_parallel_tools():
"""Add async/parallel tool execution capabilities"""
pass
# Exercise 3: Design tool cost optimization
def exercise_tool_costs():
"""Optimize tool usage under budget/latency constraints"""
pass- Survey Reference: Section 3.2, arXiv:2509.02547
- ReAct Paper: Reasoning and Acting with Language Models
- Toolformer Paper: Teaching Language Models to Use Tools
- ToolLLM Paper: Facilitating LLMs to Master 16000+ Real-world APIs
- Function Calling Guide: OpenAI Function Calling Documentation
- 3.3 Memory: Learn RL-controlled memory systems and retrieval policies
- Integration Practice: Combine tool use with planning from previous module
- Advanced Topics: Study tool composition optimization and API integration patterns
Tool use becomes intelligent when agents learn not just which tools to use, but when, how to chain them, and how to recover from failures. RL transforms static function calling into adaptive, outcome-driven tool orchestration.