Skip to content

Latest commit

 

History

History
781 lines (627 loc) · 29.6 KB

File metadata and controls

781 lines (627 loc) · 29.6 KB

3.2 Tool Using

Tool use evolves from static ReAct-style prompting and supervised datasets to Tool-Integrated Reasoning (TIR) where RL teaches agents when to invoke tools, how to compose them effectively, and how to recover from failures. This outcome-driven approach moves beyond imitation to discover optimal tool strategies through environmental feedback.

Key Takeaways

  • Evolution Path: ReAct prompting → SFT datasets → RL-optimized tool policies
  • Dynamic Selection: Agents learn tool choice based on task context rather than fixed rules
  • Composition Learning: RL discovers tool chaining and error recovery strategies
  • Credit Assignment: Long-horizon TIR enables temporal credit across tool sequences

Prerequisites Check

# Verify tool integration libraries
python -c "import requests, json; print('HTTP tools ready')"
python -c "import subprocess; print('System tools ready')" 
python -c "import torch, transformers; print('ML integration ready')"

# Conceptual check
echo "Do you understand function calling and API interfaces?"
echo "Are you familiar with the ReAct (Reasoning + Acting) framework?"
echo "Have you completed Module 3.1 (Planning)?"

Hands-On: Tool Use Evolution

Traditional Rule-Based Tool Selection

import json
import random
import time
from typing import Dict, List, Any, Optional
from dataclasses import dataclass

@dataclass
class ToolResult:
    success: bool
    result: Any
    execution_time: float
    error_message: Optional[str] = None

class RuleBasedToolAgent:
    """Traditional rule-based tool selection"""
    
    def __init__(self):
        self.tools = {
            'calculator': self.calculator_tool,
            'web_search': self.web_search_tool,
            'file_reader': self.file_reader_tool,
            'code_executor': self.code_executor_tool
        }
        
        # Fixed rules for tool selection
        self.selection_rules = {
            'math_keywords': ['calculate', 'compute', '+', '-', '*', '/', '%', 'equation'],
            'search_keywords': ['search', 'find', 'lookup', 'research', 'what is'],
            'file_keywords': ['read', 'file', 'document', 'load', 'open'],
            'code_keywords': ['run', 'execute', 'python', 'code', 'script']
        }
    
    def select_tool(self, task: str) -> Optional[str]:
        """Rule-based tool selection"""
        task_lower = task.lower()
        
        # Check each rule category
        for tool_type, keywords in [
            ('calculator', self.selection_rules['math_keywords']),
            ('web_search', self.selection_rules['search_keywords']),
            ('file_reader', self.selection_rules['file_keywords']),
            ('code_executor', self.selection_rules['code_keywords'])
        ]:
            if any(keyword in task_lower for keyword in keywords):
                return tool_type
        
        return None  # No tool needed
    
    def execute_task(self, task: str) -> Dict[str, Any]:
        """Execute task using rule-based tool selection"""
        start_time = time.time()
        
        # Select tool based on rules
        selected_tool = self.select_tool(task)
        
        if not selected_tool:
            return {
                'action': 'direct_response',
                'tool_used': None,
                'result': 'Responding directly without tools',
                'execution_time': time.time() - start_time,
                'success': True
            }
        
        # Execute selected tool
        tool_function = self.tools[selected_tool]
        tool_result = tool_function(task)
        
        return {
            'action': 'tool_use',
            'tool_used': selected_tool,
            'result': tool_result.result,
            'success': tool_result.success,
            'execution_time': time.time() - start_time,
            'error': tool_result.error_message
        }
    
    # Simplified tool implementations
    def calculator_tool(self, query: str) -> ToolResult:
        """Basic calculator tool"""
        try:
            # Extract mathematical expression (simplified)
            import re
            numbers = re.findall(r'\d+\.?\d*', query)
            if len(numbers) >= 2:
                result = float(numbers[0]) + float(numbers[1])  # Simplified: just add
                return ToolResult(True, f"Result: {result}", 0.1)
            else:
                return ToolResult(False, None, 0.1, "Could not parse mathematical expression")
        except Exception as e:
            return ToolResult(False, None, 0.1, str(e))
    
    def web_search_tool(self, query: str) -> ToolResult:
        """Simulated web search"""
        time.sleep(0.2)  # Simulate network delay
        mock_results = [
            f"Search result 1 for '{query[:20]}...'",
            f"Search result 2 for '{query[:20]}...'",
            f"Search result 3 for '{query[:20]}...'"
        ]
        return ToolResult(True, mock_results, 0.2)
    
    def file_reader_tool(self, query: str) -> ToolResult:
        """Simulated file reading"""
        return ToolResult(True, f"File content for query: {query}", 0.05)
    
    def code_executor_tool(self, query: str) -> ToolResult:
        """Simulated code execution"""
        return ToolResult(True, "Code executed successfully", 0.3)

# Demo rule-based tool selection
rule_agent = RuleBasedToolAgent()

tasks = [
    "Calculate 15 + 27",
    "Search for information about machine learning",
    "Read the contents of data.txt", 
    "Execute this Python code: print('hello')",
    "Explain the concept of gravity"
]

print("=== Rule-Based Tool Selection ===")
for task in tasks:
    result = rule_agent.execute_task(task)
    print(f"\nTask: {task}")
    print(f"Action: {result['action']}")
    if result['tool_used']:
        print(f"Tool: {result['tool_used']}")
        print(f"Success: {result['success']}")
        print(f"Time: {result['execution_time']:.3f}s")

RL-Optimized Tool Integration

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque

class ToolSelectionPolicy(nn.Module):
    """Neural policy for tool selection and composition"""
    
    def __init__(self, vocab_size: int, num_tools: int, hidden_dim: int = 128):
        super().__init__()
        
        # Text encoder for task understanding
        self.text_encoder = nn.Sequential(
            nn.Embedding(vocab_size, hidden_dim),
            nn.LSTM(hidden_dim, hidden_dim, batch_first=True),
        )
        
        # Tool selection head
        self.tool_selector = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, num_tools + 1)  # +1 for "no tool" option
        )
        
        # Tool composition head (for chaining)
        self.composition_head = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),  # Current state + tool result
            nn.ReLU(),
            nn.Linear(hidden_dim, 3)  # Continue, terminate, or retry
        )
        
    def forward(self, task_embedding, tool_history=None):
        """Forward pass for tool selection"""
        # Encode task
        lstm_out, (hidden, _) = self.text_encoder(task_embedding)
        task_features = hidden[-1]  # Last hidden state
        
        # Tool selection
        tool_logits = self.tool_selector(task_features)
        
        # Composition decision (if tool history provided)
        composition_logits = None
        if tool_history is not None:
            combined = torch.cat([task_features, tool_history], dim=-1)
            composition_logits = self.composition_head(combined)
        
        return tool_logits, composition_logits

class RLToolAgent:
    """RL-optimized tool selection and composition agent"""
    
    def __init__(self, vocab_size: int = 1000, num_tools: int = 4):
        self.policy = ToolSelectionPolicy(vocab_size, num_tools)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=0.001)
        
        self.tools = ['calculator', 'web_search', 'file_reader', 'code_executor']
        self.tool_implementations = RuleBasedToolAgent().tools
        
        # Experience buffer for training
        self.experience_buffer = deque(maxlen=1000)
        
        # Tool usage statistics for learning
        self.tool_success_rates = {tool: 0.5 for tool in self.tools}
        self.tool_usage_counts = {tool: 0 for tool in self.tools}
        
    def encode_task(self, task: str) -> torch.Tensor:
        """Simple task encoding (in practice: use proper tokenization)"""
        # Simplified encoding - hash to vocab indices
        words = task.lower().split()
        indices = [abs(hash(word)) % 1000 for word in words[:10]]  # Max 10 words
        
        # Pad to consistent length
        while len(indices) < 10:
            indices.append(0)
            
        return torch.tensor(indices).unsqueeze(0)  # Batch dimension
    
    def select_tool_sequence(self, task: str, max_tools: int = 3) -> List[Dict]:
        """RL-based tool selection and composition"""
        task_embedding = self.encode_task(task)
        tool_sequence = []
        
        current_state = task_embedding
        tool_history = torch.zeros(128)  # Initialize tool history
        
        for step in range(max_tools):
            with torch.no_grad():
                tool_logits, composition_logits = self.policy(current_state, tool_history)
                
                # Sample tool selection
                tool_probs = torch.softmax(tool_logits, dim=-1)
                tool_idx = torch.multinomial(tool_probs, 1).item()
                
                if tool_idx == len(self.tools):  # "No tool" option
                    break
                    
                selected_tool = self.tools[tool_idx]
                tool_confidence = float(tool_probs[0, tool_idx].item())
                
                # Execute tool
                tool_result = self.execute_tool(selected_tool, task)
                
                tool_sequence.append({
                    'step': step,
                    'tool': selected_tool,
                    'confidence': tool_confidence,
                    'result': tool_result,
                    'success': tool_result.success
                })
                
                # Update tool history for next iteration
                tool_history = torch.randn(128)  # Simplified tool result encoding
                
                # Composition decision - should we continue?
                if composition_logits is not None:
                    comp_probs = torch.softmax(composition_logits, dim=-1)
                    continue_prob = float(comp_probs[0, 0].item())
                    
                    if continue_prob < 0.4 or not tool_result.success:
                        break
        
        return tool_sequence
    
    def execute_tool(self, tool_name: str, task: str) -> ToolResult:
        """Execute tool and update statistics"""
        start_time = time.time()
        
        try:
            tool_func = self.tool_implementations[tool_name]
            result = tool_func(task)
            
            # Update success rate statistics
            self.tool_usage_counts[tool_name] += 1
            current_rate = self.tool_success_rates[tool_name]
            
            # Exponential moving average
            alpha = 0.1
            if result.success:
                self.tool_success_rates[tool_name] = current_rate * (1 - alpha) + alpha
            else:
                self.tool_success_rates[tool_name] = current_rate * (1 - alpha)
                
            return result
            
        except Exception as e:
            return ToolResult(False, None, time.time() - start_time, str(e))
    
    def train_on_experience(self, experiences: List[Dict]) -> float:
        """Train tool selection policy using RL"""
        if len(experiences) < 5:
            return 0.0
            
        # Prepare training data
        states = []
        actions = []
        rewards = []
        
        for exp in experiences:
            states.append(self.encode_task(exp['task']))
            actions.append(exp['tool_action'])
            rewards.append(exp['reward'])
        
        states = torch.cat(states)
        actions = torch.tensor(actions)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        
        # Policy gradient update
        tool_logits, _ = self.policy(states)
        log_probs = torch.log_softmax(tool_logits, dim=-1)
        selected_log_probs = log_probs.gather(1, actions.unsqueeze(1)).squeeze()
        
        # REINFORCE loss
        loss = -(selected_log_probs * rewards).mean()
        
        # Update policy
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        return loss.item()
    
    def get_tool_analytics(self) -> Dict:
        """Get tool usage analytics"""
        return {
            'success_rates': self.tool_success_rates.copy(),
            'usage_counts': self.tool_usage_counts.copy(),
            'total_uses': sum(self.tool_usage_counts.values()),
            'best_tool': max(self.tool_success_rates.items(), key=lambda x: x[1])[0]
        }

# Demo RL tool agent
rl_agent = RLToolAgent()

print("\n=== RL-Optimized Tool Selection ===")
for task in tasks:
    tool_sequence = rl_agent.select_tool_sequence(task)
    print(f"\nTask: {task}")
    
    if not tool_sequence:
        print("No tools selected - direct response")
    else:
        print(f"Tool sequence ({len(tool_sequence)} steps):")
        for tool_step in tool_sequence:
            print(f"  Step {tool_step['step']}: {tool_step['tool']} "
                  f"(confidence: {tool_step['confidence']:.3f}, "
                  f"success: {tool_step['success']})")

# Show analytics
analytics = rl_agent.get_tool_analytics()
print(f"\nTool Analytics:")
print(f"Best performing tool: {analytics['best_tool']}")
print(f"Success rates: {analytics['success_rates']}")

Tool Composition and Error Recovery

Advanced Tool Chaining

class ToolChainAgent:
    """Advanced tool chaining with error recovery"""
    
    def __init__(self):
        self.rl_agent = RLToolAgent()
        self.max_retries = 2
        self.composition_strategies = {
            'sequential': self.sequential_composition,
            'parallel': self.parallel_composition,
            'conditional': self.conditional_composition
        }
    
    def execute_with_recovery(self, task: str) -> Dict:
        """Execute task with automatic error recovery"""
        attempts = 0
        results_history = []
        
        while attempts < self.max_retries:
            attempt_result = self.attempt_execution(task, results_history)
            results_history.append(attempt_result)
            
            if attempt_result['success']:
                return {
                    'final_result': attempt_result,
                    'attempts': attempts + 1,
                    'recovery_used': attempts > 0,
                    'history': results_history
                }
            
            attempts += 1
            
        return {
            'final_result': results_history[-1],
            'attempts': attempts,
            'recovery_failed': True,
            'history': results_history
        }
    
    def attempt_execution(self, task: str, previous_attempts: List) -> Dict:
        """Single execution attempt with learning from failures"""
        
        # Analyze previous failures
        failure_analysis = self.analyze_failures(previous_attempts)
        
        # Adjust strategy based on failures
        strategy = self.select_composition_strategy(task, failure_analysis)
        
        # Execute with selected strategy
        return strategy(task, failure_analysis)
    
    def analyze_failures(self, previous_attempts: List) -> Dict:
        """Analyze why previous attempts failed"""
        if not previous_attempts:
            return {'failed_tools': [], 'error_types': [], 'patterns': []}
        
        failed_tools = []
        error_types = []
        
        for attempt in previous_attempts:
            if not attempt.get('success', False):
                if 'tool_sequence' in attempt:
                    for step in attempt['tool_sequence']:
                        if not step.get('success', True):
                            failed_tools.append(step['tool'])
                            if 'result' in step and hasattr(step['result'], 'error_message'):
                                error_types.append(step['result'].error_message)
        
        return {
            'failed_tools': failed_tools,
            'error_types': error_types,
            'patterns': self.detect_failure_patterns(failed_tools, error_types)
        }
    
    def detect_failure_patterns(self, failed_tools: List, error_types: List) -> List:
        """Detect patterns in tool failures"""
        patterns = []
        
        # Tool-specific patterns
        if failed_tools.count('calculator') > 1:
            patterns.append('calculator_unreliable')
        if failed_tools.count('web_search') > 1:
            patterns.append('search_failing')
            
        # Error-type patterns
        if any('timeout' in str(error).lower() for error in error_types):
            patterns.append('timeout_issues')
        if any('parse' in str(error).lower() for error in error_types):
            patterns.append('parsing_errors')
            
        return patterns
    
    def select_composition_strategy(self, task: str, failure_analysis: Dict) -> callable:
        """Select composition strategy based on task and failure history"""
        
        # Default to sequential
        strategy_name = 'sequential'
        
        # If previous tools failed, try parallel execution
        if failure_analysis['failed_tools']:
            strategy_name = 'parallel'
            
        # If timeouts occurred, use conditional execution
        if 'timeout_issues' in failure_analysis['patterns']:
            strategy_name = 'conditional'
            
        return self.composition_strategies[strategy_name]
    
    def sequential_composition(self, task: str, failure_analysis: Dict) -> Dict:
        """Execute tools in sequence"""
        tool_sequence = self.rl_agent.select_tool_sequence(task)
        
        for i, tool_step in enumerate(tool_sequence):
            if not tool_step['success']:
                return {
                    'success': False,
                    'strategy': 'sequential',
                    'failed_at_step': i,
                    'tool_sequence': tool_sequence
                }
        
        return {
            'success': True,
            'strategy': 'sequential', 
            'tool_sequence': tool_sequence,
            'final_result': tool_sequence[-1]['result'] if tool_sequence else None
        }
    
    def parallel_composition(self, task: str, failure_analysis: Dict) -> Dict:
        """Execute multiple tools in parallel and combine results"""
        # Simplified parallel execution (in practice: use threading/asyncio)
        
        potential_tools = ['calculator', 'web_search', 'file_reader']
        
        # Avoid previously failed tools
        available_tools = [t for t in potential_tools 
                          if t not in failure_analysis.get('failed_tools', [])]
        
        if not available_tools:
            available_tools = potential_tools  # Fallback if all failed
        
        results = []
        for tool in available_tools[:2]:  # Use up to 2 tools in parallel
            result = self.rl_agent.execute_tool(tool, task)
            results.append({
                'tool': tool,
                'result': result,
                'success': result.success
            })
        
        # Use best result
        successful_results = [r for r in results if r['success']]
        if successful_results:
            best_result = max(successful_results, 
                            key=lambda x: len(str(x['result'].result)) if x['result'].result else 0)
            return {
                'success': True,
                'strategy': 'parallel',
                'all_results': results,
                'selected_result': best_result
            }
        else:
            return {
                'success': False,
                'strategy': 'parallel',
                'all_results': results
            }
    
    def conditional_composition(self, task: str, failure_analysis: Dict) -> Dict:
        """Execute tools conditionally based on intermediate results"""
        
        # Start with most reliable tool
        analytics = self.rl_agent.get_tool_analytics()
        best_tool = analytics['best_tool']
        
        first_result = self.rl_agent.execute_tool(best_tool, task)
        
        if first_result.success:
            return {
                'success': True,
                'strategy': 'conditional',
                'single_tool_success': True,
                'tool_used': best_tool,
                'result': first_result
            }
        else:
            # Try backup strategy
            backup_tools = [t for t in self.rl_agent.tools if t != best_tool]
            for backup_tool in backup_tools[:1]:  # Try one backup
                backup_result = self.rl_agent.execute_tool(backup_tool, task)
                if backup_result.success:
                    return {
                        'success': True,
                        'strategy': 'conditional',
                        'backup_success': True,
                        'primary_tool': best_tool,
                        'backup_tool': backup_tool,
                        'result': backup_result
                    }
            
            return {
                'success': False,
                'strategy': 'conditional',
                'primary_failed': True,
                'backup_failed': True
            }

# Demo tool chaining with recovery
chain_agent = ToolChainAgent()

print("\n=== Tool Chaining with Error Recovery ===")
complex_tasks = [
    "Calculate the compound interest and then search for current bank rates",
    "Read the file and execute any Python code found in it",
    "Search for Python tutorials and calculate learning time estimate"
]

for task in complex_tasks:
    result = chain_agent.execute_with_recovery(task)
    print(f"\nTask: {task}")
    print(f"Success: {result['final_result']['success']}")
    print(f"Strategy: {result['final_result'].get('strategy', 'unknown')}")
    print(f"Attempts: {result['attempts']}")
    if result.get('recovery_used'):
        print("Error recovery was used")

ASCII Diagram: Tool Use Evolution

Tool Use Evolution Path:

Stage 1: Rule-Based Selection
┌─────────────┐    ┌─────────────┐    ┌─────────────┐
│    Task     │───►│   Keyword   │───►│   Single    │
│   Input     │    │   Matching  │    │   Tool      │
└─────────────┘    └─────────────┘    └─────────────┘
                          │
                   Fixed Heuristics

Stage 2: RL-Optimized Selection  
┌─────────────┐    ┌─────────────┐    ┌─────────────┐
│    Task     │───►│  Selection  │───►│  Dynamic    │
│  + Context  │    │   Policy    │    │   Tool      │
└─────────────┘    │   π(t|s)    │    │  Choice     │
                   └─────────────┘    └─────────────┘
                          │
                    RL Training Loop

Stage 3: Tool Composition + Recovery
┌─────────────┐    ┌─────────────┐    ┌─────────────┐
│    Task     │───►│ Composition │───►│   Tool      │
│  + History  │    │   Policy    │    │ Sequence    │
└─────────────┘    │ π(seq|s,h)  │    │ + Recovery  │
                   └─────────────┘    └─────────────┘
                          │                  │
                   ┌─────────────┐          │
                   │ Failure     │◄─────────┘
                   │ Analysis    │
                   └─────────────┘

Tool Selection Architecture:
       ┌─────────────────────────┐
       │    Task Encoding        │
       │   (LSTM/Transformer)    │
       └─────────────────────────┘
                   │
         ┌─────────┼─────────┐
         │         │         │
         ▼         ▼         ▼
┌─────────────┐ ┌─────────┐ ┌─────────────┐
│Tool Select  │ │Compose  │ │   Error     │
│   Head      │ │ Head    │ │Recovery Head│
│ π(tool|s)   │ │π(seq|s) │ │ π(retry|h)  │
└─────────────┘ └─────────┘ └─────────────┘

Training Tool Policies

Tool Use RL Training

def train_tool_policies():
    """Complete training pipeline for tool use policies"""
    
    agent = RLToolAgent()
    chain_agent = ToolChainAgent()
    
    # Training tasks with ground truth labels
    training_data = [
        {'task': 'Calculate 25 * 47', 'optimal_tool': 'calculator', 'reward': 10},
        {'task': 'Find recent news about AI', 'optimal_tool': 'web_search', 'reward': 8},
        {'task': 'Load configuration from config.json', 'optimal_tool': 'file_reader', 'reward': 7},
        {'task': 'Run unit tests', 'optimal_tool': 'code_executor', 'reward': 9}
    ]
    
    print("=== Tool Policy Training ===")
    
    # Training loop
    for epoch in range(50):
        experiences = []
        
        for data_point in training_data:
            # Execute task and collect experience
            tool_sequence = agent.select_tool_sequence(data_point['task'])
            
            # Compute reward based on tool selection quality
            reward = compute_tool_reward(tool_sequence, data_point)
            
            # Store experience
            experience = {
                'task': data_point['task'],
                'tool_action': agent.tools.index(data_point['optimal_tool']),
                'reward': reward
            }
            experiences.append(experience)
        
        # Train on collected experiences
        loss = agent.train_on_experience(experiences)
        
        if epoch % 10 == 0:
            print(f"Epoch {epoch}: Loss = {loss:.4f}")
            analytics = agent.get_tool_analytics()
            print(f"  Best tool: {analytics['best_tool']}")
    
    print("Tool policy training complete")

def compute_tool_reward(tool_sequence: List, ground_truth: Dict) -> float:
    """Compute reward for tool selection"""
    if not tool_sequence:
        return 1.0  # Baseline for no tool use
    
    base_reward = 1.0
    optimal_tool = ground_truth['optimal_tool']
    
    # Reward correct tool selection
    if tool_sequence[0]['tool'] == optimal_tool:
        base_reward += 5.0
    
    # Reward successful execution
    if all(step['success'] for step in tool_sequence):
        base_reward += 3.0
    
    # Penalize excessive tool use
    if len(tool_sequence) > 2:
        base_reward -= 1.0
    
    return max(base_reward, 0.0)

# Demonstrate training
print("Starting tool policy training...")
# train_tool_policies()  # Uncommented for demo
print("Training complete")

Key Differences Summary

Aspect Rule-Based Tools RL-Optimized Tools
Selection Keyword matching Learned context policies
Composition No chaining Learned sequences
Error Handling Retry same tool Strategy adaptation
Optimization Manual rule tuning Automatic from feedback
Context Awareness Limited keywords Full task understanding
Adaptation Static rules Continuous learning

Practical Exercises

# Exercise 1: Build domain-specific tool agent
def exercise_domain_tools():
    """Create tool agent for specific domain (e.g., data science)"""
    pass

# Exercise 2: Implement parallel tool execution
def exercise_parallel_tools():
    """Add async/parallel tool execution capabilities"""  
    pass

# Exercise 3: Design tool cost optimization
def exercise_tool_costs():
    """Optimize tool usage under budget/latency constraints"""
    pass

Resources

Next Steps

  • 3.3 Memory: Learn RL-controlled memory systems and retrieval policies
  • Integration Practice: Combine tool use with planning from previous module
  • Advanced Topics: Study tool composition optimization and API integration patterns

Tool use becomes intelligent when agents learn not just which tools to use, but when, how to chain them, and how to recover from failures. RL transforms static function calling into adaptive, outcome-driven tool orchestration.