-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
200 lines (167 loc) · 6.85 KB
/
utils.py
File metadata and controls
200 lines (167 loc) · 6.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import os
import requests
from typing import Dict, Any
from langsmith import traceable
from tavily import TavilyClient
from openperplex import OpenperplexSync
from dotenv import load_dotenv
load_dotenv()
def deduplicate_and_format_sources(search_response, max_tokens_per_source, include_raw_content=False):
"""
Takes either a single search response or list of responses from search APIs and formats them.
Limits the raw_content to approximately max_tokens_per_source.
include_raw_content specifies whether to include the raw_content from Tavily in the formatted string.
Args:
search_response: Either:
- A dict with a 'results' key containing a list of search results
- A list of dicts, each containing search results
Returns:
str: Formatted string with deduplicated sources
"""
# Convert input to list of results
if isinstance(search_response, dict):
sources_list = search_response['results']
elif isinstance(search_response, list):
sources_list = []
for response in search_response:
if isinstance(response, dict) and 'results' in response:
sources_list.extend(response['results'])
else:
sources_list.extend(response)
else:
raise ValueError("Input must be either a dict with 'results' or a list of search results")
# Deduplicate by URL
unique_sources = {}
for source in sources_list:
if source['url'] not in unique_sources:
unique_sources[source['url']] = source
# Format output
formatted_text = "Sources:\n\n"
for i, source in enumerate(unique_sources.values(), 1):
formatted_text += f"Source {source['title']}:\n===\n"
formatted_text += f"URL: {source['url']}\n===\n"
formatted_text += f"Most relevant content from source: {source['content']}\n===\n"
if include_raw_content:
# Using rough estimate of 4 characters per token
char_limit = max_tokens_per_source * 4
# Handle None raw_content
raw_content = source.get('raw_content', '')
if raw_content is None:
raw_content = ''
print(f"Warning: No raw_content found for source {source['url']}")
if len(raw_content) > char_limit:
raw_content = raw_content[:char_limit] + "... [truncated]"
formatted_text += f"Full source content limited to {max_tokens_per_source} tokens: {raw_content}\n\n"
return formatted_text.strip()
def format_sources(search_results):
"""Format search results into a bullet-point list of sources.
Args:
search_results (dict): Tavily search response containing results
Returns:
str: Formatted string with sources and their URLs
"""
return '\n'.join(
f"* {source['title']} : {source['url']}"
for source in search_results['results']
)
@traceable
def tavily_search(query, include_raw_content=True, max_results=3):
""" Search the web using the Tavily API.
Args:
query (str): The search query to execute
include_raw_content (bool): Whether to include the raw_content from Tavily in the formatted string
max_results (int): Maximum number of results to return
Returns:
dict: Search response containing:
- results (list): List of search result dictionaries, each containing:
- title (str): Title of the search result
- url (str): URL of the search result
- content (str): Snippet/summary of the content
- raw_content (str): Full content of the page if available"""
tavily_client = TavilyClient()
return tavily_client.search(query,
max_results=max_results,
include_raw_content=include_raw_content)
@traceable
def perplexity_search(query: str,
perplexity_search_loop_count: int) -> Dict[str, Any]:
"""Search the web using the Perplexity API.
Args:
query (str): The search query to execute
perplexity_search_loop_count (int): The loop step for perplexity search (starts at 0)
Returns:
dict: Search response containing:
- results (list): List of search result dictionaries, each containing:
- title (str): Title of the search result
- url (str): URL of the search result
- content (str): Snippet/summary of the content
- raw_content (str): Full content of the page if available
"""
headers = {
"accept": "application/json",
"content-type": "application/json",
"Authorization": f"Bearer {os.getenv('PERPLEXITY_API_KEY')}"
}
payload = {
"model": "sonar",
"messages": [
{
"role": "system",
"content": "Search the web and provide factual information with sources."
},
{
"role": "user",
"content": query
}
]
}
response = requests.post(
"https://api.perplexity.ai/chat/completions",
headers=headers,
json=payload
)
response.raise_for_status() # Raise exception for bad status codes
# Parse the response
data = response.json()
content = data["choices"][0]["message"]["content"]
# Perplexity returns a list of citations for a single search result
citations = data.get("citations", ["https://perplexity.ai"])
# Return first citation with full content, others just as references
results = [{
"title": f"Perplexity Search {perplexity_search_loop_count + 1}, Source 1",
"link": citations[0],
"content": content,
"raw_content": content
}]
# Add additional citations without duplicating content
for i, citation in enumerate(citations[1:], start=2):
results.append({
"title": f"Perplexity Search {perplexity_search_loop_count + 1}, Source {i}",
"link": citation,
"content": "See above for full content",
"raw_content": None
})
return {"results": results}
def openperplex_search(query: str,
model="gemini-2.0-flash",
):
client_sync = OpenperplexSync(os.environ["OPENPERPLEX_API_KEY"])
result = client_sync.search(
query=query,
model=model,
date_context="2024-08-25",
location="us",
response_language="en",
answer_type="text",
search_type="general",
return_citations=True,
return_sources=True,
return_images=False,
recency_filter="anytime"
)
results = {
"title": query,
"content": result["llm_response"],
"sources": result["sources"]
}
return results