forked from stanford-oval/WikiChat
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcorpora.py
More file actions
194 lines (174 loc) · 7.77 KB
/
corpora.py
File metadata and controls
194 lines (174 loc) · 7.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
from pydantic import BaseModel, Field
class ChatStarter(BaseModel):
"""
A chat starter is a suggestion for the user to start a chat with the chatbot.
It is displayed in the front-end as a button or a link that the user can click on to start a chat.
"""
display_label: str = Field(
...,
description="Label for the chat starter. This will be displayed in the front-end before the user clicks on the chat starter",
)
icon_path: str = Field(..., description="Path to the icon for the chat starter")
chat_message: str = Field(
...,
description="Message content of the chat starter. This will be sent to the chatbot when the user clicks on the chat starter",
)
class Corpus(BaseModel):
name: str = Field(..., description="Name of the corpus")
corpus_id: str = Field(
..., description="Index ID of the corpus in the vector database"
)
icon_path: str = Field(..., description="Path to the icon for the corpus")
llm_corpus_description: str = Field(
..., description="Description of the corpus for LLM prompts"
)
human_description_markdown: str = Field(
..., description="Human-readable description of the corpus"
)
chat_starters: list[ChatStarter] = Field(
..., description="List of chat starters associated with the corpus"
)
overwritten_parameters: dict = Field(
...,
description="Corpus-specific chatbot parameters to overwrite defaults.",
)
# Corpus objects
wikipedia_corpus = Corpus(
name="Wikipedia in 25 Languages",
corpus_id="wikipedia_20250320",
icon_path="/public/img/wikipedia.png",
human_description_markdown="""Includes the full text, table and infobox data from Wikipedia in the following languages:
- [English](https://en.wikipedia.org/)
- [French](https://fr.wikipedia.org/)
- [German](https://de.wikipedia.org/)
- [Spanish](https://es.wikipedia.org/)
- [Japanese](https://ja.wikipedia.org/)
- [Russian](https://ru.wikipedia.org/)
- [Portuguese](https://pt.wikipedia.org/)
- [Chinese](https://zh.wikipedia.org/)
- [Italian](https://it.wikipedia.org/)
- [Arabic](https://ar.wikipedia.org/)
- [Persian](https://fa.wikipedia.org/)
- [Polish](https://pl.wikipedia.org/)
- [Dutch](https://nl.wikipedia.org/)
- [Ukrainian](https://uk.wikipedia.org/)
- [Hebrew](https://he.wikipedia.org/)
- [Indonesian](https://id.wikipedia.org/)
- [Turkish](https://tr.wikipedia.org/)
- [Czech](https://cs.wikipedia.org/)
- [Swedish](https://sv.wikipedia.org/)
- [Korean](https://ko.wikipedia.org/)
- [Finnish](https://fi.wikipedia.org/)
- [Vietnamese](https://vi.wikipedia.org/)
- [Hungarian](https://hu.wikipedia.org/)
- [Catalan](https://ca.wikipedia.org/)
- [Thai](https://th.wikipedia.org/).""",
chat_starters=[
ChatStarter(
display_label="Artificial Intelligence",
chat_message="Explain Artificial Intelligence.",
icon_path="https://upload.wikimedia.org/wikipedia/commons/6/64/Dall-e_3_%28jan_%2724%29_artificial_intelligence_icon.png",
),
ChatStarter(
display_label="Ethics in AI",
chat_message="What are some ethical considerations about AI.",
icon_path="https://upload.wikimedia.org/wikipedia/commons/0/05/Head_of_Aristotle.jpg",
),
],
llm_corpus_description="Wikipedia in 25 Languages",
overwritten_parameters={
"engine": "gpt-4o-mini",
"do_refine": False,
},
)
# semantic_scholar_corpus = Corpus(
# name="Academic Papers",
# corpus_id="semantic_scholar",
# icon_path="/public/img/s2_logo.png",
# human_description_markdown="""A collection of academic papers spanning multiple disciplines, including computer science, physics, mathematics, medicine, and more.
# This corpus includes peer-reviewed journal articles, conference papers, preprints, and technical reports from [Semantic Scholar](https://www.semanticscholar.org/).
# """,
# chat_starters=[
# ChatStarter(
# display_label="LLMs",
# chat_message="How does LLaMA-3 compare against GPT-4o?",
# icon_path="", # No icon provided in the original data
# ),
# ChatStarter(
# display_label="COVID-19",
# chat_message="Tell me about the latest research on COVID-19",
# icon_path="", # No icon provided in the original data
# ),
# ],
# llm_corpus_description="Semantic Scholar, a corpus of scientific literature in the fields of computer science, physics, math, medicine etc.",
# overwritten_parameters={
# "engine": "gpt-4o-mini",
# "do_refine": False,
# },
# )
# the_african_times_corpus = Corpus(
# name="The African Times",
# corpus_id="the_african_times",
# icon_path="/public/img/the_african_times.jpg",
# human_description_markdown="The African Times was a newspaper published in London during the late 19th century. Its articles primarily consisted of correspondence from the global African diaspora.",
# chat_starters=[
# ChatStarter(
# display_label="Women in West Africa",
# chat_message="Tell me about the role of women in West Africa in the 1880s.",
# icon_path="https://upload.wikimedia.org/wikipedia/commons/1/15/Africa-countries-WAFU-UFOA.png",
# ),
# ChatStarter(
# display_label="Steamships",
# chat_message="What is the history of steamships?",
# icon_path="https://upload.wikimedia.org/wikipedia/commons/thumb/d/dc/StateLibQld_1_133053_Agamemnon_%28ship%29.jpg/420px-StateLibQld_1_133053_Agamemnon_%28ship%29.jpg",
# ),
# ],
# llm_corpus_description="The African Times, a newspaper published in the late 19th century by the global African diaspora.",
# overwritten_parameters={
# "engine": "gpt-4o-mini",
# "do_refine": False,
# },
# )
# general_history_of_africa_corpus = Corpus(
# name="General History of Africa",
# corpus_id="general_history_of_africa_volumes_VI_and_VII",
# icon_path="/public/img/general_history_of_africa.png",
# human_description_markdown="""This corpus includes volumes VI and VII of the UNESCO book series "General History of Africa".
# Includes the following two volumes:
# - [General history of Africa, VI: Africa in the nineteenth century until the 1880s](https://unesdoc.unesco.org/ark:/48223/pf0000184295)
# - [General history of Africa, VII: Africa under colonial domination, 1880-1935](https://unesdoc.unesco.org/ark:/48223/pf0000184296)
# """,
# llm_corpus_description="General History of Africa, volumes VI and VII",
# chat_starters=[],
# overwritten_parameters={
# "engine": "gpt-4o-mini",
# "do_refine": False,
# },
# )
all_corpus_objects = [
wikipedia_corpus,
# semantic_scholar_corpus,
# the_african_times_corpus,
# general_history_of_africa_corpus,
]
# add retriever_endpoint to all corpora
for corpus in all_corpus_objects:
corpus.overwritten_parameters["retriever_endpoint"] = (
f"https://search.genie.stanford.edu/{corpus.corpus_id}"
)
def corpus_name_to_corpus_object(corpus_name: str) -> Corpus:
for corpus in all_corpus_objects:
if corpus.name == corpus_name:
return corpus
raise ValueError(f"Corpus with name '{corpus_name}' not found in corpora.")
def corpus_id_to_corpus_object(corpus_id: str) -> Corpus:
for corpus in all_corpus_objects:
if corpus.corpus_id == corpus_id:
return corpus
raise ValueError(f"Corpus with id '{corpus_id}' not found in corpora.")
def get_public_indices() -> tuple[list[str], list[str], list[str]]:
return (
[corpus.corpus_id for corpus in all_corpus_objects],
[corpus.name for corpus in all_corpus_objects],
[corpus.human_description_markdown for corpus in all_corpus_objects],
)