llm_app/27_embedding_qdrant_test.py at master · behoss/llm_app · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import time
import json
import httpx
import concurrent.futures
import pandas as pd
from tabulate import tabulate

HF_TOKEN = "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"
QDRANT_API_KEY = "WGw3x8Ck4OuJIF_evXR3xK6zIIlRo34PGuV38hmR1MiRwQ4hs_IKPQ"

HF_EMBEDDING_URL = "https://py9zqfdpz8du3h4r.us-east-1.aws.endpoints.huggingface.cloud"
QDRANT_SEARCH_URL = "https://586ef24e-9132-423e-82f7-43b0078e7a60.us-east4-0.gcp.cloud.qdrant.io/collections/replicas/points/search"

TEXT_INPUT = "Tell me about Prompting Happiness"
REPLICA_UUID = "d79a29c0-24f3-4b70-9046-8d116f9ce1bb"


def run_single_test(index):
    # Step 1: Generate embedding
    embed_start = time.time()
    embed_resp = httpx.post(
        HF_EMBEDDING_URL,
        headers={
            "Content-Type": "application/json",
            "Authorization": f"Bearer {HF_TOKEN}",
        },
        json={"inputs": TEXT_INPUT, "options": {"normalize": True}},
        timeout=30,
    )
    embed_time = round((time.time() - embed_start) * 1000, 2)
    embedding = embed_resp.json()[0]

    # Step 2: Qdrant search
    qdrant_start = time.time()
    qdrant_resp = httpx.post(
        QDRANT_SEARCH_URL,
        headers={
            "Content-Type": "application/json",
            "api-key": QDRANT_API_KEY,
        },
        json={
            "vector": embedding,
            "limit": 5,
            "filter": {
                "must": [
                    {"key": "replica_uuid", "match": {"value": REPLICA_UUID}},
                    {"key": "tag", "match": {"value": "file"}},
                ]
            },
            "with_payload": True,
            "with_vectors": False,
        },
        timeout=30,
    )
    qdrant_time = round((time.time() - qdrant_start) * 1000, 2)

    return [
        index + 1,
        embed_time,
        qdrant_time,
        embed_resp.status_code,
        qdrant_resp.status_code,
    ]


# Run 30 concurrent tasks
results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor:
    futures = [executor.submit(run_single_test, i) for i in range(30)]
    for future in concurrent.futures.as_completed(futures):
        result = future.result()
        results.append(result)
        print(
            tabulate(
                [result],
                headers=[
                    "#",
                    "Embedding (ms)",
                    "Qdrant (ms)",
                    "Embed Status",
                    "Qdrant Status",
                ],
                tablefmt="grid",
            )
        )

# Sort and tabulate
results.sort(key=lambda x: x[0])
df = pd.DataFrame(
    results,
    columns=["#", "Embedding (ms)", "Qdrant (ms)", "Embed Status", "Qdrant Status"],
)
df.loc["Average"] = [
    "",
    round(df["Embedding (ms)"].astype(float).mean(), 2),
    round(df["Qdrant (ms)"].astype(float).mean(), 2),
    "",
    "",
]

print("\nFinal Performance Summary:")
print(tabulate(df, headers="keys", tablefmt="grid"))
df.to_csv("embedding_qdrant_latency.csv", index=False)