diff --git a/.env.example b/.env.example index 0728bf0..c52acd8 100644 --- a/.env.example +++ b/.env.example @@ -6,4 +6,6 @@ # secrets in it. If you are cloning this repo, create a copy of this file named # ".env" and populate it with your secrets. -DATABASE_URL=postgres://dev:dev@localhost:5432/dev +# Qdrant connection settings +QDRANT_HOST=localhost +QDRANT_PORT=6333 diff --git a/README.md b/README.md index 1b4c6af..7bf66d4 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ # About A simple example of implementing semantic search using wikipedia as a test corpus. -It uses a postgres database with the [pgvector](https://github.com/pgvector/pgvector) extension and create embeddings with the [`nomic-embed-text`](https://ollama.com/library/nomic-embed-text) model using [ollama](https://ollama.com/). +It uses a Qdrant vector database for storing and searching embeddings, with vectors created using the [`nomic-embed-text`](https://ollama.com/library/nomic-embed-text) model via [ollama](https://ollama.com/). See [corresponding blog post](https://www.kodemaker.no/blogg/2025-02-12-text-embeddings/) (in norwegian). # Installation -Run `nmp install` +Run `npm install` ## .env Run `npm run dotenv-init` @@ -27,22 +27,17 @@ Then you should download the embedding model we will use: `ollama pull nomic-emb ## Database setup -### Start the postgres database +### Start the Qdrant vector database `docker-compose up -d` -### Migrations -Initially you must run the migrations to create the tables in the database. +Qdrant will automatically create the necessary collections when you run the application. -`npm run migrate:up` - -If something goes wrong, you can rollback the latest migration with: `npm run migrate:down`. - -If something goes even more wrong, you can always stop the database first with `docker-compose down` and remove it completely with `rm -rf db/.pgdata`, and then start over. +If something goes wrong, you can always stop the database with `docker-compose down` and remove it completely with `rm -rf db/qdrant-data`, and then start over. # Running the app ## Indexing -Create embeddings for 100 random wikipedia articles and store them in the local postgres database. +Create embeddings for 100 random wikipedia articles and store them in the local Qdrant database. `npm run index 100` @@ -51,28 +46,28 @@ Check if there are any rock artists among our indexed data: `npm run search "Rock artist"` -Sample output from my 1000 random indexed articles: +Sample output from indexed articles: ``` > embeddings-node@0.0.1 search > tsx src/index.ts -s Rock artist -Connecting to database postgres://dev:dev@localhost:5432/dev +Connecting to Qdrant at localhost:6333 Searching for content: Rock artist Search Results: [ { resource_link: 'https://en.wikipedia.org/wiki/Dark_Horse_%E2%80%93_A_Live_Collection', caption: 'Dark Horse – A Live Collection', - cosine_distance: 0.39703900903717815 + cosine_distance: 0.8029609909628218 }, { resource_link: 'https://en.wikipedia.org/wiki/Phil_Stack', caption: 'Phil Stack', - cosine_distance: 0.4036332663110951 + cosine_distance: 0.7963667336889049 }, { resource_link: 'https://en.wikipedia.org/wiki/Russell_B_Jackson', caption: 'Russell B Jackson', - cosine_distance: 0.4270296447349413 + cosine_distance: 0.7729703552650587 }, ``` diff --git a/docker-compose.yml b/docker-compose.yml index e479e21..8f7525d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,15 +2,12 @@ version: "3.8" services: - test-pgvector: - image: ankane/pgvector:latest - container_name: embeddings-test + qdrant: + image: qdrant/qdrant:latest + container_name: embeddings-qdrant restart: always - environment: - POSTGRES_USER: dev - POSTGRES_PASSWORD: dev - POSTGRES_DB: dev ports: - - "5432:5432" + - "6333:6333" + - "6334:6334" volumes: - - ./db/.pgdata:/var/lib/postgresql/data + - ./db/qdrant-data:/qdrant/storage diff --git a/package-lock.json b/package-lock.json index db83371..f53a7b2 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,6 +9,7 @@ "version": "0.0.1", "license": "ISC", "dependencies": { + "@qdrant/js-client-rest": "^1.14.1", "commander": "^13.1.0", "dotenv": "^16.4.7", "pg": "^8.13.1" @@ -314,6 +315,40 @@ "node": ">= 8" } }, + "node_modules/@qdrant/js-client-rest": { + "version": "1.14.1", + "resolved": "https://registry.npmjs.org/@qdrant/js-client-rest/-/js-client-rest-1.14.1.tgz", + "integrity": "sha512-CkCCTDc4gCXq+hhjB3yDw9Hs/PxCJ0bKqk/LjAAmuL9+nDm/RPue4C/tGOIMlzouTQ2l6J6t+JPeM//j38VFug==", + "license": "Apache-2.0", + "dependencies": { + "@qdrant/openapi-typescript-fetch": "1.2.6", + "@sevinf/maybe": "0.5.0", + "undici": "^6.0.0" + }, + "engines": { + "node": ">=18.17.0", + "pnpm": ">=8" + }, + "peerDependencies": { + "typescript": ">=4.7" + } + }, + "node_modules/@qdrant/openapi-typescript-fetch": { + "version": "1.2.6", + "resolved": "https://registry.npmjs.org/@qdrant/openapi-typescript-fetch/-/openapi-typescript-fetch-1.2.6.tgz", + "integrity": "sha512-oQG/FejNpItrxRHoyctYvT3rwGZOnK4jr3JdppO/c78ktDvkWiPXPHNsrDf33K9sZdRb6PR7gi4noIapu5q4HA==", + "license": "MIT", + "engines": { + "node": ">=18.0.0", + "pnpm": ">=8" + } + }, + "node_modules/@sevinf/maybe": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/@sevinf/maybe/-/maybe-0.5.0.tgz", + "integrity": "sha512-ARhyoYDnY1LES3vYI0fiG6e9esWfTNcXcO6+MPJJXcnyMV3bim4lnFt45VXouV7y82F4x3YH8nOQ6VztuvUiWg==", + "license": "MIT" + }, "node_modules/@types/node": { "version": "22.13.1", "resolved": "https://registry.npmjs.org/@types/node/-/node-22.13.1.tgz", @@ -2399,7 +2434,6 @@ "version": "5.7.3", "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.7.3.tgz", "integrity": "sha512-84MVSjMEHP+FQRPy3pX9sTVV/INIex71s9TL2Gm5FG/WG1SqXeKyZ0k7/blY/4FdOzI12CBy1vGc4og/eus0fw==", - "dev": true, "license": "Apache-2.0", "bin": { "tsc": "bin/tsc", @@ -2432,6 +2466,15 @@ "typescript": ">=4.8.4 <5.8.0" } }, + "node_modules/undici": { + "version": "6.21.3", + "resolved": "https://registry.npmjs.org/undici/-/undici-6.21.3.tgz", + "integrity": "sha512-gBLkYIlEnSp8pFbT64yFgGE6UIB9tAkhukC23PmMDCe5Nd+cRqKxSjw5y54MK2AZMgZfJWMaNE4nYUHgi1XEOw==", + "license": "MIT", + "engines": { + "node": ">=18.17" + } + }, "node_modules/undici-types": { "version": "6.20.0", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.20.0.tgz", diff --git a/package.json b/package.json index 07a071b..1072f25 100644 --- a/package.json +++ b/package.json @@ -19,9 +19,9 @@ "license": "ISC", "description": "", "dependencies": { + "@qdrant/js-client-rest": "^1.14.1", "commander": "^13.1.0", - "dotenv": "^16.4.7", - "pg": "^8.13.1" + "dotenv": "^16.4.7" }, "devDependencies": { "@eslint/js": "^8.57.1", diff --git a/src/insertWikiData.ts b/src/insertWikiData.ts index a86c8dd..a34efb4 100644 --- a/src/insertWikiData.ts +++ b/src/insertWikiData.ts @@ -1,7 +1,5 @@ -import pg from "pg"; import { createEmbedding } from "./ollamaEmbedding.js"; -import * as dotenv from "dotenv"; -const { Pool } = pg; +import { initializeCollection, insertEmbedding as qdrantInsertEmbedding, getNextId } from "./qdrantClient.js"; type WikiData = { title: string; @@ -13,11 +11,8 @@ type WikiData = { const wikiUrl = "https://en.wikipedia.org/w/api.php?action=query&format=json&generator=random&grnnamespace=0&grnlimit=1&prop=info|extracts&inprop=url&explaintext=true"; -dotenv.config(); // Read database connection info from .env file - -const pool = new Pool({ - connectionString: process.env.DATABASE_URL, -}); +// Initialize Qdrant collection when this module is imported +initializeCollection().catch(console.error); export const insertWikiData = async (count: number) => { for (let i = 0; i < count; i++) { @@ -30,29 +25,17 @@ export const insertWikiData = async (count: number) => { const { title, extract } = wikiData; console.log("Indexing wiki article:", title); const embedding = await createEmbedding(extract); - insertEmbedding(wikiData, embedding); + const id = await getNextId(); + insertEmbedding(id, wikiData, embedding); }); } }; -const insertEmbedding = async ({ title, fullurl, extract }: WikiData, embedding: number[]) => { - const client = await pool.connect(); - const query = ` - INSERT INTO embedded_data ( - caption, - resource_link, - embedding, - content - ) - VALUES ($1, $2, $3, $4) - RETURNING *; - `; +const insertEmbedding = async (id: number, { title, fullurl, extract }: WikiData, embedding: number[]) => { try { - const values = [title, fullurl, JSON.stringify(embedding), extract]; - await client.query(query, values); + await qdrantInsertEmbedding(id, { title, fullurl, extract }, embedding); + console.log(`Successfully inserted embedding for "${title}" with ID ${id}`); } catch (err) { - console.error("Error inserting row:", err); - } finally { - client.release(); + console.error("Error inserting embedding:", err); } }; diff --git a/src/qdrantClient.ts b/src/qdrantClient.ts new file mode 100644 index 0000000..6e50a67 --- /dev/null +++ b/src/qdrantClient.ts @@ -0,0 +1,109 @@ +import { QdrantClient } from '@qdrant/js-client-rest'; +import * as dotenv from "dotenv"; + +dotenv.config(); // Read connection info from .env file + +const COLLECTION_NAME = 'embedded_data'; + +// Read Qdrant connection details from environment variables +const host = process.env.QDRANT_HOST || 'localhost'; +const port = parseInt(process.env.QDRANT_PORT || '6333'); + +console.log(`Connecting to Qdrant at ${host}:${port}`); + +// Create a new Qdrant client +export const qdrantClient = new QdrantClient({ host, port }); + +// Initialize the collection if it doesn't exist +export const initializeCollection = async () => { + try { + const collections = await qdrantClient.getCollections(); + const collectionExists = collections.collections.some(c => c.name === COLLECTION_NAME); + + if (!collectionExists) { + console.log(`Creating collection ${COLLECTION_NAME}`); + await qdrantClient.createCollection(COLLECTION_NAME, { + vectors: { + size: 768, // nomic-embed-text uses 768 dimensions + distance: 'Cosine' // Use cosine similarity + }, + }); + + // Create payload indexes for faster retrieval + await qdrantClient.createPayloadIndex(COLLECTION_NAME, { + field_name: 'caption', + field_schema: 'keyword' + }); + + await qdrantClient.createPayloadIndex(COLLECTION_NAME, { + field_name: 'resource_link', + field_schema: 'keyword' + }); + } else { + console.log(`Collection ${COLLECTION_NAME} already exists`); + } + } catch (error) { + console.error('Error initializing Qdrant collection:', error); + throw error; + } +}; + +// Function to insert an embedding into Qdrant +export const insertEmbedding = async ( + id: number, + { title, fullurl, extract }: { title: string, fullurl: string, extract: string }, + embedding: number[] +) => { + try { + await qdrantClient.upsert(COLLECTION_NAME, { + points: [ + { + id: id, + vector: embedding, + payload: { + caption: title, + resource_link: fullurl, + content: extract + } + } + ] + }); + } catch (error) { + console.error('Error inserting embedding:', error); + throw error; + } +}; + +// Function to search for similar embeddings in Qdrant +export const searchEmbedding = async (embedding: number[], limit = 10) => { + try { + const searchResult = await qdrantClient.search(COLLECTION_NAME, { + vector: embedding, + limit: limit, + with_payload: true + }); + + const results = searchResult.map(hit => ({ + resource_link: hit.payload?.resource_link as string, + caption: hit.payload?.caption as string, + cosine_distance: hit.score + })); + + console.log('Search Results:', results); + return results; + } catch (error) { + console.error('Error searching embeddings:', error); + throw error; + } +}; + +// Function to get the next available ID for inserting +export const getNextId = async (): Promise => { + try { + const countResponse = await qdrantClient.count(COLLECTION_NAME); + return countResponse.count + 1; + } catch (error) { + console.error('Error getting next ID:', error); + return 1; // Default to 1 if there's an error + } +}; \ No newline at end of file diff --git a/src/searchWiki.ts b/src/searchWiki.ts index b3c1656..cdd7bc4 100644 --- a/src/searchWiki.ts +++ b/src/searchWiki.ts @@ -1,31 +1,9 @@ -import * as dotenv from "dotenv"; -import pg from "pg"; -const { Pool } = pg; - -dotenv.config(); // Read database connection info from .env file - -console.log("Connecting to database", process.env.DATABASE_URL); -const pool = new Pool({ - connectionString: process.env.DATABASE_URL, -}); +import { searchEmbedding as qdrantSearchEmbedding } from "./qdrantClient.js"; export const searchEmbedding = async (embedding: number[]) => { - const client = await pool.connect(); - const query = ` - SELECT - resource_link, - caption, - embedding <=> $1 AS cosine_distance - FROM embedded_data - ORDER BY embedding <=> $1 - LIMIT 10 - `; try { - const result = await client.query(query, [JSON.stringify(embedding)]); - console.log("Search Results:", result.rows); + await qdrantSearchEmbedding(embedding, 10); } catch (err) { - console.error("Error inserting row:", err); - } finally { - client.release(); + console.error("Error searching for embeddings:", err); } };