Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,6 @@
# secrets in it. If you are cloning this repo, create a copy of this file named
# ".env" and populate it with your secrets.

DATABASE_URL=postgres://dev:dev@localhost:5432/dev
# Qdrant connection settings
QDRANT_HOST=localhost
QDRANT_PORT=6333
27 changes: 11 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# About
A simple example of implementing semantic search using wikipedia as a test corpus.

It uses a postgres database with the [pgvector](https://github.com/pgvector/pgvector) extension and create embeddings with the [`nomic-embed-text`](https://ollama.com/library/nomic-embed-text) model using [ollama](https://ollama.com/).
It uses a Qdrant vector database for storing and searching embeddings, with vectors created using the [`nomic-embed-text`](https://ollama.com/library/nomic-embed-text) model via [ollama](https://ollama.com/).

See [corresponding blog post](https://www.kodemaker.no/blogg/2025-02-12-text-embeddings/) (in norwegian).

# Installation
Run `nmp install`
Run `npm install`

## .env
Run `npm run dotenv-init`
Expand All @@ -27,22 +27,17 @@ Then you should download the embedding model we will use: `ollama pull nomic-emb

## Database setup

### Start the postgres database
### Start the Qdrant vector database
`docker-compose up -d`

### Migrations
Initially you must run the migrations to create the tables in the database.
Qdrant will automatically create the necessary collections when you run the application.

`npm run migrate:up`

If something goes wrong, you can rollback the latest migration with: `npm run migrate:down`.

If something goes even more wrong, you can always stop the database first with `docker-compose down` and remove it completely with `rm -rf db/.pgdata`, and then start over.
If something goes wrong, you can always stop the database with `docker-compose down` and remove it completely with `rm -rf db/qdrant-data`, and then start over.

# Running the app

## Indexing
Create embeddings for 100 random wikipedia articles and store them in the local postgres database.
Create embeddings for 100 random wikipedia articles and store them in the local Qdrant database.

`npm run index 100`

Expand All @@ -51,28 +46,28 @@ Check if there are any rock artists among our indexed data:

`npm run search "Rock artist"`

Sample output from my 1000 random indexed articles:
Sample output from indexed articles:
```
> embeddings-node@0.0.1 search
> tsx src/index.ts -s Rock artist

Connecting to database postgres://dev:dev@localhost:5432/dev
Connecting to Qdrant at localhost:6333
Searching for content: Rock artist
Search Results: [
{
resource_link: 'https://en.wikipedia.org/wiki/Dark_Horse_%E2%80%93_A_Live_Collection',
caption: 'Dark Horse – A Live Collection',
cosine_distance: 0.39703900903717815
cosine_distance: 0.8029609909628218
},
{
resource_link: 'https://en.wikipedia.org/wiki/Phil_Stack',
caption: 'Phil Stack',
cosine_distance: 0.4036332663110951
cosine_distance: 0.7963667336889049
},
{
resource_link: 'https://en.wikipedia.org/wiki/Russell_B_Jackson',
caption: 'Russell B Jackson',
cosine_distance: 0.4270296447349413
cosine_distance: 0.7729703552650587
},
```

15 changes: 6 additions & 9 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,12 @@ version: "3.8"


services:
test-pgvector:
image: ankane/pgvector:latest
container_name: embeddings-test
qdrant:
image: qdrant/qdrant:latest
container_name: embeddings-qdrant
restart: always
environment:
POSTGRES_USER: dev
POSTGRES_PASSWORD: dev
POSTGRES_DB: dev
ports:
- "5432:5432"
- "6333:6333"
- "6334:6334"
volumes:
- ./db/.pgdata:/var/lib/postgresql/data
- ./db/qdrant-data:/qdrant/storage
45 changes: 44 additions & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@
"license": "ISC",
"description": "",
"dependencies": {
"@qdrant/js-client-rest": "^1.14.1",
"commander": "^13.1.0",
"dotenv": "^16.4.7",
"pg": "^8.13.1"
"dotenv": "^16.4.7"
},
"devDependencies": {
"@eslint/js": "^8.57.1",
Expand Down
35 changes: 9 additions & 26 deletions src/insertWikiData.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import pg from "pg";
import { createEmbedding } from "./ollamaEmbedding.js";
import * as dotenv from "dotenv";
const { Pool } = pg;
import { initializeCollection, insertEmbedding as qdrantInsertEmbedding, getNextId } from "./qdrantClient.js";

type WikiData = {
title: string;
Expand All @@ -13,11 +11,8 @@ type WikiData = {
const wikiUrl =
"https://en.wikipedia.org/w/api.php?action=query&format=json&generator=random&grnnamespace=0&grnlimit=1&prop=info|extracts&inprop=url&explaintext=true";

dotenv.config(); // Read database connection info from .env file

const pool = new Pool({
connectionString: process.env.DATABASE_URL,
});
// Initialize Qdrant collection when this module is imported
initializeCollection().catch(console.error);

export const insertWikiData = async (count: number) => {
for (let i = 0; i < count; i++) {
Expand All @@ -30,29 +25,17 @@ export const insertWikiData = async (count: number) => {
const { title, extract } = wikiData;
console.log("Indexing wiki article:", title);
const embedding = await createEmbedding(extract);
insertEmbedding(wikiData, embedding);
const id = await getNextId();
insertEmbedding(id, wikiData, embedding);
});
}
};

const insertEmbedding = async ({ title, fullurl, extract }: WikiData, embedding: number[]) => {
const client = await pool.connect();
const query = `
INSERT INTO embedded_data (
caption,
resource_link,
embedding,
content
)
VALUES ($1, $2, $3, $4)
RETURNING *;
`;
const insertEmbedding = async (id: number, { title, fullurl, extract }: WikiData, embedding: number[]) => {
try {
const values = [title, fullurl, JSON.stringify(embedding), extract];
await client.query(query, values);
await qdrantInsertEmbedding(id, { title, fullurl, extract }, embedding);
console.log(`Successfully inserted embedding for "${title}" with ID ${id}`);
} catch (err) {
console.error("Error inserting row:", err);
} finally {
client.release();
console.error("Error inserting embedding:", err);
}
};
109 changes: 109 additions & 0 deletions src/qdrantClient.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import { QdrantClient } from '@qdrant/js-client-rest';
import * as dotenv from "dotenv";

dotenv.config(); // Read connection info from .env file

const COLLECTION_NAME = 'embedded_data';

// Read Qdrant connection details from environment variables
const host = process.env.QDRANT_HOST || 'localhost';
const port = parseInt(process.env.QDRANT_PORT || '6333');

console.log(`Connecting to Qdrant at ${host}:${port}`);

// Create a new Qdrant client
export const qdrantClient = new QdrantClient({ host, port });

// Initialize the collection if it doesn't exist
export const initializeCollection = async () => {
try {
const collections = await qdrantClient.getCollections();
const collectionExists = collections.collections.some(c => c.name === COLLECTION_NAME);

if (!collectionExists) {
console.log(`Creating collection ${COLLECTION_NAME}`);
await qdrantClient.createCollection(COLLECTION_NAME, {
vectors: {
size: 768, // nomic-embed-text uses 768 dimensions
distance: 'Cosine' // Use cosine similarity
},
});

// Create payload indexes for faster retrieval
await qdrantClient.createPayloadIndex(COLLECTION_NAME, {
field_name: 'caption',
field_schema: 'keyword'
});

await qdrantClient.createPayloadIndex(COLLECTION_NAME, {
field_name: 'resource_link',
field_schema: 'keyword'
});
} else {
console.log(`Collection ${COLLECTION_NAME} already exists`);
}
} catch (error) {
console.error('Error initializing Qdrant collection:', error);
throw error;
}
};

// Function to insert an embedding into Qdrant
export const insertEmbedding = async (
id: number,
{ title, fullurl, extract }: { title: string, fullurl: string, extract: string },
embedding: number[]
) => {
try {
await qdrantClient.upsert(COLLECTION_NAME, {
points: [
{
id: id,
vector: embedding,
payload: {
caption: title,
resource_link: fullurl,
content: extract
}
}
]
});
} catch (error) {
console.error('Error inserting embedding:', error);
throw error;
}
};

// Function to search for similar embeddings in Qdrant
export const searchEmbedding = async (embedding: number[], limit = 10) => {
try {
const searchResult = await qdrantClient.search(COLLECTION_NAME, {
vector: embedding,
limit: limit,
with_payload: true
});

const results = searchResult.map(hit => ({
resource_link: hit.payload?.resource_link as string,
caption: hit.payload?.caption as string,
cosine_distance: hit.score
}));

console.log('Search Results:', results);
return results;
} catch (error) {
console.error('Error searching embeddings:', error);
throw error;
}
};

// Function to get the next available ID for inserting
export const getNextId = async (): Promise<number> => {
try {
const countResponse = await qdrantClient.count(COLLECTION_NAME);
return countResponse.count + 1;
} catch (error) {
console.error('Error getting next ID:', error);
return 1; // Default to 1 if there's an error
}
};
28 changes: 3 additions & 25 deletions src/searchWiki.ts
Original file line number Diff line number Diff line change
@@ -1,31 +1,9 @@
import * as dotenv from "dotenv";
import pg from "pg";
const { Pool } = pg;

dotenv.config(); // Read database connection info from .env file

console.log("Connecting to database", process.env.DATABASE_URL);
const pool = new Pool({
connectionString: process.env.DATABASE_URL,
});
import { searchEmbedding as qdrantSearchEmbedding } from "./qdrantClient.js";

export const searchEmbedding = async (embedding: number[]) => {
const client = await pool.connect();
const query = `
SELECT
resource_link,
caption,
embedding <=> $1 AS cosine_distance
FROM embedded_data
ORDER BY embedding <=> $1
LIMIT 10
`;
try {
const result = await client.query(query, [JSON.stringify(embedding)]);
console.log("Search Results:", result.rows);
await qdrantSearchEmbedding(embedding, 10);
} catch (err) {
console.error("Error inserting row:", err);
} finally {
client.release();
console.error("Error searching for embeddings:", err);
}
};