Davidyz · Davidyz · Sep 1, 2025 · Sep 1, 2025 · Sep 4, 2025 · Sep 5, 2025
diff --git a/.github/workflows/test_and_cov.yml b/.github/workflows/test_and_cov.yml
@@ -38,7 +38,7 @@ jobs:
         run: pdm config use_uv true
 
       - name: install pdm and dependencies
-        run: make deps
+        run: EXTRA_LOCK_ARGS="--group chroma0" make deps
 
       - name: Set custom HF cache directory
         run: |
@@ -47,18 +47,12 @@ jobs:
           mkdir -p "$HF_HOME"
           [ -z "$(ls "$HF_HOME")" ] || rm "${HF_HOME:?}/*" -rf && true
 
-      - name: run tests 
-        run: pdm run pytest --enable-coredumpy --coredumpy-dir ${{ env.COREDUMPY_DUMP_DIR }}
-
       - name: run coverage
         run: |
-          pdm run coverage run -m pytest
+          sh ./scripts/coverage.sh
           pdm run coverage report -m
           pdm run coverage xml -i
 
-      - name: static analysis by basedpyright
-        run: pdm run basedpyright
-
       - name: upload coverage reports to codecov
         uses: codecov/codecov-action@v5
         with:

diff --git a/Makefile b/Makefile
@@ -1,10 +1,17 @@
-.PHONY: multitest
+EXTRA_LOCK_ARGS?=
+EXTRA_DEPS?=
+EXTRA_COVERAGEPY_ARGS?=
+
+LOADED_DOT_ENV=@if [ -f .env ] ; then source .env; fi;
 
-DEFAULT_GROUPS=--group dev --group lsp --group mcp --group debug
+DEFAULT_GROUPS=--group dev --group lsp --group mcp --group debug $(EXTRA_LOCK_ARGS)
+
+.PHONY: multitest
 
 deps:
 	pdm lock $(DEFAULT_GROUPS) || pdm lock $(DEFAULT_GROUPS) --group legacy; \
 	pdm install
+	[ -z "$(EXTRA_DEPS)" ] || (pdm run python -m ensurepip && pdm run python -m pip install $(EXTRA_DEPS))
 
 test:
 	make deps; \
@@ -18,7 +25,7 @@ multitest:
 
 coverage:
 	make deps; \
-	pdm run coverage run -m pytest; \
+	pdm run coverage run $(EXTRA_COVERAGEPY_ARGS) -m pytest --enable-coredumpy --coredumpy-dir dumps; \
 	pdm run coverage html; \
 	pdm run coverage report -m
 

diff --git a/doc/VectorCode-cli.txt b/doc/VectorCode-cli.txt
@@ -16,7 +16,6 @@ Table of Contents                           *VectorCode-cli-table-of-contents*
 
 - |VectorCode-cli-installation|
     - |VectorCode-cli-install-from-source|
-    - |VectorCode-cli-migration-from-`pipx`|
     - |VectorCode-cli-chromadb|
     - |VectorCode-cli-for-windows-users|
     - |VectorCode-cli-legacy-environments|
@@ -66,7 +65,7 @@ virtual environments.
 After installing `uv`, run:
 
 >bash
-    uv tool install "vectorcode<1.0.0"
+    uv tool install "vectorcode[chroma0]"
 <
 
 in your shell. To specify a particular version of Python, use the `--python`
@@ -76,40 +75,25 @@ If you want a CPU-only installation without CUDA dependencies required by
 default by PyTorch, run:
 
 >bash
-    uv tool install "vectorcode<1.0.0" --index https://download.pytorch.org/whl/cpu --index-strategy unsafe-best-match
+    uv tool install "vectorcode[chroma0]" --index https://download.pytorch.org/whl/cpu --index-strategy unsafe-best-match
 <
 
 If you need to install multiple dependency group (for |VectorCode-cli-lsp| or
 |VectorCode-cli-mcp|), you can use the following syntax:
 
 >bash
-    uv tool install "vectorcode[lsp,mcp]<1.0.0"
+    uv tool install "vectorcode[lsp,mcp,chroma0]"
 <
 
 
   [!NOTE] The command only install VectorCode and `SentenceTransformer`, the
   default embedding engine. If you need to install an extra dependency, you can
-  use `uv tool install vectorcode --with <your_deps_here>`
+  use `uv tool install vectorcode[chroma0] --with <your_deps_here>`
 
 INSTALL FROM SOURCE ~
 
-To install from source, either `git clone` this repository and run `uv tool
-install <path_to_vectorcode_repo>`, or use `pipx`:
-
->bash
-    pipx install git+https://github.com/Davidyz/VectorCode
-<
-
-
-MIGRATION FROM PIPX ~
-
-The motivation behind the change from `pipx` to `uv tool` is mainly the
-performance. The caching mechanism in uv makes it a lot faster than `pipx` for
-a lot of operations. If you installed VectorCode via `pipx`, you can continue
-to use `pipx` to manage your VectorCode installation. If you wish to switch to
-`uv`, you need to uninstall VectorCode using `pipx` and then use `uv` to
-install it as described above. All your VectorCode configurations and database
-files will work out of the box on your new install.
+To install from source, please `git clone` this repository and run `uv tool
+install <path_to_vectorcode_repo>`.
 
 
 CHROMADB ~
@@ -124,9 +108,6 @@ instructions through docker
 significantly reduce the IO overhead and avoid potential race condition.
 
 
-  If you’re setting up a standalone ChromaDB server, I recommend sticking to
-  v0.6.3, because VectorCode is not ready for the upgrade to ChromaDB 1.0 yet.
-
 FOR WINDOWS USERS ~
 
 Windows support is not officially tested at this moment. This PR
@@ -309,7 +290,10 @@ extension, the json5 syntax will be accepted. This allows you to leave trailing
 comma in the config file, as well as writing comments (`//`). This can be very
 useful if you’re experimenting with the configs.
 
-The JSON configuration file may hold the following values: -
+The JSON configuration file may hold the following values: - `db_type`: string,
+default: `"ChromaDB0Connector"` (for chromadb 0.6.3), the database backend to
+use; - `db_params`: dictionary. See the database connector documentation
+<../src/vectorcode/database/README.md> for the default values; -
 `embedding_function`: string, one of the embedding functions supported by
 Chromadb <https://www.trychroma.com/> (find more here
 <https://docs.trychroma.com/docs/embeddings/embedding-functions> and here
@@ -329,62 +313,45 @@ model_name="nomic-embed-text")`. Default: `{}`; - `embedding_dims`: integer or
 model supports Matryoshka Representation Learning (MRL) before using this._
 Learn more about MRL here
 <https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings>.
-When set to `null` (or unset), the embeddings won’t be truncated; - `db_url`:
-string, the url that points to the Chromadb server. VectorCode will start an
-HTTP server for Chromadb at a randomly picked free port on `localhost` if your
-configured `http://host:port` is not accessible. Default:
-`http://127.0.0.1:8000`; - `db_path`: string, Path to local persistent
-database. If you didn’t set up a standalone Chromadb server, this is where
-the files for your database will be stored. Default:
-`~/.local/share/vectorcode/chromadb/`; - `db_log_path`: string, path to the
-_directory_ where the built-in chromadb server will write the log to. Default:
-`~/.local/share/vectorcode/`; - `chunk_size`: integer, the maximum number of
-characters per chunk. A larger value reduces the number of items in the
-database, and hence accelerates the search, but at the cost of potentially
-truncated data and lost information. Default: `2500`. To disable chunking, set
-it to a negative number; - `overlap_ratio`: float between 0 and 1, the ratio of
-overlapping/shared content between 2 adjacent chunks. A larger ratio improves
-the coherence of chunks, but at the cost of increasing number of entries in the
-database and hence slowing down the search. Default: `0.2`. _Starting from
-0.4.11, VectorCode will use treesitter to parse languages that it can
-automatically detect. It uses pygments to guess the language from filename, and
-tree-sitter-language-pack to fetch the correct parser. overlap_ratio has no
-effects when treesitter works. If VectorCode fails to find an appropriate
-parser, it’ll fallback to the legacy naive parser, in which case
-overlap_ratio works exactly in the same way as before;_ - `query_multiplier`:
-integer, when you use the `query` command to retrieve `n` documents, VectorCode
-will check `n * query_multiplier` chunks and return at most `n` documents. A
-larger value of `query_multiplier` guarantees the return of `n` documents, but
-with the risk of including too many less-relevant chunks that may affect the
-document selection. Default: `-1` (any negative value means selecting documents
-based on all indexed chunks); - `reranker`: string, the reranking method to
-use. Currently supports `NaiveReranker` (sort chunks by the "distance" between
-the embedding vectors) and `CrossEncoderReranker` (using sentence-transformers
-cross-encoder
+When set to `null` (or unset), the embeddings won’t be truncated; -
+`chunk_size`: integer, the maximum number of characters per chunk. A larger
+value reduces the number of items in the database, and hence accelerates the
+search, but at the cost of potentially truncated data and lost information.
+Default: `2500`. To disable chunking, set it to a negative number; -
+`overlap_ratio`: float between 0 and 1, the ratio of overlapping/shared content
+between 2 adjacent chunks. A larger ratio improves the coherence of chunks, but
+at the cost of increasing number of entries in the database and hence slowing
+down the search. Default: `0.2`. _Starting from 0.4.11, VectorCode will use
+treesitter to parse languages that it can automatically detect. It uses
+pygments to guess the language from filename, and tree-sitter-language-pack to
+fetch the correct parser. overlap_ratio has no effects when treesitter works.
+If VectorCode fails to find an appropriate parser, it’ll fallback to the
+legacy naive parser, in which case overlap_ratio works exactly in the same way
+as before;_ - `query_multiplier`: integer, when you use the `query` command to
+retrieve `n` documents, VectorCode will check `n * query_multiplier` chunks and
+return at most `n` documents. A larger value of `query_multiplier` guarantees
+the return of `n` documents, but with the risk of including too many
+less-relevant chunks that may affect the document selection. Default: `-1` (any
+negative value means selecting documents based on all indexed chunks); -
+`reranker`: string, the reranking method to use. Currently supports
+`NaiveReranker` (sort chunks by the "distance" between the embedding vectors)
+and `CrossEncoderReranker` (using sentence-transformers cross-encoder
 <https://sbert.net/docs/package_reference/cross_encoder/cross_encoder.html> ).
 - `reranker_params`: dictionary, similar to `embedding_params`. The options
 passed to the reranker class constructor. For `CrossEncoderReranker`, these are
 the options passed to the `CrossEncoder`
 <https://sbert.net/docs/package_reference/cross_encoder/cross_encoder.html#id1>
 class. For example, if you want to use a non-default model, you can use the
 following: `json { "reranker_params": { "model_name_or_path": "your_model_here"
-} }` - `db_settings`: dictionary, works in a similar way to `embedding_params`,
-but for Chromadb client settings so that you can configure authentication for
-remote Chromadb <https://docs.trychroma.com/production/administration/auth>; -
-`hnsw`: a dictionary of hnsw settings
-<https://cookbook.chromadb.dev/core/configuration/#hnsw-configuration> that may
-improve the query performances or avoid runtime errors during queries. **It’s
-recommended to re-vectorise the collection after modifying these options,
-because some of the options can only be set during collection creation.**
-Example (and default): `json5 "hnsw": { "hnsw:M": 64, }` - `filetype_map`:
-`dict[str, list[str]]`, a dictionary where keys are language name
+} }` - `filetype_map`: `dict[str, list[str]]`, a dictionary where keys are
+language name
 <https://github.com/Goldziher/tree-sitter-language-pack?tab=readme-ov-file#available-languages>
 and values are lists of Python regex patterns
 <https://docs.python.org/3/library/re.html> that will match file extensions.
 This allows overriding automatic language detection and specifying a treesitter
 parser for certain file types for which the language parser cannot be correctly
 identified (e.g., `.phtml` files containing both php and html). Example
-configuration: `json5 "filetype_map": { "php": ["^phtml$"] }`
+configuration: `json5 { "filetype_map": { "php": ["^phtml$"], }, }`
 
 - `chunk_filters`: `dict[str, list[str]]`, a dictionary where the keys are
     language name
@@ -395,10 +362,12 @@ configuration: `json5 "filetype_map": { "php": ["^phtml$"] }`
     treesitter chunker. By default, no filters will be added. Example
     configuration:
     >json5
-        "chunk_filters": {
-          "python": ["^[^a-zA-Z0-9]+$"], // multiple patterns will be merged (unioned)
-          // or you can use wildcard to match any languages that has no dedicated filters:
-          "*": ["^[^a-zA-Z0-9]+$"],
+        {
+          "chunk_filters": {
+            "python": ["^[^a-zA-Z0-9]+$"], // multiple patterns will be merged (unioned)
+            // or you can use wildcard to match any languages that has no dedicated filters:
+            "*": ["^[^a-zA-Z0-9]+$"],
+          },
         }
     <
 - `encoding`: string, alternative encoding used for this project. By default this
@@ -743,13 +712,13 @@ A JSON array of collection information of the following format will be printed:
 
 >json
     {
-      "project_root": str,
-      "user": str,
-      "hostname": str,
-      "collection_name": str,
-      "size": int,
-      "num_files": int,
-      "embedding_function": str
+      "project_root": "project_root",
+      "user": "user",
+      "hostname": "host",
+      "collection_name": "fuerbvo13571943ofuib",
+      "size": 10,
+      "num_files": 100,
+      "embedding_function": "SomeEmbeddingFunction"
     }
 <
 

diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
@@ -40,6 +40,41 @@ You may also find it helpful to
 [enable logging](https://github.com/Davidyz/VectorCode/blob/main/docs/cli.md#debugging-and-diagnosing) 
 for the CLI when developing new features or working on fixes.
 
+### Local Dependencies
+
+Sometimes you want to run `make deps` that install non-default dependencies. The 
+`Makefile` provides easy ways to do that. 
+
+When you want to install a dependency group of VectorCode: 
+
+```bash 
+EXTRA_LOCK_ARGS="--group chroma0" make deps
+```
+
+When you want to install a library that is not declared in any of the dependency 
+groups (like `openai`):
+
+```bash 
+EXTRA_DEPS="openai\<2.0.0" make deps
+```
+
+Both environment variables apply to `make deps`, `make test` and `make coverage`.
+
+### Database Connectors
+
+Please take a look at [the database documentation](../src/vectorcode/database/README.md), 
+which contains a brief introduction on the API design that explains what you'd need 
+to do to add support for a new database.
+
+### Coverage Across Mutiple Runs
+
+If, for some reasons, you need to run the tests multiple times to get full coverage 
+(maybe when there are conflicting dependency groups like chromadb 0.6.3 vs chromadb 1.x), you can pass `--append` flag to the `coverage` command.
+If you're using `make coverage`, you can set this flag via the `EXTRA_COVERAGEPY_ARGS` environment variable:
+```bash 
+EXTRA_COVERAGEPY_ARGS="--append" make coverage
+```
+
 ## Neovim Plugin
 
 At the moment, there isn't much to cover on here. As long as the code is