Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ set(CMAKE_CXX_STANDARD_REQUIRED True)
# Set extension name here
set(TARGET_NAME duckdb_rdkit)

# Find Boost with components before RDKit, so that Boost::system etc. targets
# are created (vcpkg's BoostConfig only creates targets for requested components)
find_package(Boost REQUIRED COMPONENTS system serialization iostreams)
find_package(RDKit REQUIRED)

set(EXTENSION_NAME ${TARGET_NAME}_extension)
Expand Down
2 changes: 1 addition & 1 deletion duckdb
Submodule duckdb updated 7066 files
2 changes: 1 addition & 1 deletion extension-ci-tools
123 changes: 123 additions & 0 deletions notes/rdkit-vcpkg-port.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# RDKit vcpkg Port Notes

## Summary

RDKit is not available in vcpkg. Creating a C++ only port is feasible.
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you help me understand what this markdown file is for?


## Current Build Issue

The project uses:
- **vcpkg toolchain** for DuckDB extension build system
- **Spack-installed RDKit** at `/mnt/aux-data/teague/Dev/spack/var/spack/environments/duckdb/.spack-env/view/`
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This path seems to be specific to your system. Is this intended? Also in other portions of this file, for example line 34


The conflict: RDKit's cmake config (`rdkit-targets.cmake:61-64`) declares dependencies on `Boost::system`, `Boost::serialization`, `Boost::iostreams`. When vcpkg's toolchain intercepts `find_package(Boost)`, it looks in vcpkg's install tree where Boost doesn't exist.

## RDKit Dependencies (from Spack recipe)

### Required
- `boost` (+system +serialization +iostreams) - **in vcpkg**
- `sqlite` - **in vcpkg**

### Optional (for full build)
- `freetype` - **in vcpkg**
- `eigen3` (for 3D descriptors) - **in vcpkg**
- `coordgen` - **NOT in vcpkg** (would need port)
- `maeparser` - **NOT in vcpkg** (would need port)
- `freesasa` - **NOT in vcpkg**
- Python/NumPy (for wrappers) - not needed for C++ only

### For C++ Only Build
Only need: boost, sqlite, optionally eigen3/freetype. All available in vcpkg.

## vcpkg Port Structure

A port requires two files in `/home/teague/Dev/vcpkg/ports/rdkit/`:

### vcpkg.json
```json
{
"name": "rdkit",
"version": "2024.03.3",
"description": "RDKit: Open-Source Cheminformatics Software",
"homepage": "https://www.rdkit.org",
"license": "BSD-3-Clause",
"dependencies": [
"boost-system",
"boost-serialization",
"boost-iostreams",
"sqlite3",
{
"name": "vcpkg-cmake",
"host": true
},
{
"name": "vcpkg-cmake-config",
"host": true
}
],
"features": {
"freetype": {
"description": "Build with FreeType support",
"dependencies": ["freetype"]
},
"3d": {
"description": "Build 3D descriptor calculators",
"dependencies": ["eigen3"]
}
}
}
```

### portfile.cmake (skeleton)
```cmake
vcpkg_from_github(
OUT_SOURCE_PATH SOURCE_PATH
REPO rdkit/rdkit
REF Release_2024_03_3
SHA512 <calculate-sha512>
HEAD_REF master
)

vcpkg_cmake_configure(
SOURCE_PATH "${SOURCE_PATH}"
OPTIONS
-DRDK_INSTALL_INTREE=OFF
-DRDK_BUILD_PYTHON_WRAPPERS=OFF
-DRDK_BUILD_COORDGEN_SUPPORT=OFF
-DRDK_BUILD_MAEPARSER_SUPPORT=OFF
-DRDK_BUILD_FREESASA_SUPPORT=OFF
-DRDK_BUILD_YAEHMOP_SUPPORT=OFF
-DRDK_BUILD_XYZ2MOL_SUPPORT=OFF
)

vcpkg_cmake_install()
vcpkg_cmake_config_fixup(CONFIG_PATH lib/cmake/rdkit)
vcpkg_copy_pdbs()

file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include")
vcpkg_install_copyright(FILE_LIST "${SOURCE_PATH}/license.txt")
```

## Effort Estimate (C++ Only)

| Task | Time |
|------|------|
| Create vcpkg.json | 15 min |
| Create portfile.cmake | 30 min |
| Debug build issues | 1-2 hours |
| Test on Linux | 30 min |
| **Total** | **2-4 hours** |

## Alternative: Fix Current Build

Instead of creating a port, could:
1. Add boost to vcpkg manifest so `Boost::system` target exists
2. Disable vcpkg toolchain's find_package override for specific packages
3. Use Spack entirely (remove vcpkg toolchain)

## References

- Spack recipe: `spack edit rdkit`
- RDKit CMake docs: https://github.com/rdkit/rdkit/blob/master/Code/cmake/Modules/
- vcpkg port tutorial: https://learn.microsoft.com/en-us/vcpkg/get_started/get-started-adding-to-registry
- Example complex port: `/home/teague/Dev/vcpkg/ports/eigen3/`
13 changes: 5 additions & 8 deletions src/cast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#include "duckdb/common/types/string_type.hpp"
#include "duckdb/common/types/vector.hpp"
#include "duckdb/function/cast/default_casts.hpp"
#include "duckdb/main/extension_util.hpp"
#include "mol_formats.hpp"
#include "types.hpp"
#include "umbra_mol.hpp"
Expand Down Expand Up @@ -74,14 +73,12 @@ bool MolToVarcharCast(Vector &source, Vector &result, idx_t count,
return true;
}

void RegisterCasts(DatabaseInstance &instance) {
ExtensionUtil::RegisterCastFunction(instance, LogicalType::VARCHAR,
::duckdb_rdkit::Mol(),
BoundCastInfo(VarcharToMolCast), 1);
void RegisterCasts(ExtensionLoader &loader) {
loader.RegisterCastFunction(LogicalType::VARCHAR, ::duckdb_rdkit::Mol(),
BoundCastInfo(VarcharToMolCast), 1);

ExtensionUtil::RegisterCastFunction(instance, duckdb_rdkit::Mol(),
LogicalType::VARCHAR,
BoundCastInfo(MolToVarcharCast), 1);
loader.RegisterCastFunction(duckdb_rdkit::Mol(), LogicalType::VARCHAR,
BoundCastInfo(MolToVarcharCast), 1);
}

} // namespace duckdb_rdkit
28 changes: 16 additions & 12 deletions src/duckdb_rdkit_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

#define DUCKDB_EXTENSION_MAIN
#include "cast.hpp"
#include "duckdb/main/extension_util.hpp"
#include "duckdb/main/extension/extension_loader.hpp"
#include "duckdb_rdkit_extension.hpp"
#include "mol_compare.hpp"
#include "mol_formats.hpp"
Expand All @@ -20,39 +20,43 @@

namespace duckdb {

static void LoadInternal(DatabaseInstance &instance) {
duckdb_rdkit::RegisterTypes(instance);
duckdb_rdkit::RegisterCasts(instance);
duckdb_rdkit::RegisterFormatFunctions(instance);
duckdb_rdkit::RegisterCompareFunctions(instance);
duckdb_rdkit::RegisterDescriptorFunctions(instance);
static void LoadInternal(ExtensionLoader &loader) {
duckdb_rdkit::RegisterTypes(loader);
duckdb_rdkit::RegisterCasts(loader);
duckdb_rdkit::RegisterFormatFunctions(loader);
duckdb_rdkit::RegisterCompareFunctions(loader);
duckdb_rdkit::RegisterDescriptorFunctions(loader);

for (auto &fun : SDFFunctions::GetTableFunctions()) {
ExtensionUtil::RegisterFunction(instance, fun);
loader.RegisterFunction(fun);
}

// SDF replacement scan
auto &instance = loader.GetDatabaseInstance();
auto &config = DBConfig::GetConfig(instance);
config.replacement_scans.emplace_back(SDFFunctions::ReadSDFReplacement);
}

void DuckdbRdkitExtension::Load(DuckDB &db) { LoadInternal(*db.instance); }
void DuckdbRdkitExtension::Load(ExtensionLoader &loader) {
LoadInternal(loader);
}

std::string DuckdbRdkitExtension::Name() { return "duckdb_rdkit"; }

} // namespace duckdb

#ifdef DUCKDB_BUILD_LOADABLE_EXTENSION
extern "C" {

DUCKDB_EXTENSION_API void duckdb_rdkit_init(duckdb::DatabaseInstance &db) {
duckdb::DuckDB db_wrapper(db);
db_wrapper.LoadExtension<duckdb::DuckdbRdkitExtension>();
DUCKDB_CPP_EXTENSION_ENTRY(duckdb_rdkit, loader) {
Copy link
Copy Markdown
Owner

@bodowd bodowd Dec 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed the parquet extension uses this macro #ifdef DUCKDB_BUILD_LOADABLE_EXTENSION , but the extension template does not.

I'm not familiar with why to use one or the other. Is there an advantage of one approach over the other?

duckdb::LoadInternal(loader);
}

DUCKDB_EXTENSION_API const char *duckdb_rdkit_version() {
return duckdb::DuckDB::LibraryVersion();
}
}
#endif

#ifndef DUCKDB_EXTENSION_MAIN
#error DUCKDB_EXTENSION_MAIN not defined
Expand Down
2 changes: 1 addition & 1 deletion src/include/cast.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ bool VarcharToMolCast(Vector &source, Vector &result, idx_t count,
void MolToVarchar(Vector &source, Vector &result, idx_t count);
bool MolToVarcharCast(Vector &source, Vector &result, idx_t count,
CastParameters &parameters);
void RegisterCasts(DatabaseInstance &instance);
void RegisterCasts(ExtensionLoader &loader);

} // namespace duckdb_rdkit
2 changes: 1 addition & 1 deletion src/include/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

#include "duckdb.hpp"
#include "duckdb/common/helper.hpp"
#include "duckdb/main/extension_util.hpp"
#include "duckdb/main/extension/extension_loader.hpp"

// including common.hpp into the other files makes it so that
// it is not necessary to put duckdb::FUNCTION. Brings in the namespace
Expand Down
4 changes: 3 additions & 1 deletion src/include/duckdb_rdkit_extension.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@

namespace duckdb {

class ExtensionLoader;

class DuckdbRdkitExtension : public Extension {
public:
void Load(DuckDB &db) override;
void Load(ExtensionLoader &loader) override;
std::string Name() override;
};

Expand Down
2 changes: 1 addition & 1 deletion src/include/mol_compare.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#pragma once
#include "common.hpp"
namespace duckdb_rdkit {
void RegisterCompareFunctions(DatabaseInstance &instance);
void RegisterCompareFunctions(ExtensionLoader &loader);
} // namespace duckdb_rdkit
2 changes: 1 addition & 1 deletion src/include/mol_descriptors.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#pragma once
#include "common.hpp"
namespace duckdb_rdkit {
void RegisterDescriptorFunctions(DatabaseInstance &instance);
void RegisterDescriptorFunctions(ExtensionLoader &loader);
}
2 changes: 1 addition & 1 deletion src/include/mol_formats.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@ std::string rdkit_mol_to_binary_mol(const RDKit::ROMol mol);
std::unique_ptr<RDKit::ROMol> rdkit_binary_mol_to_mol(std::string bmol);
std::string rdkit_mol_to_smiles(RDKit::ROMol mol);

void RegisterFormatFunctions(DatabaseInstance &instance);
void RegisterFormatFunctions(ExtensionLoader &loader);
} // namespace duckdb_rdkit
2 changes: 1 addition & 1 deletion src/include/sdf_scanner/sdf_scan.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#pragma once
#include "GraphMol/FileParsers/MolSupplier.h"
#include "duckdb/common/multi_file_reader.hpp"
#include "duckdb/common/multi_file/multi_file_reader.hpp"
#include "duckdb/common/unique_ptr.hpp"
#include "duckdb/execution/execution_context.hpp"
#include "duckdb/function/function.hpp"
Expand Down
2 changes: 1 addition & 1 deletion src/include/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@
namespace duckdb_rdkit {

LogicalType Mol();
void RegisterTypes(DatabaseInstance &instance);
void RegisterTypes(ExtensionLoader &loader);
} // namespace duckdb_rdkit
7 changes: 3 additions & 4 deletions src/mol_compare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
#include "duckdb/common/types/vector.hpp"
#include "duckdb/execution/expression_executor_state.hpp"
#include "duckdb/function/scalar_function.hpp"
#include "duckdb/main/extension_util.hpp"
#include "mol_formats.hpp"
#include "types.hpp"
#include "umbra_mol.hpp"
Expand Down Expand Up @@ -145,18 +144,18 @@ static void is_substruct(DataChunk &args, ExpressionState &state,
});
}

void RegisterCompareFunctions(DatabaseInstance &instance) {
void RegisterCompareFunctions(ExtensionLoader &loader) {
ScalarFunctionSet set("is_exact_match");
// left type and right type
set.AddFunction(ScalarFunction({duckdb_rdkit::Mol(), duckdb_rdkit::Mol()},
LogicalType::BOOLEAN, is_exact_match));
ExtensionUtil::RegisterFunction(instance, set);
loader.RegisterFunction(set);

ScalarFunctionSet set_is_substruct("is_substruct");
set_is_substruct.AddFunction(
ScalarFunction({duckdb_rdkit::Mol(), duckdb_rdkit::Mol()},
LogicalType::BOOLEAN, is_substruct));
ExtensionUtil::RegisterFunction(instance, set_is_substruct);
loader.RegisterFunction(set_is_substruct);
}

} // namespace duckdb_rdkit
19 changes: 9 additions & 10 deletions src/mol_descriptors.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#include "duckdb/common/vector_operations/unary_executor.hpp"
#include "duckdb/execution/expression_executor_state.hpp"
#include "duckdb/function/function_set.hpp"
#include "duckdb/main/extension_util.hpp"
#include "mol_formats.hpp"
#include "qed.hpp"
#include "types.hpp"
Expand Down Expand Up @@ -131,45 +130,45 @@ void mol_num_rotatable_bonds(DataChunk &args, ExpressionState &state, Vector &re
});
}

void RegisterDescriptorFunctions(DatabaseInstance &instance) {
void RegisterDescriptorFunctions(ExtensionLoader &loader) {
ScalarFunctionSet set_mol_amw("mol_amw");
set_mol_amw.AddFunction(
ScalarFunction({duckdb_rdkit::Mol()}, LogicalType::FLOAT, mol_amw));
ExtensionUtil::RegisterFunction(instance, set_mol_amw);
loader.RegisterFunction(set_mol_amw);

ScalarFunctionSet set_mol_exactmw("mol_exactmw");
set_mol_exactmw.AddFunction(
ScalarFunction({duckdb_rdkit::Mol()}, LogicalType::FLOAT, mol_exactmw));
ExtensionUtil::RegisterFunction(instance, set_mol_exactmw);
loader.RegisterFunction(set_mol_exactmw);

ScalarFunctionSet set_mol_tpsa("mol_tpsa");
set_mol_tpsa.AddFunction(
ScalarFunction({duckdb_rdkit::Mol()}, LogicalType::FLOAT, mol_tpsa));
ExtensionUtil::RegisterFunction(instance, set_mol_tpsa);
loader.RegisterFunction(set_mol_tpsa);

ScalarFunctionSet set_mol_qed("mol_qed");
set_mol_qed.AddFunction(
ScalarFunction({duckdb_rdkit::Mol()}, LogicalType::FLOAT, mol_qed));
ExtensionUtil::RegisterFunction(instance, set_mol_qed);
loader.RegisterFunction(set_mol_qed);

ScalarFunctionSet set_mol_logp("mol_logp");
set_mol_logp.AddFunction(
ScalarFunction({duckdb_rdkit::Mol()}, LogicalType::FLOAT, mol_logp));
ExtensionUtil::RegisterFunction(instance, set_mol_logp);
loader.RegisterFunction(set_mol_logp);

ScalarFunctionSet set_mol_hbd("mol_hbd");
set_mol_hbd.AddFunction(
ScalarFunction({duckdb_rdkit::Mol()}, LogicalType::INTEGER, mol_hbd));
ExtensionUtil::RegisterFunction(instance, set_mol_hbd);
loader.RegisterFunction(set_mol_hbd);

ScalarFunctionSet set_mol_hba("mol_hba");
set_mol_hba.AddFunction(
ScalarFunction({duckdb_rdkit::Mol()}, LogicalType::INTEGER, mol_hba));
ExtensionUtil::RegisterFunction(instance, set_mol_hba);
loader.RegisterFunction(set_mol_hba);

ScalarFunctionSet set_mol_num_rotatable_bonds("mol_num_rotatable_bonds");
set_mol_num_rotatable_bonds.AddFunction(
ScalarFunction({duckdb_rdkit::Mol()}, LogicalType::INTEGER, mol_num_rotatable_bonds));
ExtensionUtil::RegisterFunction(instance, set_mol_num_rotatable_bonds);
loader.RegisterFunction(set_mol_num_rotatable_bonds);
}
} // namespace duckdb_rdkit
Loading