Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions data/visualization/predictions/default/.gitignore

This file was deleted.

40 changes: 40 additions & 0 deletions molgrapher/docling/demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import logging
from pathlib import Path

from docling_core.types.doc import (
PictureItem,
)
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption

from molgrapher.docling.picture_molecule_annotator_pipeline import PictureMoleculeAnnotatorPipelineOptions, PictureMoleculeAnnotatorPipeline

def main():
logging.basicConfig(level=logging.INFO)

data_folder = Path(__file__).parent / "../../vw-tests/docling/"
input_doc_path = data_folder / "CN119912385A-5.pdf"

pipeline_options = PictureMoleculeAnnotatorPipelineOptions()
pipeline_options.images_scale = 2.0
pipeline_options.generate_picture_images = True

doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=PictureMoleculeAnnotatorPipeline,
pipeline_options=pipeline_options,
)
}
)
result = doc_converter.convert(input_doc_path)

for element, _level in result.document.iterate_items():
if isinstance(element, PictureItem):
print(
f"The model populated the `data` portion of picture {element.self_ref}:\n{element.meta}"
)


if __name__ == "__main__":
main()
157 changes: 157 additions & 0 deletions molgrapher/docling/picture_molecule_annotator.py
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how are these docling integrations differing from the existing onese in https://github.com/DS4SD/MolGrapher/tree/main/molgrapher/scripts/annotate/docling ?

Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
from collections.abc import Iterable
from pathlib import Path
from typing import List, Literal, Optional, Union

import numpy as np
from docling_core.types.doc import (
DoclingDocument,
NodeItem,
PictureItem,
PictureMeta,
MoleculeMetaField,
)
from PIL import Image
from pydantic import BaseModel

from docling.datamodel.accelerator_options import AcceleratorOptions
from docling.datamodel.base_models import ItemAndImageEnrichmentElement
from docling.models.base_model import BaseItemAndImageEnrichmentModel
from docling.models.utils.hf_model_download import download_hf_model
from docling.utils.accelerator_utils import decide_device

from molgrapher.models.molgrapher_model import MolgrapherModel


class PictureMoleculeAnnotatorOptions(BaseModel):
"""
Options for configuring the PictureMoleculeAnnotator.

Attributes
----------
kind : Literal["picture_molecule_annotator"]
Identifier for the annotator.
"""

kind: Literal["picture_molecule_annotator"] = "picture_molecule_annotator"


class PictureMoleculeAnnotator(BaseItemAndImageEnrichmentModel):
"""
A model for annotating molecule pictures in documents.

This class enriches document pictures with predicted annotations
based on a predefined set of classes.

Attributes
----------
enabled : bool
Whether the annotator is enabled for use.
options : PictureMoleculeAnnotatorOptions
Configuration options for the annotator.

Methods
-------
__init__(enabled, options, accelerator_options)
Initializes the annotator with specified configurations.
is_processable(doc, element)
Checks if the given element can be processed by the annotator.
__call__(doc, element_batch)
Processes a batch of elements and adds molecule annotations.
"""

def __init__(
self,
enabled: bool,
options: PictureMoleculeAnnotatorOptions,
accelerator_options: AcceleratorOptions,
):
"""
Initializes the PictureMoleculeAnnotator.

Parameters
----------
enabled : bool
Indicates whether the annotator is enabled.
options : PictureMoleculeAnnotatorOptions
Configuration options for the annotator.
accelerator_options : AcceleratorOptions
Options for configuring the device and parallelism.
"""
self.enabled = enabled
self.options = options

if self.enabled:
device = decide_device(accelerator_options.device)

# enforce only cpu for the moment
assert device == "cpu"

self.picture_molecule_annotator = MolgrapherModel()

def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
"""
Determines if the given element can be processed by the annotator.

Parameters
----------
doc : DoclingDocument
The document containing the element.
element : NodeItem
The element to be checked.

Returns
-------
bool
True if the element is a PictureItem and processing is enabled; False otherwise.
"""
return self.enabled and isinstance(element, PictureItem)

def __call__(
self,
doc: DoclingDocument,
element_batch: Iterable[ItemAndImageEnrichmentElement],
) -> Iterable[NodeItem]:
"""
Processes a batch of elements and enriches them with molecule annotation predictions.

Parameters
----------
doc : DoclingDocument
The document containing the elements to be processed.
element_batch : Iterable[ItemAndImageEnrichmentElement]
A batch of pictures to annotate.

Returns
-------
Iterable[NodeItem]
An iterable of NodeItem objects after processing. The field
'meta.molecule' is added containing the annotation for each picture.
"""
if not self.enabled:
for element in element_batch:
yield element.item
return

images: List[Union[Image.Image, np.ndarray]] = []
elements: List[PictureItem] = []
for el in element_batch:
assert isinstance(el.item, PictureItem)
elements.append(el.item)
images.append(el.image)

outputs = self.picture_molecule_annotator.predict(images)

for item, output in zip(elements, outputs):
if output.get("smi", "") == "" or \
output.get("smi", "") == "C":
continue
if item.meta is None:
item.meta = PictureMeta()
#
item.meta.molecule = MoleculeMetaField(
smi=output.get("smi", ""),
confidence=output.get("conf", 0.0),
created_by=output.get("annotator", {}).get("program", "")
)

yield item
36 changes: 36 additions & 0 deletions molgrapher/docling/picture_molecule_annotator_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import logging
from pathlib import Path

from docling_core.types.doc import (
PictureItem,
)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.base_model import BaseEnrichmentModel
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions

from molgrapher.docling.picture_molecule_annotator import PictureMoleculeAnnotator, PictureMoleculeAnnotatorOptions


class PictureMoleculeAnnotatorPipelineOptions(PdfPipelineOptions):
do_picture_annotator: bool = True


class PictureMoleculeAnnotatorPipeline(StandardPdfPipeline):
def __init__(self, pipeline_options: PictureMoleculeAnnotatorPipelineOptions):
super().__init__(pipeline_options)
self.pipeline_options: PictureMoleculeAnnotatorPipeline

self.enrichment_pipe = [
PictureMoleculeAnnotator(
enabled=pipeline_options.do_picture_annotator,
options=PictureMoleculeAnnotatorOptions(),
accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU)
)
]

@classmethod
def get_default_options(cls) -> PictureMoleculeAnnotatorPipelineOptions:
return PictureMoleculeAnnotatorPipelineOptions()
5 changes: 5 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[aliases]
test=pytest
[tool:pytest]
addopts = -q
testpaths = tests
19 changes: 14 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,16 @@ def install_paddle(package: str, version: str = ""):
long_description_content_type="text/markdown",
url="https://github.com/DS4SD/MolGrapher",
packages=setuptools.find_packages(exclude=["tests.*", "tests"]),
setup_requires=[
'pytest-runner',
],
tests_require=[
'pytest',
],
install_requires=[
"mol-depict @ git+https://git@github.com/DS4SD/MolDepictor.git",
"pytorch-lightning==2.1.3",
"torch_geometric==2.4.0",
"pytorch-lightning", # "pytorch-lightning==2.1.3",
"torch_geometric", # "torch_geometric==2.4.0",
"scikit-learn",
"seaborn",
"timm",
Expand Down Expand Up @@ -111,9 +117,12 @@ def install_paddle(package: str, version: str = ""):
install_paddle("paddlepaddle-gpu", "2.6.0"),
],
"cpu": [
install_torch("torch", "2.1.2", cpu=True),
install_torch("torchvision", "0.16.2", cpu=True),
"paddlepaddle",
install_torch('torch', '2.6.0', cpu=True),
install_torch('torchvision', '0.21.0', cpu=True),
"paddlepaddle==2.6.2",
],
"docling": [
"docling>=2.62.0",
],
},
)
54 changes: 54 additions & 0 deletions tests/docl/test_docling_1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""Test methods in module molgrapher."""

import unittest

from PIL import Image
from pathlib import Path
try:
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc import (
PictureItem,
)
from molgrapher.docling.picture_molecule_annotator_pipeline import PictureMoleculeAnnotatorPipelineOptions, PictureMoleculeAnnotatorPipeline
has_docling = True
except ImportError:
has_docling = False


@unittest.skipIf(not has_docling, "docling package not installed")
class TestDocling(unittest.TestCase):
"""Test the methods in molgrapher."""

def test_docl_1(self):

input_doc_path = Path(__file__).parent / "../images/CN119912385A-5.pdf"

pipeline_options = PictureMoleculeAnnotatorPipelineOptions()
pipeline_options.images_scale = 2.0
pipeline_options.generate_picture_images = True

doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=PictureMoleculeAnnotatorPipeline,
pipeline_options=pipeline_options,
)
}
)
result = doc_converter.convert(input_doc_path)

smis = []
for element, _level in result.document.iterate_items():
if isinstance(element, PictureItem):
print(
f"The model populated the `data` portion of picture {element.self_ref}:\n{element.meta}"
)
if element.meta is not None:
smis.append(element.meta.molecule.smi)

refs = ["[H]C1=CC=C(C=CC(=O)C2=CC=C3C(=C2)C2=C(C=CC(C(=O)C=CC4=CC=C(F)C=C4)=C2)N3CC)C=C1", "CC(=O)C1=CC=N2=[I-](=C1)C1=C(C=CC(C(C)=O)=C1)N21CC1"]
for smi, ref in zip(smis, refs):
print(smi)
self.assertEqual(smi, ref)

Binary file added tests/images/CN119912385A-5-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/images/CN119912385A-5-2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/images/CN119912385A-5.pdf
Binary file not shown.
23 changes: 23 additions & 0 deletions tests/molg/test_molg_1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""Test methods in module molgrapher."""

import unittest
import json

from molgrapher.models.molgrapher_model import MolgrapherModel
from PIL import Image
from pathlib import Path


class TestMolG(unittest.TestCase):
"""Test the methods in molgrapher."""

def test_molg_1(self):
model = MolgrapherModel()
image_paths = [
Path(__file__).parent /"../images/CN119912385A-5-1.png",
Path(__file__).parent /"../images/CN119912385A-5-2.png"
]
batch = [Image.open(image_path) for image_path in image_paths]
annos = model.predict(batch)
self.assertEqual(annos[0].get("smi"), "C")
self.assertEqual(annos[1].get("smi"), "C=CC(=CC1=CN2(CC2)N2=CC=C(C(C)=O)C=C=21)C(C)=O")