From 93561b1e906afad521f55c7c65e3f3d49ae818f8 Mon Sep 17 00:00:00 2001
From: Slobodan Stanojevikj <stanoja@proton.me>
Date: Fri, 12 Dec 2025 10:46:32 +0100
Subject: [PATCH 01/12] -wikidata SPARQL query adjustments -added fetching from
 related articles and keyword extraction -added local llm execution in order
 to categorize items

---
 Makefile                                      |   8 +
 requirements.txt                              |   5 +
 web/categorizer/README.md                     | 139 ++++++++++++
 web/categorizer/__init__.py                   |   0
 web/categorizer/categorizer_service.py        | 196 ++++++++++++++++
 web/categorizer/llm_service.py                | 209 ++++++++++++++++++
 web/categorizer/management/__init__.py        |   0
 .../management/commands/__init__.py           |   0
 .../management/commands/categorize.py         |  33 +++
 web/concepts/admin.py                         |  17 +-
 web/concepts/migrations/0011_item_keywords.py |  18 ++
 .../migrations/0012_item_article_text.py      |  18 ++
 web/concepts/migrations/0013_item_aliases.py  |  18 ++
 .../migrations/0014_categorizerresult.py      |  58 +++++
 web/concepts/models.py                        |  30 +++
 web/requirements.txt                          |  14 ++
 web/slurper/keyword_util.py                   |  25 +++
 web/slurper/source_wikidata.py                |  95 +++++++-
 web/slurper/wd_raw_item.py                    |  27 +++
 web/web/settings.py                           |   1 +
 web/web/urls.py                               |   1 +
 21 files changed, 908 insertions(+), 4 deletions(-)
 create mode 100644 Makefile
 create mode 100644 requirements.txt
 create mode 100644 web/categorizer/README.md
 create mode 100644 web/categorizer/__init__.py
 create mode 100644 web/categorizer/categorizer_service.py
 create mode 100644 web/categorizer/llm_service.py
 create mode 100644 web/categorizer/management/__init__.py
 create mode 100644 web/categorizer/management/commands/__init__.py
 create mode 100644 web/categorizer/management/commands/categorize.py
 create mode 100644 web/concepts/migrations/0011_item_keywords.py
 create mode 100644 web/concepts/migrations/0012_item_article_text.py
 create mode 100644 web/concepts/migrations/0013_item_aliases.py
 create mode 100644 web/concepts/migrations/0014_categorizerresult.py
 create mode 100644 web/slurper/keyword_util.py

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..3cfe1f6
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,8 @@
+install:
+	pip install -r requirements.txt
+
+start:
+	python ./web/manage.py runserver
+
+compute-concepts:
+	python ./web/manage.py compute_concepts
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..e86dbb8
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+black~=25.9.0
+isort~=4.2.5
+flake8~=7.3.0
+
+-r ./web/requirements.txt
diff --git a/web/categorizer/README.md b/web/categorizer/README.md
new file mode 100644
index 0000000..40514d3
--- /dev/null
+++ b/web/categorizer/README.md
@@ -0,0 +1,139 @@
+# Categorizer Module
+
+The categorizer module provides LLM-powered categorization of mathematical concepts.
+
+## Setup
+
+### 1. Install Required Dependencies
+
+**For FREE local models (recommended):**
+```bash
+pip install langchain-huggingface langchain-community transformers torch accelerate
+```
+
+**For paid API models (optional):**
+
+For OpenAI:
+```bash
+pip install openai
+```
+
+For Anthropic Claude:
+```bash
+pip install anthropic
+```
+
+**For Ollama (free local alternative):**
+1. Install Ollama from https://ollama.ai
+2. Install langchain-community: `pip install langchain-community`
+3. Pull a model: `ollama pull llama2`
+
+### 2. Configure API Keys (only for paid models)
+
+Set the appropriate environment variable for your chosen LLM provider:
+
+**For OpenAI:**
+```bash
+export OPENAI_API_KEY="your-openai-api-key-here"
+```
+
+**For Anthropic Claude:**
+```bash
+export ANTHROPIC_API_KEY="your-anthropic-api-key-here"
+```
+
+**For Ollama (optional):**
+```bash
+export OLLAMA_MODEL="llama2"  # Default is llama2
+```
+
+You can also add these to a `.env` file or your shell configuration file (`.bashrc`, `.zshrc`, etc.).
+
+## Usage
+
+### Basic Usage
+
+Categorize all items using the default FREE LLM (HuggingFace FLAN-T5):
+```bash
+python manage.py categorize
+```
+
+### With Options
+
+Categorize a limited number of items:
+```bash
+python manage.py categorize --limit 10
+```
+
+Use a specific LLM provider:
+
+**FREE models (run locally):**
+```bash
+# Use HuggingFace FLAN-T5 (default, free, good for instruction following)
+python manage.py categorize --llm huggingface_flan_t5
+
+# Use HuggingFace GPT-2 (free, generative model)
+python manage.py categorize --llm huggingface_gpt2
+
+# Use HuggingFace DialoGPT (free, conversational model)
+python manage.py categorize --llm huggingface_dialogpt
+
+# Use Ollama (free, requires Ollama installed)
+python manage.py categorize --llm ollama
+```
+
+**Paid API models:**
+```bash
+# Use OpenAI GPT-4 (requires API key)
+python manage.py categorize --llm openai_gpt4
+
+# Use OpenAI GPT-3.5 Turbo (requires API key)
+python manage.py categorize --llm openai_gpt35
+
+# Use Anthropic Claude (requires API key)
+python manage.py categorize --llm anthropic_claude
+```
+
+Combine options:
+```bash
+python manage.py categorize --limit 5 --llm huggingface_flan_t5
+```
+
+## Architecture
+
+- `categorizer_service.py` - Main service for categorizing items
+- `llm_service.py` - Service for calling various LLM APIs
+- `management/commands/categorize.py` - Django management command
+
+## Supported LLMs
+
+### Free Models (No API Key Required)
+1. **HuggingFace FLAN-T5** - Google's instruction-following model (recommended for tasks)
+2. **HuggingFace GPT-2** - OpenAI's classic generative model
+3. **HuggingFace DialoGPT** - Microsoft's conversational model
+4. **Ollama** - Run any Ollama model locally (llama2, mistral, etc.)
+
+### Paid API Models (Require API Key)
+1. **OpenAI GPT-4** - Most capable, but expensive
+2. **OpenAI GPT-3.5 Turbo** - Fast and cheaper than GPT-4
+3. **Anthropic Claude** - High quality, good reasoning
+
+## Performance Notes
+
+- **Free models** run locally and don't require internet/API keys, but:
+  - First run downloads the model (~1-3GB depending on model)
+  - Requires sufficient RAM (4-8GB+ recommended)
+  - Slower than API models (especially without GPU)
+
+- **API models** are faster but cost money per request
+
+- **Ollama** is a good middle ground - free, local, and supports many models
+
+## Extending
+
+To add support for additional LLM providers:
+
+1. Add a new entry to the `LLMType` enum in `llm_service.py`
+2. Implement a new private method (e.g., `_call_new_provider`) in the `LLMService` class
+3. Add the new provider to the `call_llm` method's conditional logic
+4. Update the command choices in `management/commands/categorize.py`
diff --git a/web/categorizer/__init__.py b/web/categorizer/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/web/categorizer/categorizer_service.py b/web/categorizer/categorizer_service.py
new file mode 100644
index 0000000..2a4e40e
--- /dev/null
+++ b/web/categorizer/categorizer_service.py
@@ -0,0 +1,196 @@
+import json
+import logging
+import re
+from concepts.models import Item, CategorizerResult
+from categorizer.llm_service import LLMService, LLMType
+
+# Free LLM types to use for categorization
+LLM_JUDGE_POOL = [
+    LLMType.HUGGINGFACE_FLAN_T5,
+    LLMType.HUGGINGFACE_GPT2,
+    LLMType.HUGGINGFACE_DIALOGPT,
+]
+
+
+class CategorizerService:
+    """
+    Service for categorizing mathematical concepts.
+    """
+
+    def __init__(self):
+        self.logger = logging.getLogger(__name__)
+        self.llm_service = LLMService()
+
+    def categorize_items(self, limit=None):
+        """
+        Categorize items from the database using all free LLM types.
+
+        Args:
+            limit: Optional limit on number of items to process
+        """
+        queryset = Item.objects.all()
+        if limit:
+            queryset = queryset[:limit]
+
+        total = queryset.count()
+        self.logger.info(f"Categorizing {total} items using {len(LLM_JUDGE_POOL)} free LLMs")
+
+        for i, item in enumerate(queryset):
+            self.logger.info(
+                f"Processing item {i + 1}/{total}: {item.identifier}"
+            )
+            self.categorize_item(item)
+
+        self.logger.info("Categorization complete")
+
+    def categorize_item(
+        self,
+        item,
+        predicate: str = "Is the given concept a mathematical concept, given the name, description, keywords, and article text?"
+    ):
+        """
+        Categorize a single item using all free LLM types.
+
+        Args:
+            item: Item instance to categorize
+            predicate: The question to evaluate (default: checks if it's a mathematical concept)
+
+        Returns:
+            List of categorization results from all LLMs
+        """
+        self.logger.debug(f"Categorizing: {item.name}")
+
+        prompt = self._build_categorization_prompt(item, predicate)
+
+        results = []
+
+        for llm_type in LLM_JUDGE_POOL:
+            try:
+                self.logger.info(f"Calling {llm_type.value} for {item.name}")
+                raw_result = self.llm_service.call_llm(llm_type, prompt)
+                self.logger.info(f"Categorized {item.name} with {llm_type.value}: {raw_result[:100]}...")
+
+                parsed_result = self._parse_categorization_result(raw_result)
+
+                categorizer_result = CategorizerResult.objects.create(
+                    item=item,
+                    llm_type=llm_type.value,
+                    raw_result=raw_result,
+                    result_answer=parsed_result["answer"],
+                    result_confidence=parsed_result["confidence"],
+                )
+                categorizer_result.save()
+
+                self.logger.info(
+                    f"Saved categorization result for {item.name} ({llm_type.value}): "
+                    f"answer={parsed_result['answer']}, "
+                    f"confidence={parsed_result['confidence']}"
+                )
+
+                results.append(parsed_result)
+            except Exception as e:
+                self.logger.error(f"Failed to categorize {item.name} with {llm_type.value}: {e}")
+                # Continue with other LLMs even if one fails?
+                continue
+
+        return results
+
+    def _build_categorization_prompt(self, item, predicate: str):
+        """
+        Build a prompt for evaluating a concept against a predicate.
+
+        Args:
+            item: Item instance to categorize
+            predicate: The question/predicate to evaluate
+
+        Returns:
+            Formatted prompt string
+        """
+        system_prompt = """You are a categorization judge. Your task is to evaluate whether a given concept satisfies a specific predicate.
+
+You must respond with a structured answer containing:
+1. answer: true or false (boolean)
+2. confidence: a number from 0 to 100 (representing your confidence percentage)
+
+Format your response as JSON:
+{
+  "answer": true,
+  "confidence": 85
+}"""
+
+        item_info_parts = [f"Name: {item.name}"]
+
+        if item.description:
+            item_info_parts.append(f"Description: {item.description}")
+
+        if item.keywords:
+            item_info_parts.append(f"Keywords: {item.keywords}")
+
+        if item.article_text:
+            # Truncate article text to 5000 characters
+            article_text = item.article_text[:5000]
+            item_info_parts.append(f"Article text: {article_text}")
+
+        item_info = "\n".join(item_info_parts)
+
+        prompt = f"""{system_prompt}
+
+---
+
+CONCEPT INFORMATION:
+{item_info}
+
+---
+
+PREDICATE TO EVALUATE:
+{predicate}
+
+---
+
+Please provide your evaluation in JSON format."""
+
+        return prompt
+
+    def _parse_categorization_result(self, result: str) -> dict:
+        """
+        Parse the LLM's JSON response.
+
+        Args:
+            result: The raw response from the LLM
+
+        Returns:
+            Dictionary with 'answer' (bool) and 'confidence' (int) keys
+
+        Raises:
+            ValueError: If the response cannot be parsed
+        """
+        try:
+            json_match = re.search(r'\{[^}]*"answer"[^}]*\}', result, re.DOTALL)
+            if json_match:
+                json_str = json_match.group(0)
+                parsed = json.loads(json_str)
+            else:
+                parsed = json.loads(result)
+
+            if "answer" not in parsed or "confidence" not in parsed:
+                raise ValueError("Response missing required fields 'answer' or 'confidence'")
+
+            answer = parsed["answer"]
+            if isinstance(answer, str):
+                answer = answer.lower() in ("true", "yes", "1")
+
+            confidence = int(parsed["confidence"])
+            if not 0 <= confidence <= 100:
+                raise ValueError(f"Confidence must be between 0-100, got {confidence}")
+
+            return {
+                "answer": bool(answer),
+                "confidence": confidence
+            }
+
+        except json.JSONDecodeError as e:
+            self.logger.error(f"Failed to parse JSON response: {result}")
+            raise ValueError(f"Invalid JSON response from LLM: {e}")
+        except (KeyError, ValueError) as e:
+            self.logger.error(f"Invalid response format: {result}")
+            raise ValueError(f"Invalid response format: {e}")
diff --git a/web/categorizer/llm_service.py b/web/categorizer/llm_service.py
new file mode 100644
index 0000000..cb13d23
--- /dev/null
+++ b/web/categorizer/llm_service.py
@@ -0,0 +1,209 @@
+import logging
+import os
+from enum import Enum
+
+
+class LLMType(Enum):
+    """Supported LLM types"""
+
+    # Paid API-based models
+    OPENAI_GPT4 = "openai_gpt4"
+    OPENAI_GPT35 = "openai_gpt35"
+    ANTHROPIC_CLAUDE = "anthropic_claude"
+
+    # Free HuggingFace models (run locally)
+    HUGGINGFACE_FLAN_T5 = "huggingface_flan_t5"
+    HUGGINGFACE_GPT2 = "huggingface_gpt2"
+    HUGGINGFACE_DIALOGPT = "huggingface_dialogpt"
+
+    # Ollama (free local models)
+    OLLAMA = "ollama"
+
+
+class LLMService:
+    """
+    Service for calling various LLM providers.
+    """
+
+    def __init__(self):
+        self.logger = logging.getLogger(__name__)
+        self.llm_handlers = {
+            LLMType.OPENAI_GPT4: lambda llm_type, prompt: self._call_openai(llm_type, prompt),
+            LLMType.OPENAI_GPT35: lambda llm_type, prompt: self._call_openai(llm_type, prompt),
+            LLMType.ANTHROPIC_CLAUDE: lambda llm_type, prompt: self._call_anthropic(prompt),
+            LLMType.HUGGINGFACE_FLAN_T5: lambda llm_type, prompt: self._call_huggingface("google/flan-t5-base", prompt),
+            LLMType.HUGGINGFACE_GPT2: lambda llm_type, prompt: self._call_huggingface("gpt2", prompt),
+            LLMType.HUGGINGFACE_DIALOGPT: lambda llm_type, prompt: self._call_huggingface("microsoft/DialoGPT-medium", prompt),
+            LLMType.OLLAMA: lambda llm_type, prompt: self._call_ollama(prompt),
+        }
+
+    def call_llm(self, llm_type: LLMType, prompt: str) -> str:
+        """
+        Call an LLM with the given prompt.
+
+        Args:
+            llm_type: The type of LLM to use (LLMType enum)
+            prompt: The prompt to send to the LLM
+
+        Returns:
+            The LLM's response as a string
+
+        Raises:
+            ValueError: If the LLM type is not supported or API key is missing
+            Exception: If the API call fails
+        """
+        self.logger.info(f"Calling {llm_type.value} with prompt length: {len(prompt)}")
+
+        handler = self.llm_handlers.get(llm_type)
+
+        if handler:
+            return handler(llm_type, prompt)
+        else:
+            raise ValueError(f"Unsupported LLM type: {llm_type}")
+
+    def _call_openai(self, llm_type: LLMType, prompt: str) -> str:
+        """Call OpenAI API"""
+        try:
+            import openai
+        except ImportError:
+            raise ImportError(
+                "openai package is required. Install it with: pip install openai"
+            )
+
+        api_key = os.getenv("OPENAI_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "OPENAI_API_KEY environment variable is not set. "
+                "Please set it to your OpenAI API key."
+            )
+
+        openai.api_key = api_key
+
+        model = "gpt-4" if llm_type == LLMType.OPENAI_GPT4 else "gpt-3.5-turbo"
+
+        try:
+            response = openai.ChatCompletion.create(
+                model=model,
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.7,
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            self.logger.error(f"OpenAI API call failed: {e}")
+            raise
+
+    def _call_anthropic(self, prompt: str) -> str:
+        """Call Anthropic Claude API"""
+        try:
+            import anthropic
+        except ImportError:
+            raise ImportError(
+                "anthropic package is required. Install it with: pip install anthropic"
+            )
+
+        api_key = os.getenv("ANTHROPIC_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "ANTHROPIC_API_KEY environment variable is not set. "
+                "Please set it to your Anthropic API key."
+            )
+
+        client = anthropic.Anthropic(api_key=api_key)
+
+        try:
+            response = client.messages.create(
+                model="claude-3-5-sonnet-20241022",
+                max_tokens=1024,
+                messages=[{"role": "user", "content": prompt}],
+            )
+            return response.content[0].text
+        except Exception as e:
+            self.logger.error(f"Anthropic API call failed: {e}")
+            raise
+
+    def _call_huggingface(self, model_id: str, prompt: str) -> str:
+        """
+        Call HuggingFace models using langchain.
+
+        Args:
+            model_id: HuggingFace model ID (e.g., "google/flan-t5-base")
+            prompt: The prompt to send to the model
+
+        Returns:
+            The model's response
+        """
+        try:
+            from langchain_huggingface import HuggingFacePipeline
+        except ImportError:
+            raise ImportError(
+                "langchain-huggingface package is required. "
+                "Install it with: pip install langchain-huggingface"
+            )
+
+        self.logger.info(f"Loading HuggingFace model: {model_id}")
+
+        try:
+            pipeline_kwargs = {
+                "max_new_tokens": 512,
+                "temperature": 0.7,
+            }
+
+            # TODO SST: Remove the whole model
+            # Add pad_token_id for DialoGPT
+            if "DialoGPT" in model_id or "gpt2" in model_id:
+                pipeline_kwargs["pad_token_id"] = 50256
+
+            # Create the HuggingFace pipeline
+            hf = HuggingFacePipeline.from_model_id(
+                model_id=model_id,
+                task="text-generation" if "gpt" in model_id.lower() else "text2text-generation",
+                pipeline_kwargs=pipeline_kwargs,
+            )
+
+            response = hf.invoke(prompt)
+
+            self.logger.info(f"HuggingFace model response length: {len(response)}")
+            return response
+
+        except Exception as e:
+            self.logger.error(f"HuggingFace model call failed: {e}")
+            raise
+
+    def _call_ollama(self, prompt: str, model: str = "llama2") -> str:
+        """
+        Call Ollama for local LLM inference.
+
+        Args:
+            prompt: The prompt to send to the model
+            model: Ollama model name (default: llama2)
+
+        Returns:
+            The model's response
+
+        Note:
+            Requires Ollama to be installed and running locally.
+            Install from: https://ollama.ai
+        """
+        try:
+            from langchain_community.llms import Ollama
+        except ImportError:
+            raise ImportError(
+                "langchain-community package is required. "
+                "Install it with: pip install langchain-community"
+            )
+
+        # Allow model override via environment variable
+        model = os.getenv("OLLAMA_MODEL", model)
+
+        self.logger.info(f"Calling Ollama with model: {model}")
+
+        try:
+            llm = Ollama(model=model)
+            response = llm.invoke(prompt)
+            return response
+        except Exception as e:
+            self.logger.error(
+                f"Ollama call failed: {e}. "
+                "Make sure Ollama is installed and running (https://ollama.ai)"
+            )
+            raise
diff --git a/web/categorizer/management/__init__.py b/web/categorizer/management/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/web/categorizer/management/commands/__init__.py b/web/categorizer/management/commands/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/web/categorizer/management/commands/categorize.py b/web/categorizer/management/commands/categorize.py
new file mode 100644
index 0000000..af9694b
--- /dev/null
+++ b/web/categorizer/management/commands/categorize.py
@@ -0,0 +1,33 @@
+from django.core.management.base import BaseCommand
+from categorizer.categorizer_service import CategorizerService
+
+
+class Command(BaseCommand):
+    help = "Categorize mathematical concepts using all free LLMs (HuggingFace models)"
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--limit",
+            type=int,
+            default=None,
+            help="Limit the number of items to categorize",
+        )
+
+    def handle(self, *args, **options):
+        limit = options.get("limit")
+
+        service = CategorizerService()
+
+        self.stdout.write("Using all free LLMs: huggingface_flan_t5, huggingface_gpt2, huggingface_dialogpt")
+        if limit:
+            self.stdout.write(f"Categorizing up to {limit} items...")
+        else:
+            self.stdout.write("Categorizing all items...")
+
+        try:
+            service.categorize_items(limit=limit)
+            self.stdout.write(self.style.SUCCESS("Categorization complete!"))
+        except Exception as e:
+            self.stdout.write(
+                self.style.ERROR(f"Categorization failed: {e}")
+            )
diff --git a/web/concepts/admin.py b/web/concepts/admin.py
index 60e9bc5..83cb3c7 100644
--- a/web/concepts/admin.py
+++ b/web/concepts/admin.py
@@ -1,6 +1,6 @@
 from django.contrib import admin
 
-from .models import Item
+from .models import CategorizerResult, Item
 
 
 class ItemAdmin(admin.ModelAdmin):
@@ -9,4 +9,19 @@ class ItemAdmin(admin.ModelAdmin):
     list_filter = ["source"]
 
 
+class CategorizerResultAdmin(admin.ModelAdmin):
+    list_display = [
+        "item",
+        "llm_type",
+        "result_answer",
+        "result_confidence",
+        "created_at",
+    ]
+    search_fields = ["item__name", "item__identifier"]
+    list_filter = ["llm_type", "result_answer", "created_at"]
+    readonly_fields = ["created_at", "updated_at"]
+    ordering = ["-created_at"]
+
+
 admin.site.register(Item, ItemAdmin)
+admin.site.register(CategorizerResult, CategorizerResultAdmin)
diff --git a/web/concepts/migrations/0011_item_keywords.py b/web/concepts/migrations/0011_item_keywords.py
new file mode 100644
index 0000000..1773323
--- /dev/null
+++ b/web/concepts/migrations/0011_item_keywords.py
@@ -0,0 +1,18 @@
+# Generated by Django 4.2.25 on 2025-12-11 18:55
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("concepts", "0010_alter_item_source"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="item",
+            name="keywords",
+            field=models.TextField(blank=True, null=True),
+        ),
+    ]
diff --git a/web/concepts/migrations/0012_item_article_text.py b/web/concepts/migrations/0012_item_article_text.py
new file mode 100644
index 0000000..4c1998d
--- /dev/null
+++ b/web/concepts/migrations/0012_item_article_text.py
@@ -0,0 +1,18 @@
+# Generated by Django 4.2.25 on 2025-12-11 20:41
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("concepts", "0011_item_keywords"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="item",
+            name="article_text",
+            field=models.TextField(blank=True, null=True),
+        ),
+    ]
diff --git a/web/concepts/migrations/0013_item_aliases.py b/web/concepts/migrations/0013_item_aliases.py
new file mode 100644
index 0000000..510997c
--- /dev/null
+++ b/web/concepts/migrations/0013_item_aliases.py
@@ -0,0 +1,18 @@
+# Generated by Django 4.2.25 on 2025-12-11 21:19
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("concepts", "0012_item_article_text"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="item",
+            name="aliases",
+            field=models.TextField(blank=True, null=True),
+        ),
+    ]
diff --git a/web/concepts/migrations/0014_categorizerresult.py b/web/concepts/migrations/0014_categorizerresult.py
new file mode 100644
index 0000000..96dc102
--- /dev/null
+++ b/web/concepts/migrations/0014_categorizerresult.py
@@ -0,0 +1,58 @@
+# Generated by Django 4.2.25 on 2025-12-11 22:37
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("concepts", "0013_item_aliases"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="CategorizerResult",
+            fields=[
+                (
+                    "id",
+                    models.BigAutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                ("llm_type", models.CharField(max_length=50)),
+                ("raw_result", models.TextField()),
+                ("result_answer", models.BooleanField()),
+                ("result_confidence", models.IntegerField()),
+                ("created_at", models.DateTimeField(auto_now_add=True)),
+                ("updated_at", models.DateTimeField(auto_now=True)),
+                (
+                    "item",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        related_name="categorizer_results",
+                        to="concepts.item",
+                    ),
+                ),
+            ],
+            options={
+                "ordering": ["-created_at"],
+                "indexes": [
+                    models.Index(
+                        fields=["item", "llm_type"],
+                        name="concepts_ca_item_id_c24595_idx",
+                    ),
+                    models.Index(
+                        fields=["result_answer"], name="concepts_ca_result__a4c7a5_idx"
+                    ),
+                    models.Index(
+                        fields=["result_confidence"],
+                        name="concepts_ca_result__d25f96_idx",
+                    ),
+                ],
+            },
+        ),
+    ]
diff --git a/web/concepts/models.py b/web/concepts/models.py
index 697eb61..0545806 100644
--- a/web/concepts/models.py
+++ b/web/concepts/models.py
@@ -88,6 +88,9 @@ def key():
     url = models.URLField(max_length=200)
     name = models.CharField(max_length=200, null=True)
     description = models.TextField(null=True)
+    keywords = models.TextField(null=True, blank=True)
+    article_text = models.TextField(null=True, blank=True)
+    aliases = models.TextField(null=True, blank=True)
     concept = models.ForeignKey(
         Concept,
         models.SET_NULL,
@@ -159,3 +162,30 @@ def save_new(source: Item, destination: Item, label: Label):
 
     def __str__(self):
         return f"{self.source} -[{self.get_label_display()}]-> {self.destination}"
+
+
+class CategorizerResult(models.Model):
+    """
+    Stores the result of categorizing an item using an LLM.
+    """
+
+    item = models.ForeignKey(
+        Item, on_delete=models.CASCADE, related_name="categorizer_results"
+    )
+    llm_type = models.CharField(max_length=100)
+    raw_result = models.TextField()
+    result_answer = models.BooleanField()
+    result_confidence = models.IntegerField()
+    created_at = models.DateTimeField(auto_now_add=True)
+    updated_at = models.DateTimeField(auto_now=True)
+
+    class Meta:
+        ordering = ["-created_at"]
+        indexes = [
+            models.Index(fields=["item", "llm_type"]),
+            models.Index(fields=["result_answer"]),
+            models.Index(fields=["result_confidence"]),
+        ]
+
+    def __str__(self):
+        return f"{self.item} - {self.llm_type}: {self.result_answer} ({self.result_confidence}%)"
\ No newline at end of file
diff --git a/web/requirements.txt b/web/requirements.txt
index 6b64bf7..69be5c7 100644
--- a/web/requirements.txt
+++ b/web/requirements.txt
@@ -1,2 +1,16 @@
 Django~=4.2.6
 requests~=2.31.0
+spacy~=3.7.0
+scispacy~=0.5.4
+
+# LLM dependencies (optional, install based on which LLM you want to use)
+# For paid APIs:
+# openai>=1.0.0  # Uncomment for OpenAI GPT models
+# anthropic>=0.7.0  # Uncomment for Anthropic Claude
+
+# For free local models (recommended):
+langchain-huggingface>=0.0.1  # For HuggingFace models
+langchain-community>=0.0.1  # For Ollama and other local models
+transformers>=4.35.0  # Required by HuggingFace models
+torch>=2.0.0  # Required by HuggingFace models
+accelerate>=0.24.0  # Speeds up model loading
diff --git a/web/slurper/keyword_util.py b/web/slurper/keyword_util.py
new file mode 100644
index 0000000..aaeb919
--- /dev/null
+++ b/web/slurper/keyword_util.py
@@ -0,0 +1,25 @@
+import spacy
+
+# TODO SST: Move to readme.md
+# TODO SST: Also it should be lazy-loaded
+# Load the scientific English model from scispacy
+# Note: You need to download this model first with:
+# pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz
+nlp = spacy.load("en_core_sci_lg")
+
+
+def extract_keywords(text):
+    """
+    Extract keywords from text using spaCy's named entity recognition.
+
+    Args:
+        text: The text to extract keywords from
+
+    Returns:
+        A list of recognized entities (keywords) from the text
+    """
+    if not text:
+        return []
+
+    doc = nlp(text)
+    return doc.ents
diff --git a/web/slurper/source_wikidata.py b/web/slurper/source_wikidata.py
index 06b6a97..9669a04 100644
--- a/web/slurper/source_wikidata.py
+++ b/web/slurper/source_wikidata.py
@@ -6,6 +6,22 @@
 from slurper.wd_raw_item import WD_OTHER_SOURCES, BaseWdRawItem
 
 
+# Wikidata entities to exclude from queries (natural numbers and positive integers)
+# TODO SST: Ask Katja: whether to add all found
+#   1. Should I put all found? Most likely yes
+#   2. Use categorization results to exclude them in further uses
+KNOWN_EXCLUDED_CATEGORIES = ["wd:Q21199", "wd:Q28920044"]
+
+
+# These are added to every query:
+#   - Optional image: Fetches image if available
+#   - Optional Wikipedia link: Gets English Wikipedia article
+#   - Excludes natural numbers (FILTER NOT EXISTS)
+#   - Excludes humans (FILTER NOT EXISTS)
+#   - Label service: Automatically fetches English labels and descriptions
+#
+#   The class fetches mathematical concepts from Wikidata while filtering out unwanted items like people and natural numbers.
+
 class WikidataSlurper:
     SPARQL_URL = "https://query.wikidata.org/sparql"
 
@@ -18,9 +34,12 @@ class WikidataSlurper:
       schema:isPartOf <https://en.wikipedia.org/>;
       schema:about ?item .
   }
-  # except for natural numbers
-  MINUS {
-    ?item wdt:P31 wd:Q21199 .
+  OPTIONAL
+  { ?item skos:altLabel ?itemAltLabel . FILTER (lang(?itemAltLabel) = "en") }
+  # except for natural numbers and positive integers
+  FILTER NOT EXISTS {
+    VALUES ?excludedType { """ + " ".join(KNOWN_EXCLUDED_CATEGORIES) + """ }
+    ?item wdt:P31 ?excludedType .
   }
   # except for humans
   FILTER NOT EXISTS{ ?item wdt:P31 wd:Q5 . }
@@ -35,6 +54,7 @@ def __init__(self, source, query, limit=None):
             """
 SELECT
   DISTINCT ?item ?itemLabel ?itemDescription ?image ?wp_en
+  (GROUP_CONCAT(DISTINCT ?itemAltLabel; separator=", ") AS ?aliases)
  """
             + self._sparql_source_vars_select()
             + """
@@ -43,9 +63,18 @@ def __init__(self, source, query, limit=None):
             + query
             + self._sparql_source_vars_triples()
             + self.SPARQL_QUERY_OPTIONS
+            + """
+GROUP BY ?item ?itemLabel ?itemDescription ?image ?wp_en """
+            + " ".join(
+                [f"?{src['json_key']}" for src in WD_OTHER_SOURCES.values()]
+            )
+            + """
+"""
             + (f"LIMIT {limit}" if limit is not None else "")
         )
         self.raw_data = self.fetch_json()
+        self.article_text = self.fetch_articles()
+
 
     def _sparql_source_vars_select(self):
         def to_var(source_dict):
@@ -70,8 +99,68 @@ def fetch_json(self):
         )
         return response.json()["results"]["bindings"]
 
+    def fetch_articles(self):
+        """Fetch Wikipedia article text for items with wp_en links."""
+        article_texts = {}
+
+        for json_item in self.raw_data:
+            # Only fetch if Wikipedia link exists
+            if "wp_en" not in json_item:
+                continue
+
+            wp_url = json_item["wp_en"]["value"]
+            article_title = wp_url.split("/wiki/")[-1]
+
+            api_url = "https://en.wikipedia.org/w/api.php"
+            params = {
+                "action": "query",
+                "format": "json",
+                "titles": article_title,
+                "prop": "extracts",
+                "explaintext": True,
+                "exsectionformat": "plain",
+            }
+            headers = {
+                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+                "Accept": "application/json",
+                "Accept-Language": "en-US,en;q=0.9",
+            }
+
+            try:
+                response = requests.get(api_url, params=params, headers=headers)
+                response.raise_for_status()
+
+                if not response.text:
+                    logging.log(
+                        logging.WARNING,
+                        f"Empty response for Wikipedia article: {article_title}",
+                    )
+                    continue
+
+                data = response.json()
+                pages = data.get("query", {}).get("pages", {})
+
+                # Get the first (and only) page
+                for page_id, page_data in pages.items():
+                    if "extract" in page_data:
+                        # Use Wikidata ID as key
+                        wd_id = json_item["item"]["value"]
+                        article_texts[wd_id] = page_data["extract"]
+                        break
+            except Exception as e:
+                logging.log(
+                    logging.WARNING,
+                    f"Failed to fetch Wikipedia article for {article_title}: {e}",
+                )
+
+        return article_texts
+
     def get_items(self):
         for json_item in self.raw_data:
+            wd_id = json_item["item"]["value"]
+            if wd_id in self.article_text:
+                json_item["article_text"] = {"value": self.article_text[wd_id]}
+
             raw_item = BaseWdRawItem.raw_item(self.source, json_item)
             yield raw_item.to_item()
             if self.source != Item.Source.WIKIDATA:
diff --git a/web/slurper/wd_raw_item.py b/web/slurper/wd_raw_item.py
index cc71823..29a5539 100644
--- a/web/slurper/wd_raw_item.py
+++ b/web/slurper/wd_raw_item.py
@@ -1,6 +1,7 @@
 from typing import Optional
 
 from concepts.models import Item, Link
+from slurper.keyword_util import extract_keywords
 
 WD_OTHER_SOURCES = {
     Item.Source.NLAB: {
@@ -42,6 +43,18 @@ def name(self):
     def description(self):
         return None
 
+    def aliases(self):
+        """Get aliases (alternative labels) if available."""
+        if "aliases" in self.raw and self.raw["aliases"]["value"]:
+            return self.raw["aliases"]["value"]
+        return None
+
+    def article_text(self):
+        """Get the Wikipedia article text if available."""
+        if "article_text" in self.raw:
+            return self.raw["article_text"]["value"]
+        return None
+
     def has_source(self, source):
         if source == Item.Source.WIKIPEDIA_EN:
             return "wp_en" in self.raw
@@ -52,12 +65,26 @@ def switch_source_to(self, source):
         return BaseWdRawItem.raw_item(source, self.raw)
 
     def to_item(self) -> Optional[Item]:
+        # Extract keywords from article text if available
+        article = self.article_text()
+        keywords = None
+
+        if article:
+            # Extract entities using spaCy
+            entities = extract_keywords(article)
+            # Convert to lowercase and create comma-separated string
+            keyword_list = [entity.text.lower() for entity in entities]
+            keywords = ", ".join(keyword_list) if keyword_list else None
+
         return Item(
             source=self.source,
             identifier=self.identifier(),
             url=self.url(),
             name=self.name(),
             description=self.description(),
+            keywords=keywords,
+            article_text=article,
+            aliases=self.aliases(),
         )
 
     def _get_item_queryset(self):
diff --git a/web/web/settings.py b/web/web/settings.py
index 31317db..199c794 100644
--- a/web/web/settings.py
+++ b/web/web/settings.py
@@ -38,6 +38,7 @@
     "django.contrib.sessions",
     "django.contrib.messages",
     "django.contrib.staticfiles",
+    "categorizer",
     "concepts",
     "slurper",
     "web",
diff --git a/web/web/urls.py b/web/web/urls.py
index a50d3a4..958bd85 100644
--- a/web/web/urls.py
+++ b/web/web/urls.py
@@ -14,6 +14,7 @@
     1. Import the include() function: from django.urls import include, path
     2. Add a URL to urlpatterns:  path('blog/', include('blog.urls'))
 """
+
 from concepts import views
 from django.conf import settings
 from django.conf.urls.static import static

From 7fd7197effb29b6fd88f59d4189e1f96f11d7947 Mon Sep 17 00:00:00 2001
From: Slobodan Stanojevikj <stanoja@proton.me>
Date: Fri, 12 Dec 2025 23:42:05 +0100
Subject: [PATCH 02/12] -moved fetch to be per item, not fetch all then

---
 Makefile                       |   3 +
 web/slurper/keyword_util.py    |  18 +++-
 web/slurper/source_wikidata.py | 156 +++++++++++++++++++++++----------
 3 files changed, 128 insertions(+), 49 deletions(-)

diff --git a/Makefile b/Makefile
index 3cfe1f6..a479b89 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,9 @@
 install:
 	pip install -r requirements.txt
 
+install-scispacy:
+	pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz
+
 start:
 	python ./web/manage.py runserver
 
diff --git a/web/slurper/keyword_util.py b/web/slurper/keyword_util.py
index aaeb919..8b8472d 100644
--- a/web/slurper/keyword_util.py
+++ b/web/slurper/keyword_util.py
@@ -1,11 +1,22 @@
 import spacy
 
 # TODO SST: Move to readme.md
-# TODO SST: Also it should be lazy-loaded
 # Load the scientific English model from scispacy
 # Note: You need to download this model first with:
-# pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz
-nlp = spacy.load("en_core_sci_lg")
+#   make install-scispacy
+# Or directly:
+#   pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz
+
+# Lazy-loaded spaCy model
+_nlp = None
+
+
+def _get_nlp():
+    """Lazy-load the spaCy model only when needed."""
+    global _nlp
+    if _nlp is None:
+        _nlp = spacy.load("en_core_sci_lg")
+    return _nlp
 
 
 def extract_keywords(text):
@@ -21,5 +32,6 @@ def extract_keywords(text):
     if not text:
         return []
 
+    nlp = _get_nlp()
     doc = nlp(text)
     return doc.ents
diff --git a/web/slurper/source_wikidata.py b/web/slurper/source_wikidata.py
index 9669a04..8cc1504 100644
--- a/web/slurper/source_wikidata.py
+++ b/web/slurper/source_wikidata.py
@@ -1,4 +1,6 @@
 import logging
+import time
+import urllib.parse
 
 import requests
 from concepts.models import Item
@@ -6,12 +8,16 @@
 from slurper.wd_raw_item import WD_OTHER_SOURCES, BaseWdRawItem
 
 
+# Wikipedia API contact email (required by Wikipedia API guidelines)
+# Set to None to disable Wikipedia article fetching
+WIKIPEDIA_CONTACT_EMAIL = None
+
 # Wikidata entities to exclude from queries (natural numbers and positive integers)
-# TODO SST: Ask Katja: whether to add all found
-#   1. Should I put all found? Most likely yes
-#   2. Use categorization results to exclude them in further uses
 KNOWN_EXCLUDED_CATEGORIES = ["wd:Q21199", "wd:Q28920044"]
 
+# Flag to track if we've logged the missing email warning
+_missing_email_logged = False
+
 
 # These are added to every query:
 #   - Optional image: Fetches image if available
@@ -73,7 +79,6 @@ def __init__(self, source, query, limit=None):
             + (f"LIMIT {limit}" if limit is not None else "")
         )
         self.raw_data = self.fetch_json()
-        self.article_text = self.fetch_articles()
 
 
     def _sparql_source_vars_select(self):
@@ -99,43 +104,85 @@ def fetch_json(self):
         )
         return response.json()["results"]["bindings"]
 
-    def fetch_articles(self):
-        """Fetch Wikipedia article text for items with wp_en links."""
-        article_texts = {}
+    def fetch_article(self, json_item, index=None, total=None):
+        global _missing_email_logged
 
-        for json_item in self.raw_data:
-            # Only fetch if Wikipedia link exists
-            if "wp_en" not in json_item:
-                continue
-
-            wp_url = json_item["wp_en"]["value"]
-            article_title = wp_url.split("/wiki/")[-1]
-
-            api_url = "https://en.wikipedia.org/w/api.php"
-            params = {
-                "action": "query",
-                "format": "json",
-                "titles": article_title,
-                "prop": "extracts",
-                "explaintext": True,
-                "exsectionformat": "plain",
-            }
-            headers = {
-                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
-                "Accept": "application/json",
-                "Accept-Language": "en-US,en;q=0.9",
-            }
+        # Check if contact email is configured
+        if WIKIPEDIA_CONTACT_EMAIL is None:
+            if not _missing_email_logged:
+                logging.log(
+                    logging.WARNING,
+                    "WIKIPEDIA_CONTACT_EMAIL is not set. Wikipedia article fetching is disabled. "
+                    "Please set WIKIPEDIA_CONTACT_EMAIL at the top of source_wikidata.py to enable article fetching.",
+                )
+                _missing_email_logged = True
+            return None
+
+        wp_url = json_item["wp_en"]["value"]
+        # Decode URL-encoded characters (e.g., %E2%80%93 becomes –)
+        article_title = urllib.parse.unquote(wp_url.split("/wiki/")[-1])
 
+        if index is not None and total is not None:
+            logging.log(
+                logging.INFO,
+                f"Fetching Wikipedia article [{index}/{total}]: {article_title}",
+            )
+        else:
+            logging.log(
+                logging.INFO,
+                f"Fetching Wikipedia article: {article_title}",
+            )
+        api_url = "https://en.wikipedia.org/w/api.php"
+        params = {
+            "action": "query",
+            "format": "json",
+            "titles": article_title,
+            "prop": "extracts",
+            "explaintext": True,
+            "exsectionformat": "plain",
+        }
+        headers = {
+            "User-Agent": f"MathSwitch/1.0 ({WIKIPEDIA_CONTACT_EMAIL})",
+            "Accept": "application/json",
+            "Accept-Language": "en-US,en;q=0.9",
+        }
+        # Retry logic with exponential backoff
+        max_retries = 3
+        retry_delay = 1  # Start with 1 second
+        success = False
+        for attempt in range(max_retries):
             try:
-                response = requests.get(api_url, params=params, headers=headers)
+                # Rate limiting: delay between requests (100 req/s max)
+                time.sleep(0.01)
+
+                # Timeout: (connect_timeout, read_timeout) in seconds
+                response = requests.get(api_url, params=params, headers=headers, timeout=(5, 30))
+
+                # Handle rate limiting
+                if response.status_code in (429, 403):
+                    if attempt < max_retries - 1:
+                        logging.log(
+                            logging.WARNING,
+                            f"Rate limited for {article_title}, retrying in {retry_delay}s (attempt {attempt + 1}/{max_retries})",
+                        )
+                        time.sleep(retry_delay)
+                        retry_delay *= 2  # Exponential backoff
+                        continue
+                    else:
+                        logging.log(
+                            logging.ERROR,
+                            f"Failed to fetch {article_title} after {max_retries} attempts (rate limited). Skipping article.",
+                        )
+                        break
+
                 response.raise_for_status()
 
                 if not response.text:
                     logging.log(
                         logging.WARNING,
-                        f"Empty response for Wikipedia article: {article_title}",
+                        f"Empty response for Wikipedia article: {article_title}. Skipping article.",
                     )
-                    continue
+                    break
 
                 data = response.json()
                 pages = data.get("query", {}).get("pages", {})
@@ -143,24 +190,35 @@ def fetch_articles(self):
                 # Get the first (and only) page
                 for page_id, page_data in pages.items():
                     if "extract" in page_data:
-                        # Use Wikidata ID as key
-                        wd_id = json_item["item"]["value"]
-                        article_texts[wd_id] = page_data["extract"]
-                        break
-            except Exception as e:
-                logging.log(
-                    logging.WARNING,
-                    f"Failed to fetch Wikipedia article for {article_title}: {e}",
-                )
+                        success = True
+                        return page_data["extract"]
 
-        return article_texts
+                # Success, break retry loop
+                break
+
+            except requests.exceptions.RequestException as e:
+                if attempt < max_retries - 1:
+                    logging.log(
+                        logging.WARNING,
+                        f"Request failed for {article_title}: {e}, retrying in {retry_delay}s",
+                    )
+                    time.sleep(retry_delay)
+                    retry_delay *= 2
+                else:
+                    logging.log(
+                        logging.ERROR,
+                        f"Failed to fetch {article_title} after {max_retries} attempts: {e}. Skipping article.",
+                    )
+        if not success and "wp_en" in json_item:
+            logging.log(
+                logging.INFO,
+                f"Article {article_title} will have null value (fetch failed or empty)",
+            )
+
+        return None
 
     def get_items(self):
         for json_item in self.raw_data:
-            wd_id = json_item["item"]["value"]
-            if wd_id in self.article_text:
-                json_item["article_text"] = {"value": self.article_text[wd_id]}
-
             raw_item = BaseWdRawItem.raw_item(self.source, json_item)
             yield raw_item.to_item()
             if self.source != Item.Source.WIKIDATA:
@@ -168,6 +226,12 @@ def get_items(self):
                 if not raw_item_wd.item_exists():
                     yield raw_item_wd.to_item()
             if raw_item.has_source(Item.Source.WIKIPEDIA_EN):
+                # Fetch Wikipedia article if available
+                if "wp_en" in json_item and "article_text" not in json_item:
+                    article_text = self.fetch_article(json_item)
+                    if article_text is not None:
+                        json_item["article_text"] = {"value": article_text}
+
                 raw_item_wp_en = raw_item.switch_source_to(Item.Source.WIKIPEDIA_EN)
                 if not raw_item_wp_en.item_exists():
                     yield raw_item_wp_en.to_item()

From f82c7f80d34cd4062ad7017f0c99394a27312fda Mon Sep 17 00:00:00 2001
From: Slobodan Stanojevikj <stanoja@proton.me>
Date: Sat, 13 Dec 2025 00:13:00 +0100
Subject: [PATCH 03/12] -moved fetch to be per item, not fetch all then

---
 Makefile                                      |  5 +++
 requirements.txt                              |  2 +-
 web/categorizer/categorizer_service.py        | 39 ++++++++++-------
 web/categorizer/llm_service.py                | 34 +++++++++++----
 .../management/commands/categorize.py         | 11 ++---
 web/concepts/models.py                        |  5 ++-
 web/slurper/keyword_util.py                   |  4 +-
 web/slurper/source_wikidata.py                | 43 ++++++++++++-------
 8 files changed, 93 insertions(+), 50 deletions(-)

diff --git a/Makefile b/Makefile
index a479b89..3d20f73 100644
--- a/Makefile
+++ b/Makefile
@@ -9,3 +9,8 @@ start:
 
 compute-concepts:
 	python ./web/manage.py compute_concepts
+
+fix-files:
+	python3 -m black .
+	python3 -m isort .
+	python3 -m flake8 .
diff --git a/requirements.txt b/requirements.txt
index e86dbb8..6fbba70 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 black~=25.9.0
-isort~=4.2.5
+isort~=5.12.0
 flake8~=7.3.0
 
 -r ./web/requirements.txt
diff --git a/web/categorizer/categorizer_service.py b/web/categorizer/categorizer_service.py
index 2a4e40e..97186af 100644
--- a/web/categorizer/categorizer_service.py
+++ b/web/categorizer/categorizer_service.py
@@ -1,8 +1,9 @@
 import json
 import logging
 import re
-from concepts.models import Item, CategorizerResult
+
 from categorizer.llm_service import LLMService, LLMType
+from concepts.models import CategorizerResult, Item
 
 # Free LLM types to use for categorization
 LLM_JUDGE_POOL = [
@@ -33,12 +34,12 @@ def categorize_items(self, limit=None):
             queryset = queryset[:limit]
 
         total = queryset.count()
-        self.logger.info(f"Categorizing {total} items using {len(LLM_JUDGE_POOL)} free LLMs")
+        self.logger.info(
+            f"Categorizing {total} items using {len(LLM_JUDGE_POOL)} free LLMs"
+        )
 
         for i, item in enumerate(queryset):
-            self.logger.info(
-                f"Processing item {i + 1}/{total}: {item.identifier}"
-            )
+            self.logger.info(f"Processing item {i + 1}/{total}: {item.identifier}")
             self.categorize_item(item)
 
         self.logger.info("Categorization complete")
@@ -46,14 +47,17 @@ def categorize_items(self, limit=None):
     def categorize_item(
         self,
         item,
-        predicate: str = "Is the given concept a mathematical concept, given the name, description, keywords, and article text?"
+        predicate: str = "Is the given concept a mathematical concept,"
+        " given the name, description, "
+        "keywords, and article text?",
     ):
         """
         Categorize a single item using all free LLM types.
 
         Args:
             item: Item instance to categorize
-            predicate: The question to evaluate (default: checks if it's a mathematical concept)
+            predicate: The question to evaluate (default: checks if it's
+            a mathematical concept)
 
         Returns:
             List of categorization results from all LLMs
@@ -68,7 +72,10 @@ def categorize_item(
             try:
                 self.logger.info(f"Calling {llm_type.value} for {item.name}")
                 raw_result = self.llm_service.call_llm(llm_type, prompt)
-                self.logger.info(f"Categorized {item.name} with {llm_type.value}: {raw_result[:100]}...")
+                self.logger.info(
+                    f"Categorized {item.name} with {llm_type.value}: "
+                    f"{raw_result[:100]}..."
+                )
 
                 parsed_result = self._parse_categorization_result(raw_result)
 
@@ -89,7 +96,9 @@ def categorize_item(
 
                 results.append(parsed_result)
             except Exception as e:
-                self.logger.error(f"Failed to categorize {item.name} with {llm_type.value}: {e}")
+                self.logger.error(
+                    f"Failed to categorize {item.name} with {llm_type.value}: {e}"
+                )
                 # Continue with other LLMs even if one fails?
                 continue
 
@@ -106,7 +115,8 @@ def _build_categorization_prompt(self, item, predicate: str):
         Returns:
             Formatted prompt string
         """
-        system_prompt = """You are a categorization judge. Your task is to evaluate whether a given concept satisfies a specific predicate.
+        system_prompt = """You are a categorization judge. Your task is to
+         evaluate whether a given concept satisfies a specific predicate.
 
 You must respond with a structured answer containing:
 1. answer: true or false (boolean)
@@ -173,7 +183,9 @@ def _parse_categorization_result(self, result: str) -> dict:
                 parsed = json.loads(result)
 
             if "answer" not in parsed or "confidence" not in parsed:
-                raise ValueError("Response missing required fields 'answer' or 'confidence'")
+                raise ValueError(
+                    "Response missing required fields 'answer' or 'confidence'"
+                )
 
             answer = parsed["answer"]
             if isinstance(answer, str):
@@ -183,10 +195,7 @@ def _parse_categorization_result(self, result: str) -> dict:
             if not 0 <= confidence <= 100:
                 raise ValueError(f"Confidence must be between 0-100, got {confidence}")
 
-            return {
-                "answer": bool(answer),
-                "confidence": confidence
-            }
+            return {"answer": bool(answer), "confidence": confidence}
 
         except json.JSONDecodeError as e:
             self.logger.error(f"Failed to parse JSON response: {result}")
diff --git a/web/categorizer/llm_service.py b/web/categorizer/llm_service.py
index cb13d23..190da36 100644
--- a/web/categorizer/llm_service.py
+++ b/web/categorizer/llm_service.py
@@ -28,12 +28,24 @@ class LLMService:
     def __init__(self):
         self.logger = logging.getLogger(__name__)
         self.llm_handlers = {
-            LLMType.OPENAI_GPT4: lambda llm_type, prompt: self._call_openai(llm_type, prompt),
-            LLMType.OPENAI_GPT35: lambda llm_type, prompt: self._call_openai(llm_type, prompt),
-            LLMType.ANTHROPIC_CLAUDE: lambda llm_type, prompt: self._call_anthropic(prompt),
-            LLMType.HUGGINGFACE_FLAN_T5: lambda llm_type, prompt: self._call_huggingface("google/flan-t5-base", prompt),
-            LLMType.HUGGINGFACE_GPT2: lambda llm_type, prompt: self._call_huggingface("gpt2", prompt),
-            LLMType.HUGGINGFACE_DIALOGPT: lambda llm_type, prompt: self._call_huggingface("microsoft/DialoGPT-medium", prompt),
+            LLMType.OPENAI_GPT4: lambda llm_type, prompt: self._call_openai(
+                llm_type, prompt
+            ),
+            LLMType.OPENAI_GPT35: lambda llm_type, prompt: self._call_openai(
+                llm_type, prompt
+            ),
+            LLMType.ANTHROPIC_CLAUDE: lambda llm_type, prompt: self._call_anthrpc(
+                prompt
+            ),
+            LLMType.HUGGINGFACE_FLAN_T5: lambda llm_type, prompt: self._call_hgf(
+                "google/flan-t5-base", prompt
+            ),
+            LLMType.HUGGINGFACE_GPT2: lambda llm_type, prompt: self._call_hgf(
+                "gpt2", prompt
+            ),
+            LLMType.HUGGINGFACE_DIALOGPT: lambda llm_type, prompt: self._call_hgf(
+                "microsoft/DialoGPT-medium", prompt
+            ),
             LLMType.OLLAMA: lambda llm_type, prompt: self._call_ollama(prompt),
         }
 
@@ -92,7 +104,7 @@ def _call_openai(self, llm_type: LLMType, prompt: str) -> str:
             self.logger.error(f"OpenAI API call failed: {e}")
             raise
 
-    def _call_anthropic(self, prompt: str) -> str:
+    def _call_anthrpc(self, prompt: str) -> str:
         """Call Anthropic Claude API"""
         try:
             import anthropic
@@ -121,7 +133,7 @@ def _call_anthropic(self, prompt: str) -> str:
             self.logger.error(f"Anthropic API call failed: {e}")
             raise
 
-    def _call_huggingface(self, model_id: str, prompt: str) -> str:
+    def _call_hgf(self, model_id: str, prompt: str) -> str:
         """
         Call HuggingFace models using langchain.
 
@@ -156,7 +168,11 @@ def _call_huggingface(self, model_id: str, prompt: str) -> str:
             # Create the HuggingFace pipeline
             hf = HuggingFacePipeline.from_model_id(
                 model_id=model_id,
-                task="text-generation" if "gpt" in model_id.lower() else "text2text-generation",
+                task=(
+                    "text-generation"
+                    if "gpt" in model_id.lower()
+                    else "text2text-generation"
+                ),
                 pipeline_kwargs=pipeline_kwargs,
             )
 
diff --git a/web/categorizer/management/commands/categorize.py b/web/categorizer/management/commands/categorize.py
index af9694b..21a6794 100644
--- a/web/categorizer/management/commands/categorize.py
+++ b/web/categorizer/management/commands/categorize.py
@@ -1,5 +1,5 @@
-from django.core.management.base import BaseCommand
 from categorizer.categorizer_service import CategorizerService
+from django.core.management.base import BaseCommand
 
 
 class Command(BaseCommand):
@@ -18,7 +18,10 @@ def handle(self, *args, **options):
 
         service = CategorizerService()
 
-        self.stdout.write("Using all free LLMs: huggingface_flan_t5, huggingface_gpt2, huggingface_dialogpt")
+        self.stdout.write(
+            "Using all free LLMs: huggingface_flan_t5, "
+            "huggingface_gpt2, huggingface_dialogpt"
+        )
         if limit:
             self.stdout.write(f"Categorizing up to {limit} items...")
         else:
@@ -28,6 +31,4 @@ def handle(self, *args, **options):
             service.categorize_items(limit=limit)
             self.stdout.write(self.style.SUCCESS("Categorization complete!"))
         except Exception as e:
-            self.stdout.write(
-                self.style.ERROR(f"Categorization failed: {e}")
-            )
+            self.stdout.write(self.style.ERROR(f"Categorization failed: {e}"))
diff --git a/web/concepts/models.py b/web/concepts/models.py
index 0545806..f15c95e 100644
--- a/web/concepts/models.py
+++ b/web/concepts/models.py
@@ -188,4 +188,7 @@ class Meta:
         ]
 
     def __str__(self):
-        return f"{self.item} - {self.llm_type}: {self.result_answer} ({self.result_confidence}%)"
\ No newline at end of file
+        return (
+            f"{self.item} - {self.llm_type}: "
+            f"{self.result_answer} ({self.result_confidence}%)"
+        )
diff --git a/web/slurper/keyword_util.py b/web/slurper/keyword_util.py
index 8b8472d..0523e47 100644
--- a/web/slurper/keyword_util.py
+++ b/web/slurper/keyword_util.py
@@ -3,9 +3,7 @@
 # TODO SST: Move to readme.md
 # Load the scientific English model from scispacy
 # Note: You need to download this model first with:
-#   make install-scispacy
-# Or directly:
-#   pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz
+#  make install-scispacy
 
 # Lazy-loaded spaCy model
 _nlp = None
diff --git a/web/slurper/source_wikidata.py b/web/slurper/source_wikidata.py
index 8cc1504..9cc7272 100644
--- a/web/slurper/source_wikidata.py
+++ b/web/slurper/source_wikidata.py
@@ -7,7 +7,6 @@
 from django.db.utils import IntegrityError
 from slurper.wd_raw_item import WD_OTHER_SOURCES, BaseWdRawItem
 
-
 # Wikipedia API contact email (required by Wikipedia API guidelines)
 # Set to None to disable Wikipedia article fetching
 WIKIPEDIA_CONTACT_EMAIL = None
@@ -26,12 +25,15 @@
 #   - Excludes humans (FILTER NOT EXISTS)
 #   - Label service: Automatically fetches English labels and descriptions
 #
-#   The class fetches mathematical concepts from Wikidata while filtering out unwanted items like people and natural numbers.
+#   The class fetches mathematical concepts from Wikidata while
+#   filtering out unwanted items like people and natural numbers.
+
 
 class WikidataSlurper:
     SPARQL_URL = "https://query.wikidata.org/sparql"
 
-    SPARQL_QUERY_OPTIONS = """
+    SPARQL_QUERY_OPTIONS = (
+        """
   OPTIONAL
   { ?item wdt:P18 ?image . }
   OPTIONAL
@@ -44,7 +46,9 @@ class WikidataSlurper:
   { ?item skos:altLabel ?itemAltLabel . FILTER (lang(?itemAltLabel) = "en") }
   # except for natural numbers and positive integers
   FILTER NOT EXISTS {
-    VALUES ?excludedType { """ + " ".join(KNOWN_EXCLUDED_CATEGORIES) + """ }
+    VALUES ?excludedType { """
+        + " ".join(KNOWN_EXCLUDED_CATEGORIES)
+        + """ }
     ?item wdt:P31 ?excludedType .
   }
   # except for humans
@@ -53,6 +57,7 @@ class WikidataSlurper:
   SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
 }
 """
+    )
 
     def __init__(self, source, query, limit=None):
         self.source = source
@@ -71,16 +76,13 @@ def __init__(self, source, query, limit=None):
             + self.SPARQL_QUERY_OPTIONS
             + """
 GROUP BY ?item ?itemLabel ?itemDescription ?image ?wp_en """
-            + " ".join(
-                [f"?{src['json_key']}" for src in WD_OTHER_SOURCES.values()]
-            )
+            + " ".join([f"?{src['json_key']}" for src in WD_OTHER_SOURCES.values()])
             + """
 """
             + (f"LIMIT {limit}" if limit is not None else "")
         )
         self.raw_data = self.fetch_json()
 
-
     def _sparql_source_vars_select(self):
         def to_var(source_dict):
             return " ?" + source_dict["json_key"]
@@ -112,8 +114,10 @@ def fetch_article(self, json_item, index=None, total=None):
             if not _missing_email_logged:
                 logging.log(
                     logging.WARNING,
-                    "WIKIPEDIA_CONTACT_EMAIL is not set. Wikipedia article fetching is disabled. "
-                    "Please set WIKIPEDIA_CONTACT_EMAIL at the top of source_wikidata.py to enable article fetching.",
+                    "WIKIPEDIA_CONTACT_EMAIL is not set. "
+                    "Wikipedia article fetching is disabled. "
+                    "Please set WIKIPEDIA_CONTACT_EMAIL at the top of "
+                    "source_wikidata.py to enable article fetching.",
                 )
                 _missing_email_logged = True
             return None
@@ -156,14 +160,17 @@ def fetch_article(self, json_item, index=None, total=None):
                 time.sleep(0.01)
 
                 # Timeout: (connect_timeout, read_timeout) in seconds
-                response = requests.get(api_url, params=params, headers=headers, timeout=(5, 30))
+                response = requests.get(
+                    api_url, params=params, headers=headers, timeout=(5, 30)
+                )
 
                 # Handle rate limiting
                 if response.status_code in (429, 403):
                     if attempt < max_retries - 1:
                         logging.log(
                             logging.WARNING,
-                            f"Rate limited for {article_title}, retrying in {retry_delay}s (attempt {attempt + 1}/{max_retries})",
+                            f"Rate limited for {article_title}, retrying in "
+                            f"{retry_delay}s (attempt {attempt + 1}/{max_retries})",
                         )
                         time.sleep(retry_delay)
                         retry_delay *= 2  # Exponential backoff
@@ -171,7 +178,8 @@ def fetch_article(self, json_item, index=None, total=None):
                     else:
                         logging.log(
                             logging.ERROR,
-                            f"Failed to fetch {article_title} after {max_retries} attempts (rate limited). Skipping article.",
+                            f"Failed to fetch {article_title} after "
+                            f"{max_retries} attempts (rate limited). Skipping article.",
                         )
                         break
 
@@ -180,7 +188,8 @@ def fetch_article(self, json_item, index=None, total=None):
                 if not response.text:
                     logging.log(
                         logging.WARNING,
-                        f"Empty response for Wikipedia article: {article_title}. Skipping article.",
+                        f"Empty response for Wikipedia article: "
+                        f"{article_title}. Skipping article.",
                     )
                     break
 
@@ -200,14 +209,16 @@ def fetch_article(self, json_item, index=None, total=None):
                 if attempt < max_retries - 1:
                     logging.log(
                         logging.WARNING,
-                        f"Request failed for {article_title}: {e}, retrying in {retry_delay}s",
+                        f"Request failed for {article_title}: "
+                        f"{e}, retrying in {retry_delay}s",
                     )
                     time.sleep(retry_delay)
                     retry_delay *= 2
                 else:
                     logging.log(
                         logging.ERROR,
-                        f"Failed to fetch {article_title} after {max_retries} attempts: {e}. Skipping article.",
+                        f"Failed to fetch {article_title}"
+                        f" after {max_retries} attempts: {e}. Skipping article.",
                     )
         if not success and "wp_en" in json_item:
             logging.log(

From 245d4f8231b9a00efcf862e078de897fbd643593 Mon Sep 17 00:00:00 2001
From: Slobodan Stanojevikj <stanoja@proton.me>
Date: Sat, 13 Dec 2025 00:17:42 +0100
Subject: [PATCH 04/12] -fixed test workflow to run on push to main and pull
 requests

---
 .github/workflows/test.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index b77ce29..0115b76 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -1,6 +1,10 @@
 name: Test
 
-on: [push, pull_request]
+on:
+  push:
+    branches:
+      - main
+  pull_request:
 
 jobs:
   test:

From 055f78c92ef6eeaa82a6cb7275f1d6e602a202bb Mon Sep 17 00:00:00 2001
From: Slobodan Stanojevikj <stanoja@proton.me>
Date: Sun, 14 Dec 2025 11:21:51 +0100
Subject: [PATCH 05/12] -added excluded categories based on the categorizer
 results -added a fix to clear up gpt2 prompt output -fixed dependency issues

---
 Makefile                               |  3 +
 web/categorizer/categorizer_service.py | 98 +++++++++++++++++---------
 web/categorizer/llm_service.py         | 21 +++++-
 web/requirements.txt                   | 12 ++--
 web/slurper/source_wikidata.py         | 60 ++++++++++++++--
 5 files changed, 150 insertions(+), 44 deletions(-)

diff --git a/Makefile b/Makefile
index 3d20f73..317c9fd 100644
--- a/Makefile
+++ b/Makefile
@@ -10,6 +10,9 @@ start:
 compute-concepts:
 	python ./web/manage.py compute_concepts
 
+categorize:
+	python ./web/manage.py categorize --limit 10
+
 fix-files:
 	python3 -m black .
 	python3 -m isort .
diff --git a/web/categorizer/categorizer_service.py b/web/categorizer/categorizer_service.py
index 97186af..363e1e1 100644
--- a/web/categorizer/categorizer_service.py
+++ b/web/categorizer/categorizer_service.py
@@ -1,6 +1,4 @@
-import json
 import logging
-import re
 
 from categorizer.llm_service import LLMService, LLMType
 from concepts.models import CategorizerResult, Item
@@ -77,14 +75,19 @@ def categorize_item(
                     f"{raw_result[:100]}..."
                 )
 
+                print(f"{raw_result}")
                 parsed_result = self._parse_categorization_result(raw_result)
 
+                confidence = parsed_result["confidence"]
+                if confidence is None:
+                    confidence = 50
+
                 categorizer_result = CategorizerResult.objects.create(
                     item=item,
                     llm_type=llm_type.value,
                     raw_result=raw_result,
                     result_answer=parsed_result["answer"],
-                    result_confidence=parsed_result["confidence"],
+                    result_confidence=confidence,
                 )
                 categorizer_result.save()
 
@@ -122,11 +125,9 @@ def _build_categorization_prompt(self, item, predicate: str):
 1. answer: true or false (boolean)
 2. confidence: a number from 0 to 100 (representing your confidence percentage)
 
-Format your response as JSON:
-{
-  "answer": true,
-  "confidence": 85
-}"""
+IMPORTANT: Format your response as comma-separated string:
+yes,85
+"""
 
         item_info_parts = [f"Name: {item.name}"]
 
@@ -157,16 +158,17 @@ def _build_categorization_prompt(self, item, predicate: str):
 
 ---
 
-Please provide your evaluation in JSON format."""
+Please provide your evaluation in the comma-separated format specified above."""
 
         return prompt
 
     def _parse_categorization_result(self, result: str) -> dict:
         """
-        Parse the LLM's JSON response.
+        Parse the LLM's comma-separated response.
 
         Args:
-            result: The raw response from the LLM
+            result: The raw response from the LLM (expected format: "yes,85"
+            or "no,75", "yes ---")
 
         Returns:
             Dictionary with 'answer' (bool) and 'confidence' (int) keys
@@ -175,31 +177,63 @@ def _parse_categorization_result(self, result: str) -> dict:
             ValueError: If the response cannot be parsed
         """
         try:
-            json_match = re.search(r'\{[^}]*"answer"[^}]*\}', result, re.DOTALL)
-            if json_match:
-                json_str = json_match.group(0)
-                parsed = json.loads(json_str)
+            # Clean the result string
+            result = result.strip()
+
+            # Split by comma (with or without space) or just space
+            # Try separators in order of specificity: ", ", ",", " "
+            if ", " in result:
+                parts = result.split(", ", 1)
+            elif "," in result:
+                parts = result.split(",", 1)
+            else:
+                parts = result.split(" ", 1)
+
+            if len(parts) == 1:
+                # Only answer provided, no confidence
+                answer_str = parts[0].strip().lower()
+                confidence = None
+            elif len(parts) == 2:
+                # Both answer and confidence provided
+                answer_str = parts[0].strip().lower()
+                confidence_str = parts[1].strip()
+
+                # Parse confidence if provided
+                if confidence_str:
+                    try:
+                        confidence = int(confidence_str)
+                        if not 0 <= confidence <= 100:
+                            self.logger.warning(
+                                f"Confidence {confidence} out of range [0-100], "
+                                f"setting to None"
+                            )
+                            confidence = None
+                    except ValueError:
+                        self.logger.warning(
+                            f"Invalid confidence value '{confidence_str}', "
+                            f"setting to None"
+                        )
+                        confidence = None
+                else:
+                    confidence = None
             else:
-                parsed = json.loads(result)
-
-            if "answer" not in parsed or "confidence" not in parsed:
                 raise ValueError(
-                    "Response missing required fields 'answer' or 'confidence'"
+                    f"Expected format 'answer' or 'answer,confidence', got: {result}"
                 )
 
-            answer = parsed["answer"]
-            if isinstance(answer, str):
-                answer = answer.lower() in ("true", "yes", "1")
-
-            confidence = int(parsed["confidence"])
-            if not 0 <= confidence <= 100:
-                raise ValueError(f"Confidence must be between 0-100, got {confidence}")
+            # Parse answer - accept yes/true/1 as True, no/false/0 as False
+            if answer_str in ("yes", "true", "1"):
+                answer = True
+            elif answer_str in ("no", "false", "0"):
+                answer = False
+            else:
+                raise ValueError(
+                    f"Invalid answer value: {answer_str}. "
+                    f"Expected yes/no, true/false, or 1/0"
+                )
 
-            return {"answer": bool(answer), "confidence": confidence}
+            return {"answer": answer, "confidence": confidence}
 
-        except json.JSONDecodeError as e:
-            self.logger.error(f"Failed to parse JSON response: {result}")
-            raise ValueError(f"Invalid JSON response from LLM: {e}")
-        except (KeyError, ValueError) as e:
-            self.logger.error(f"Invalid response format: {result}")
+        except (ValueError, IndexError) as e:
+            self.logger.error(f"Failed to parse response: {result}")
             raise ValueError(f"Invalid response format: {e}")
diff --git a/web/categorizer/llm_service.py b/web/categorizer/llm_service.py
index 190da36..0d8d364 100644
--- a/web/categorizer/llm_service.py
+++ b/web/categorizer/llm_service.py
@@ -160,8 +160,7 @@ def _call_hgf(self, model_id: str, prompt: str) -> str:
                 "temperature": 0.7,
             }
 
-            # TODO SST: Remove the whole model
-            # Add pad_token_id for DialoGPT
+            # Add pad_token_id for DialoGPT and GPT2
             if "DialoGPT" in model_id or "gpt2" in model_id:
                 pipeline_kwargs["pad_token_id"] = 50256
 
@@ -178,6 +177,24 @@ def _call_hgf(self, model_id: str, prompt: str) -> str:
 
             response = hf.invoke(prompt)
 
+            if "gpt2" in model_id.lower():
+                response = response.removeprefix(prompt).strip()
+
+                lines = response.split("\n")
+                cleaned_lines = []
+                for line in lines:
+                    if line.strip() and line.strip() != "---":
+                        cleaned_lines.append(line)
+
+                response = "\n".join(cleaned_lines).strip()
+
+                # If we got nothing useful, return a default response
+                if not response:
+                    self.logger.warning(
+                        "GPT2 produced no useful output, " "returning default: 'no, 0'"
+                    )
+                    response = "no, 0"
+
             self.logger.info(f"HuggingFace model response length: {len(response)}")
             return response
 
diff --git a/web/requirements.txt b/web/requirements.txt
index 69be5c7..087e587 100644
--- a/web/requirements.txt
+++ b/web/requirements.txt
@@ -1,7 +1,7 @@
 Django~=4.2.6
 requests~=2.31.0
-spacy~=3.7.0
-scispacy~=0.5.4
+spacy~=3.7.0 --prefer-binary
+scispacy~=0.6.2
 
 # LLM dependencies (optional, install based on which LLM you want to use)
 # For paid APIs:
@@ -11,6 +11,8 @@ scispacy~=0.5.4
 # For free local models (recommended):
 langchain-huggingface>=0.0.1  # For HuggingFace models
 langchain-community>=0.0.1  # For Ollama and other local models
-transformers>=4.35.0  # Required by HuggingFace models
-torch>=2.0.0  # Required by HuggingFace models
-accelerate>=0.24.0  # Speeds up model loading
+# Required by HuggingFace models
+transformers>=4.35.0
+torch>=2.0.0
+# Speeds up model loading
+accelerate>=0.24.0
diff --git a/web/slurper/source_wikidata.py b/web/slurper/source_wikidata.py
index 9cc7272..85e5136 100644
--- a/web/slurper/source_wikidata.py
+++ b/web/slurper/source_wikidata.py
@@ -10,12 +10,62 @@
 # Wikipedia API contact email (required by Wikipedia API guidelines)
 # Set to None to disable Wikipedia article fetching
 WIKIPEDIA_CONTACT_EMAIL = None
+_missing_email_logged = False
 
-# Wikidata entities to exclude from queries (natural numbers and positive integers)
-KNOWN_EXCLUDED_CATEGORIES = ["wd:Q21199", "wd:Q28920044"]
+# Wikidata entities to exclude from queries
+KNOWN_EXCLUDED_CATEGORIES = [
+    # Natural numbers
+    "wd:Q21199",
+    # positive integers
+    "wd:Q28920044",
+    # countries
+    "wd:Q6256",
+    # philosophical concepts
+    "wd:Q714737",
+]
 
-# Flag to track if we've logged the missing email warning
-_missing_email_logged = False
+
+def _load_excluded_categories_from_results():
+    """
+    Load Wikidata identifiers of items that have been categorized as "no"
+    with confidence > 49%, to be excluded from future queries.
+
+    Returns a list of Wikidata entity IDs in the format ["wd:Q12345", ...].
+    """
+    try:
+        from concepts.models import CategorizerResult
+        from django.db.models import Avg
+
+        excluded_items = (
+            CategorizerResult.objects.filter(
+                result_answer=False, result_confidence__gt=49
+            )
+            .values("item__identifier", "item__source")
+            .annotate(avg_confidence=Avg("result_confidence"))
+            .filter(avg_confidence__gt=49, item__source=Item.Source.WIKIDATA)
+            .distinct()
+        )
+
+        categories = [f"wd:{item['item__identifier']}" for item in excluded_items]
+
+        if categories:
+            logging.log(
+                logging.INFO,
+                f"Loaded {len(categories)} excluded categories "
+                f"from categorizer results",
+            )
+
+        return categories
+    except Exception as e:
+        logging.log(
+            logging.DEBUG, f"Could not load excluded categories from results: {e}"
+        )
+        return []
+
+
+RESULT_EXCLUDED_CATEGORIES = _load_excluded_categories_from_results()
+
+EXCLUDED_CATEGORIES = KNOWN_EXCLUDED_CATEGORIES + RESULT_EXCLUDED_CATEGORIES
 
 
 # These are added to every query:
@@ -47,7 +97,7 @@ class WikidataSlurper:
   # except for natural numbers and positive integers
   FILTER NOT EXISTS {
     VALUES ?excludedType { """
-        + " ".join(KNOWN_EXCLUDED_CATEGORIES)
+        + " ".join(EXCLUDED_CATEGORIES)
         + """ }
     ?item wdt:P31 ?excludedType .
   }

From 35124cd75d4789f06faf56ce324863e62b0da5ad Mon Sep 17 00:00:00 2001
From: Slobodan Stanojevikj <stanoja@proton.me>
Date: Sun, 14 Dec 2025 11:51:28 +0100
Subject: [PATCH 06/12] -adapted readmes and makefile

---
 Makefile                  |  15 +++++
 README.md                 | 115 +++++++++++++++++++++++++++-----------
 web/categorizer/README.md |   4 +-
 3 files changed, 101 insertions(+), 33 deletions(-)

diff --git a/Makefile b/Makefile
index 317c9fd..d2cff24 100644
--- a/Makefile
+++ b/Makefile
@@ -1,12 +1,26 @@
 install:
+	pip install -r web/requirements.txt
+
+install-dev:
 	pip install -r requirements.txt
 
 install-scispacy:
 	pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz
 
+prepare-db:
+	pip install -r web/requirements.txt
+	python manage.py migrate
+	python manage.py createsuperuser
+
 start:
 	python ./web/manage.py runserver
 
+populate-db:
+	python manage.py import_wikidata
+
+clear-db:
+	python manage.py clear_wikidata
+
 compute-concepts:
 	python ./web/manage.py compute_concepts
 
@@ -14,6 +28,7 @@ categorize:
 	python ./web/manage.py categorize --limit 10
 
 fix-files:
+	pip install -r requirements.txt
 	python3 -m black .
 	python3 -m isort .
 	python3 -m flake8 .
diff --git a/README.md b/README.md
index 0517efd..bb558e3 100644
--- a/README.md
+++ b/README.md
@@ -8,35 +8,84 @@ For a demonstration of a page with at least one link, see for example `{baseurl}
 
 To install all the necessary Python packages, run:
 
-    pip install -r requirements.txt
+```bash
+make install
+# OR
+pip install -r web/requirements.txt
+```
 
 Next, to create a database, run:
 
-    python manage.py migrate
+```bash
+make prepare-db # which migrates db and creates superuser
+# OR
+python manage.py migrate
+```
 
 In order to use the administrative interface, you need to create an admin user:
 
-    python manage.py createsuperuser
+```bash
+python manage.py createsuperuser
+```
 
 Finally, to populate the database, run
 
-    python manage.py import_wikidata
+```bash
+python manage.py import_wikidata
+# OR
+make populate-db
+```
+
+  * In order to fetch wikipedia articles and extract keywords from them:
+    ```bash
+    make install-scispacy
+    ```
+    then configure your email `WIKIPEDIA_CONTACT_EMAIL` in [source_wikidata.py](web/slurper/source_wikidata.py)
+    * This is needed
+  * Then run the database population (make sure your db is cleared)
+
+
 
 If you ever want to repopulate the database, you can clear it using
 
-    python manage.py clear_wikidata
+```bash
+python manage.py clear_wikidata
+```
+
+### To run the categorizer
+The categorizer is setup to work with several models, divided into free and paid.
+All of them are run locally, so expect some performance hits. The models are downloaded when the categorizer is
+ran initially, and by default the free models are used.
+
+The database needs to be filled in before running it, so:
+```bash
+make populate-db
+```
+then
+```bash
+make categorize
+```
+
+There are some known existing issues that have some inline fixes, such as `gpt2` getting stuck
+and returning the same prompt, then few times `---\n\n\n---`.
+
+For more details see [categorizer readme](web/categorizer/README.md).
 
 ## Notes for developers
 
 In order to contribute, install [Black](https://github.com/psf/black) and [isort](https://pycqa.github.io/isort/) autoformatters and [Flake8](https://flake8.pycqa.org/) linter.
-
-    pip install black isort flake8
+```bash
+make install-dev
+```
 
 You can run all three with
-
-    isort .
-    black .
-    flake8
+```bash
+make fix-files
+# Or manually
+isort .
+black .
+flake8
+```
 
 or set up a Git pre-commit hook by creating `.git/hooks/pre-commit` with the following contents:
 
@@ -47,35 +96,37 @@ black . && isort . && flake8
 ```
 
 Each time after you change a model, make sure to create the appropriate migrations:
-
-    python manage.py makemigrations
+```bash
+python manage.py makemigrations
+```
 
 To update the database with the new model, run:
-
+```bash
     python manage.py migrate
+```
 
 ## Instructions for Katja to update the live version
-
-  sudo systemctl stop mathswitch
-  cd mathswitch
-  git pull
-  source venv/bin/activate
-  cd web
-  ./manage.py rebuild_db
-  sudo systemctl start mathswitch
-
+```bash
+sudo systemctl stop mathswitch
+cd mathswitch
+git pull
+source venv/bin/activate
+cd web
+./manage.py rebuild_db
+sudo systemctl start mathswitch
+```
 ## WD item JSON example
 
-```
+```json
 {
-    'item': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q192276'}, 
-    'art': {'type': 'uri', 'value': 'https://en.wikipedia.org/wiki/Measure_(mathematics)'}, 
-    'image': {'type': 'uri', 'value': 'http://commons.wikimedia.org/wiki/Special:FilePath/Measure%20illustration%20%28Vector%29.svg'}, 
-    'mwID': {'type': 'literal', 'value': 'Measure'}, 
-    'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'measure'}, 
-    'itemDescription': {'xml:lang': 'en', 'type': 'literal', 'value': 'function assigning numbers to some subsets of a set, which could be seen as a generalization of length, area, volume and integral'}, 
-    'eomID': {'type': 'literal', 'value': 'measure'}, 
-    'pwID': {'type': 'literal', 'value': 'Definition:Measure_(Measure_Theory)'
+    "item": {"type": "uri", "value": "http://www.wikidata.org/entity/Q192276"}, 
+    "art": {"type": "uri", "value": "https://en.wikipedia.org/wiki/Measure_(mathematics)"}, 
+    "image": {"type": "uri", "value": "http://commons.wikimedia.org/wiki/Special:FilePath/Measure%20illustration%20%28Vector%29.svg"}, 
+    "mwID": {"type": "literal", "value": "Measure"}, 
+    "itemLabel": {"xml:lang": "en", "type": "literal", "value": "measure"}, 
+    "itemDescription": {"xml:lang": "en", "type": "literal", "value": "function assigning numbers to some subsets of a set, which could be seen as a generalization of length, area, volume and integral"}, 
+    "eomID": {"type": "literal", "value": "measure"}, 
+    "pwID": {"type": "literal", "value": "Definition:Measure_(Measure_Theory)"}
 }
 ```
 
diff --git a/web/categorizer/README.md b/web/categorizer/README.md
index 40514d3..7f44469 100644
--- a/web/categorizer/README.md
+++ b/web/categorizer/README.md
@@ -8,7 +8,7 @@ The categorizer module provides LLM-powered categorization of mathematical conce
 
 **For FREE local models (recommended):**
 ```bash
-pip install langchain-huggingface langchain-community transformers torch accelerate
+make install
 ```
 
 **For paid API models (optional):**
@@ -63,6 +63,8 @@ python manage.py categorize
 Categorize a limited number of items:
 ```bash
 python manage.py categorize --limit 10
+make categorize
+# OR
 ```
 
 Use a specific LLM provider:

From 605ff7fdf2a598d159dfe9f5a7106ec342d04848 Mon Sep 17 00:00:00 2001
From: Slobodan Stanojevikj <stanoja@proton.me>
Date: Sun, 14 Dec 2025 13:53:30 +0100
Subject: [PATCH 07/12] -upgraded github actions to latest -added static python
 version 3.12.7

---
 .github/workflows/test.yml | 6 ++++--
 .python-version            | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)
 create mode 100644 .python-version

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 0115b76..25739e6 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -10,8 +10,10 @@ jobs:
   test:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
-      - uses: actions/setup-python@v2
+      - uses: actions/checkout@v5
+      - uses: actions/setup-python@v6
+        with:
+          python-version: '3.12.7'
       - run: pip install -r web/requirements.txt 
       - run: pip install black isort flake8
       - run: python3 -m black --check .
diff --git a/.python-version b/.python-version
new file mode 100644
index 0000000..56bb660
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.12.7

From 67e052cbee06fef824104cbfd947b1df1fafc3c0 Mon Sep 17 00:00:00 2001
From: Slobodan Stanojevikj <stanoja@proton.me>
Date: Sun, 14 Dec 2025 14:40:27 +0100
Subject: [PATCH 08/12] -added static versions to dependencies

---
 web/requirements.txt | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/web/requirements.txt b/web/requirements.txt
index 087e587..3a83c77 100644
--- a/web/requirements.txt
+++ b/web/requirements.txt
@@ -1,5 +1,5 @@
 Django~=4.2.6
-requests~=2.31.0
+requests~=2.32.5
 spacy~=3.7.0 --prefer-binary
 scispacy~=0.6.2
 
@@ -8,11 +8,11 @@ scispacy~=0.6.2
 # openai>=1.0.0  # Uncomment for OpenAI GPT models
 # anthropic>=0.7.0  # Uncomment for Anthropic Claude
 
-# For free local models (recommended):
-langchain-huggingface>=0.0.1  # For HuggingFace models
-langchain-community>=0.0.1  # For Ollama and other local models
+# For free local models:
+langchain-huggingface==0.3.1  # For HuggingFace models
+langchain-community==0.3.27  # For Ollama and other local models
 # Required by HuggingFace models
-transformers>=4.35.0
-torch>=2.0.0
+transformers~=4.57.0
+torch~=2.9.0
 # Speeds up model loading
-accelerate>=0.24.0
+accelerate~=1.12.0

From 94730dc7c2c2b9b8434ac989ee64e653741b7861 Mon Sep 17 00:00:00 2001
From: Slobodan Stanojevikj <stanoja@proton.me>
Date: Sun, 14 Dec 2025 16:17:03 +0100
Subject: [PATCH 09/12] -added configuration parser and config file
 (.env.example) -minor Makefile cleanup and updated Readme

---
 .gitignore                     |  3 ++-
 Makefile                       | 10 ++++------
 README.md                      |  9 ++++++---
 web/.env.example               |  2 ++
 web/requirements.txt           |  1 +
 web/slurper/source_wikidata.py |  2 +-
 web/web/settings.py            |  5 ++++-
 7 files changed, 20 insertions(+), 12 deletions(-)
 create mode 100644 web/.env.example

diff --git a/.gitignore b/.gitignore
index 70b914e..908cac1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 venv
 __pycache__
-web/db.sqlite3
+web/**/*.sqlite3
+**/.env
diff --git a/Makefile b/Makefile
index d2cff24..5f6ba68 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,8 @@
-install:
+prepare-web:
 	pip install -r web/requirements.txt
+	cp web/.env.example web/.env
+	python manage.py migrate
+	python manage.py createsuperuser
 
 install-dev:
 	pip install -r requirements.txt
@@ -7,11 +10,6 @@ install-dev:
 install-scispacy:
 	pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz
 
-prepare-db:
-	pip install -r web/requirements.txt
-	python manage.py migrate
-	python manage.py createsuperuser
-
 start:
 	python ./web/manage.py runserver
 
diff --git a/README.md b/README.md
index bb558e3..97beebb 100644
--- a/README.md
+++ b/README.md
@@ -9,16 +9,19 @@ For a demonstration of a page with at least one link, see for example `{baseurl}
 To install all the necessary Python packages, run:
 
 ```bash
-make install
+make prepare-web # Which does the necessary steps for env, db, superuser
 # OR
 pip install -r web/requirements.txt
 ```
 
+Prepare an environment:
+```bash
+cp web/.env.example web/.env
+```
+
 Next, to create a database, run:
 
 ```bash
-make prepare-db # which migrates db and creates superuser
-# OR
 python manage.py migrate
 ```
 
diff --git a/web/.env.example b/web/.env.example
new file mode 100644
index 0000000..40e274f
--- /dev/null
+++ b/web/.env.example
@@ -0,0 +1,2 @@
+SECRET_KEY="django-insecure-9wy9w#vf^tde0262doyy_j19=64c()_qub!1)f+fh-b^=7ndw*"
+WIKIPEDIA_CONTACT_EMAIL=my@email.com
\ No newline at end of file
diff --git a/web/requirements.txt b/web/requirements.txt
index 3a83c77..4d6c286 100644
--- a/web/requirements.txt
+++ b/web/requirements.txt
@@ -2,6 +2,7 @@ Django~=4.2.6
 requests~=2.32.5
 spacy~=3.7.0 --prefer-binary
 scispacy~=0.6.2
+python-decouple~=3.8
 
 # LLM dependencies (optional, install based on which LLM you want to use)
 # For paid APIs:
diff --git a/web/slurper/source_wikidata.py b/web/slurper/source_wikidata.py
index 85e5136..1004d4e 100644
--- a/web/slurper/source_wikidata.py
+++ b/web/slurper/source_wikidata.py
@@ -6,10 +6,10 @@
 from concepts.models import Item
 from django.db.utils import IntegrityError
 from slurper.wd_raw_item import WD_OTHER_SOURCES, BaseWdRawItem
+from web.settings import WIKIPEDIA_CONTACT_EMAIL
 
 # Wikipedia API contact email (required by Wikipedia API guidelines)
 # Set to None to disable Wikipedia article fetching
-WIKIPEDIA_CONTACT_EMAIL = None
 _missing_email_logged = False
 
 # Wikidata entities to exclude from queries
diff --git a/web/web/settings.py b/web/web/settings.py
index 199c794..8195f10 100644
--- a/web/web/settings.py
+++ b/web/web/settings.py
@@ -12,6 +12,7 @@
 
 from os import path
 from pathlib import Path
+from decouple import config
 
 # Build paths inside the project like this: BASE_DIR / 'subdir'.
 BASE_DIR = Path(__file__).resolve().parent.parent
@@ -21,7 +22,7 @@
 # See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
 
 # SECURITY WARNING: keep the secret key used in production secret!
-SECRET_KEY = "django-insecure-9wy9w#vf^tde0262doyy_j19=64c()_qub!1)f+fh-b^=7ndw*"
+SECRET_KEY = config('SECRET_KEY', default="django-insecure-9wy9w#vf^tde0262doyy_j19=64c()_qub!1)f+fh-b^=7ndw*")
 
 # SECURITY WARNING: don't run with debug turned on in production!
 DEBUG = True
@@ -128,3 +129,5 @@
 # https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
 
 DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
+
+WIKIPEDIA_CONTACT_EMAIL = config('WIKIPEDIA_CONTACT_EMAIL', default="my@email.com")

From d1acf435fd21b79b2fa69e57df6bd2bf1dd21544 Mon Sep 17 00:00:00 2001
From: Slobodan Stanojevikj <stanoja@proton.me>
Date: Sun, 14 Dec 2025 16:36:04 +0100
Subject: [PATCH 10/12] -code restyling -excluded venv from flake8

---
 .flake8                        | 2 +-
 web/slurper/source_wikidata.py | 1 +
 web/web/settings.py            | 8 ++++++--
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/.flake8 b/.flake8
index 8e952af..7391d15 100644
--- a/.flake8
+++ b/.flake8
@@ -1,3 +1,3 @@
 [flake8]
-exclude = experiments,migrations,settings.py
+exclude = experiments,migrations,settings.py,venv/
 max-line-length = 88
diff --git a/web/slurper/source_wikidata.py b/web/slurper/source_wikidata.py
index 1004d4e..758f346 100644
--- a/web/slurper/source_wikidata.py
+++ b/web/slurper/source_wikidata.py
@@ -6,6 +6,7 @@
 from concepts.models import Item
 from django.db.utils import IntegrityError
 from slurper.wd_raw_item import WD_OTHER_SOURCES, BaseWdRawItem
+
 from web.settings import WIKIPEDIA_CONTACT_EMAIL
 
 # Wikipedia API contact email (required by Wikipedia API guidelines)
diff --git a/web/web/settings.py b/web/web/settings.py
index 8195f10..8b33153 100644
--- a/web/web/settings.py
+++ b/web/web/settings.py
@@ -12,6 +12,7 @@
 
 from os import path
 from pathlib import Path
+
 from decouple import config
 
 # Build paths inside the project like this: BASE_DIR / 'subdir'.
@@ -22,7 +23,10 @@
 # See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
 
 # SECURITY WARNING: keep the secret key used in production secret!
-SECRET_KEY = config('SECRET_KEY', default="django-insecure-9wy9w#vf^tde0262doyy_j19=64c()_qub!1)f+fh-b^=7ndw*")
+SECRET_KEY = config(
+    "SECRET_KEY",
+    default="django-insecure-9wy9w#vf^tde0262doyy_j19=64c()_qub!1)f+fh-b^=7ndw*",
+)
 
 # SECURITY WARNING: don't run with debug turned on in production!
 DEBUG = True
@@ -130,4 +134,4 @@
 
 DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
 
-WIKIPEDIA_CONTACT_EMAIL = config('WIKIPEDIA_CONTACT_EMAIL', default="my@email.com")
+WIKIPEDIA_CONTACT_EMAIL = config("WIKIPEDIA_CONTACT_EMAIL", default="my@email.com")

From 31354339c727cdf73dec7a645831e4235ffe250f Mon Sep 17 00:00:00 2001
From: Slobodan Stanojevikj <stanoja@proton.me>
Date: Sun, 14 Dec 2025 17:41:52 +0100
Subject: [PATCH 11/12] -minor fixes in Makefile commands

---
 Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 5f6ba68..7436995 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,8 @@
 prepare-web:
 	pip install -r web/requirements.txt
 	cp web/.env.example web/.env
-	python manage.py migrate
-	python manage.py createsuperuser
+	python ./web/manage.py migrate
+	python ./web/manage.py createsuperuser
 
 install-dev:
 	pip install -r requirements.txt
@@ -14,10 +14,10 @@ start:
 	python ./web/manage.py runserver
 
 populate-db:
-	python manage.py import_wikidata
+	python ./web/manage.py import_wikidata
 
 clear-db:
-	python manage.py clear_wikidata
+	python ./web/manage.py clear_wikidata
 
 compute-concepts:
 	python ./web/manage.py compute_concepts

From 8b1f7387dbb415b8ed088e39d696f49ded06c343 Mon Sep 17 00:00:00 2001
From: Slobodan Stanojevikj <stanoja@proton.me>
Date: Sun, 14 Dec 2025 19:48:07 +0100
Subject: [PATCH 12/12] -limited llm input for context window

---
 web/categorizer/categorizer_service.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/web/categorizer/categorizer_service.py b/web/categorizer/categorizer_service.py
index 363e1e1..008289b 100644
--- a/web/categorizer/categorizer_service.py
+++ b/web/categorizer/categorizer_service.py
@@ -132,14 +132,14 @@ def _build_categorization_prompt(self, item, predicate: str):
         item_info_parts = [f"Name: {item.name}"]
 
         if item.description:
-            item_info_parts.append(f"Description: {item.description}")
+            item_info_parts.append(f"Description: {item.description[:100]}")
 
         if item.keywords:
-            item_info_parts.append(f"Keywords: {item.keywords}")
+            item_info_parts.append(f"Keywords: {item.keywords[:200]}")
 
         if item.article_text:
-            # Truncate article text to 5000 characters
-            article_text = item.article_text[:5000]
+            # Truncate article text to 1000 characters
+            article_text = item.article_text[:1000]
             item_info_parts.append(f"Article text: {article_text}")
 
         item_info = "\n".join(item_info_parts)