From 93561b1e906afad521f55c7c65e3f3d49ae818f8 Mon Sep 17 00:00:00 2001 From: Slobodan Stanojevikj Date: Fri, 12 Dec 2025 10:46:32 +0100 Subject: [PATCH 01/12] -wikidata SPARQL query adjustments -added fetching from related articles and keyword extraction -added local llm execution in order to categorize items --- Makefile | 8 + requirements.txt | 5 + web/categorizer/README.md | 139 ++++++++++++ web/categorizer/__init__.py | 0 web/categorizer/categorizer_service.py | 196 ++++++++++++++++ web/categorizer/llm_service.py | 209 ++++++++++++++++++ web/categorizer/management/__init__.py | 0 .../management/commands/__init__.py | 0 .../management/commands/categorize.py | 33 +++ web/concepts/admin.py | 17 +- web/concepts/migrations/0011_item_keywords.py | 18 ++ .../migrations/0012_item_article_text.py | 18 ++ web/concepts/migrations/0013_item_aliases.py | 18 ++ .../migrations/0014_categorizerresult.py | 58 +++++ web/concepts/models.py | 30 +++ web/requirements.txt | 14 ++ web/slurper/keyword_util.py | 25 +++ web/slurper/source_wikidata.py | 95 +++++++- web/slurper/wd_raw_item.py | 27 +++ web/web/settings.py | 1 + web/web/urls.py | 1 + 21 files changed, 908 insertions(+), 4 deletions(-) create mode 100644 Makefile create mode 100644 requirements.txt create mode 100644 web/categorizer/README.md create mode 100644 web/categorizer/__init__.py create mode 100644 web/categorizer/categorizer_service.py create mode 100644 web/categorizer/llm_service.py create mode 100644 web/categorizer/management/__init__.py create mode 100644 web/categorizer/management/commands/__init__.py create mode 100644 web/categorizer/management/commands/categorize.py create mode 100644 web/concepts/migrations/0011_item_keywords.py create mode 100644 web/concepts/migrations/0012_item_article_text.py create mode 100644 web/concepts/migrations/0013_item_aliases.py create mode 100644 web/concepts/migrations/0014_categorizerresult.py create mode 100644 web/slurper/keyword_util.py diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..3cfe1f6 --- /dev/null +++ b/Makefile @@ -0,0 +1,8 @@ +install: + pip install -r requirements.txt + +start: + python ./web/manage.py runserver + +compute-concepts: + python ./web/manage.py compute_concepts diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e86dbb8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +black~=25.9.0 +isort~=4.2.5 +flake8~=7.3.0 + +-r ./web/requirements.txt diff --git a/web/categorizer/README.md b/web/categorizer/README.md new file mode 100644 index 0000000..40514d3 --- /dev/null +++ b/web/categorizer/README.md @@ -0,0 +1,139 @@ +# Categorizer Module + +The categorizer module provides LLM-powered categorization of mathematical concepts. + +## Setup + +### 1. Install Required Dependencies + +**For FREE local models (recommended):** +```bash +pip install langchain-huggingface langchain-community transformers torch accelerate +``` + +**For paid API models (optional):** + +For OpenAI: +```bash +pip install openai +``` + +For Anthropic Claude: +```bash +pip install anthropic +``` + +**For Ollama (free local alternative):** +1. Install Ollama from https://ollama.ai +2. Install langchain-community: `pip install langchain-community` +3. Pull a model: `ollama pull llama2` + +### 2. Configure API Keys (only for paid models) + +Set the appropriate environment variable for your chosen LLM provider: + +**For OpenAI:** +```bash +export OPENAI_API_KEY="your-openai-api-key-here" +``` + +**For Anthropic Claude:** +```bash +export ANTHROPIC_API_KEY="your-anthropic-api-key-here" +``` + +**For Ollama (optional):** +```bash +export OLLAMA_MODEL="llama2" # Default is llama2 +``` + +You can also add these to a `.env` file or your shell configuration file (`.bashrc`, `.zshrc`, etc.). + +## Usage + +### Basic Usage + +Categorize all items using the default FREE LLM (HuggingFace FLAN-T5): +```bash +python manage.py categorize +``` + +### With Options + +Categorize a limited number of items: +```bash +python manage.py categorize --limit 10 +``` + +Use a specific LLM provider: + +**FREE models (run locally):** +```bash +# Use HuggingFace FLAN-T5 (default, free, good for instruction following) +python manage.py categorize --llm huggingface_flan_t5 + +# Use HuggingFace GPT-2 (free, generative model) +python manage.py categorize --llm huggingface_gpt2 + +# Use HuggingFace DialoGPT (free, conversational model) +python manage.py categorize --llm huggingface_dialogpt + +# Use Ollama (free, requires Ollama installed) +python manage.py categorize --llm ollama +``` + +**Paid API models:** +```bash +# Use OpenAI GPT-4 (requires API key) +python manage.py categorize --llm openai_gpt4 + +# Use OpenAI GPT-3.5 Turbo (requires API key) +python manage.py categorize --llm openai_gpt35 + +# Use Anthropic Claude (requires API key) +python manage.py categorize --llm anthropic_claude +``` + +Combine options: +```bash +python manage.py categorize --limit 5 --llm huggingface_flan_t5 +``` + +## Architecture + +- `categorizer_service.py` - Main service for categorizing items +- `llm_service.py` - Service for calling various LLM APIs +- `management/commands/categorize.py` - Django management command + +## Supported LLMs + +### Free Models (No API Key Required) +1. **HuggingFace FLAN-T5** - Google's instruction-following model (recommended for tasks) +2. **HuggingFace GPT-2** - OpenAI's classic generative model +3. **HuggingFace DialoGPT** - Microsoft's conversational model +4. **Ollama** - Run any Ollama model locally (llama2, mistral, etc.) + +### Paid API Models (Require API Key) +1. **OpenAI GPT-4** - Most capable, but expensive +2. **OpenAI GPT-3.5 Turbo** - Fast and cheaper than GPT-4 +3. **Anthropic Claude** - High quality, good reasoning + +## Performance Notes + +- **Free models** run locally and don't require internet/API keys, but: + - First run downloads the model (~1-3GB depending on model) + - Requires sufficient RAM (4-8GB+ recommended) + - Slower than API models (especially without GPU) + +- **API models** are faster but cost money per request + +- **Ollama** is a good middle ground - free, local, and supports many models + +## Extending + +To add support for additional LLM providers: + +1. Add a new entry to the `LLMType` enum in `llm_service.py` +2. Implement a new private method (e.g., `_call_new_provider`) in the `LLMService` class +3. Add the new provider to the `call_llm` method's conditional logic +4. Update the command choices in `management/commands/categorize.py` diff --git a/web/categorizer/__init__.py b/web/categorizer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/web/categorizer/categorizer_service.py b/web/categorizer/categorizer_service.py new file mode 100644 index 0000000..2a4e40e --- /dev/null +++ b/web/categorizer/categorizer_service.py @@ -0,0 +1,196 @@ +import json +import logging +import re +from concepts.models import Item, CategorizerResult +from categorizer.llm_service import LLMService, LLMType + +# Free LLM types to use for categorization +LLM_JUDGE_POOL = [ + LLMType.HUGGINGFACE_FLAN_T5, + LLMType.HUGGINGFACE_GPT2, + LLMType.HUGGINGFACE_DIALOGPT, +] + + +class CategorizerService: + """ + Service for categorizing mathematical concepts. + """ + + def __init__(self): + self.logger = logging.getLogger(__name__) + self.llm_service = LLMService() + + def categorize_items(self, limit=None): + """ + Categorize items from the database using all free LLM types. + + Args: + limit: Optional limit on number of items to process + """ + queryset = Item.objects.all() + if limit: + queryset = queryset[:limit] + + total = queryset.count() + self.logger.info(f"Categorizing {total} items using {len(LLM_JUDGE_POOL)} free LLMs") + + for i, item in enumerate(queryset): + self.logger.info( + f"Processing item {i + 1}/{total}: {item.identifier}" + ) + self.categorize_item(item) + + self.logger.info("Categorization complete") + + def categorize_item( + self, + item, + predicate: str = "Is the given concept a mathematical concept, given the name, description, keywords, and article text?" + ): + """ + Categorize a single item using all free LLM types. + + Args: + item: Item instance to categorize + predicate: The question to evaluate (default: checks if it's a mathematical concept) + + Returns: + List of categorization results from all LLMs + """ + self.logger.debug(f"Categorizing: {item.name}") + + prompt = self._build_categorization_prompt(item, predicate) + + results = [] + + for llm_type in LLM_JUDGE_POOL: + try: + self.logger.info(f"Calling {llm_type.value} for {item.name}") + raw_result = self.llm_service.call_llm(llm_type, prompt) + self.logger.info(f"Categorized {item.name} with {llm_type.value}: {raw_result[:100]}...") + + parsed_result = self._parse_categorization_result(raw_result) + + categorizer_result = CategorizerResult.objects.create( + item=item, + llm_type=llm_type.value, + raw_result=raw_result, + result_answer=parsed_result["answer"], + result_confidence=parsed_result["confidence"], + ) + categorizer_result.save() + + self.logger.info( + f"Saved categorization result for {item.name} ({llm_type.value}): " + f"answer={parsed_result['answer']}, " + f"confidence={parsed_result['confidence']}" + ) + + results.append(parsed_result) + except Exception as e: + self.logger.error(f"Failed to categorize {item.name} with {llm_type.value}: {e}") + # Continue with other LLMs even if one fails? + continue + + return results + + def _build_categorization_prompt(self, item, predicate: str): + """ + Build a prompt for evaluating a concept against a predicate. + + Args: + item: Item instance to categorize + predicate: The question/predicate to evaluate + + Returns: + Formatted prompt string + """ + system_prompt = """You are a categorization judge. Your task is to evaluate whether a given concept satisfies a specific predicate. + +You must respond with a structured answer containing: +1. answer: true or false (boolean) +2. confidence: a number from 0 to 100 (representing your confidence percentage) + +Format your response as JSON: +{ + "answer": true, + "confidence": 85 +}""" + + item_info_parts = [f"Name: {item.name}"] + + if item.description: + item_info_parts.append(f"Description: {item.description}") + + if item.keywords: + item_info_parts.append(f"Keywords: {item.keywords}") + + if item.article_text: + # Truncate article text to 5000 characters + article_text = item.article_text[:5000] + item_info_parts.append(f"Article text: {article_text}") + + item_info = "\n".join(item_info_parts) + + prompt = f"""{system_prompt} + +--- + +CONCEPT INFORMATION: +{item_info} + +--- + +PREDICATE TO EVALUATE: +{predicate} + +--- + +Please provide your evaluation in JSON format.""" + + return prompt + + def _parse_categorization_result(self, result: str) -> dict: + """ + Parse the LLM's JSON response. + + Args: + result: The raw response from the LLM + + Returns: + Dictionary with 'answer' (bool) and 'confidence' (int) keys + + Raises: + ValueError: If the response cannot be parsed + """ + try: + json_match = re.search(r'\{[^}]*"answer"[^}]*\}', result, re.DOTALL) + if json_match: + json_str = json_match.group(0) + parsed = json.loads(json_str) + else: + parsed = json.loads(result) + + if "answer" not in parsed or "confidence" not in parsed: + raise ValueError("Response missing required fields 'answer' or 'confidence'") + + answer = parsed["answer"] + if isinstance(answer, str): + answer = answer.lower() in ("true", "yes", "1") + + confidence = int(parsed["confidence"]) + if not 0 <= confidence <= 100: + raise ValueError(f"Confidence must be between 0-100, got {confidence}") + + return { + "answer": bool(answer), + "confidence": confidence + } + + except json.JSONDecodeError as e: + self.logger.error(f"Failed to parse JSON response: {result}") + raise ValueError(f"Invalid JSON response from LLM: {e}") + except (KeyError, ValueError) as e: + self.logger.error(f"Invalid response format: {result}") + raise ValueError(f"Invalid response format: {e}") diff --git a/web/categorizer/llm_service.py b/web/categorizer/llm_service.py new file mode 100644 index 0000000..cb13d23 --- /dev/null +++ b/web/categorizer/llm_service.py @@ -0,0 +1,209 @@ +import logging +import os +from enum import Enum + + +class LLMType(Enum): + """Supported LLM types""" + + # Paid API-based models + OPENAI_GPT4 = "openai_gpt4" + OPENAI_GPT35 = "openai_gpt35" + ANTHROPIC_CLAUDE = "anthropic_claude" + + # Free HuggingFace models (run locally) + HUGGINGFACE_FLAN_T5 = "huggingface_flan_t5" + HUGGINGFACE_GPT2 = "huggingface_gpt2" + HUGGINGFACE_DIALOGPT = "huggingface_dialogpt" + + # Ollama (free local models) + OLLAMA = "ollama" + + +class LLMService: + """ + Service for calling various LLM providers. + """ + + def __init__(self): + self.logger = logging.getLogger(__name__) + self.llm_handlers = { + LLMType.OPENAI_GPT4: lambda llm_type, prompt: self._call_openai(llm_type, prompt), + LLMType.OPENAI_GPT35: lambda llm_type, prompt: self._call_openai(llm_type, prompt), + LLMType.ANTHROPIC_CLAUDE: lambda llm_type, prompt: self._call_anthropic(prompt), + LLMType.HUGGINGFACE_FLAN_T5: lambda llm_type, prompt: self._call_huggingface("google/flan-t5-base", prompt), + LLMType.HUGGINGFACE_GPT2: lambda llm_type, prompt: self._call_huggingface("gpt2", prompt), + LLMType.HUGGINGFACE_DIALOGPT: lambda llm_type, prompt: self._call_huggingface("microsoft/DialoGPT-medium", prompt), + LLMType.OLLAMA: lambda llm_type, prompt: self._call_ollama(prompt), + } + + def call_llm(self, llm_type: LLMType, prompt: str) -> str: + """ + Call an LLM with the given prompt. + + Args: + llm_type: The type of LLM to use (LLMType enum) + prompt: The prompt to send to the LLM + + Returns: + The LLM's response as a string + + Raises: + ValueError: If the LLM type is not supported or API key is missing + Exception: If the API call fails + """ + self.logger.info(f"Calling {llm_type.value} with prompt length: {len(prompt)}") + + handler = self.llm_handlers.get(llm_type) + + if handler: + return handler(llm_type, prompt) + else: + raise ValueError(f"Unsupported LLM type: {llm_type}") + + def _call_openai(self, llm_type: LLMType, prompt: str) -> str: + """Call OpenAI API""" + try: + import openai + except ImportError: + raise ImportError( + "openai package is required. Install it with: pip install openai" + ) + + api_key = os.getenv("OPENAI_API_KEY") + if not api_key: + raise ValueError( + "OPENAI_API_KEY environment variable is not set. " + "Please set it to your OpenAI API key." + ) + + openai.api_key = api_key + + model = "gpt-4" if llm_type == LLMType.OPENAI_GPT4 else "gpt-3.5-turbo" + + try: + response = openai.ChatCompletion.create( + model=model, + messages=[{"role": "user", "content": prompt}], + temperature=0.7, + ) + return response.choices[0].message.content + except Exception as e: + self.logger.error(f"OpenAI API call failed: {e}") + raise + + def _call_anthropic(self, prompt: str) -> str: + """Call Anthropic Claude API""" + try: + import anthropic + except ImportError: + raise ImportError( + "anthropic package is required. Install it with: pip install anthropic" + ) + + api_key = os.getenv("ANTHROPIC_API_KEY") + if not api_key: + raise ValueError( + "ANTHROPIC_API_KEY environment variable is not set. " + "Please set it to your Anthropic API key." + ) + + client = anthropic.Anthropic(api_key=api_key) + + try: + response = client.messages.create( + model="claude-3-5-sonnet-20241022", + max_tokens=1024, + messages=[{"role": "user", "content": prompt}], + ) + return response.content[0].text + except Exception as e: + self.logger.error(f"Anthropic API call failed: {e}") + raise + + def _call_huggingface(self, model_id: str, prompt: str) -> str: + """ + Call HuggingFace models using langchain. + + Args: + model_id: HuggingFace model ID (e.g., "google/flan-t5-base") + prompt: The prompt to send to the model + + Returns: + The model's response + """ + try: + from langchain_huggingface import HuggingFacePipeline + except ImportError: + raise ImportError( + "langchain-huggingface package is required. " + "Install it with: pip install langchain-huggingface" + ) + + self.logger.info(f"Loading HuggingFace model: {model_id}") + + try: + pipeline_kwargs = { + "max_new_tokens": 512, + "temperature": 0.7, + } + + # TODO SST: Remove the whole model + # Add pad_token_id for DialoGPT + if "DialoGPT" in model_id or "gpt2" in model_id: + pipeline_kwargs["pad_token_id"] = 50256 + + # Create the HuggingFace pipeline + hf = HuggingFacePipeline.from_model_id( + model_id=model_id, + task="text-generation" if "gpt" in model_id.lower() else "text2text-generation", + pipeline_kwargs=pipeline_kwargs, + ) + + response = hf.invoke(prompt) + + self.logger.info(f"HuggingFace model response length: {len(response)}") + return response + + except Exception as e: + self.logger.error(f"HuggingFace model call failed: {e}") + raise + + def _call_ollama(self, prompt: str, model: str = "llama2") -> str: + """ + Call Ollama for local LLM inference. + + Args: + prompt: The prompt to send to the model + model: Ollama model name (default: llama2) + + Returns: + The model's response + + Note: + Requires Ollama to be installed and running locally. + Install from: https://ollama.ai + """ + try: + from langchain_community.llms import Ollama + except ImportError: + raise ImportError( + "langchain-community package is required. " + "Install it with: pip install langchain-community" + ) + + # Allow model override via environment variable + model = os.getenv("OLLAMA_MODEL", model) + + self.logger.info(f"Calling Ollama with model: {model}") + + try: + llm = Ollama(model=model) + response = llm.invoke(prompt) + return response + except Exception as e: + self.logger.error( + f"Ollama call failed: {e}. " + "Make sure Ollama is installed and running (https://ollama.ai)" + ) + raise diff --git a/web/categorizer/management/__init__.py b/web/categorizer/management/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/web/categorizer/management/commands/__init__.py b/web/categorizer/management/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/web/categorizer/management/commands/categorize.py b/web/categorizer/management/commands/categorize.py new file mode 100644 index 0000000..af9694b --- /dev/null +++ b/web/categorizer/management/commands/categorize.py @@ -0,0 +1,33 @@ +from django.core.management.base import BaseCommand +from categorizer.categorizer_service import CategorizerService + + +class Command(BaseCommand): + help = "Categorize mathematical concepts using all free LLMs (HuggingFace models)" + + def add_arguments(self, parser): + parser.add_argument( + "--limit", + type=int, + default=None, + help="Limit the number of items to categorize", + ) + + def handle(self, *args, **options): + limit = options.get("limit") + + service = CategorizerService() + + self.stdout.write("Using all free LLMs: huggingface_flan_t5, huggingface_gpt2, huggingface_dialogpt") + if limit: + self.stdout.write(f"Categorizing up to {limit} items...") + else: + self.stdout.write("Categorizing all items...") + + try: + service.categorize_items(limit=limit) + self.stdout.write(self.style.SUCCESS("Categorization complete!")) + except Exception as e: + self.stdout.write( + self.style.ERROR(f"Categorization failed: {e}") + ) diff --git a/web/concepts/admin.py b/web/concepts/admin.py index 60e9bc5..83cb3c7 100644 --- a/web/concepts/admin.py +++ b/web/concepts/admin.py @@ -1,6 +1,6 @@ from django.contrib import admin -from .models import Item +from .models import CategorizerResult, Item class ItemAdmin(admin.ModelAdmin): @@ -9,4 +9,19 @@ class ItemAdmin(admin.ModelAdmin): list_filter = ["source"] +class CategorizerResultAdmin(admin.ModelAdmin): + list_display = [ + "item", + "llm_type", + "result_answer", + "result_confidence", + "created_at", + ] + search_fields = ["item__name", "item__identifier"] + list_filter = ["llm_type", "result_answer", "created_at"] + readonly_fields = ["created_at", "updated_at"] + ordering = ["-created_at"] + + admin.site.register(Item, ItemAdmin) +admin.site.register(CategorizerResult, CategorizerResultAdmin) diff --git a/web/concepts/migrations/0011_item_keywords.py b/web/concepts/migrations/0011_item_keywords.py new file mode 100644 index 0000000..1773323 --- /dev/null +++ b/web/concepts/migrations/0011_item_keywords.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.25 on 2025-12-11 18:55 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("concepts", "0010_alter_item_source"), + ] + + operations = [ + migrations.AddField( + model_name="item", + name="keywords", + field=models.TextField(blank=True, null=True), + ), + ] diff --git a/web/concepts/migrations/0012_item_article_text.py b/web/concepts/migrations/0012_item_article_text.py new file mode 100644 index 0000000..4c1998d --- /dev/null +++ b/web/concepts/migrations/0012_item_article_text.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.25 on 2025-12-11 20:41 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("concepts", "0011_item_keywords"), + ] + + operations = [ + migrations.AddField( + model_name="item", + name="article_text", + field=models.TextField(blank=True, null=True), + ), + ] diff --git a/web/concepts/migrations/0013_item_aliases.py b/web/concepts/migrations/0013_item_aliases.py new file mode 100644 index 0000000..510997c --- /dev/null +++ b/web/concepts/migrations/0013_item_aliases.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.25 on 2025-12-11 21:19 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("concepts", "0012_item_article_text"), + ] + + operations = [ + migrations.AddField( + model_name="item", + name="aliases", + field=models.TextField(blank=True, null=True), + ), + ] diff --git a/web/concepts/migrations/0014_categorizerresult.py b/web/concepts/migrations/0014_categorizerresult.py new file mode 100644 index 0000000..96dc102 --- /dev/null +++ b/web/concepts/migrations/0014_categorizerresult.py @@ -0,0 +1,58 @@ +# Generated by Django 4.2.25 on 2025-12-11 22:37 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ("concepts", "0013_item_aliases"), + ] + + operations = [ + migrations.CreateModel( + name="CategorizerResult", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("llm_type", models.CharField(max_length=50)), + ("raw_result", models.TextField()), + ("result_answer", models.BooleanField()), + ("result_confidence", models.IntegerField()), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ( + "item", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="categorizer_results", + to="concepts.item", + ), + ), + ], + options={ + "ordering": ["-created_at"], + "indexes": [ + models.Index( + fields=["item", "llm_type"], + name="concepts_ca_item_id_c24595_idx", + ), + models.Index( + fields=["result_answer"], name="concepts_ca_result__a4c7a5_idx" + ), + models.Index( + fields=["result_confidence"], + name="concepts_ca_result__d25f96_idx", + ), + ], + }, + ), + ] diff --git a/web/concepts/models.py b/web/concepts/models.py index 697eb61..0545806 100644 --- a/web/concepts/models.py +++ b/web/concepts/models.py @@ -88,6 +88,9 @@ def key(): url = models.URLField(max_length=200) name = models.CharField(max_length=200, null=True) description = models.TextField(null=True) + keywords = models.TextField(null=True, blank=True) + article_text = models.TextField(null=True, blank=True) + aliases = models.TextField(null=True, blank=True) concept = models.ForeignKey( Concept, models.SET_NULL, @@ -159,3 +162,30 @@ def save_new(source: Item, destination: Item, label: Label): def __str__(self): return f"{self.source} -[{self.get_label_display()}]-> {self.destination}" + + +class CategorizerResult(models.Model): + """ + Stores the result of categorizing an item using an LLM. + """ + + item = models.ForeignKey( + Item, on_delete=models.CASCADE, related_name="categorizer_results" + ) + llm_type = models.CharField(max_length=100) + raw_result = models.TextField() + result_answer = models.BooleanField() + result_confidence = models.IntegerField() + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) + + class Meta: + ordering = ["-created_at"] + indexes = [ + models.Index(fields=["item", "llm_type"]), + models.Index(fields=["result_answer"]), + models.Index(fields=["result_confidence"]), + ] + + def __str__(self): + return f"{self.item} - {self.llm_type}: {self.result_answer} ({self.result_confidence}%)" \ No newline at end of file diff --git a/web/requirements.txt b/web/requirements.txt index 6b64bf7..69be5c7 100644 --- a/web/requirements.txt +++ b/web/requirements.txt @@ -1,2 +1,16 @@ Django~=4.2.6 requests~=2.31.0 +spacy~=3.7.0 +scispacy~=0.5.4 + +# LLM dependencies (optional, install based on which LLM you want to use) +# For paid APIs: +# openai>=1.0.0 # Uncomment for OpenAI GPT models +# anthropic>=0.7.0 # Uncomment for Anthropic Claude + +# For free local models (recommended): +langchain-huggingface>=0.0.1 # For HuggingFace models +langchain-community>=0.0.1 # For Ollama and other local models +transformers>=4.35.0 # Required by HuggingFace models +torch>=2.0.0 # Required by HuggingFace models +accelerate>=0.24.0 # Speeds up model loading diff --git a/web/slurper/keyword_util.py b/web/slurper/keyword_util.py new file mode 100644 index 0000000..aaeb919 --- /dev/null +++ b/web/slurper/keyword_util.py @@ -0,0 +1,25 @@ +import spacy + +# TODO SST: Move to readme.md +# TODO SST: Also it should be lazy-loaded +# Load the scientific English model from scispacy +# Note: You need to download this model first with: +# pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz +nlp = spacy.load("en_core_sci_lg") + + +def extract_keywords(text): + """ + Extract keywords from text using spaCy's named entity recognition. + + Args: + text: The text to extract keywords from + + Returns: + A list of recognized entities (keywords) from the text + """ + if not text: + return [] + + doc = nlp(text) + return doc.ents diff --git a/web/slurper/source_wikidata.py b/web/slurper/source_wikidata.py index 06b6a97..9669a04 100644 --- a/web/slurper/source_wikidata.py +++ b/web/slurper/source_wikidata.py @@ -6,6 +6,22 @@ from slurper.wd_raw_item import WD_OTHER_SOURCES, BaseWdRawItem +# Wikidata entities to exclude from queries (natural numbers and positive integers) +# TODO SST: Ask Katja: whether to add all found +# 1. Should I put all found? Most likely yes +# 2. Use categorization results to exclude them in further uses +KNOWN_EXCLUDED_CATEGORIES = ["wd:Q21199", "wd:Q28920044"] + + +# These are added to every query: +# - Optional image: Fetches image if available +# - Optional Wikipedia link: Gets English Wikipedia article +# - Excludes natural numbers (FILTER NOT EXISTS) +# - Excludes humans (FILTER NOT EXISTS) +# - Label service: Automatically fetches English labels and descriptions +# +# The class fetches mathematical concepts from Wikidata while filtering out unwanted items like people and natural numbers. + class WikidataSlurper: SPARQL_URL = "https://query.wikidata.org/sparql" @@ -18,9 +34,12 @@ class WikidataSlurper: schema:isPartOf ; schema:about ?item . } - # except for natural numbers - MINUS { - ?item wdt:P31 wd:Q21199 . + OPTIONAL + { ?item skos:altLabel ?itemAltLabel . FILTER (lang(?itemAltLabel) = "en") } + # except for natural numbers and positive integers + FILTER NOT EXISTS { + VALUES ?excludedType { """ + " ".join(KNOWN_EXCLUDED_CATEGORIES) + """ } + ?item wdt:P31 ?excludedType . } # except for humans FILTER NOT EXISTS{ ?item wdt:P31 wd:Q5 . } @@ -35,6 +54,7 @@ def __init__(self, source, query, limit=None): """ SELECT DISTINCT ?item ?itemLabel ?itemDescription ?image ?wp_en + (GROUP_CONCAT(DISTINCT ?itemAltLabel; separator=", ") AS ?aliases) """ + self._sparql_source_vars_select() + """ @@ -43,9 +63,18 @@ def __init__(self, source, query, limit=None): + query + self._sparql_source_vars_triples() + self.SPARQL_QUERY_OPTIONS + + """ +GROUP BY ?item ?itemLabel ?itemDescription ?image ?wp_en """ + + " ".join( + [f"?{src['json_key']}" for src in WD_OTHER_SOURCES.values()] + ) + + """ +""" + (f"LIMIT {limit}" if limit is not None else "") ) self.raw_data = self.fetch_json() + self.article_text = self.fetch_articles() + def _sparql_source_vars_select(self): def to_var(source_dict): @@ -70,8 +99,68 @@ def fetch_json(self): ) return response.json()["results"]["bindings"] + def fetch_articles(self): + """Fetch Wikipedia article text for items with wp_en links.""" + article_texts = {} + + for json_item in self.raw_data: + # Only fetch if Wikipedia link exists + if "wp_en" not in json_item: + continue + + wp_url = json_item["wp_en"]["value"] + article_title = wp_url.split("/wiki/")[-1] + + api_url = "https://en.wikipedia.org/w/api.php" + params = { + "action": "query", + "format": "json", + "titles": article_title, + "prop": "extracts", + "explaintext": True, + "exsectionformat": "plain", + } + headers = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept": "application/json", + "Accept-Language": "en-US,en;q=0.9", + } + + try: + response = requests.get(api_url, params=params, headers=headers) + response.raise_for_status() + + if not response.text: + logging.log( + logging.WARNING, + f"Empty response for Wikipedia article: {article_title}", + ) + continue + + data = response.json() + pages = data.get("query", {}).get("pages", {}) + + # Get the first (and only) page + for page_id, page_data in pages.items(): + if "extract" in page_data: + # Use Wikidata ID as key + wd_id = json_item["item"]["value"] + article_texts[wd_id] = page_data["extract"] + break + except Exception as e: + logging.log( + logging.WARNING, + f"Failed to fetch Wikipedia article for {article_title}: {e}", + ) + + return article_texts + def get_items(self): for json_item in self.raw_data: + wd_id = json_item["item"]["value"] + if wd_id in self.article_text: + json_item["article_text"] = {"value": self.article_text[wd_id]} + raw_item = BaseWdRawItem.raw_item(self.source, json_item) yield raw_item.to_item() if self.source != Item.Source.WIKIDATA: diff --git a/web/slurper/wd_raw_item.py b/web/slurper/wd_raw_item.py index cc71823..29a5539 100644 --- a/web/slurper/wd_raw_item.py +++ b/web/slurper/wd_raw_item.py @@ -1,6 +1,7 @@ from typing import Optional from concepts.models import Item, Link +from slurper.keyword_util import extract_keywords WD_OTHER_SOURCES = { Item.Source.NLAB: { @@ -42,6 +43,18 @@ def name(self): def description(self): return None + def aliases(self): + """Get aliases (alternative labels) if available.""" + if "aliases" in self.raw and self.raw["aliases"]["value"]: + return self.raw["aliases"]["value"] + return None + + def article_text(self): + """Get the Wikipedia article text if available.""" + if "article_text" in self.raw: + return self.raw["article_text"]["value"] + return None + def has_source(self, source): if source == Item.Source.WIKIPEDIA_EN: return "wp_en" in self.raw @@ -52,12 +65,26 @@ def switch_source_to(self, source): return BaseWdRawItem.raw_item(source, self.raw) def to_item(self) -> Optional[Item]: + # Extract keywords from article text if available + article = self.article_text() + keywords = None + + if article: + # Extract entities using spaCy + entities = extract_keywords(article) + # Convert to lowercase and create comma-separated string + keyword_list = [entity.text.lower() for entity in entities] + keywords = ", ".join(keyword_list) if keyword_list else None + return Item( source=self.source, identifier=self.identifier(), url=self.url(), name=self.name(), description=self.description(), + keywords=keywords, + article_text=article, + aliases=self.aliases(), ) def _get_item_queryset(self): diff --git a/web/web/settings.py b/web/web/settings.py index 31317db..199c794 100644 --- a/web/web/settings.py +++ b/web/web/settings.py @@ -38,6 +38,7 @@ "django.contrib.sessions", "django.contrib.messages", "django.contrib.staticfiles", + "categorizer", "concepts", "slurper", "web", diff --git a/web/web/urls.py b/web/web/urls.py index a50d3a4..958bd85 100644 --- a/web/web/urls.py +++ b/web/web/urls.py @@ -14,6 +14,7 @@ 1. Import the include() function: from django.urls import include, path 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) """ + from concepts import views from django.conf import settings from django.conf.urls.static import static From 7fd7197effb29b6fd88f59d4189e1f96f11d7947 Mon Sep 17 00:00:00 2001 From: Slobodan Stanojevikj Date: Fri, 12 Dec 2025 23:42:05 +0100 Subject: [PATCH 02/12] -moved fetch to be per item, not fetch all then --- Makefile | 3 + web/slurper/keyword_util.py | 18 +++- web/slurper/source_wikidata.py | 156 +++++++++++++++++++++++---------- 3 files changed, 128 insertions(+), 49 deletions(-) diff --git a/Makefile b/Makefile index 3cfe1f6..a479b89 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,9 @@ install: pip install -r requirements.txt +install-scispacy: + pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz + start: python ./web/manage.py runserver diff --git a/web/slurper/keyword_util.py b/web/slurper/keyword_util.py index aaeb919..8b8472d 100644 --- a/web/slurper/keyword_util.py +++ b/web/slurper/keyword_util.py @@ -1,11 +1,22 @@ import spacy # TODO SST: Move to readme.md -# TODO SST: Also it should be lazy-loaded # Load the scientific English model from scispacy # Note: You need to download this model first with: -# pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz -nlp = spacy.load("en_core_sci_lg") +# make install-scispacy +# Or directly: +# pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz + +# Lazy-loaded spaCy model +_nlp = None + + +def _get_nlp(): + """Lazy-load the spaCy model only when needed.""" + global _nlp + if _nlp is None: + _nlp = spacy.load("en_core_sci_lg") + return _nlp def extract_keywords(text): @@ -21,5 +32,6 @@ def extract_keywords(text): if not text: return [] + nlp = _get_nlp() doc = nlp(text) return doc.ents diff --git a/web/slurper/source_wikidata.py b/web/slurper/source_wikidata.py index 9669a04..8cc1504 100644 --- a/web/slurper/source_wikidata.py +++ b/web/slurper/source_wikidata.py @@ -1,4 +1,6 @@ import logging +import time +import urllib.parse import requests from concepts.models import Item @@ -6,12 +8,16 @@ from slurper.wd_raw_item import WD_OTHER_SOURCES, BaseWdRawItem +# Wikipedia API contact email (required by Wikipedia API guidelines) +# Set to None to disable Wikipedia article fetching +WIKIPEDIA_CONTACT_EMAIL = None + # Wikidata entities to exclude from queries (natural numbers and positive integers) -# TODO SST: Ask Katja: whether to add all found -# 1. Should I put all found? Most likely yes -# 2. Use categorization results to exclude them in further uses KNOWN_EXCLUDED_CATEGORIES = ["wd:Q21199", "wd:Q28920044"] +# Flag to track if we've logged the missing email warning +_missing_email_logged = False + # These are added to every query: # - Optional image: Fetches image if available @@ -73,7 +79,6 @@ def __init__(self, source, query, limit=None): + (f"LIMIT {limit}" if limit is not None else "") ) self.raw_data = self.fetch_json() - self.article_text = self.fetch_articles() def _sparql_source_vars_select(self): @@ -99,43 +104,85 @@ def fetch_json(self): ) return response.json()["results"]["bindings"] - def fetch_articles(self): - """Fetch Wikipedia article text for items with wp_en links.""" - article_texts = {} + def fetch_article(self, json_item, index=None, total=None): + global _missing_email_logged - for json_item in self.raw_data: - # Only fetch if Wikipedia link exists - if "wp_en" not in json_item: - continue - - wp_url = json_item["wp_en"]["value"] - article_title = wp_url.split("/wiki/")[-1] - - api_url = "https://en.wikipedia.org/w/api.php" - params = { - "action": "query", - "format": "json", - "titles": article_title, - "prop": "extracts", - "explaintext": True, - "exsectionformat": "plain", - } - headers = { - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", - "Accept": "application/json", - "Accept-Language": "en-US,en;q=0.9", - } + # Check if contact email is configured + if WIKIPEDIA_CONTACT_EMAIL is None: + if not _missing_email_logged: + logging.log( + logging.WARNING, + "WIKIPEDIA_CONTACT_EMAIL is not set. Wikipedia article fetching is disabled. " + "Please set WIKIPEDIA_CONTACT_EMAIL at the top of source_wikidata.py to enable article fetching.", + ) + _missing_email_logged = True + return None + + wp_url = json_item["wp_en"]["value"] + # Decode URL-encoded characters (e.g., %E2%80%93 becomes –) + article_title = urllib.parse.unquote(wp_url.split("/wiki/")[-1]) + if index is not None and total is not None: + logging.log( + logging.INFO, + f"Fetching Wikipedia article [{index}/{total}]: {article_title}", + ) + else: + logging.log( + logging.INFO, + f"Fetching Wikipedia article: {article_title}", + ) + api_url = "https://en.wikipedia.org/w/api.php" + params = { + "action": "query", + "format": "json", + "titles": article_title, + "prop": "extracts", + "explaintext": True, + "exsectionformat": "plain", + } + headers = { + "User-Agent": f"MathSwitch/1.0 ({WIKIPEDIA_CONTACT_EMAIL})", + "Accept": "application/json", + "Accept-Language": "en-US,en;q=0.9", + } + # Retry logic with exponential backoff + max_retries = 3 + retry_delay = 1 # Start with 1 second + success = False + for attempt in range(max_retries): try: - response = requests.get(api_url, params=params, headers=headers) + # Rate limiting: delay between requests (100 req/s max) + time.sleep(0.01) + + # Timeout: (connect_timeout, read_timeout) in seconds + response = requests.get(api_url, params=params, headers=headers, timeout=(5, 30)) + + # Handle rate limiting + if response.status_code in (429, 403): + if attempt < max_retries - 1: + logging.log( + logging.WARNING, + f"Rate limited for {article_title}, retrying in {retry_delay}s (attempt {attempt + 1}/{max_retries})", + ) + time.sleep(retry_delay) + retry_delay *= 2 # Exponential backoff + continue + else: + logging.log( + logging.ERROR, + f"Failed to fetch {article_title} after {max_retries} attempts (rate limited). Skipping article.", + ) + break + response.raise_for_status() if not response.text: logging.log( logging.WARNING, - f"Empty response for Wikipedia article: {article_title}", + f"Empty response for Wikipedia article: {article_title}. Skipping article.", ) - continue + break data = response.json() pages = data.get("query", {}).get("pages", {}) @@ -143,24 +190,35 @@ def fetch_articles(self): # Get the first (and only) page for page_id, page_data in pages.items(): if "extract" in page_data: - # Use Wikidata ID as key - wd_id = json_item["item"]["value"] - article_texts[wd_id] = page_data["extract"] - break - except Exception as e: - logging.log( - logging.WARNING, - f"Failed to fetch Wikipedia article for {article_title}: {e}", - ) + success = True + return page_data["extract"] - return article_texts + # Success, break retry loop + break + + except requests.exceptions.RequestException as e: + if attempt < max_retries - 1: + logging.log( + logging.WARNING, + f"Request failed for {article_title}: {e}, retrying in {retry_delay}s", + ) + time.sleep(retry_delay) + retry_delay *= 2 + else: + logging.log( + logging.ERROR, + f"Failed to fetch {article_title} after {max_retries} attempts: {e}. Skipping article.", + ) + if not success and "wp_en" in json_item: + logging.log( + logging.INFO, + f"Article {article_title} will have null value (fetch failed or empty)", + ) + + return None def get_items(self): for json_item in self.raw_data: - wd_id = json_item["item"]["value"] - if wd_id in self.article_text: - json_item["article_text"] = {"value": self.article_text[wd_id]} - raw_item = BaseWdRawItem.raw_item(self.source, json_item) yield raw_item.to_item() if self.source != Item.Source.WIKIDATA: @@ -168,6 +226,12 @@ def get_items(self): if not raw_item_wd.item_exists(): yield raw_item_wd.to_item() if raw_item.has_source(Item.Source.WIKIPEDIA_EN): + # Fetch Wikipedia article if available + if "wp_en" in json_item and "article_text" not in json_item: + article_text = self.fetch_article(json_item) + if article_text is not None: + json_item["article_text"] = {"value": article_text} + raw_item_wp_en = raw_item.switch_source_to(Item.Source.WIKIPEDIA_EN) if not raw_item_wp_en.item_exists(): yield raw_item_wp_en.to_item() From f82c7f80d34cd4062ad7017f0c99394a27312fda Mon Sep 17 00:00:00 2001 From: Slobodan Stanojevikj Date: Sat, 13 Dec 2025 00:13:00 +0100 Subject: [PATCH 03/12] -moved fetch to be per item, not fetch all then --- Makefile | 5 +++ requirements.txt | 2 +- web/categorizer/categorizer_service.py | 39 ++++++++++------- web/categorizer/llm_service.py | 34 +++++++++++---- .../management/commands/categorize.py | 11 ++--- web/concepts/models.py | 5 ++- web/slurper/keyword_util.py | 4 +- web/slurper/source_wikidata.py | 43 ++++++++++++------- 8 files changed, 93 insertions(+), 50 deletions(-) diff --git a/Makefile b/Makefile index a479b89..3d20f73 100644 --- a/Makefile +++ b/Makefile @@ -9,3 +9,8 @@ start: compute-concepts: python ./web/manage.py compute_concepts + +fix-files: + python3 -m black . + python3 -m isort . + python3 -m flake8 . diff --git a/requirements.txt b/requirements.txt index e86dbb8..6fbba70 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ black~=25.9.0 -isort~=4.2.5 +isort~=5.12.0 flake8~=7.3.0 -r ./web/requirements.txt diff --git a/web/categorizer/categorizer_service.py b/web/categorizer/categorizer_service.py index 2a4e40e..97186af 100644 --- a/web/categorizer/categorizer_service.py +++ b/web/categorizer/categorizer_service.py @@ -1,8 +1,9 @@ import json import logging import re -from concepts.models import Item, CategorizerResult + from categorizer.llm_service import LLMService, LLMType +from concepts.models import CategorizerResult, Item # Free LLM types to use for categorization LLM_JUDGE_POOL = [ @@ -33,12 +34,12 @@ def categorize_items(self, limit=None): queryset = queryset[:limit] total = queryset.count() - self.logger.info(f"Categorizing {total} items using {len(LLM_JUDGE_POOL)} free LLMs") + self.logger.info( + f"Categorizing {total} items using {len(LLM_JUDGE_POOL)} free LLMs" + ) for i, item in enumerate(queryset): - self.logger.info( - f"Processing item {i + 1}/{total}: {item.identifier}" - ) + self.logger.info(f"Processing item {i + 1}/{total}: {item.identifier}") self.categorize_item(item) self.logger.info("Categorization complete") @@ -46,14 +47,17 @@ def categorize_items(self, limit=None): def categorize_item( self, item, - predicate: str = "Is the given concept a mathematical concept, given the name, description, keywords, and article text?" + predicate: str = "Is the given concept a mathematical concept," + " given the name, description, " + "keywords, and article text?", ): """ Categorize a single item using all free LLM types. Args: item: Item instance to categorize - predicate: The question to evaluate (default: checks if it's a mathematical concept) + predicate: The question to evaluate (default: checks if it's + a mathematical concept) Returns: List of categorization results from all LLMs @@ -68,7 +72,10 @@ def categorize_item( try: self.logger.info(f"Calling {llm_type.value} for {item.name}") raw_result = self.llm_service.call_llm(llm_type, prompt) - self.logger.info(f"Categorized {item.name} with {llm_type.value}: {raw_result[:100]}...") + self.logger.info( + f"Categorized {item.name} with {llm_type.value}: " + f"{raw_result[:100]}..." + ) parsed_result = self._parse_categorization_result(raw_result) @@ -89,7 +96,9 @@ def categorize_item( results.append(parsed_result) except Exception as e: - self.logger.error(f"Failed to categorize {item.name} with {llm_type.value}: {e}") + self.logger.error( + f"Failed to categorize {item.name} with {llm_type.value}: {e}" + ) # Continue with other LLMs even if one fails? continue @@ -106,7 +115,8 @@ def _build_categorization_prompt(self, item, predicate: str): Returns: Formatted prompt string """ - system_prompt = """You are a categorization judge. Your task is to evaluate whether a given concept satisfies a specific predicate. + system_prompt = """You are a categorization judge. Your task is to + evaluate whether a given concept satisfies a specific predicate. You must respond with a structured answer containing: 1. answer: true or false (boolean) @@ -173,7 +183,9 @@ def _parse_categorization_result(self, result: str) -> dict: parsed = json.loads(result) if "answer" not in parsed or "confidence" not in parsed: - raise ValueError("Response missing required fields 'answer' or 'confidence'") + raise ValueError( + "Response missing required fields 'answer' or 'confidence'" + ) answer = parsed["answer"] if isinstance(answer, str): @@ -183,10 +195,7 @@ def _parse_categorization_result(self, result: str) -> dict: if not 0 <= confidence <= 100: raise ValueError(f"Confidence must be between 0-100, got {confidence}") - return { - "answer": bool(answer), - "confidence": confidence - } + return {"answer": bool(answer), "confidence": confidence} except json.JSONDecodeError as e: self.logger.error(f"Failed to parse JSON response: {result}") diff --git a/web/categorizer/llm_service.py b/web/categorizer/llm_service.py index cb13d23..190da36 100644 --- a/web/categorizer/llm_service.py +++ b/web/categorizer/llm_service.py @@ -28,12 +28,24 @@ class LLMService: def __init__(self): self.logger = logging.getLogger(__name__) self.llm_handlers = { - LLMType.OPENAI_GPT4: lambda llm_type, prompt: self._call_openai(llm_type, prompt), - LLMType.OPENAI_GPT35: lambda llm_type, prompt: self._call_openai(llm_type, prompt), - LLMType.ANTHROPIC_CLAUDE: lambda llm_type, prompt: self._call_anthropic(prompt), - LLMType.HUGGINGFACE_FLAN_T5: lambda llm_type, prompt: self._call_huggingface("google/flan-t5-base", prompt), - LLMType.HUGGINGFACE_GPT2: lambda llm_type, prompt: self._call_huggingface("gpt2", prompt), - LLMType.HUGGINGFACE_DIALOGPT: lambda llm_type, prompt: self._call_huggingface("microsoft/DialoGPT-medium", prompt), + LLMType.OPENAI_GPT4: lambda llm_type, prompt: self._call_openai( + llm_type, prompt + ), + LLMType.OPENAI_GPT35: lambda llm_type, prompt: self._call_openai( + llm_type, prompt + ), + LLMType.ANTHROPIC_CLAUDE: lambda llm_type, prompt: self._call_anthrpc( + prompt + ), + LLMType.HUGGINGFACE_FLAN_T5: lambda llm_type, prompt: self._call_hgf( + "google/flan-t5-base", prompt + ), + LLMType.HUGGINGFACE_GPT2: lambda llm_type, prompt: self._call_hgf( + "gpt2", prompt + ), + LLMType.HUGGINGFACE_DIALOGPT: lambda llm_type, prompt: self._call_hgf( + "microsoft/DialoGPT-medium", prompt + ), LLMType.OLLAMA: lambda llm_type, prompt: self._call_ollama(prompt), } @@ -92,7 +104,7 @@ def _call_openai(self, llm_type: LLMType, prompt: str) -> str: self.logger.error(f"OpenAI API call failed: {e}") raise - def _call_anthropic(self, prompt: str) -> str: + def _call_anthrpc(self, prompt: str) -> str: """Call Anthropic Claude API""" try: import anthropic @@ -121,7 +133,7 @@ def _call_anthropic(self, prompt: str) -> str: self.logger.error(f"Anthropic API call failed: {e}") raise - def _call_huggingface(self, model_id: str, prompt: str) -> str: + def _call_hgf(self, model_id: str, prompt: str) -> str: """ Call HuggingFace models using langchain. @@ -156,7 +168,11 @@ def _call_huggingface(self, model_id: str, prompt: str) -> str: # Create the HuggingFace pipeline hf = HuggingFacePipeline.from_model_id( model_id=model_id, - task="text-generation" if "gpt" in model_id.lower() else "text2text-generation", + task=( + "text-generation" + if "gpt" in model_id.lower() + else "text2text-generation" + ), pipeline_kwargs=pipeline_kwargs, ) diff --git a/web/categorizer/management/commands/categorize.py b/web/categorizer/management/commands/categorize.py index af9694b..21a6794 100644 --- a/web/categorizer/management/commands/categorize.py +++ b/web/categorizer/management/commands/categorize.py @@ -1,5 +1,5 @@ -from django.core.management.base import BaseCommand from categorizer.categorizer_service import CategorizerService +from django.core.management.base import BaseCommand class Command(BaseCommand): @@ -18,7 +18,10 @@ def handle(self, *args, **options): service = CategorizerService() - self.stdout.write("Using all free LLMs: huggingface_flan_t5, huggingface_gpt2, huggingface_dialogpt") + self.stdout.write( + "Using all free LLMs: huggingface_flan_t5, " + "huggingface_gpt2, huggingface_dialogpt" + ) if limit: self.stdout.write(f"Categorizing up to {limit} items...") else: @@ -28,6 +31,4 @@ def handle(self, *args, **options): service.categorize_items(limit=limit) self.stdout.write(self.style.SUCCESS("Categorization complete!")) except Exception as e: - self.stdout.write( - self.style.ERROR(f"Categorization failed: {e}") - ) + self.stdout.write(self.style.ERROR(f"Categorization failed: {e}")) diff --git a/web/concepts/models.py b/web/concepts/models.py index 0545806..f15c95e 100644 --- a/web/concepts/models.py +++ b/web/concepts/models.py @@ -188,4 +188,7 @@ class Meta: ] def __str__(self): - return f"{self.item} - {self.llm_type}: {self.result_answer} ({self.result_confidence}%)" \ No newline at end of file + return ( + f"{self.item} - {self.llm_type}: " + f"{self.result_answer} ({self.result_confidence}%)" + ) diff --git a/web/slurper/keyword_util.py b/web/slurper/keyword_util.py index 8b8472d..0523e47 100644 --- a/web/slurper/keyword_util.py +++ b/web/slurper/keyword_util.py @@ -3,9 +3,7 @@ # TODO SST: Move to readme.md # Load the scientific English model from scispacy # Note: You need to download this model first with: -# make install-scispacy -# Or directly: -# pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz +# make install-scispacy # Lazy-loaded spaCy model _nlp = None diff --git a/web/slurper/source_wikidata.py b/web/slurper/source_wikidata.py index 8cc1504..9cc7272 100644 --- a/web/slurper/source_wikidata.py +++ b/web/slurper/source_wikidata.py @@ -7,7 +7,6 @@ from django.db.utils import IntegrityError from slurper.wd_raw_item import WD_OTHER_SOURCES, BaseWdRawItem - # Wikipedia API contact email (required by Wikipedia API guidelines) # Set to None to disable Wikipedia article fetching WIKIPEDIA_CONTACT_EMAIL = None @@ -26,12 +25,15 @@ # - Excludes humans (FILTER NOT EXISTS) # - Label service: Automatically fetches English labels and descriptions # -# The class fetches mathematical concepts from Wikidata while filtering out unwanted items like people and natural numbers. +# The class fetches mathematical concepts from Wikidata while +# filtering out unwanted items like people and natural numbers. + class WikidataSlurper: SPARQL_URL = "https://query.wikidata.org/sparql" - SPARQL_QUERY_OPTIONS = """ + SPARQL_QUERY_OPTIONS = ( + """ OPTIONAL { ?item wdt:P18 ?image . } OPTIONAL @@ -44,7 +46,9 @@ class WikidataSlurper: { ?item skos:altLabel ?itemAltLabel . FILTER (lang(?itemAltLabel) = "en") } # except for natural numbers and positive integers FILTER NOT EXISTS { - VALUES ?excludedType { """ + " ".join(KNOWN_EXCLUDED_CATEGORIES) + """ } + VALUES ?excludedType { """ + + " ".join(KNOWN_EXCLUDED_CATEGORIES) + + """ } ?item wdt:P31 ?excludedType . } # except for humans @@ -53,6 +57,7 @@ class WikidataSlurper: SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } } """ + ) def __init__(self, source, query, limit=None): self.source = source @@ -71,16 +76,13 @@ def __init__(self, source, query, limit=None): + self.SPARQL_QUERY_OPTIONS + """ GROUP BY ?item ?itemLabel ?itemDescription ?image ?wp_en """ - + " ".join( - [f"?{src['json_key']}" for src in WD_OTHER_SOURCES.values()] - ) + + " ".join([f"?{src['json_key']}" for src in WD_OTHER_SOURCES.values()]) + """ """ + (f"LIMIT {limit}" if limit is not None else "") ) self.raw_data = self.fetch_json() - def _sparql_source_vars_select(self): def to_var(source_dict): return " ?" + source_dict["json_key"] @@ -112,8 +114,10 @@ def fetch_article(self, json_item, index=None, total=None): if not _missing_email_logged: logging.log( logging.WARNING, - "WIKIPEDIA_CONTACT_EMAIL is not set. Wikipedia article fetching is disabled. " - "Please set WIKIPEDIA_CONTACT_EMAIL at the top of source_wikidata.py to enable article fetching.", + "WIKIPEDIA_CONTACT_EMAIL is not set. " + "Wikipedia article fetching is disabled. " + "Please set WIKIPEDIA_CONTACT_EMAIL at the top of " + "source_wikidata.py to enable article fetching.", ) _missing_email_logged = True return None @@ -156,14 +160,17 @@ def fetch_article(self, json_item, index=None, total=None): time.sleep(0.01) # Timeout: (connect_timeout, read_timeout) in seconds - response = requests.get(api_url, params=params, headers=headers, timeout=(5, 30)) + response = requests.get( + api_url, params=params, headers=headers, timeout=(5, 30) + ) # Handle rate limiting if response.status_code in (429, 403): if attempt < max_retries - 1: logging.log( logging.WARNING, - f"Rate limited for {article_title}, retrying in {retry_delay}s (attempt {attempt + 1}/{max_retries})", + f"Rate limited for {article_title}, retrying in " + f"{retry_delay}s (attempt {attempt + 1}/{max_retries})", ) time.sleep(retry_delay) retry_delay *= 2 # Exponential backoff @@ -171,7 +178,8 @@ def fetch_article(self, json_item, index=None, total=None): else: logging.log( logging.ERROR, - f"Failed to fetch {article_title} after {max_retries} attempts (rate limited). Skipping article.", + f"Failed to fetch {article_title} after " + f"{max_retries} attempts (rate limited). Skipping article.", ) break @@ -180,7 +188,8 @@ def fetch_article(self, json_item, index=None, total=None): if not response.text: logging.log( logging.WARNING, - f"Empty response for Wikipedia article: {article_title}. Skipping article.", + f"Empty response for Wikipedia article: " + f"{article_title}. Skipping article.", ) break @@ -200,14 +209,16 @@ def fetch_article(self, json_item, index=None, total=None): if attempt < max_retries - 1: logging.log( logging.WARNING, - f"Request failed for {article_title}: {e}, retrying in {retry_delay}s", + f"Request failed for {article_title}: " + f"{e}, retrying in {retry_delay}s", ) time.sleep(retry_delay) retry_delay *= 2 else: logging.log( logging.ERROR, - f"Failed to fetch {article_title} after {max_retries} attempts: {e}. Skipping article.", + f"Failed to fetch {article_title}" + f" after {max_retries} attempts: {e}. Skipping article.", ) if not success and "wp_en" in json_item: logging.log( From 245d4f8231b9a00efcf862e078de897fbd643593 Mon Sep 17 00:00:00 2001 From: Slobodan Stanojevikj Date: Sat, 13 Dec 2025 00:17:42 +0100 Subject: [PATCH 04/12] -fixed test workflow to run on push to main and pull requests --- .github/workflows/test.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b77ce29..0115b76 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,6 +1,10 @@ name: Test -on: [push, pull_request] +on: + push: + branches: + - main + pull_request: jobs: test: From 055f78c92ef6eeaa82a6cb7275f1d6e602a202bb Mon Sep 17 00:00:00 2001 From: Slobodan Stanojevikj Date: Sun, 14 Dec 2025 11:21:51 +0100 Subject: [PATCH 05/12] -added excluded categories based on the categorizer results -added a fix to clear up gpt2 prompt output -fixed dependency issues --- Makefile | 3 + web/categorizer/categorizer_service.py | 98 +++++++++++++++++--------- web/categorizer/llm_service.py | 21 +++++- web/requirements.txt | 12 ++-- web/slurper/source_wikidata.py | 60 ++++++++++++++-- 5 files changed, 150 insertions(+), 44 deletions(-) diff --git a/Makefile b/Makefile index 3d20f73..317c9fd 100644 --- a/Makefile +++ b/Makefile @@ -10,6 +10,9 @@ start: compute-concepts: python ./web/manage.py compute_concepts +categorize: + python ./web/manage.py categorize --limit 10 + fix-files: python3 -m black . python3 -m isort . diff --git a/web/categorizer/categorizer_service.py b/web/categorizer/categorizer_service.py index 97186af..363e1e1 100644 --- a/web/categorizer/categorizer_service.py +++ b/web/categorizer/categorizer_service.py @@ -1,6 +1,4 @@ -import json import logging -import re from categorizer.llm_service import LLMService, LLMType from concepts.models import CategorizerResult, Item @@ -77,14 +75,19 @@ def categorize_item( f"{raw_result[:100]}..." ) + print(f"{raw_result}") parsed_result = self._parse_categorization_result(raw_result) + confidence = parsed_result["confidence"] + if confidence is None: + confidence = 50 + categorizer_result = CategorizerResult.objects.create( item=item, llm_type=llm_type.value, raw_result=raw_result, result_answer=parsed_result["answer"], - result_confidence=parsed_result["confidence"], + result_confidence=confidence, ) categorizer_result.save() @@ -122,11 +125,9 @@ def _build_categorization_prompt(self, item, predicate: str): 1. answer: true or false (boolean) 2. confidence: a number from 0 to 100 (representing your confidence percentage) -Format your response as JSON: -{ - "answer": true, - "confidence": 85 -}""" +IMPORTANT: Format your response as comma-separated string: +yes,85 +""" item_info_parts = [f"Name: {item.name}"] @@ -157,16 +158,17 @@ def _build_categorization_prompt(self, item, predicate: str): --- -Please provide your evaluation in JSON format.""" +Please provide your evaluation in the comma-separated format specified above.""" return prompt def _parse_categorization_result(self, result: str) -> dict: """ - Parse the LLM's JSON response. + Parse the LLM's comma-separated response. Args: - result: The raw response from the LLM + result: The raw response from the LLM (expected format: "yes,85" + or "no,75", "yes ---") Returns: Dictionary with 'answer' (bool) and 'confidence' (int) keys @@ -175,31 +177,63 @@ def _parse_categorization_result(self, result: str) -> dict: ValueError: If the response cannot be parsed """ try: - json_match = re.search(r'\{[^}]*"answer"[^}]*\}', result, re.DOTALL) - if json_match: - json_str = json_match.group(0) - parsed = json.loads(json_str) + # Clean the result string + result = result.strip() + + # Split by comma (with or without space) or just space + # Try separators in order of specificity: ", ", ",", " " + if ", " in result: + parts = result.split(", ", 1) + elif "," in result: + parts = result.split(",", 1) + else: + parts = result.split(" ", 1) + + if len(parts) == 1: + # Only answer provided, no confidence + answer_str = parts[0].strip().lower() + confidence = None + elif len(parts) == 2: + # Both answer and confidence provided + answer_str = parts[0].strip().lower() + confidence_str = parts[1].strip() + + # Parse confidence if provided + if confidence_str: + try: + confidence = int(confidence_str) + if not 0 <= confidence <= 100: + self.logger.warning( + f"Confidence {confidence} out of range [0-100], " + f"setting to None" + ) + confidence = None + except ValueError: + self.logger.warning( + f"Invalid confidence value '{confidence_str}', " + f"setting to None" + ) + confidence = None + else: + confidence = None else: - parsed = json.loads(result) - - if "answer" not in parsed or "confidence" not in parsed: raise ValueError( - "Response missing required fields 'answer' or 'confidence'" + f"Expected format 'answer' or 'answer,confidence', got: {result}" ) - answer = parsed["answer"] - if isinstance(answer, str): - answer = answer.lower() in ("true", "yes", "1") - - confidence = int(parsed["confidence"]) - if not 0 <= confidence <= 100: - raise ValueError(f"Confidence must be between 0-100, got {confidence}") + # Parse answer - accept yes/true/1 as True, no/false/0 as False + if answer_str in ("yes", "true", "1"): + answer = True + elif answer_str in ("no", "false", "0"): + answer = False + else: + raise ValueError( + f"Invalid answer value: {answer_str}. " + f"Expected yes/no, true/false, or 1/0" + ) - return {"answer": bool(answer), "confidence": confidence} + return {"answer": answer, "confidence": confidence} - except json.JSONDecodeError as e: - self.logger.error(f"Failed to parse JSON response: {result}") - raise ValueError(f"Invalid JSON response from LLM: {e}") - except (KeyError, ValueError) as e: - self.logger.error(f"Invalid response format: {result}") + except (ValueError, IndexError) as e: + self.logger.error(f"Failed to parse response: {result}") raise ValueError(f"Invalid response format: {e}") diff --git a/web/categorizer/llm_service.py b/web/categorizer/llm_service.py index 190da36..0d8d364 100644 --- a/web/categorizer/llm_service.py +++ b/web/categorizer/llm_service.py @@ -160,8 +160,7 @@ def _call_hgf(self, model_id: str, prompt: str) -> str: "temperature": 0.7, } - # TODO SST: Remove the whole model - # Add pad_token_id for DialoGPT + # Add pad_token_id for DialoGPT and GPT2 if "DialoGPT" in model_id or "gpt2" in model_id: pipeline_kwargs["pad_token_id"] = 50256 @@ -178,6 +177,24 @@ def _call_hgf(self, model_id: str, prompt: str) -> str: response = hf.invoke(prompt) + if "gpt2" in model_id.lower(): + response = response.removeprefix(prompt).strip() + + lines = response.split("\n") + cleaned_lines = [] + for line in lines: + if line.strip() and line.strip() != "---": + cleaned_lines.append(line) + + response = "\n".join(cleaned_lines).strip() + + # If we got nothing useful, return a default response + if not response: + self.logger.warning( + "GPT2 produced no useful output, " "returning default: 'no, 0'" + ) + response = "no, 0" + self.logger.info(f"HuggingFace model response length: {len(response)}") return response diff --git a/web/requirements.txt b/web/requirements.txt index 69be5c7..087e587 100644 --- a/web/requirements.txt +++ b/web/requirements.txt @@ -1,7 +1,7 @@ Django~=4.2.6 requests~=2.31.0 -spacy~=3.7.0 -scispacy~=0.5.4 +spacy~=3.7.0 --prefer-binary +scispacy~=0.6.2 # LLM dependencies (optional, install based on which LLM you want to use) # For paid APIs: @@ -11,6 +11,8 @@ scispacy~=0.5.4 # For free local models (recommended): langchain-huggingface>=0.0.1 # For HuggingFace models langchain-community>=0.0.1 # For Ollama and other local models -transformers>=4.35.0 # Required by HuggingFace models -torch>=2.0.0 # Required by HuggingFace models -accelerate>=0.24.0 # Speeds up model loading +# Required by HuggingFace models +transformers>=4.35.0 +torch>=2.0.0 +# Speeds up model loading +accelerate>=0.24.0 diff --git a/web/slurper/source_wikidata.py b/web/slurper/source_wikidata.py index 9cc7272..85e5136 100644 --- a/web/slurper/source_wikidata.py +++ b/web/slurper/source_wikidata.py @@ -10,12 +10,62 @@ # Wikipedia API contact email (required by Wikipedia API guidelines) # Set to None to disable Wikipedia article fetching WIKIPEDIA_CONTACT_EMAIL = None +_missing_email_logged = False -# Wikidata entities to exclude from queries (natural numbers and positive integers) -KNOWN_EXCLUDED_CATEGORIES = ["wd:Q21199", "wd:Q28920044"] +# Wikidata entities to exclude from queries +KNOWN_EXCLUDED_CATEGORIES = [ + # Natural numbers + "wd:Q21199", + # positive integers + "wd:Q28920044", + # countries + "wd:Q6256", + # philosophical concepts + "wd:Q714737", +] -# Flag to track if we've logged the missing email warning -_missing_email_logged = False + +def _load_excluded_categories_from_results(): + """ + Load Wikidata identifiers of items that have been categorized as "no" + with confidence > 49%, to be excluded from future queries. + + Returns a list of Wikidata entity IDs in the format ["wd:Q12345", ...]. + """ + try: + from concepts.models import CategorizerResult + from django.db.models import Avg + + excluded_items = ( + CategorizerResult.objects.filter( + result_answer=False, result_confidence__gt=49 + ) + .values("item__identifier", "item__source") + .annotate(avg_confidence=Avg("result_confidence")) + .filter(avg_confidence__gt=49, item__source=Item.Source.WIKIDATA) + .distinct() + ) + + categories = [f"wd:{item['item__identifier']}" for item in excluded_items] + + if categories: + logging.log( + logging.INFO, + f"Loaded {len(categories)} excluded categories " + f"from categorizer results", + ) + + return categories + except Exception as e: + logging.log( + logging.DEBUG, f"Could not load excluded categories from results: {e}" + ) + return [] + + +RESULT_EXCLUDED_CATEGORIES = _load_excluded_categories_from_results() + +EXCLUDED_CATEGORIES = KNOWN_EXCLUDED_CATEGORIES + RESULT_EXCLUDED_CATEGORIES # These are added to every query: @@ -47,7 +97,7 @@ class WikidataSlurper: # except for natural numbers and positive integers FILTER NOT EXISTS { VALUES ?excludedType { """ - + " ".join(KNOWN_EXCLUDED_CATEGORIES) + + " ".join(EXCLUDED_CATEGORIES) + """ } ?item wdt:P31 ?excludedType . } From 35124cd75d4789f06faf56ce324863e62b0da5ad Mon Sep 17 00:00:00 2001 From: Slobodan Stanojevikj Date: Sun, 14 Dec 2025 11:51:28 +0100 Subject: [PATCH 06/12] -adapted readmes and makefile --- Makefile | 15 +++++ README.md | 115 +++++++++++++++++++++++++++----------- web/categorizer/README.md | 4 +- 3 files changed, 101 insertions(+), 33 deletions(-) diff --git a/Makefile b/Makefile index 317c9fd..d2cff24 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,26 @@ install: + pip install -r web/requirements.txt + +install-dev: pip install -r requirements.txt install-scispacy: pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz +prepare-db: + pip install -r web/requirements.txt + python manage.py migrate + python manage.py createsuperuser + start: python ./web/manage.py runserver +populate-db: + python manage.py import_wikidata + +clear-db: + python manage.py clear_wikidata + compute-concepts: python ./web/manage.py compute_concepts @@ -14,6 +28,7 @@ categorize: python ./web/manage.py categorize --limit 10 fix-files: + pip install -r requirements.txt python3 -m black . python3 -m isort . python3 -m flake8 . diff --git a/README.md b/README.md index 0517efd..bb558e3 100644 --- a/README.md +++ b/README.md @@ -8,35 +8,84 @@ For a demonstration of a page with at least one link, see for example `{baseurl} To install all the necessary Python packages, run: - pip install -r requirements.txt +```bash +make install +# OR +pip install -r web/requirements.txt +``` Next, to create a database, run: - python manage.py migrate +```bash +make prepare-db # which migrates db and creates superuser +# OR +python manage.py migrate +``` In order to use the administrative interface, you need to create an admin user: - python manage.py createsuperuser +```bash +python manage.py createsuperuser +``` Finally, to populate the database, run - python manage.py import_wikidata +```bash +python manage.py import_wikidata +# OR +make populate-db +``` + + * In order to fetch wikipedia articles and extract keywords from them: + ```bash + make install-scispacy + ``` + then configure your email `WIKIPEDIA_CONTACT_EMAIL` in [source_wikidata.py](web/slurper/source_wikidata.py) + * This is needed + * Then run the database population (make sure your db is cleared) + + If you ever want to repopulate the database, you can clear it using - python manage.py clear_wikidata +```bash +python manage.py clear_wikidata +``` + +### To run the categorizer +The categorizer is setup to work with several models, divided into free and paid. +All of them are run locally, so expect some performance hits. The models are downloaded when the categorizer is +ran initially, and by default the free models are used. + +The database needs to be filled in before running it, so: +```bash +make populate-db +``` +then +```bash +make categorize +``` + +There are some known existing issues that have some inline fixes, such as `gpt2` getting stuck +and returning the same prompt, then few times `---\n\n\n---`. + +For more details see [categorizer readme](web/categorizer/README.md). ## Notes for developers In order to contribute, install [Black](https://github.com/psf/black) and [isort](https://pycqa.github.io/isort/) autoformatters and [Flake8](https://flake8.pycqa.org/) linter. - - pip install black isort flake8 +```bash +make install-dev +``` You can run all three with - - isort . - black . - flake8 +```bash +make fix-files +# Or manually +isort . +black . +flake8 +``` or set up a Git pre-commit hook by creating `.git/hooks/pre-commit` with the following contents: @@ -47,35 +96,37 @@ black . && isort . && flake8 ``` Each time after you change a model, make sure to create the appropriate migrations: - - python manage.py makemigrations +```bash +python manage.py makemigrations +``` To update the database with the new model, run: - +```bash python manage.py migrate +``` ## Instructions for Katja to update the live version - - sudo systemctl stop mathswitch - cd mathswitch - git pull - source venv/bin/activate - cd web - ./manage.py rebuild_db - sudo systemctl start mathswitch - +```bash +sudo systemctl stop mathswitch +cd mathswitch +git pull +source venv/bin/activate +cd web +./manage.py rebuild_db +sudo systemctl start mathswitch +``` ## WD item JSON example -``` +```json { - 'item': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q192276'}, - 'art': {'type': 'uri', 'value': 'https://en.wikipedia.org/wiki/Measure_(mathematics)'}, - 'image': {'type': 'uri', 'value': 'http://commons.wikimedia.org/wiki/Special:FilePath/Measure%20illustration%20%28Vector%29.svg'}, - 'mwID': {'type': 'literal', 'value': 'Measure'}, - 'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'measure'}, - 'itemDescription': {'xml:lang': 'en', 'type': 'literal', 'value': 'function assigning numbers to some subsets of a set, which could be seen as a generalization of length, area, volume and integral'}, - 'eomID': {'type': 'literal', 'value': 'measure'}, - 'pwID': {'type': 'literal', 'value': 'Definition:Measure_(Measure_Theory)' + "item": {"type": "uri", "value": "http://www.wikidata.org/entity/Q192276"}, + "art": {"type": "uri", "value": "https://en.wikipedia.org/wiki/Measure_(mathematics)"}, + "image": {"type": "uri", "value": "http://commons.wikimedia.org/wiki/Special:FilePath/Measure%20illustration%20%28Vector%29.svg"}, + "mwID": {"type": "literal", "value": "Measure"}, + "itemLabel": {"xml:lang": "en", "type": "literal", "value": "measure"}, + "itemDescription": {"xml:lang": "en", "type": "literal", "value": "function assigning numbers to some subsets of a set, which could be seen as a generalization of length, area, volume and integral"}, + "eomID": {"type": "literal", "value": "measure"}, + "pwID": {"type": "literal", "value": "Definition:Measure_(Measure_Theory)"} } ``` diff --git a/web/categorizer/README.md b/web/categorizer/README.md index 40514d3..7f44469 100644 --- a/web/categorizer/README.md +++ b/web/categorizer/README.md @@ -8,7 +8,7 @@ The categorizer module provides LLM-powered categorization of mathematical conce **For FREE local models (recommended):** ```bash -pip install langchain-huggingface langchain-community transformers torch accelerate +make install ``` **For paid API models (optional):** @@ -63,6 +63,8 @@ python manage.py categorize Categorize a limited number of items: ```bash python manage.py categorize --limit 10 +make categorize +# OR ``` Use a specific LLM provider: From 605ff7fdf2a598d159dfe9f5a7106ec342d04848 Mon Sep 17 00:00:00 2001 From: Slobodan Stanojevikj Date: Sun, 14 Dec 2025 13:53:30 +0100 Subject: [PATCH 07/12] -upgraded github actions to latest -added static python version 3.12.7 --- .github/workflows/test.yml | 6 ++++-- .python-version | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) create mode 100644 .python-version diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0115b76..25739e6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -10,8 +10,10 @@ jobs: test: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/checkout@v5 + - uses: actions/setup-python@v6 + with: + python-version: '3.12.7' - run: pip install -r web/requirements.txt - run: pip install black isort flake8 - run: python3 -m black --check . diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..56bb660 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.12.7 From 67e052cbee06fef824104cbfd947b1df1fafc3c0 Mon Sep 17 00:00:00 2001 From: Slobodan Stanojevikj Date: Sun, 14 Dec 2025 14:40:27 +0100 Subject: [PATCH 08/12] -added static versions to dependencies --- web/requirements.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/web/requirements.txt b/web/requirements.txt index 087e587..3a83c77 100644 --- a/web/requirements.txt +++ b/web/requirements.txt @@ -1,5 +1,5 @@ Django~=4.2.6 -requests~=2.31.0 +requests~=2.32.5 spacy~=3.7.0 --prefer-binary scispacy~=0.6.2 @@ -8,11 +8,11 @@ scispacy~=0.6.2 # openai>=1.0.0 # Uncomment for OpenAI GPT models # anthropic>=0.7.0 # Uncomment for Anthropic Claude -# For free local models (recommended): -langchain-huggingface>=0.0.1 # For HuggingFace models -langchain-community>=0.0.1 # For Ollama and other local models +# For free local models: +langchain-huggingface==0.3.1 # For HuggingFace models +langchain-community==0.3.27 # For Ollama and other local models # Required by HuggingFace models -transformers>=4.35.0 -torch>=2.0.0 +transformers~=4.57.0 +torch~=2.9.0 # Speeds up model loading -accelerate>=0.24.0 +accelerate~=1.12.0 From 94730dc7c2c2b9b8434ac989ee64e653741b7861 Mon Sep 17 00:00:00 2001 From: Slobodan Stanojevikj Date: Sun, 14 Dec 2025 16:17:03 +0100 Subject: [PATCH 09/12] -added configuration parser and config file (.env.example) -minor Makefile cleanup and updated Readme --- .gitignore | 3 ++- Makefile | 10 ++++------ README.md | 9 ++++++--- web/.env.example | 2 ++ web/requirements.txt | 1 + web/slurper/source_wikidata.py | 2 +- web/web/settings.py | 5 ++++- 7 files changed, 20 insertions(+), 12 deletions(-) create mode 100644 web/.env.example diff --git a/.gitignore b/.gitignore index 70b914e..908cac1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ venv __pycache__ -web/db.sqlite3 +web/**/*.sqlite3 +**/.env diff --git a/Makefile b/Makefile index d2cff24..5f6ba68 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,8 @@ -install: +prepare-web: pip install -r web/requirements.txt + cp web/.env.example web/.env + python manage.py migrate + python manage.py createsuperuser install-dev: pip install -r requirements.txt @@ -7,11 +10,6 @@ install-dev: install-scispacy: pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz -prepare-db: - pip install -r web/requirements.txt - python manage.py migrate - python manage.py createsuperuser - start: python ./web/manage.py runserver diff --git a/README.md b/README.md index bb558e3..97beebb 100644 --- a/README.md +++ b/README.md @@ -9,16 +9,19 @@ For a demonstration of a page with at least one link, see for example `{baseurl} To install all the necessary Python packages, run: ```bash -make install +make prepare-web # Which does the necessary steps for env, db, superuser # OR pip install -r web/requirements.txt ``` +Prepare an environment: +```bash +cp web/.env.example web/.env +``` + Next, to create a database, run: ```bash -make prepare-db # which migrates db and creates superuser -# OR python manage.py migrate ``` diff --git a/web/.env.example b/web/.env.example new file mode 100644 index 0000000..40e274f --- /dev/null +++ b/web/.env.example @@ -0,0 +1,2 @@ +SECRET_KEY="django-insecure-9wy9w#vf^tde0262doyy_j19=64c()_qub!1)f+fh-b^=7ndw*" +WIKIPEDIA_CONTACT_EMAIL=my@email.com \ No newline at end of file diff --git a/web/requirements.txt b/web/requirements.txt index 3a83c77..4d6c286 100644 --- a/web/requirements.txt +++ b/web/requirements.txt @@ -2,6 +2,7 @@ Django~=4.2.6 requests~=2.32.5 spacy~=3.7.0 --prefer-binary scispacy~=0.6.2 +python-decouple~=3.8 # LLM dependencies (optional, install based on which LLM you want to use) # For paid APIs: diff --git a/web/slurper/source_wikidata.py b/web/slurper/source_wikidata.py index 85e5136..1004d4e 100644 --- a/web/slurper/source_wikidata.py +++ b/web/slurper/source_wikidata.py @@ -6,10 +6,10 @@ from concepts.models import Item from django.db.utils import IntegrityError from slurper.wd_raw_item import WD_OTHER_SOURCES, BaseWdRawItem +from web.settings import WIKIPEDIA_CONTACT_EMAIL # Wikipedia API contact email (required by Wikipedia API guidelines) # Set to None to disable Wikipedia article fetching -WIKIPEDIA_CONTACT_EMAIL = None _missing_email_logged = False # Wikidata entities to exclude from queries diff --git a/web/web/settings.py b/web/web/settings.py index 199c794..8195f10 100644 --- a/web/web/settings.py +++ b/web/web/settings.py @@ -12,6 +12,7 @@ from os import path from pathlib import Path +from decouple import config # Build paths inside the project like this: BASE_DIR / 'subdir'. BASE_DIR = Path(__file__).resolve().parent.parent @@ -21,7 +22,7 @@ # See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/ # SECURITY WARNING: keep the secret key used in production secret! -SECRET_KEY = "django-insecure-9wy9w#vf^tde0262doyy_j19=64c()_qub!1)f+fh-b^=7ndw*" +SECRET_KEY = config('SECRET_KEY', default="django-insecure-9wy9w#vf^tde0262doyy_j19=64c()_qub!1)f+fh-b^=7ndw*") # SECURITY WARNING: don't run with debug turned on in production! DEBUG = True @@ -128,3 +129,5 @@ # https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField" + +WIKIPEDIA_CONTACT_EMAIL = config('WIKIPEDIA_CONTACT_EMAIL', default="my@email.com") From d1acf435fd21b79b2fa69e57df6bd2bf1dd21544 Mon Sep 17 00:00:00 2001 From: Slobodan Stanojevikj Date: Sun, 14 Dec 2025 16:36:04 +0100 Subject: [PATCH 10/12] -code restyling -excluded venv from flake8 --- .flake8 | 2 +- web/slurper/source_wikidata.py | 1 + web/web/settings.py | 8 ++++++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/.flake8 b/.flake8 index 8e952af..7391d15 100644 --- a/.flake8 +++ b/.flake8 @@ -1,3 +1,3 @@ [flake8] -exclude = experiments,migrations,settings.py +exclude = experiments,migrations,settings.py,venv/ max-line-length = 88 diff --git a/web/slurper/source_wikidata.py b/web/slurper/source_wikidata.py index 1004d4e..758f346 100644 --- a/web/slurper/source_wikidata.py +++ b/web/slurper/source_wikidata.py @@ -6,6 +6,7 @@ from concepts.models import Item from django.db.utils import IntegrityError from slurper.wd_raw_item import WD_OTHER_SOURCES, BaseWdRawItem + from web.settings import WIKIPEDIA_CONTACT_EMAIL # Wikipedia API contact email (required by Wikipedia API guidelines) diff --git a/web/web/settings.py b/web/web/settings.py index 8195f10..8b33153 100644 --- a/web/web/settings.py +++ b/web/web/settings.py @@ -12,6 +12,7 @@ from os import path from pathlib import Path + from decouple import config # Build paths inside the project like this: BASE_DIR / 'subdir'. @@ -22,7 +23,10 @@ # See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/ # SECURITY WARNING: keep the secret key used in production secret! -SECRET_KEY = config('SECRET_KEY', default="django-insecure-9wy9w#vf^tde0262doyy_j19=64c()_qub!1)f+fh-b^=7ndw*") +SECRET_KEY = config( + "SECRET_KEY", + default="django-insecure-9wy9w#vf^tde0262doyy_j19=64c()_qub!1)f+fh-b^=7ndw*", +) # SECURITY WARNING: don't run with debug turned on in production! DEBUG = True @@ -130,4 +134,4 @@ DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField" -WIKIPEDIA_CONTACT_EMAIL = config('WIKIPEDIA_CONTACT_EMAIL', default="my@email.com") +WIKIPEDIA_CONTACT_EMAIL = config("WIKIPEDIA_CONTACT_EMAIL", default="my@email.com") From 31354339c727cdf73dec7a645831e4235ffe250f Mon Sep 17 00:00:00 2001 From: Slobodan Stanojevikj Date: Sun, 14 Dec 2025 17:41:52 +0100 Subject: [PATCH 11/12] -minor fixes in Makefile commands --- Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 5f6ba68..7436995 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ prepare-web: pip install -r web/requirements.txt cp web/.env.example web/.env - python manage.py migrate - python manage.py createsuperuser + python ./web/manage.py migrate + python ./web/manage.py createsuperuser install-dev: pip install -r requirements.txt @@ -14,10 +14,10 @@ start: python ./web/manage.py runserver populate-db: - python manage.py import_wikidata + python ./web/manage.py import_wikidata clear-db: - python manage.py clear_wikidata + python ./web/manage.py clear_wikidata compute-concepts: python ./web/manage.py compute_concepts From 8b1f7387dbb415b8ed088e39d696f49ded06c343 Mon Sep 17 00:00:00 2001 From: Slobodan Stanojevikj Date: Sun, 14 Dec 2025 19:48:07 +0100 Subject: [PATCH 12/12] -limited llm input for context window --- web/categorizer/categorizer_service.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/web/categorizer/categorizer_service.py b/web/categorizer/categorizer_service.py index 363e1e1..008289b 100644 --- a/web/categorizer/categorizer_service.py +++ b/web/categorizer/categorizer_service.py @@ -132,14 +132,14 @@ def _build_categorization_prompt(self, item, predicate: str): item_info_parts = [f"Name: {item.name}"] if item.description: - item_info_parts.append(f"Description: {item.description}") + item_info_parts.append(f"Description: {item.description[:100]}") if item.keywords: - item_info_parts.append(f"Keywords: {item.keywords}") + item_info_parts.append(f"Keywords: {item.keywords[:200]}") if item.article_text: - # Truncate article text to 5000 characters - article_text = item.article_text[:5000] + # Truncate article text to 1000 characters + article_text = item.article_text[:1000] item_info_parts.append(f"Article text: {article_text}") item_info = "\n".join(item_info_parts)