diff --git a/.flake8 b/.flake8 index 8e952af..7391d15 100644 --- a/.flake8 +++ b/.flake8 @@ -1,3 +1,3 @@ [flake8] -exclude = experiments,migrations,settings.py +exclude = experiments,migrations,settings.py,venv/ max-line-length = 88 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b77ce29..25739e6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,13 +1,19 @@ name: Test -on: [push, pull_request] +on: + push: + branches: + - main + pull_request: jobs: test: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/checkout@v5 + - uses: actions/setup-python@v6 + with: + python-version: '3.12.7' - run: pip install -r web/requirements.txt - run: pip install black isort flake8 - run: python3 -m black --check . diff --git a/.gitignore b/.gitignore index 70b914e..908cac1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ venv __pycache__ -web/db.sqlite3 +web/**/*.sqlite3 +**/.env diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..56bb660 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.12.7 diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7436995 --- /dev/null +++ b/Makefile @@ -0,0 +1,32 @@ +prepare-web: + pip install -r web/requirements.txt + cp web/.env.example web/.env + python ./web/manage.py migrate + python ./web/manage.py createsuperuser + +install-dev: + pip install -r requirements.txt + +install-scispacy: + pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz + +start: + python ./web/manage.py runserver + +populate-db: + python ./web/manage.py import_wikidata + +clear-db: + python ./web/manage.py clear_wikidata + +compute-concepts: + python ./web/manage.py compute_concepts + +categorize: + python ./web/manage.py categorize --limit 10 + +fix-files: + pip install -r requirements.txt + python3 -m black . + python3 -m isort . + python3 -m flake8 . diff --git a/README.md b/README.md index 0517efd..97beebb 100644 --- a/README.md +++ b/README.md @@ -8,35 +8,87 @@ For a demonstration of a page with at least one link, see for example `{baseurl} To install all the necessary Python packages, run: - pip install -r requirements.txt +```bash +make prepare-web # Which does the necessary steps for env, db, superuser +# OR +pip install -r web/requirements.txt +``` + +Prepare an environment: +```bash +cp web/.env.example web/.env +``` Next, to create a database, run: - python manage.py migrate +```bash +python manage.py migrate +``` In order to use the administrative interface, you need to create an admin user: - python manage.py createsuperuser +```bash +python manage.py createsuperuser +``` Finally, to populate the database, run - python manage.py import_wikidata +```bash +python manage.py import_wikidata +# OR +make populate-db +``` + + * In order to fetch wikipedia articles and extract keywords from them: + ```bash + make install-scispacy + ``` + then configure your email `WIKIPEDIA_CONTACT_EMAIL` in [source_wikidata.py](web/slurper/source_wikidata.py) + * This is needed + * Then run the database population (make sure your db is cleared) + + If you ever want to repopulate the database, you can clear it using - python manage.py clear_wikidata +```bash +python manage.py clear_wikidata +``` + +### To run the categorizer +The categorizer is setup to work with several models, divided into free and paid. +All of them are run locally, so expect some performance hits. The models are downloaded when the categorizer is +ran initially, and by default the free models are used. + +The database needs to be filled in before running it, so: +```bash +make populate-db +``` +then +```bash +make categorize +``` + +There are some known existing issues that have some inline fixes, such as `gpt2` getting stuck +and returning the same prompt, then few times `---\n\n\n---`. + +For more details see [categorizer readme](web/categorizer/README.md). ## Notes for developers In order to contribute, install [Black](https://github.com/psf/black) and [isort](https://pycqa.github.io/isort/) autoformatters and [Flake8](https://flake8.pycqa.org/) linter. - - pip install black isort flake8 +```bash +make install-dev +``` You can run all three with - - isort . - black . - flake8 +```bash +make fix-files +# Or manually +isort . +black . +flake8 +``` or set up a Git pre-commit hook by creating `.git/hooks/pre-commit` with the following contents: @@ -47,35 +99,37 @@ black . && isort . && flake8 ``` Each time after you change a model, make sure to create the appropriate migrations: - - python manage.py makemigrations +```bash +python manage.py makemigrations +``` To update the database with the new model, run: - +```bash python manage.py migrate +``` ## Instructions for Katja to update the live version - - sudo systemctl stop mathswitch - cd mathswitch - git pull - source venv/bin/activate - cd web - ./manage.py rebuild_db - sudo systemctl start mathswitch - +```bash +sudo systemctl stop mathswitch +cd mathswitch +git pull +source venv/bin/activate +cd web +./manage.py rebuild_db +sudo systemctl start mathswitch +``` ## WD item JSON example -``` +```json { - 'item': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q192276'}, - 'art': {'type': 'uri', 'value': 'https://en.wikipedia.org/wiki/Measure_(mathematics)'}, - 'image': {'type': 'uri', 'value': 'http://commons.wikimedia.org/wiki/Special:FilePath/Measure%20illustration%20%28Vector%29.svg'}, - 'mwID': {'type': 'literal', 'value': 'Measure'}, - 'itemLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'measure'}, - 'itemDescription': {'xml:lang': 'en', 'type': 'literal', 'value': 'function assigning numbers to some subsets of a set, which could be seen as a generalization of length, area, volume and integral'}, - 'eomID': {'type': 'literal', 'value': 'measure'}, - 'pwID': {'type': 'literal', 'value': 'Definition:Measure_(Measure_Theory)' + "item": {"type": "uri", "value": "http://www.wikidata.org/entity/Q192276"}, + "art": {"type": "uri", "value": "https://en.wikipedia.org/wiki/Measure_(mathematics)"}, + "image": {"type": "uri", "value": "http://commons.wikimedia.org/wiki/Special:FilePath/Measure%20illustration%20%28Vector%29.svg"}, + "mwID": {"type": "literal", "value": "Measure"}, + "itemLabel": {"xml:lang": "en", "type": "literal", "value": "measure"}, + "itemDescription": {"xml:lang": "en", "type": "literal", "value": "function assigning numbers to some subsets of a set, which could be seen as a generalization of length, area, volume and integral"}, + "eomID": {"type": "literal", "value": "measure"}, + "pwID": {"type": "literal", "value": "Definition:Measure_(Measure_Theory)"} } ``` diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6fbba70 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +black~=25.9.0 +isort~=5.12.0 +flake8~=7.3.0 + +-r ./web/requirements.txt diff --git a/web/.env.example b/web/.env.example new file mode 100644 index 0000000..40e274f --- /dev/null +++ b/web/.env.example @@ -0,0 +1,2 @@ +SECRET_KEY="django-insecure-9wy9w#vf^tde0262doyy_j19=64c()_qub!1)f+fh-b^=7ndw*" +WIKIPEDIA_CONTACT_EMAIL=my@email.com \ No newline at end of file diff --git a/web/categorizer/README.md b/web/categorizer/README.md new file mode 100644 index 0000000..7f44469 --- /dev/null +++ b/web/categorizer/README.md @@ -0,0 +1,141 @@ +# Categorizer Module + +The categorizer module provides LLM-powered categorization of mathematical concepts. + +## Setup + +### 1. Install Required Dependencies + +**For FREE local models (recommended):** +```bash +make install +``` + +**For paid API models (optional):** + +For OpenAI: +```bash +pip install openai +``` + +For Anthropic Claude: +```bash +pip install anthropic +``` + +**For Ollama (free local alternative):** +1. Install Ollama from https://ollama.ai +2. Install langchain-community: `pip install langchain-community` +3. Pull a model: `ollama pull llama2` + +### 2. Configure API Keys (only for paid models) + +Set the appropriate environment variable for your chosen LLM provider: + +**For OpenAI:** +```bash +export OPENAI_API_KEY="your-openai-api-key-here" +``` + +**For Anthropic Claude:** +```bash +export ANTHROPIC_API_KEY="your-anthropic-api-key-here" +``` + +**For Ollama (optional):** +```bash +export OLLAMA_MODEL="llama2" # Default is llama2 +``` + +You can also add these to a `.env` file or your shell configuration file (`.bashrc`, `.zshrc`, etc.). + +## Usage + +### Basic Usage + +Categorize all items using the default FREE LLM (HuggingFace FLAN-T5): +```bash +python manage.py categorize +``` + +### With Options + +Categorize a limited number of items: +```bash +python manage.py categorize --limit 10 +make categorize +# OR +``` + +Use a specific LLM provider: + +**FREE models (run locally):** +```bash +# Use HuggingFace FLAN-T5 (default, free, good for instruction following) +python manage.py categorize --llm huggingface_flan_t5 + +# Use HuggingFace GPT-2 (free, generative model) +python manage.py categorize --llm huggingface_gpt2 + +# Use HuggingFace DialoGPT (free, conversational model) +python manage.py categorize --llm huggingface_dialogpt + +# Use Ollama (free, requires Ollama installed) +python manage.py categorize --llm ollama +``` + +**Paid API models:** +```bash +# Use OpenAI GPT-4 (requires API key) +python manage.py categorize --llm openai_gpt4 + +# Use OpenAI GPT-3.5 Turbo (requires API key) +python manage.py categorize --llm openai_gpt35 + +# Use Anthropic Claude (requires API key) +python manage.py categorize --llm anthropic_claude +``` + +Combine options: +```bash +python manage.py categorize --limit 5 --llm huggingface_flan_t5 +``` + +## Architecture + +- `categorizer_service.py` - Main service for categorizing items +- `llm_service.py` - Service for calling various LLM APIs +- `management/commands/categorize.py` - Django management command + +## Supported LLMs + +### Free Models (No API Key Required) +1. **HuggingFace FLAN-T5** - Google's instruction-following model (recommended for tasks) +2. **HuggingFace GPT-2** - OpenAI's classic generative model +3. **HuggingFace DialoGPT** - Microsoft's conversational model +4. **Ollama** - Run any Ollama model locally (llama2, mistral, etc.) + +### Paid API Models (Require API Key) +1. **OpenAI GPT-4** - Most capable, but expensive +2. **OpenAI GPT-3.5 Turbo** - Fast and cheaper than GPT-4 +3. **Anthropic Claude** - High quality, good reasoning + +## Performance Notes + +- **Free models** run locally and don't require internet/API keys, but: + - First run downloads the model (~1-3GB depending on model) + - Requires sufficient RAM (4-8GB+ recommended) + - Slower than API models (especially without GPU) + +- **API models** are faster but cost money per request + +- **Ollama** is a good middle ground - free, local, and supports many models + +## Extending + +To add support for additional LLM providers: + +1. Add a new entry to the `LLMType` enum in `llm_service.py` +2. Implement a new private method (e.g., `_call_new_provider`) in the `LLMService` class +3. Add the new provider to the `call_llm` method's conditional logic +4. Update the command choices in `management/commands/categorize.py` diff --git a/web/categorizer/__init__.py b/web/categorizer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/web/categorizer/categorizer_service.py b/web/categorizer/categorizer_service.py new file mode 100644 index 0000000..008289b --- /dev/null +++ b/web/categorizer/categorizer_service.py @@ -0,0 +1,239 @@ +import logging + +from categorizer.llm_service import LLMService, LLMType +from concepts.models import CategorizerResult, Item + +# Free LLM types to use for categorization +LLM_JUDGE_POOL = [ + LLMType.HUGGINGFACE_FLAN_T5, + LLMType.HUGGINGFACE_GPT2, + LLMType.HUGGINGFACE_DIALOGPT, +] + + +class CategorizerService: + """ + Service for categorizing mathematical concepts. + """ + + def __init__(self): + self.logger = logging.getLogger(__name__) + self.llm_service = LLMService() + + def categorize_items(self, limit=None): + """ + Categorize items from the database using all free LLM types. + + Args: + limit: Optional limit on number of items to process + """ + queryset = Item.objects.all() + if limit: + queryset = queryset[:limit] + + total = queryset.count() + self.logger.info( + f"Categorizing {total} items using {len(LLM_JUDGE_POOL)} free LLMs" + ) + + for i, item in enumerate(queryset): + self.logger.info(f"Processing item {i + 1}/{total}: {item.identifier}") + self.categorize_item(item) + + self.logger.info("Categorization complete") + + def categorize_item( + self, + item, + predicate: str = "Is the given concept a mathematical concept," + " given the name, description, " + "keywords, and article text?", + ): + """ + Categorize a single item using all free LLM types. + + Args: + item: Item instance to categorize + predicate: The question to evaluate (default: checks if it's + a mathematical concept) + + Returns: + List of categorization results from all LLMs + """ + self.logger.debug(f"Categorizing: {item.name}") + + prompt = self._build_categorization_prompt(item, predicate) + + results = [] + + for llm_type in LLM_JUDGE_POOL: + try: + self.logger.info(f"Calling {llm_type.value} for {item.name}") + raw_result = self.llm_service.call_llm(llm_type, prompt) + self.logger.info( + f"Categorized {item.name} with {llm_type.value}: " + f"{raw_result[:100]}..." + ) + + print(f"{raw_result}") + parsed_result = self._parse_categorization_result(raw_result) + + confidence = parsed_result["confidence"] + if confidence is None: + confidence = 50 + + categorizer_result = CategorizerResult.objects.create( + item=item, + llm_type=llm_type.value, + raw_result=raw_result, + result_answer=parsed_result["answer"], + result_confidence=confidence, + ) + categorizer_result.save() + + self.logger.info( + f"Saved categorization result for {item.name} ({llm_type.value}): " + f"answer={parsed_result['answer']}, " + f"confidence={parsed_result['confidence']}" + ) + + results.append(parsed_result) + except Exception as e: + self.logger.error( + f"Failed to categorize {item.name} with {llm_type.value}: {e}" + ) + # Continue with other LLMs even if one fails? + continue + + return results + + def _build_categorization_prompt(self, item, predicate: str): + """ + Build a prompt for evaluating a concept against a predicate. + + Args: + item: Item instance to categorize + predicate: The question/predicate to evaluate + + Returns: + Formatted prompt string + """ + system_prompt = """You are a categorization judge. Your task is to + evaluate whether a given concept satisfies a specific predicate. + +You must respond with a structured answer containing: +1. answer: true or false (boolean) +2. confidence: a number from 0 to 100 (representing your confidence percentage) + +IMPORTANT: Format your response as comma-separated string: +yes,85 +""" + + item_info_parts = [f"Name: {item.name}"] + + if item.description: + item_info_parts.append(f"Description: {item.description[:100]}") + + if item.keywords: + item_info_parts.append(f"Keywords: {item.keywords[:200]}") + + if item.article_text: + # Truncate article text to 1000 characters + article_text = item.article_text[:1000] + item_info_parts.append(f"Article text: {article_text}") + + item_info = "\n".join(item_info_parts) + + prompt = f"""{system_prompt} + +--- + +CONCEPT INFORMATION: +{item_info} + +--- + +PREDICATE TO EVALUATE: +{predicate} + +--- + +Please provide your evaluation in the comma-separated format specified above.""" + + return prompt + + def _parse_categorization_result(self, result: str) -> dict: + """ + Parse the LLM's comma-separated response. + + Args: + result: The raw response from the LLM (expected format: "yes,85" + or "no,75", "yes ---") + + Returns: + Dictionary with 'answer' (bool) and 'confidence' (int) keys + + Raises: + ValueError: If the response cannot be parsed + """ + try: + # Clean the result string + result = result.strip() + + # Split by comma (with or without space) or just space + # Try separators in order of specificity: ", ", ",", " " + if ", " in result: + parts = result.split(", ", 1) + elif "," in result: + parts = result.split(",", 1) + else: + parts = result.split(" ", 1) + + if len(parts) == 1: + # Only answer provided, no confidence + answer_str = parts[0].strip().lower() + confidence = None + elif len(parts) == 2: + # Both answer and confidence provided + answer_str = parts[0].strip().lower() + confidence_str = parts[1].strip() + + # Parse confidence if provided + if confidence_str: + try: + confidence = int(confidence_str) + if not 0 <= confidence <= 100: + self.logger.warning( + f"Confidence {confidence} out of range [0-100], " + f"setting to None" + ) + confidence = None + except ValueError: + self.logger.warning( + f"Invalid confidence value '{confidence_str}', " + f"setting to None" + ) + confidence = None + else: + confidence = None + else: + raise ValueError( + f"Expected format 'answer' or 'answer,confidence', got: {result}" + ) + + # Parse answer - accept yes/true/1 as True, no/false/0 as False + if answer_str in ("yes", "true", "1"): + answer = True + elif answer_str in ("no", "false", "0"): + answer = False + else: + raise ValueError( + f"Invalid answer value: {answer_str}. " + f"Expected yes/no, true/false, or 1/0" + ) + + return {"answer": answer, "confidence": confidence} + + except (ValueError, IndexError) as e: + self.logger.error(f"Failed to parse response: {result}") + raise ValueError(f"Invalid response format: {e}") diff --git a/web/categorizer/llm_service.py b/web/categorizer/llm_service.py new file mode 100644 index 0000000..0d8d364 --- /dev/null +++ b/web/categorizer/llm_service.py @@ -0,0 +1,242 @@ +import logging +import os +from enum import Enum + + +class LLMType(Enum): + """Supported LLM types""" + + # Paid API-based models + OPENAI_GPT4 = "openai_gpt4" + OPENAI_GPT35 = "openai_gpt35" + ANTHROPIC_CLAUDE = "anthropic_claude" + + # Free HuggingFace models (run locally) + HUGGINGFACE_FLAN_T5 = "huggingface_flan_t5" + HUGGINGFACE_GPT2 = "huggingface_gpt2" + HUGGINGFACE_DIALOGPT = "huggingface_dialogpt" + + # Ollama (free local models) + OLLAMA = "ollama" + + +class LLMService: + """ + Service for calling various LLM providers. + """ + + def __init__(self): + self.logger = logging.getLogger(__name__) + self.llm_handlers = { + LLMType.OPENAI_GPT4: lambda llm_type, prompt: self._call_openai( + llm_type, prompt + ), + LLMType.OPENAI_GPT35: lambda llm_type, prompt: self._call_openai( + llm_type, prompt + ), + LLMType.ANTHROPIC_CLAUDE: lambda llm_type, prompt: self._call_anthrpc( + prompt + ), + LLMType.HUGGINGFACE_FLAN_T5: lambda llm_type, prompt: self._call_hgf( + "google/flan-t5-base", prompt + ), + LLMType.HUGGINGFACE_GPT2: lambda llm_type, prompt: self._call_hgf( + "gpt2", prompt + ), + LLMType.HUGGINGFACE_DIALOGPT: lambda llm_type, prompt: self._call_hgf( + "microsoft/DialoGPT-medium", prompt + ), + LLMType.OLLAMA: lambda llm_type, prompt: self._call_ollama(prompt), + } + + def call_llm(self, llm_type: LLMType, prompt: str) -> str: + """ + Call an LLM with the given prompt. + + Args: + llm_type: The type of LLM to use (LLMType enum) + prompt: The prompt to send to the LLM + + Returns: + The LLM's response as a string + + Raises: + ValueError: If the LLM type is not supported or API key is missing + Exception: If the API call fails + """ + self.logger.info(f"Calling {llm_type.value} with prompt length: {len(prompt)}") + + handler = self.llm_handlers.get(llm_type) + + if handler: + return handler(llm_type, prompt) + else: + raise ValueError(f"Unsupported LLM type: {llm_type}") + + def _call_openai(self, llm_type: LLMType, prompt: str) -> str: + """Call OpenAI API""" + try: + import openai + except ImportError: + raise ImportError( + "openai package is required. Install it with: pip install openai" + ) + + api_key = os.getenv("OPENAI_API_KEY") + if not api_key: + raise ValueError( + "OPENAI_API_KEY environment variable is not set. " + "Please set it to your OpenAI API key." + ) + + openai.api_key = api_key + + model = "gpt-4" if llm_type == LLMType.OPENAI_GPT4 else "gpt-3.5-turbo" + + try: + response = openai.ChatCompletion.create( + model=model, + messages=[{"role": "user", "content": prompt}], + temperature=0.7, + ) + return response.choices[0].message.content + except Exception as e: + self.logger.error(f"OpenAI API call failed: {e}") + raise + + def _call_anthrpc(self, prompt: str) -> str: + """Call Anthropic Claude API""" + try: + import anthropic + except ImportError: + raise ImportError( + "anthropic package is required. Install it with: pip install anthropic" + ) + + api_key = os.getenv("ANTHROPIC_API_KEY") + if not api_key: + raise ValueError( + "ANTHROPIC_API_KEY environment variable is not set. " + "Please set it to your Anthropic API key." + ) + + client = anthropic.Anthropic(api_key=api_key) + + try: + response = client.messages.create( + model="claude-3-5-sonnet-20241022", + max_tokens=1024, + messages=[{"role": "user", "content": prompt}], + ) + return response.content[0].text + except Exception as e: + self.logger.error(f"Anthropic API call failed: {e}") + raise + + def _call_hgf(self, model_id: str, prompt: str) -> str: + """ + Call HuggingFace models using langchain. + + Args: + model_id: HuggingFace model ID (e.g., "google/flan-t5-base") + prompt: The prompt to send to the model + + Returns: + The model's response + """ + try: + from langchain_huggingface import HuggingFacePipeline + except ImportError: + raise ImportError( + "langchain-huggingface package is required. " + "Install it with: pip install langchain-huggingface" + ) + + self.logger.info(f"Loading HuggingFace model: {model_id}") + + try: + pipeline_kwargs = { + "max_new_tokens": 512, + "temperature": 0.7, + } + + # Add pad_token_id for DialoGPT and GPT2 + if "DialoGPT" in model_id or "gpt2" in model_id: + pipeline_kwargs["pad_token_id"] = 50256 + + # Create the HuggingFace pipeline + hf = HuggingFacePipeline.from_model_id( + model_id=model_id, + task=( + "text-generation" + if "gpt" in model_id.lower() + else "text2text-generation" + ), + pipeline_kwargs=pipeline_kwargs, + ) + + response = hf.invoke(prompt) + + if "gpt2" in model_id.lower(): + response = response.removeprefix(prompt).strip() + + lines = response.split("\n") + cleaned_lines = [] + for line in lines: + if line.strip() and line.strip() != "---": + cleaned_lines.append(line) + + response = "\n".join(cleaned_lines).strip() + + # If we got nothing useful, return a default response + if not response: + self.logger.warning( + "GPT2 produced no useful output, " "returning default: 'no, 0'" + ) + response = "no, 0" + + self.logger.info(f"HuggingFace model response length: {len(response)}") + return response + + except Exception as e: + self.logger.error(f"HuggingFace model call failed: {e}") + raise + + def _call_ollama(self, prompt: str, model: str = "llama2") -> str: + """ + Call Ollama for local LLM inference. + + Args: + prompt: The prompt to send to the model + model: Ollama model name (default: llama2) + + Returns: + The model's response + + Note: + Requires Ollama to be installed and running locally. + Install from: https://ollama.ai + """ + try: + from langchain_community.llms import Ollama + except ImportError: + raise ImportError( + "langchain-community package is required. " + "Install it with: pip install langchain-community" + ) + + # Allow model override via environment variable + model = os.getenv("OLLAMA_MODEL", model) + + self.logger.info(f"Calling Ollama with model: {model}") + + try: + llm = Ollama(model=model) + response = llm.invoke(prompt) + return response + except Exception as e: + self.logger.error( + f"Ollama call failed: {e}. " + "Make sure Ollama is installed and running (https://ollama.ai)" + ) + raise diff --git a/web/categorizer/management/__init__.py b/web/categorizer/management/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/web/categorizer/management/commands/__init__.py b/web/categorizer/management/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/web/categorizer/management/commands/categorize.py b/web/categorizer/management/commands/categorize.py new file mode 100644 index 0000000..21a6794 --- /dev/null +++ b/web/categorizer/management/commands/categorize.py @@ -0,0 +1,34 @@ +from categorizer.categorizer_service import CategorizerService +from django.core.management.base import BaseCommand + + +class Command(BaseCommand): + help = "Categorize mathematical concepts using all free LLMs (HuggingFace models)" + + def add_arguments(self, parser): + parser.add_argument( + "--limit", + type=int, + default=None, + help="Limit the number of items to categorize", + ) + + def handle(self, *args, **options): + limit = options.get("limit") + + service = CategorizerService() + + self.stdout.write( + "Using all free LLMs: huggingface_flan_t5, " + "huggingface_gpt2, huggingface_dialogpt" + ) + if limit: + self.stdout.write(f"Categorizing up to {limit} items...") + else: + self.stdout.write("Categorizing all items...") + + try: + service.categorize_items(limit=limit) + self.stdout.write(self.style.SUCCESS("Categorization complete!")) + except Exception as e: + self.stdout.write(self.style.ERROR(f"Categorization failed: {e}")) diff --git a/web/concepts/admin.py b/web/concepts/admin.py index 60e9bc5..83cb3c7 100644 --- a/web/concepts/admin.py +++ b/web/concepts/admin.py @@ -1,6 +1,6 @@ from django.contrib import admin -from .models import Item +from .models import CategorizerResult, Item class ItemAdmin(admin.ModelAdmin): @@ -9,4 +9,19 @@ class ItemAdmin(admin.ModelAdmin): list_filter = ["source"] +class CategorizerResultAdmin(admin.ModelAdmin): + list_display = [ + "item", + "llm_type", + "result_answer", + "result_confidence", + "created_at", + ] + search_fields = ["item__name", "item__identifier"] + list_filter = ["llm_type", "result_answer", "created_at"] + readonly_fields = ["created_at", "updated_at"] + ordering = ["-created_at"] + + admin.site.register(Item, ItemAdmin) +admin.site.register(CategorizerResult, CategorizerResultAdmin) diff --git a/web/concepts/migrations/0011_item_keywords.py b/web/concepts/migrations/0011_item_keywords.py new file mode 100644 index 0000000..1773323 --- /dev/null +++ b/web/concepts/migrations/0011_item_keywords.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.25 on 2025-12-11 18:55 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("concepts", "0010_alter_item_source"), + ] + + operations = [ + migrations.AddField( + model_name="item", + name="keywords", + field=models.TextField(blank=True, null=True), + ), + ] diff --git a/web/concepts/migrations/0012_item_article_text.py b/web/concepts/migrations/0012_item_article_text.py new file mode 100644 index 0000000..4c1998d --- /dev/null +++ b/web/concepts/migrations/0012_item_article_text.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.25 on 2025-12-11 20:41 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("concepts", "0011_item_keywords"), + ] + + operations = [ + migrations.AddField( + model_name="item", + name="article_text", + field=models.TextField(blank=True, null=True), + ), + ] diff --git a/web/concepts/migrations/0013_item_aliases.py b/web/concepts/migrations/0013_item_aliases.py new file mode 100644 index 0000000..510997c --- /dev/null +++ b/web/concepts/migrations/0013_item_aliases.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.25 on 2025-12-11 21:19 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("concepts", "0012_item_article_text"), + ] + + operations = [ + migrations.AddField( + model_name="item", + name="aliases", + field=models.TextField(blank=True, null=True), + ), + ] diff --git a/web/concepts/migrations/0014_categorizerresult.py b/web/concepts/migrations/0014_categorizerresult.py new file mode 100644 index 0000000..96dc102 --- /dev/null +++ b/web/concepts/migrations/0014_categorizerresult.py @@ -0,0 +1,58 @@ +# Generated by Django 4.2.25 on 2025-12-11 22:37 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ("concepts", "0013_item_aliases"), + ] + + operations = [ + migrations.CreateModel( + name="CategorizerResult", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("llm_type", models.CharField(max_length=50)), + ("raw_result", models.TextField()), + ("result_answer", models.BooleanField()), + ("result_confidence", models.IntegerField()), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ( + "item", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="categorizer_results", + to="concepts.item", + ), + ), + ], + options={ + "ordering": ["-created_at"], + "indexes": [ + models.Index( + fields=["item", "llm_type"], + name="concepts_ca_item_id_c24595_idx", + ), + models.Index( + fields=["result_answer"], name="concepts_ca_result__a4c7a5_idx" + ), + models.Index( + fields=["result_confidence"], + name="concepts_ca_result__d25f96_idx", + ), + ], + }, + ), + ] diff --git a/web/concepts/models.py b/web/concepts/models.py index 697eb61..f15c95e 100644 --- a/web/concepts/models.py +++ b/web/concepts/models.py @@ -88,6 +88,9 @@ def key(): url = models.URLField(max_length=200) name = models.CharField(max_length=200, null=True) description = models.TextField(null=True) + keywords = models.TextField(null=True, blank=True) + article_text = models.TextField(null=True, blank=True) + aliases = models.TextField(null=True, blank=True) concept = models.ForeignKey( Concept, models.SET_NULL, @@ -159,3 +162,33 @@ def save_new(source: Item, destination: Item, label: Label): def __str__(self): return f"{self.source} -[{self.get_label_display()}]-> {self.destination}" + + +class CategorizerResult(models.Model): + """ + Stores the result of categorizing an item using an LLM. + """ + + item = models.ForeignKey( + Item, on_delete=models.CASCADE, related_name="categorizer_results" + ) + llm_type = models.CharField(max_length=100) + raw_result = models.TextField() + result_answer = models.BooleanField() + result_confidence = models.IntegerField() + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) + + class Meta: + ordering = ["-created_at"] + indexes = [ + models.Index(fields=["item", "llm_type"]), + models.Index(fields=["result_answer"]), + models.Index(fields=["result_confidence"]), + ] + + def __str__(self): + return ( + f"{self.item} - {self.llm_type}: " + f"{self.result_answer} ({self.result_confidence}%)" + ) diff --git a/web/requirements.txt b/web/requirements.txt index 6b64bf7..4d6c286 100644 --- a/web/requirements.txt +++ b/web/requirements.txt @@ -1,2 +1,19 @@ Django~=4.2.6 -requests~=2.31.0 +requests~=2.32.5 +spacy~=3.7.0 --prefer-binary +scispacy~=0.6.2 +python-decouple~=3.8 + +# LLM dependencies (optional, install based on which LLM you want to use) +# For paid APIs: +# openai>=1.0.0 # Uncomment for OpenAI GPT models +# anthropic>=0.7.0 # Uncomment for Anthropic Claude + +# For free local models: +langchain-huggingface==0.3.1 # For HuggingFace models +langchain-community==0.3.27 # For Ollama and other local models +# Required by HuggingFace models +transformers~=4.57.0 +torch~=2.9.0 +# Speeds up model loading +accelerate~=1.12.0 diff --git a/web/slurper/keyword_util.py b/web/slurper/keyword_util.py new file mode 100644 index 0000000..0523e47 --- /dev/null +++ b/web/slurper/keyword_util.py @@ -0,0 +1,35 @@ +import spacy + +# TODO SST: Move to readme.md +# Load the scientific English model from scispacy +# Note: You need to download this model first with: +# make install-scispacy + +# Lazy-loaded spaCy model +_nlp = None + + +def _get_nlp(): + """Lazy-load the spaCy model only when needed.""" + global _nlp + if _nlp is None: + _nlp = spacy.load("en_core_sci_lg") + return _nlp + + +def extract_keywords(text): + """ + Extract keywords from text using spaCy's named entity recognition. + + Args: + text: The text to extract keywords from + + Returns: + A list of recognized entities (keywords) from the text + """ + if not text: + return [] + + nlp = _get_nlp() + doc = nlp(text) + return doc.ents diff --git a/web/slurper/source_wikidata.py b/web/slurper/source_wikidata.py index 06b6a97..758f346 100644 --- a/web/slurper/source_wikidata.py +++ b/web/slurper/source_wikidata.py @@ -1,15 +1,90 @@ import logging +import time +import urllib.parse import requests from concepts.models import Item from django.db.utils import IntegrityError from slurper.wd_raw_item import WD_OTHER_SOURCES, BaseWdRawItem +from web.settings import WIKIPEDIA_CONTACT_EMAIL + +# Wikipedia API contact email (required by Wikipedia API guidelines) +# Set to None to disable Wikipedia article fetching +_missing_email_logged = False + +# Wikidata entities to exclude from queries +KNOWN_EXCLUDED_CATEGORIES = [ + # Natural numbers + "wd:Q21199", + # positive integers + "wd:Q28920044", + # countries + "wd:Q6256", + # philosophical concepts + "wd:Q714737", +] + + +def _load_excluded_categories_from_results(): + """ + Load Wikidata identifiers of items that have been categorized as "no" + with confidence > 49%, to be excluded from future queries. + + Returns a list of Wikidata entity IDs in the format ["wd:Q12345", ...]. + """ + try: + from concepts.models import CategorizerResult + from django.db.models import Avg + + excluded_items = ( + CategorizerResult.objects.filter( + result_answer=False, result_confidence__gt=49 + ) + .values("item__identifier", "item__source") + .annotate(avg_confidence=Avg("result_confidence")) + .filter(avg_confidence__gt=49, item__source=Item.Source.WIKIDATA) + .distinct() + ) + + categories = [f"wd:{item['item__identifier']}" for item in excluded_items] + + if categories: + logging.log( + logging.INFO, + f"Loaded {len(categories)} excluded categories " + f"from categorizer results", + ) + + return categories + except Exception as e: + logging.log( + logging.DEBUG, f"Could not load excluded categories from results: {e}" + ) + return [] + + +RESULT_EXCLUDED_CATEGORIES = _load_excluded_categories_from_results() + +EXCLUDED_CATEGORIES = KNOWN_EXCLUDED_CATEGORIES + RESULT_EXCLUDED_CATEGORIES + + +# These are added to every query: +# - Optional image: Fetches image if available +# - Optional Wikipedia link: Gets English Wikipedia article +# - Excludes natural numbers (FILTER NOT EXISTS) +# - Excludes humans (FILTER NOT EXISTS) +# - Label service: Automatically fetches English labels and descriptions +# +# The class fetches mathematical concepts from Wikidata while +# filtering out unwanted items like people and natural numbers. + class WikidataSlurper: SPARQL_URL = "https://query.wikidata.org/sparql" - SPARQL_QUERY_OPTIONS = """ + SPARQL_QUERY_OPTIONS = ( + """ OPTIONAL { ?item wdt:P18 ?image . } OPTIONAL @@ -18,9 +93,14 @@ class WikidataSlurper: schema:isPartOf ; schema:about ?item . } - # except for natural numbers - MINUS { - ?item wdt:P31 wd:Q21199 . + OPTIONAL + { ?item skos:altLabel ?itemAltLabel . FILTER (lang(?itemAltLabel) = "en") } + # except for natural numbers and positive integers + FILTER NOT EXISTS { + VALUES ?excludedType { """ + + " ".join(EXCLUDED_CATEGORIES) + + """ } + ?item wdt:P31 ?excludedType . } # except for humans FILTER NOT EXISTS{ ?item wdt:P31 wd:Q5 . } @@ -28,6 +108,7 @@ class WikidataSlurper: SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } } """ + ) def __init__(self, source, query, limit=None): self.source = source @@ -35,6 +116,7 @@ def __init__(self, source, query, limit=None): """ SELECT DISTINCT ?item ?itemLabel ?itemDescription ?image ?wp_en + (GROUP_CONCAT(DISTINCT ?itemAltLabel; separator=", ") AS ?aliases) """ + self._sparql_source_vars_select() + """ @@ -43,6 +125,11 @@ def __init__(self, source, query, limit=None): + query + self._sparql_source_vars_triples() + self.SPARQL_QUERY_OPTIONS + + """ +GROUP BY ?item ?itemLabel ?itemDescription ?image ?wp_en """ + + " ".join([f"?{src['json_key']}" for src in WD_OTHER_SOURCES.values()]) + + """ +""" + (f"LIMIT {limit}" if limit is not None else "") ) self.raw_data = self.fetch_json() @@ -70,6 +157,128 @@ def fetch_json(self): ) return response.json()["results"]["bindings"] + def fetch_article(self, json_item, index=None, total=None): + global _missing_email_logged + + # Check if contact email is configured + if WIKIPEDIA_CONTACT_EMAIL is None: + if not _missing_email_logged: + logging.log( + logging.WARNING, + "WIKIPEDIA_CONTACT_EMAIL is not set. " + "Wikipedia article fetching is disabled. " + "Please set WIKIPEDIA_CONTACT_EMAIL at the top of " + "source_wikidata.py to enable article fetching.", + ) + _missing_email_logged = True + return None + + wp_url = json_item["wp_en"]["value"] + # Decode URL-encoded characters (e.g., %E2%80%93 becomes –) + article_title = urllib.parse.unquote(wp_url.split("/wiki/")[-1]) + + if index is not None and total is not None: + logging.log( + logging.INFO, + f"Fetching Wikipedia article [{index}/{total}]: {article_title}", + ) + else: + logging.log( + logging.INFO, + f"Fetching Wikipedia article: {article_title}", + ) + api_url = "https://en.wikipedia.org/w/api.php" + params = { + "action": "query", + "format": "json", + "titles": article_title, + "prop": "extracts", + "explaintext": True, + "exsectionformat": "plain", + } + headers = { + "User-Agent": f"MathSwitch/1.0 ({WIKIPEDIA_CONTACT_EMAIL})", + "Accept": "application/json", + "Accept-Language": "en-US,en;q=0.9", + } + # Retry logic with exponential backoff + max_retries = 3 + retry_delay = 1 # Start with 1 second + success = False + for attempt in range(max_retries): + try: + # Rate limiting: delay between requests (100 req/s max) + time.sleep(0.01) + + # Timeout: (connect_timeout, read_timeout) in seconds + response = requests.get( + api_url, params=params, headers=headers, timeout=(5, 30) + ) + + # Handle rate limiting + if response.status_code in (429, 403): + if attempt < max_retries - 1: + logging.log( + logging.WARNING, + f"Rate limited for {article_title}, retrying in " + f"{retry_delay}s (attempt {attempt + 1}/{max_retries})", + ) + time.sleep(retry_delay) + retry_delay *= 2 # Exponential backoff + continue + else: + logging.log( + logging.ERROR, + f"Failed to fetch {article_title} after " + f"{max_retries} attempts (rate limited). Skipping article.", + ) + break + + response.raise_for_status() + + if not response.text: + logging.log( + logging.WARNING, + f"Empty response for Wikipedia article: " + f"{article_title}. Skipping article.", + ) + break + + data = response.json() + pages = data.get("query", {}).get("pages", {}) + + # Get the first (and only) page + for page_id, page_data in pages.items(): + if "extract" in page_data: + success = True + return page_data["extract"] + + # Success, break retry loop + break + + except requests.exceptions.RequestException as e: + if attempt < max_retries - 1: + logging.log( + logging.WARNING, + f"Request failed for {article_title}: " + f"{e}, retrying in {retry_delay}s", + ) + time.sleep(retry_delay) + retry_delay *= 2 + else: + logging.log( + logging.ERROR, + f"Failed to fetch {article_title}" + f" after {max_retries} attempts: {e}. Skipping article.", + ) + if not success and "wp_en" in json_item: + logging.log( + logging.INFO, + f"Article {article_title} will have null value (fetch failed or empty)", + ) + + return None + def get_items(self): for json_item in self.raw_data: raw_item = BaseWdRawItem.raw_item(self.source, json_item) @@ -79,6 +288,12 @@ def get_items(self): if not raw_item_wd.item_exists(): yield raw_item_wd.to_item() if raw_item.has_source(Item.Source.WIKIPEDIA_EN): + # Fetch Wikipedia article if available + if "wp_en" in json_item and "article_text" not in json_item: + article_text = self.fetch_article(json_item) + if article_text is not None: + json_item["article_text"] = {"value": article_text} + raw_item_wp_en = raw_item.switch_source_to(Item.Source.WIKIPEDIA_EN) if not raw_item_wp_en.item_exists(): yield raw_item_wp_en.to_item() diff --git a/web/slurper/wd_raw_item.py b/web/slurper/wd_raw_item.py index cc71823..29a5539 100644 --- a/web/slurper/wd_raw_item.py +++ b/web/slurper/wd_raw_item.py @@ -1,6 +1,7 @@ from typing import Optional from concepts.models import Item, Link +from slurper.keyword_util import extract_keywords WD_OTHER_SOURCES = { Item.Source.NLAB: { @@ -42,6 +43,18 @@ def name(self): def description(self): return None + def aliases(self): + """Get aliases (alternative labels) if available.""" + if "aliases" in self.raw and self.raw["aliases"]["value"]: + return self.raw["aliases"]["value"] + return None + + def article_text(self): + """Get the Wikipedia article text if available.""" + if "article_text" in self.raw: + return self.raw["article_text"]["value"] + return None + def has_source(self, source): if source == Item.Source.WIKIPEDIA_EN: return "wp_en" in self.raw @@ -52,12 +65,26 @@ def switch_source_to(self, source): return BaseWdRawItem.raw_item(source, self.raw) def to_item(self) -> Optional[Item]: + # Extract keywords from article text if available + article = self.article_text() + keywords = None + + if article: + # Extract entities using spaCy + entities = extract_keywords(article) + # Convert to lowercase and create comma-separated string + keyword_list = [entity.text.lower() for entity in entities] + keywords = ", ".join(keyword_list) if keyword_list else None + return Item( source=self.source, identifier=self.identifier(), url=self.url(), name=self.name(), description=self.description(), + keywords=keywords, + article_text=article, + aliases=self.aliases(), ) def _get_item_queryset(self): diff --git a/web/web/settings.py b/web/web/settings.py index 31317db..8b33153 100644 --- a/web/web/settings.py +++ b/web/web/settings.py @@ -13,6 +13,8 @@ from os import path from pathlib import Path +from decouple import config + # Build paths inside the project like this: BASE_DIR / 'subdir'. BASE_DIR = Path(__file__).resolve().parent.parent @@ -21,7 +23,10 @@ # See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/ # SECURITY WARNING: keep the secret key used in production secret! -SECRET_KEY = "django-insecure-9wy9w#vf^tde0262doyy_j19=64c()_qub!1)f+fh-b^=7ndw*" +SECRET_KEY = config( + "SECRET_KEY", + default="django-insecure-9wy9w#vf^tde0262doyy_j19=64c()_qub!1)f+fh-b^=7ndw*", +) # SECURITY WARNING: don't run with debug turned on in production! DEBUG = True @@ -38,6 +43,7 @@ "django.contrib.sessions", "django.contrib.messages", "django.contrib.staticfiles", + "categorizer", "concepts", "slurper", "web", @@ -127,3 +133,5 @@ # https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField" + +WIKIPEDIA_CONTACT_EMAIL = config("WIKIPEDIA_CONTACT_EMAIL", default="my@email.com") diff --git a/web/web/urls.py b/web/web/urls.py index a50d3a4..958bd85 100644 --- a/web/web/urls.py +++ b/web/web/urls.py @@ -14,6 +14,7 @@ 1. Import the include() function: from django.urls import include, path 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) """ + from concepts import views from django.conf import settings from django.conf.urls.static import static