From f4c4a09d1ef3ef29fc8683dc6784896225d18750 Mon Sep 17 00:00:00 2001
From: Dennis Stopa <dennisstopa@gmail.com>
Date: Wed, 21 Aug 2024 03:11:44 +0200
Subject: [PATCH] Update snippets.json

Using TextRazor API to extract entities from URLs. It extractts the entities, count, Freebase and Wikipedia link (if existing).

It has an option to override the language.

Create your own API key at textrazor.com, 500 requests free per day.
---
 snippets.json | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/snippets.json b/snippets.json
index 9f22f4c..050416e 100644
--- a/snippets.json
+++ b/snippets.json
@@ -45,6 +45,15 @@
       "type": "EXTRACTION",
       "actionTimeoutSecs": 1,
       "contentTypes": "text/html"
+    },
+    {
+      "version": 1,
+      "javascript": "// TextRazor Entity Extraction\\n// IMPORTANT:\\n// You will need to supply your API key below which will be stored\\n// as part of your SEO Spider configuration in plain text. \\n// TextRazor's free plan is limited to 500 requests per day.\\n// TextRazor's free plan is capped to two concurrent requests, a delay is built in.\\n\\n// Dennis Stopa / Nodeyn :: https://dennisstopa.com / https://nodeyn.nl\\n\\n// API key and language override variables (keep your API key static in Screaming Frog)\\nconst TEXTRAZOR_API_KEY = '';\\nconst languageOverride = ''; // Leave blank if you don't want to override. Check https://www.textrazor.com/languages for supported languages\\n\\n// No more modifications needed\\nconst userContent = document.body.innerText; \\n\\nlet requestCounter = 0; // Initialize request counter\\nconst maxRequestsPerDay = 500; // Set the maximum requests per day, the free plan has a 500 requests per day\\n\\n// The free plan has a limit of two concurrent requests, the delay will handle this\\nfunction delay(ms) {\\n    return new Promise(resolve => setTimeout(resolve, ms));\\n}\\n\\nasync function extractEntitiesWithDelay() {\\n    const entities = [];\\n    const chunkSize = 5000;\\n    const textChunks = [];\\n\\n    for (let i = 0; i < userContent.length; i += chunkSize) {\\n        textChunks.push(userContent.substring(i, i + chunkSize));\\n    }\\n\\n    for (let i = 0; i < textChunks.length; i++) {\\n        if (requestCounter >= maxRequestsPerDay) {\\n            console.log('Reached the maximum number of requests for the day.');\\n            break;\\n        }\\n\\n        const text = textChunks[i];\\n        console.log('Sending text chunk to TextRazor:', text.slice(0, 200)); \\n\\n        const bodyParams = new URLSearchParams({\\n            text: text,\\n            extractors: 'entities,topics',\\n        });\\n\\n        // Conditionally add the language override if it's provided\\n        if (languageOverride) {\\n            bodyParams.append('languageOverride', languageOverride);\\n        }\\n\\n        const response = await fetch('https://api.textrazor.com/', {\\n            method: 'POST',\\n            headers: {\\n                'x-textrazor-key': TEXTRAZOR_API_KEY, \\n                'Content-Type': 'application/x-www-form-urlencoded'\\n            },\\n            body: bodyParams.toString()\\n        });\\n\\n        if (response.ok) {\\n            const data = await response.json();\\n            console.log('TextRazor response:', data); // Log the response for debugging\\n\\n            if (data.response.entities) {\\n                entities.push(...data.response.entities);\\n                requestCounter++;\\n            }\\n        } else {\\n            const errorText = await response.text();\\n            console.error('TextRazor API error:', errorText);\\n        }\\n\\n        if (i < textChunks.length - 1) {\\n            await delay(1000);\\n        }\\n    }\\n\\n    return entities;\\n}\\n\\nfunction isValidEntity(entity) {\\n    const invalidTypes = ['Number', 'Cookie', 'Email', 'Date'];\\n    const entityId = entity.entityId || entity.matchedText;\\n\\n    if (entity.type && Array.isArray(entity.type) && entity.type.length > 0) {\\n        if (invalidTypes.includes(entity.type[0]) || /^[0-9]+$/.test(entityId)) {\\n            return false;\\n        }\\n    } else if (/^[0-9]+$/.test(entityId)) {\\n        return false;\\n    }\\n\\n    return true;\\n}\\n\\nfunction processEntities(entities) {\\n    const entitiesDict = {};\\n\\n    entities.forEach(entity => {\\n        if (isValidEntity(entity)) {\\n            const entityId = entity.entityId || entity.matchedText;\\n            const entityName = entity.matchedText.toLowerCase(); // Convert entity name to lowercase\\n            const freebaseLink = entity.freebaseId ? `https://www.google.com/search?kgmid=${entity.freebaseId}` : '';\\n            const wikiLink = entity.wikiLink || ''; // Ensure we're capturing the Wikipedia link correctly\\n\\n            if (entityId !== 'None' && isNaN(entityName)) {  // Filter out numeric-only entities\\n                const key = entityName + freebaseLink; // Unique key based on name and link\\n                if (!entitiesDict[key]) {\\n                    entitiesDict[key] = {\\n                        entity: entityName,\\n                        count: 1,\\n                        freebaseLink: freebaseLink,\\n                        wikiLink: wikiLink\\n                    };\\n                } else {\\n                    entitiesDict[key].count += 1;\\n                }\\n            }\\n        }\\n    });\\n\\n    const result = Object.values(entitiesDict).filter(item => item.entity && item.entity !== 'None'); // Filter out empty or 'None' entities\\n\\n    return JSON.stringify(result);\\n}\\n\\nreturn extractEntitiesWithDelay()\\n    .then(entities => {\\n        if (entities.length === 0) {\\n            console.warn('No entities found in the response.');\\n        }\\n        return seoSpider.data(processEntities(entities));\\n    })\\n    .catch(error => seoSpider.error(error));\\n",
+      "name": "TextRazor Entity Extraction",
+      "comments": "",
+      "type": "EXTRACTION",
+      "actionTimeoutSecs": 1,
+      "contentTypes": "text/html"
     }
   ]
 }
\ No newline at end of file