ipullrank · denstopa · Aug 21, 2024
diff --git a/snippets.json b/snippets.json
@@ -45,6 +45,15 @@
       "type": "EXTRACTION",
       "actionTimeoutSecs": 1,
       "contentTypes": "text/html"
+    },
+    {
+      "version": 1,
+      "javascript": "// TextRazor Entity Extraction\\n// IMPORTANT:\\n// You will need to supply your API key below which will be stored\\n// as part of your SEO Spider configuration in plain text. \\n// TextRazor's free plan is limited to 500 requests per day.\\n// TextRazor's free plan is capped to two concurrent requests, a delay is built in.\\n\\n// Dennis Stopa / Nodeyn :: https://dennisstopa.com / https://nodeyn.nl\\n\\n// API key and language override variables (keep your API key static in Screaming Frog)\\nconst TEXTRAZOR_API_KEY = '';\\nconst languageOverride = ''; // Leave blank if you don't want to override. Check https://www.textrazor.com/languages for supported languages\\n\\n// No more modifications needed\\nconst userContent = document.body.innerText; \\n\\nlet requestCounter = 0; // Initialize request counter\\nconst maxRequestsPerDay = 500; // Set the maximum requests per day, the free plan has a 500 requests per day\\n\\n// The free plan has a limit of two concurrent requests, the delay will handle this\\nfunction delay(ms) {\\n    return new Promise(resolve => setTimeout(resolve, ms));\\n}\\n\\nasync function extractEntitiesWithDelay() {\\n    const entities = [];\\n    const chunkSize = 5000;\\n    const textChunks = [];\\n\\n    for (let i = 0; i < userContent.length; i += chunkSize) {\\n        textChunks.push(userContent.substring(i, i + chunkSize));\\n    }\\n\\n    for (let i = 0; i < textChunks.length; i++) {\\n        if (requestCounter >= maxRequestsPerDay) {\\n            console.log('Reached the maximum number of requests for the day.');\\n            break;\\n        }\\n\\n        const text = textChunks[i];\\n        console.log('Sending text chunk to TextRazor:', text.slice(0, 200)); \\n\\n        const bodyParams = new URLSearchParams({\\n            text: text,\\n            extractors: 'entities,topics',\\n        });\\n\\n        // Conditionally add the language override if it's provided\\n        if (languageOverride) {\\n            bodyParams.append('languageOverride', languageOverride);\\n        }\\n\\n        const response = await fetch('https://api.textrazor.com/', {\\n            method: 'POST',\\n            headers: {\\n                'x-textrazor-key': TEXTRAZOR_API_KEY, \\n                'Content-Type': 'application/x-www-form-urlencoded'\\n            },\\n            body: bodyParams.toString()\\n        });\\n\\n        if (response.ok) {\\n            const data = await response.json();\\n            console.log('TextRazor response:', data); // Log the response for debugging\\n\\n            if (data.response.entities) {\\n                entities.push(...data.response.entities);\\n                requestCounter++;\\n            }\\n        } else {\\n            const errorText = await response.text();\\n            console.error('TextRazor API error:', errorText);\\n        }\\n\\n        if (i < textChunks.length - 1) {\\n            await delay(1000);\\n        }\\n    }\\n\\n    return entities;\\n}\\n\\nfunction isValidEntity(entity) {\\n    const invalidTypes = ['Number', 'Cookie', 'Email', 'Date'];\\n    const entityId = entity.entityId || entity.matchedText;\\n\\n    if (entity.type && Array.isArray(entity.type) && entity.type.length > 0) {\\n        if (invalidTypes.includes(entity.type[0]) || /^[0-9]+$/.test(entityId)) {\\n            return false;\\n        }\\n    } else if (/^[0-9]+$/.test(entityId)) {\\n        return false;\\n    }\\n\\n    return true;\\n}\\n\\nfunction processEntities(entities) {\\n    const entitiesDict = {};\\n\\n    entities.forEach(entity => {\\n        if (isValidEntity(entity)) {\\n            const entityId = entity.entityId || entity.matchedText;\\n            const entityName = entity.matchedText.toLowerCase(); // Convert entity name to lowercase\\n            const freebaseLink = entity.freebaseId ? `https://www.google.com/search?kgmid=${entity.freebaseId}` : '';\\n            const wikiLink = entity.wikiLink || ''; // Ensure we're capturing the Wikipedia link correctly\\n\\n            if (entityId !== 'None' && isNaN(entityName)) {  // Filter out numeric-only entities\\n                const key = entityName + freebaseLink; // Unique key based on name and link\\n                if (!entitiesDict[key]) {\\n                    entitiesDict[key] = {\\n                        entity: entityName,\\n                        count: 1,\\n                        freebaseLink: freebaseLink,\\n                        wikiLink: wikiLink\\n                    };\\n                } else {\\n                    entitiesDict[key].count += 1;\\n                }\\n            }\\n        }\\n    });\\n\\n    const result = Object.values(entitiesDict).filter(item => item.entity && item.entity !== 'None'); // Filter out empty or 'None' entities\\n\\n    return JSON.stringify(result);\\n}\\n\\nreturn extractEntitiesWithDelay()\\n    .then(entities => {\\n        if (entities.length === 0) {\\n            console.warn('No entities found in the response.');\\n        }\\n        return seoSpider.data(processEntities(entities));\\n    })\\n    .catch(error => seoSpider.error(error));\\n",
+      "name": "TextRazor Entity Extraction",
+      "comments": "",
+      "type": "EXTRACTION",
+      "actionTimeoutSecs": 1,
+      "contentTypes": "text/html"
     }
   ]
 }