diff --git a/snippets.json b/snippets.json index 9f22f4c..050416e 100644 --- a/snippets.json +++ b/snippets.json @@ -45,6 +45,15 @@ "type": "EXTRACTION", "actionTimeoutSecs": 1, "contentTypes": "text/html" + }, + { + "version": 1, + "javascript": "// TextRazor Entity Extraction\\n// IMPORTANT:\\n// You will need to supply your API key below which will be stored\\n// as part of your SEO Spider configuration in plain text. \\n// TextRazor's free plan is limited to 500 requests per day.\\n// TextRazor's free plan is capped to two concurrent requests, a delay is built in.\\n\\n// Dennis Stopa / Nodeyn :: https://dennisstopa.com / https://nodeyn.nl\\n\\n// API key and language override variables (keep your API key static in Screaming Frog)\\nconst TEXTRAZOR_API_KEY = '';\\nconst languageOverride = ''; // Leave blank if you don't want to override. Check https://www.textrazor.com/languages for supported languages\\n\\n// No more modifications needed\\nconst userContent = document.body.innerText; \\n\\nlet requestCounter = 0; // Initialize request counter\\nconst maxRequestsPerDay = 500; // Set the maximum requests per day, the free plan has a 500 requests per day\\n\\n// The free plan has a limit of two concurrent requests, the delay will handle this\\nfunction delay(ms) {\\n return new Promise(resolve => setTimeout(resolve, ms));\\n}\\n\\nasync function extractEntitiesWithDelay() {\\n const entities = [];\\n const chunkSize = 5000;\\n const textChunks = [];\\n\\n for (let i = 0; i < userContent.length; i += chunkSize) {\\n textChunks.push(userContent.substring(i, i + chunkSize));\\n }\\n\\n for (let i = 0; i < textChunks.length; i++) {\\n if (requestCounter >= maxRequestsPerDay) {\\n console.log('Reached the maximum number of requests for the day.');\\n break;\\n }\\n\\n const text = textChunks[i];\\n console.log('Sending text chunk to TextRazor:', text.slice(0, 200)); \\n\\n const bodyParams = new URLSearchParams({\\n text: text,\\n extractors: 'entities,topics',\\n });\\n\\n // Conditionally add the language override if it's provided\\n if (languageOverride) {\\n bodyParams.append('languageOverride', languageOverride);\\n }\\n\\n const response = await fetch('https://api.textrazor.com/', {\\n method: 'POST',\\n headers: {\\n 'x-textrazor-key': TEXTRAZOR_API_KEY, \\n 'Content-Type': 'application/x-www-form-urlencoded'\\n },\\n body: bodyParams.toString()\\n });\\n\\n if (response.ok) {\\n const data = await response.json();\\n console.log('TextRazor response:', data); // Log the response for debugging\\n\\n if (data.response.entities) {\\n entities.push(...data.response.entities);\\n requestCounter++;\\n }\\n } else {\\n const errorText = await response.text();\\n console.error('TextRazor API error:', errorText);\\n }\\n\\n if (i < textChunks.length - 1) {\\n await delay(1000);\\n }\\n }\\n\\n return entities;\\n}\\n\\nfunction isValidEntity(entity) {\\n const invalidTypes = ['Number', 'Cookie', 'Email', 'Date'];\\n const entityId = entity.entityId || entity.matchedText;\\n\\n if (entity.type && Array.isArray(entity.type) && entity.type.length > 0) {\\n if (invalidTypes.includes(entity.type[0]) || /^[0-9]+$/.test(entityId)) {\\n return false;\\n }\\n } else if (/^[0-9]+$/.test(entityId)) {\\n return false;\\n }\\n\\n return true;\\n}\\n\\nfunction processEntities(entities) {\\n const entitiesDict = {};\\n\\n entities.forEach(entity => {\\n if (isValidEntity(entity)) {\\n const entityId = entity.entityId || entity.matchedText;\\n const entityName = entity.matchedText.toLowerCase(); // Convert entity name to lowercase\\n const freebaseLink = entity.freebaseId ? `https://www.google.com/search?kgmid=${entity.freebaseId}` : '';\\n const wikiLink = entity.wikiLink || ''; // Ensure we're capturing the Wikipedia link correctly\\n\\n if (entityId !== 'None' && isNaN(entityName)) { // Filter out numeric-only entities\\n const key = entityName + freebaseLink; // Unique key based on name and link\\n if (!entitiesDict[key]) {\\n entitiesDict[key] = {\\n entity: entityName,\\n count: 1,\\n freebaseLink: freebaseLink,\\n wikiLink: wikiLink\\n };\\n } else {\\n entitiesDict[key].count += 1;\\n }\\n }\\n }\\n });\\n\\n const result = Object.values(entitiesDict).filter(item => item.entity && item.entity !== 'None'); // Filter out empty or 'None' entities\\n\\n return JSON.stringify(result);\\n}\\n\\nreturn extractEntitiesWithDelay()\\n .then(entities => {\\n if (entities.length === 0) {\\n console.warn('No entities found in the response.');\\n }\\n return seoSpider.data(processEntities(entities));\\n })\\n .catch(error => seoSpider.error(error));\\n", + "name": "TextRazor Entity Extraction", + "comments": "", + "type": "EXTRACTION", + "actionTimeoutSecs": 1, + "contentTypes": "text/html" } ] } \ No newline at end of file