Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions snippets.json
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,15 @@
"type": "EXTRACTION",
"actionTimeoutSecs": 1,
"contentTypes": "text/html"
},
{
"version": 1,
"javascript": "// TextRazor Entity Extraction\\n// IMPORTANT:\\n// You will need to supply your API key below which will be stored\\n// as part of your SEO Spider configuration in plain text. \\n// TextRazor's free plan is limited to 500 requests per day.\\n// TextRazor's free plan is capped to two concurrent requests, a delay is built in.\\n\\n// Dennis Stopa / Nodeyn :: https://dennisstopa.com / https://nodeyn.nl\\n\\n// API key and language override variables (keep your API key static in Screaming Frog)\\nconst TEXTRAZOR_API_KEY = '';\\nconst languageOverride = ''; // Leave blank if you don't want to override. Check https://www.textrazor.com/languages for supported languages\\n\\n// No more modifications needed\\nconst userContent = document.body.innerText; \\n\\nlet requestCounter = 0; // Initialize request counter\\nconst maxRequestsPerDay = 500; // Set the maximum requests per day, the free plan has a 500 requests per day\\n\\n// The free plan has a limit of two concurrent requests, the delay will handle this\\nfunction delay(ms) {\\n return new Promise(resolve => setTimeout(resolve, ms));\\n}\\n\\nasync function extractEntitiesWithDelay() {\\n const entities = [];\\n const chunkSize = 5000;\\n const textChunks = [];\\n\\n for (let i = 0; i < userContent.length; i += chunkSize) {\\n textChunks.push(userContent.substring(i, i + chunkSize));\\n }\\n\\n for (let i = 0; i < textChunks.length; i++) {\\n if (requestCounter >= maxRequestsPerDay) {\\n console.log('Reached the maximum number of requests for the day.');\\n break;\\n }\\n\\n const text = textChunks[i];\\n console.log('Sending text chunk to TextRazor:', text.slice(0, 200)); \\n\\n const bodyParams = new URLSearchParams({\\n text: text,\\n extractors: 'entities,topics',\\n });\\n\\n // Conditionally add the language override if it's provided\\n if (languageOverride) {\\n bodyParams.append('languageOverride', languageOverride);\\n }\\n\\n const response = await fetch('https://api.textrazor.com/', {\\n method: 'POST',\\n headers: {\\n 'x-textrazor-key': TEXTRAZOR_API_KEY, \\n 'Content-Type': 'application/x-www-form-urlencoded'\\n },\\n body: bodyParams.toString()\\n });\\n\\n if (response.ok) {\\n const data = await response.json();\\n console.log('TextRazor response:', data); // Log the response for debugging\\n\\n if (data.response.entities) {\\n entities.push(...data.response.entities);\\n requestCounter++;\\n }\\n } else {\\n const errorText = await response.text();\\n console.error('TextRazor API error:', errorText);\\n }\\n\\n if (i < textChunks.length - 1) {\\n await delay(1000);\\n }\\n }\\n\\n return entities;\\n}\\n\\nfunction isValidEntity(entity) {\\n const invalidTypes = ['Number', 'Cookie', 'Email', 'Date'];\\n const entityId = entity.entityId || entity.matchedText;\\n\\n if (entity.type && Array.isArray(entity.type) && entity.type.length > 0) {\\n if (invalidTypes.includes(entity.type[0]) || /^[0-9]+$/.test(entityId)) {\\n return false;\\n }\\n } else if (/^[0-9]+$/.test(entityId)) {\\n return false;\\n }\\n\\n return true;\\n}\\n\\nfunction processEntities(entities) {\\n const entitiesDict = {};\\n\\n entities.forEach(entity => {\\n if (isValidEntity(entity)) {\\n const entityId = entity.entityId || entity.matchedText;\\n const entityName = entity.matchedText.toLowerCase(); // Convert entity name to lowercase\\n const freebaseLink = entity.freebaseId ? `https://www.google.com/search?kgmid=${entity.freebaseId}` : '';\\n const wikiLink = entity.wikiLink || ''; // Ensure we're capturing the Wikipedia link correctly\\n\\n if (entityId !== 'None' && isNaN(entityName)) { // Filter out numeric-only entities\\n const key = entityName + freebaseLink; // Unique key based on name and link\\n if (!entitiesDict[key]) {\\n entitiesDict[key] = {\\n entity: entityName,\\n count: 1,\\n freebaseLink: freebaseLink,\\n wikiLink: wikiLink\\n };\\n } else {\\n entitiesDict[key].count += 1;\\n }\\n }\\n }\\n });\\n\\n const result = Object.values(entitiesDict).filter(item => item.entity && item.entity !== 'None'); // Filter out empty or 'None' entities\\n\\n return JSON.stringify(result);\\n}\\n\\nreturn extractEntitiesWithDelay()\\n .then(entities => {\\n if (entities.length === 0) {\\n console.warn('No entities found in the response.');\\n }\\n return seoSpider.data(processEntities(entities));\\n })\\n .catch(error => seoSpider.error(error));\\n",
"name": "TextRazor Entity Extraction",
"comments": "",
"type": "EXTRACTION",
"actionTimeoutSecs": 1,
"contentTypes": "text/html"
}
]
}