Skip to content
Draft
3 changes: 2 additions & 1 deletion dev/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ main();
*/
async function main() {
const json = {
input: "dev/doc-content.dita",
input: "https://www.doc-detective.com/sitemap.xml",
crawl: true,
logLevel: "debug",
runOn: [
{
Expand Down
40 changes: 40 additions & 0 deletions src/config.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -502,3 +502,43 @@ describe("resolveConcurrentRunners", function () {
expect(result.concurrentRunners).to.equal(4);
});
});

describe("crawl config field", function () {
it("should preserve crawl field through validation", async function () {
const inputConfig = {
input: ["https://example.com"],
crawl: true,
logLevel: "info",
fileTypes: ["markdown"]
};

const result = await setConfig({ config: inputConfig });

expect(result.crawl).to.equal(true);
});

it("should handle crawl field set to false", async function () {
const inputConfig = {
input: ["https://example.com"],
crawl: false,
logLevel: "info",
fileTypes: ["markdown"]
};

const result = await setConfig({ config: inputConfig });

expect(result.crawl).to.equal(false);
});

it("should default crawl field to false when not specified", async function () {
const inputConfig = {
input: ["https://example.com"],
logLevel: "info",
fileTypes: ["markdown"]
};

const result = await setConfig({ config: inputConfig });

expect(result.crawl).to.equal(false);
});
});
190 changes: 190 additions & 0 deletions src/crawler.integration.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
const assert = require("assert");
const sinon = require("sinon");
const proxyquire = require("proxyquire");

before(async function () {
const { expect } = await import("chai");
global.expect = expect;
});

describe("crawler integration", function () {
let qualifyFiles, axiosStub, fsStub, crawlSitemapStub, readFileStub;

beforeEach(function () {
axiosStub = {
get: sinon.stub(),
};

fsStub = {
statSync: sinon.stub(),
readdirSync: sinon.stub(),
existsSync: sinon.stub(),
mkdirSync: sinon.stub(),
writeFileSync: sinon.stub(),
};

crawlSitemapStub = sinon.stub();
readFileStub = sinon.stub().resolves({});

// Mock fetchFile behavior
axiosStub.get.callsFake(async (url) => {
if (url.endsWith("sitemap.xml")) {
return {
data: `<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://example.com/page1</loc></url>
<url><loc>https://example.com/page2</loc></url>
</urlset>`,
};
}
return { data: "" };
});

const utilsModule = proxyquire("./utils", {
axios: axiosStub,
fs: fsStub,
"./crawler": { crawlSitemap: crawlSitemapStub },
"doc-detective-common": {
validate: () => ({ valid: true }),
resolvePaths: (x) => x,
transformToSchemaKey: (x) => x,
readFile: readFileStub,
},
});

qualifyFiles = utilsModule.qualifyFiles;
});

afterEach(function () {
sinon.restore();
});

it("should process sitemap.xml URLs when crawl is true", async function () {
const config = {
input: ["https://example.com/sitemap.xml"],
crawl: true,
logLevel: "info",
fileTypes: [],
};

crawlSitemapStub.resolves([
"https://example.com/page1",
"https://example.com/page2",
]);

// Mock file system calls for fetched files
fsStub.existsSync.returns(true);
fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });

await qualifyFiles({ config });

expect(crawlSitemapStub.calledOnce).to.be.true;
expect(crawlSitemapStub.firstCall.args[0].sitemapUrl).to.equal("https://example.com/sitemap.xml");
});

it("should not process non-sitemap URLs", async function () {
const config = {
input: ["https://example.com/page.html"],
logLevel: "info",
fileTypes: [],
};

// Mock file system calls for fetched files
fsStub.existsSync.returns(true);
fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });

await qualifyFiles({ config });

expect(crawlSitemapStub.called).to.be.false;
});

it("should disable processing when crawl is false", async function () {
const config = {
input: ["https://example.com/sitemap.xml"],
crawl: false,
logLevel: "info",
fileTypes: [],
};

// Mock file system calls for fetched files
fsStub.existsSync.returns(true);
fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });

await qualifyFiles({ config });

expect(crawlSitemapStub.called).to.be.false;
});

it("should enable processing when crawl is true", async function () {
const config = {
input: ["https://example.com/sitemap.xml"],
crawl: true,
logLevel: "info",
fileTypes: [],
};

crawlSitemapStub.resolves([
"https://example.com/page1",
"https://example.com/page2",
]);

// Mock file system calls for fetched files
fsStub.existsSync.returns(true);
fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });

await qualifyFiles({ config });

expect(crawlSitemapStub.calledOnce).to.be.true;
});

it("should not process file:// URLs", async function () {
const config = {
input: [],
logLevel: "info",
fileTypes: [],
};

await qualifyFiles({ config });

expect(crawlSitemapStub.called).to.be.false;
});

it("should log sitemap processing activity", async function () {
const config = {
input: ["https://example.com/sitemap.xml"],
crawl: true,
logLevel: "info",
fileTypes: [],
};

crawlSitemapStub.resolves([
"https://example.com/page1",
"https://example.com/page2",
]);

// Mock file system calls for fetched files
fsStub.existsSync.returns(true);
fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });

// Capture console output
const originalConsoleLog = console.log;
const logOutput = [];
console.log = (...args) => {
logOutput.push(args.join(" "));
originalConsoleLog(...args);
};

try {
await qualifyFiles({ config });

// Check that processing info was logged
const hasProcessingLog = logOutput.some((msg) => msg.includes("Processing") && msg.includes("sitemap"));
const hasDiscoveredLog = logOutput.some((msg) => msg.includes("Discovered"));

expect(hasProcessingLog).to.be.true;
expect(hasDiscoveredLog).to.be.true;
} finally {
console.log = originalConsoleLog;
}
});
});
111 changes: 111 additions & 0 deletions src/crawler.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
const axios = require("axios");

exports.extractXmlSitemapUrls = extractXmlSitemapUrls;
exports.isSameOrigin = isSameOrigin;
exports.crawlSitemap = crawlSitemap;

/**
* Extracts URLs from XML sitemap.
*
* @param {string} xml - The XML sitemap content to parse
* @returns {string[]} - Array of extracted URLs
*/
function extractXmlSitemapUrls(xml) {
if (typeof xml !== "string") {
return [];
}

const urls = [];
// Match <loc> tags in XML sitemaps
const locRegex = /<loc>([^<]+)<\/loc>/gi;
let match;

while ((match = locRegex.exec(xml)) !== null) {
const url = match[1].trim();
if (url) {
urls.push(url);
}
}

return urls;
}

/**
* Compares two URLs for strict origin matching.
*
* @param {string} url1 - First URL to compare
* @param {string} url2 - Second URL to compare
* @returns {boolean} - True if origins match strictly (protocol, hostname, and port)
*/
function isSameOrigin(url1, url2) {
try {
const parsed1 = new URL(url1);
const parsed2 = new URL(url2);

// Compare protocol, hostname, and port
return (
parsed1.protocol === parsed2.protocol &&
parsed1.hostname === parsed2.hostname &&
parsed1.port === parsed2.port
);
} catch (error) {
// If URL parsing fails, they can't be same origin
return false;
}
}

/**
* Processes an XML sitemap and extracts all URLs.
*
* @param {Object} options - Crawling options
* @param {Object} options.config - Configuration object
* @param {string} options.sitemapUrl - URL of the sitemap to process
* @param {Function} options.log - Logging function (optional)
* @returns {Promise<string[]>} - Promise resolving to array of all discovered URLs
*/
async function crawlSitemap({ config, sitemapUrl, log }) {
// Default no-op logger if not provided
const logger = log || (() => {});

const discoveredUrls = [];

logger(config, "debug", `Processing sitemap: ${sitemapUrl}`);

// Fetch the sitemap content
let content;
let finalUrl = sitemapUrl;
try {
const response = await axios.get(sitemapUrl, {
timeout: 30000,
maxRedirects: 5,
});
content = response.data;

// Use the final URL after redirects for origin comparison
if (response.request && response.request.res && response.request.res.responseUrl) {
finalUrl = response.request.res.responseUrl;
logger(config, "debug", `Sitemap redirected to: ${finalUrl}`);
}
} catch (error) {
logger(config, "warn", `Failed to fetch sitemap ${sitemapUrl}: ${error.message}`);
return discoveredUrls;
}

// Extract URLs from sitemap
if (typeof content === "string") {
const extractedUrls = extractXmlSitemapUrls(content);

// Filter URLs to only include same-origin URLs (using final URL after redirects)
for (const url of extractedUrls) {
if (isSameOrigin(url, finalUrl)) {
discoveredUrls.push(url);
} else {
logger(config, "debug", `Skipping cross-origin URL: ${url}`);
}
}
}

logger(config, "info", `Discovered ${discoveredUrls.length} URL(s) from sitemap`);

return discoveredUrls;
}
Loading