diff --git a/dev/index.js b/dev/index.js index 847624b..03afb33 100644 --- a/dev/index.js +++ b/dev/index.js @@ -12,7 +12,8 @@ main(); */ async function main() { const json = { - input: "dev/doc-content.dita", + input: "https://www.doc-detective.com/sitemap.xml", + crawl: true, logLevel: "debug", runOn: [ { diff --git a/src/config.test.js b/src/config.test.js index 3880c5b..dc8e283 100644 --- a/src/config.test.js +++ b/src/config.test.js @@ -502,3 +502,43 @@ describe("resolveConcurrentRunners", function () { expect(result.concurrentRunners).to.equal(4); }); }); + +describe("crawl config field", function () { + it("should preserve crawl field through validation", async function () { + const inputConfig = { + input: ["https://example.com"], + crawl: true, + logLevel: "info", + fileTypes: ["markdown"] + }; + + const result = await setConfig({ config: inputConfig }); + + expect(result.crawl).to.equal(true); + }); + + it("should handle crawl field set to false", async function () { + const inputConfig = { + input: ["https://example.com"], + crawl: false, + logLevel: "info", + fileTypes: ["markdown"] + }; + + const result = await setConfig({ config: inputConfig }); + + expect(result.crawl).to.equal(false); + }); + + it("should default crawl field to false when not specified", async function () { + const inputConfig = { + input: ["https://example.com"], + logLevel: "info", + fileTypes: ["markdown"] + }; + + const result = await setConfig({ config: inputConfig }); + + expect(result.crawl).to.equal(false); + }); +}); diff --git a/src/crawler.integration.test.js b/src/crawler.integration.test.js new file mode 100644 index 0000000..ad885c3 --- /dev/null +++ b/src/crawler.integration.test.js @@ -0,0 +1,190 @@ +const assert = require("assert"); +const sinon = require("sinon"); +const proxyquire = require("proxyquire"); + +before(async function () { + const { expect } = await import("chai"); + global.expect = expect; +}); + +describe("crawler integration", function () { + let qualifyFiles, axiosStub, fsStub, crawlSitemapStub, readFileStub; + + beforeEach(function () { + axiosStub = { + get: sinon.stub(), + }; + + fsStub = { + statSync: sinon.stub(), + readdirSync: sinon.stub(), + existsSync: sinon.stub(), + mkdirSync: sinon.stub(), + writeFileSync: sinon.stub(), + }; + + crawlSitemapStub = sinon.stub(); + readFileStub = sinon.stub().resolves({}); + + // Mock fetchFile behavior + axiosStub.get.callsFake(async (url) => { + if (url.endsWith("sitemap.xml")) { + return { + data: ` + + https://example.com/page1 + https://example.com/page2 + `, + }; + } + return { data: "" }; + }); + + const utilsModule = proxyquire("./utils", { + axios: axiosStub, + fs: fsStub, + "./crawler": { crawlSitemap: crawlSitemapStub }, + "doc-detective-common": { + validate: () => ({ valid: true }), + resolvePaths: (x) => x, + transformToSchemaKey: (x) => x, + readFile: readFileStub, + }, + }); + + qualifyFiles = utilsModule.qualifyFiles; + }); + + afterEach(function () { + sinon.restore(); + }); + + it("should process sitemap.xml URLs when crawl is true", async function () { + const config = { + input: ["https://example.com/sitemap.xml"], + crawl: true, + logLevel: "info", + fileTypes: [], + }; + + crawlSitemapStub.resolves([ + "https://example.com/page1", + "https://example.com/page2", + ]); + + // Mock file system calls for fetched files + fsStub.existsSync.returns(true); + fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false }); + + await qualifyFiles({ config }); + + expect(crawlSitemapStub.calledOnce).to.be.true; + expect(crawlSitemapStub.firstCall.args[0].sitemapUrl).to.equal("https://example.com/sitemap.xml"); + }); + + it("should not process non-sitemap URLs", async function () { + const config = { + input: ["https://example.com/page.html"], + logLevel: "info", + fileTypes: [], + }; + + // Mock file system calls for fetched files + fsStub.existsSync.returns(true); + fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false }); + + await qualifyFiles({ config }); + + expect(crawlSitemapStub.called).to.be.false; + }); + + it("should disable processing when crawl is false", async function () { + const config = { + input: ["https://example.com/sitemap.xml"], + crawl: false, + logLevel: "info", + fileTypes: [], + }; + + // Mock file system calls for fetched files + fsStub.existsSync.returns(true); + fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false }); + + await qualifyFiles({ config }); + + expect(crawlSitemapStub.called).to.be.false; + }); + + it("should enable processing when crawl is true", async function () { + const config = { + input: ["https://example.com/sitemap.xml"], + crawl: true, + logLevel: "info", + fileTypes: [], + }; + + crawlSitemapStub.resolves([ + "https://example.com/page1", + "https://example.com/page2", + ]); + + // Mock file system calls for fetched files + fsStub.existsSync.returns(true); + fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false }); + + await qualifyFiles({ config }); + + expect(crawlSitemapStub.calledOnce).to.be.true; + }); + + it("should not process file:// URLs", async function () { + const config = { + input: [], + logLevel: "info", + fileTypes: [], + }; + + await qualifyFiles({ config }); + + expect(crawlSitemapStub.called).to.be.false; + }); + + it("should log sitemap processing activity", async function () { + const config = { + input: ["https://example.com/sitemap.xml"], + crawl: true, + logLevel: "info", + fileTypes: [], + }; + + crawlSitemapStub.resolves([ + "https://example.com/page1", + "https://example.com/page2", + ]); + + // Mock file system calls for fetched files + fsStub.existsSync.returns(true); + fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false }); + + // Capture console output + const originalConsoleLog = console.log; + const logOutput = []; + console.log = (...args) => { + logOutput.push(args.join(" ")); + originalConsoleLog(...args); + }; + + try { + await qualifyFiles({ config }); + + // Check that processing info was logged + const hasProcessingLog = logOutput.some((msg) => msg.includes("Processing") && msg.includes("sitemap")); + const hasDiscoveredLog = logOutput.some((msg) => msg.includes("Discovered")); + + expect(hasProcessingLog).to.be.true; + expect(hasDiscoveredLog).to.be.true; + } finally { + console.log = originalConsoleLog; + } + }); +}); diff --git a/src/crawler.js b/src/crawler.js new file mode 100644 index 0000000..b97fd95 --- /dev/null +++ b/src/crawler.js @@ -0,0 +1,111 @@ +const axios = require("axios"); + +exports.extractXmlSitemapUrls = extractXmlSitemapUrls; +exports.isSameOrigin = isSameOrigin; +exports.crawlSitemap = crawlSitemap; + +/** + * Extracts URLs from XML sitemap. + * + * @param {string} xml - The XML sitemap content to parse + * @returns {string[]} - Array of extracted URLs + */ +function extractXmlSitemapUrls(xml) { + if (typeof xml !== "string") { + return []; + } + + const urls = []; + // Match tags in XML sitemaps + const locRegex = /([^<]+)<\/loc>/gi; + let match; + + while ((match = locRegex.exec(xml)) !== null) { + const url = match[1].trim(); + if (url) { + urls.push(url); + } + } + + return urls; +} + +/** + * Compares two URLs for strict origin matching. + * + * @param {string} url1 - First URL to compare + * @param {string} url2 - Second URL to compare + * @returns {boolean} - True if origins match strictly (protocol, hostname, and port) + */ +function isSameOrigin(url1, url2) { + try { + const parsed1 = new URL(url1); + const parsed2 = new URL(url2); + + // Compare protocol, hostname, and port + return ( + parsed1.protocol === parsed2.protocol && + parsed1.hostname === parsed2.hostname && + parsed1.port === parsed2.port + ); + } catch (error) { + // If URL parsing fails, they can't be same origin + return false; + } +} + +/** + * Processes an XML sitemap and extracts all URLs. + * + * @param {Object} options - Crawling options + * @param {Object} options.config - Configuration object + * @param {string} options.sitemapUrl - URL of the sitemap to process + * @param {Function} options.log - Logging function (optional) + * @returns {Promise} - Promise resolving to array of all discovered URLs + */ +async function crawlSitemap({ config, sitemapUrl, log }) { + // Default no-op logger if not provided + const logger = log || (() => {}); + + const discoveredUrls = []; + + logger(config, "debug", `Processing sitemap: ${sitemapUrl}`); + + // Fetch the sitemap content + let content; + let finalUrl = sitemapUrl; + try { + const response = await axios.get(sitemapUrl, { + timeout: 30000, + maxRedirects: 5, + }); + content = response.data; + + // Use the final URL after redirects for origin comparison + if (response.request && response.request.res && response.request.res.responseUrl) { + finalUrl = response.request.res.responseUrl; + logger(config, "debug", `Sitemap redirected to: ${finalUrl}`); + } + } catch (error) { + logger(config, "warn", `Failed to fetch sitemap ${sitemapUrl}: ${error.message}`); + return discoveredUrls; + } + + // Extract URLs from sitemap + if (typeof content === "string") { + const extractedUrls = extractXmlSitemapUrls(content); + + // Filter URLs to only include same-origin URLs (using final URL after redirects) + for (const url of extractedUrls) { + if (isSameOrigin(url, finalUrl)) { + discoveredUrls.push(url); + } else { + logger(config, "debug", `Skipping cross-origin URL: ${url}`); + } + } + } + + logger(config, "info", `Discovered ${discoveredUrls.length} URL(s) from sitemap`); + + return discoveredUrls; +} diff --git a/src/crawler.test.js b/src/crawler.test.js new file mode 100644 index 0000000..54a07e8 --- /dev/null +++ b/src/crawler.test.js @@ -0,0 +1,240 @@ +const assert = require("assert"); +const sinon = require("sinon"); +const proxyquire = require("proxyquire"); + +before(async function () { + const { expect } = await import("chai"); + global.expect = expect; +}); + +describe("crawler", function () { + describe("extractXmlSitemapUrls", function () { + let extractXmlSitemapUrls; + + beforeEach(function () { + const crawler = require("./crawler"); + extractXmlSitemapUrls = crawler.extractXmlSitemapUrls; + }); + + it("should extract single URL from XML sitemap", function () { + const xml = ` + + + https://example.com/page1 + + `; + const urls = extractXmlSitemapUrls(xml); + expect(urls).to.deep.equal(["https://example.com/page1"]); + }); + + it("should extract multiple URLs from XML sitemap", function () { + const xml = ` + + + https://example.com/page1 + + + https://example.com/page2 + + + https://example.com/page3 + + `; + const urls = extractXmlSitemapUrls(xml); + expect(urls).to.deep.equal([ + "https://example.com/page1", + "https://example.com/page2", + "https://example.com/page3", + ]); + }); + + it("should handle empty string", function () { + const urls = extractXmlSitemapUrls(""); + expect(urls).to.deep.equal([]); + }); + + it("should handle non-string input", function () { + const urls = extractXmlSitemapUrls(null); + expect(urls).to.deep.equal([]); + }); + + it("should handle XML without loc tags", function () { + const xml = "test"; + const urls = extractXmlSitemapUrls(xml); + expect(urls).to.deep.equal([]); + }); + }); + + describe("isSameOrigin", function () { + let isSameOrigin; + + beforeEach(function () { + const crawler = require("./crawler"); + isSameOrigin = crawler.isSameOrigin; + }); + + it("should return true for same protocol, domain, and port", function () { + const result = isSameOrigin( + "https://example.com:443/page1", + "https://example.com:443/page2" + ); + expect(result).to.be.true; + }); + + it("should return true for same origin with default ports", function () { + const result = isSameOrigin( + "https://example.com/page1", + "https://example.com/page2" + ); + expect(result).to.be.true; + }); + + it("should return false for different protocol", function () { + const result = isSameOrigin( + "http://example.com/page1", + "https://example.com/page2" + ); + expect(result).to.be.false; + }); + + it("should return false for different domain", function () { + const result = isSameOrigin( + "https://example.com/page1", + "https://other.com/page2" + ); + expect(result).to.be.false; + }); + + it("should return false for different port", function () { + const result = isSameOrigin( + "https://example.com:443/page1", + "https://example.com:8080/page2" + ); + expect(result).to.be.false; + }); + + it("should return false for subdomain differences", function () { + const result = isSameOrigin( + "https://example.com/page1", + "https://subdomain.example.com/page2" + ); + expect(result).to.be.false; + }); + + it("should return false for malformed URLs", function () { + const result = isSameOrigin("not a url", "https://example.com"); + expect(result).to.be.false; + }); + + it("should handle query parameters", function () { + const result = isSameOrigin( + "https://example.com/page?foo=bar", + "https://example.com/page?baz=qux" + ); + expect(result).to.be.true; + }); + + it("should handle fragments", function () { + const result = isSameOrigin( + "https://example.com/page#section1", + "https://example.com/page#section2" + ); + expect(result).to.be.true; + }); + }); + + describe("crawlSitemap", function () { + let crawlSitemap, axiosStub, logStub; + + beforeEach(function () { + axiosStub = { + get: sinon.stub(), + }; + logStub = sinon.stub(); + + const crawlerModule = proxyquire("./crawler", { + axios: axiosStub, + }); + crawlSitemap = crawlerModule.crawlSitemap; + }); + + afterEach(function () { + sinon.restore(); + }); + + it("should process sitemap and extract same-origin URLs", async function () { + const config = { logLevel: "info" }; + const sitemapUrl = "https://example.com/sitemap.xml"; + const sitemapContent = ` + + https://example.com/page1 + https://example.com/page2 + `; + + axiosStub.get.resolves({ data: sitemapContent }); + + const urls = await crawlSitemap({ + config, + sitemapUrl, + log: logStub, + }); + + expect(urls).to.deep.equal([ + "https://example.com/page1", + "https://example.com/page2", + ]); + expect(axiosStub.get.calledOnce).to.be.true; + }); + + it("should filter out cross-origin URLs", async function () { + const config = { logLevel: "info" }; + const sitemapUrl = "https://example.com/sitemap.xml"; + const sitemapContent = ` + + https://example.com/page1 + https://other.com/page2 + `; + + axiosStub.get.resolves({ data: sitemapContent }); + + const urls = await crawlSitemap({ + config, + sitemapUrl, + log: logStub, + }); + + expect(urls).to.deep.equal(["https://example.com/page1"]); + }); + + it("should handle fetch errors gracefully", async function () { + const config = { logLevel: "info" }; + const sitemapUrl = "https://example.com/sitemap.xml"; + + axiosStub.get.rejects(new Error("404 Not Found")); + + const urls = await crawlSitemap({ + config, + sitemapUrl, + log: logStub, + }); + + expect(urls).to.deep.equal([]); + expect(logStub.calledWith(config, "warn")).to.be.true; + }); + + it("should handle non-string content", async function () { + const config = { logLevel: "info" }; + const sitemapUrl = "https://example.com/sitemap.xml"; + + axiosStub.get.resolves({ data: { json: "object" } }); + + const urls = await crawlSitemap({ + config, + sitemapUrl, + log: logStub, + }); + + expect(urls).to.deep.equal([]); + }); + }); +}); diff --git a/src/utils.js b/src/utils.js index 8b0a61c..39bcd78 100644 --- a/src/utils.js +++ b/src/utils.js @@ -11,6 +11,7 @@ const { transformToSchemaKey, readFile, } = require("doc-detective-common"); +const { crawlSitemap } = require("./crawler"); exports.qualifyFiles = qualifyFiles; exports.parseTests = parseTests; @@ -153,7 +154,11 @@ async function fetchFile(fileURL) { } else { response.data = response.data.toString(); } - const fileName = fileURL.split("/").pop(); + let fileName = fileURL.split("/").pop(); + // If fileName doesn't have an extension, add ".html" + if (!path.extname(fileName)) { + fileName += ".html"; + } const hash = crypto.createHash("md5").update(response.data).digest("hex"); const filePath = `${os.tmpdir}/doc-detective/${hash}_${fileName}`; // If doc-detective temp directory doesn't exist, create it @@ -184,6 +189,52 @@ async function qualifyFiles({ config }) { const cleanup = config.afterAll; if (cleanup) sequence = sequence.concat(cleanup); + // Collect sitemap.xml URLs that should be crawled + const sitemapsToProcess = []; + for (const source of sequence) { + const isHttpUrl = + typeof source === "string" && + (source.startsWith("http://") || source.startsWith("https://")); + + const isSitemapUrl = typeof source === "string" && source.endsWith("sitemap.xml"); + + if (isHttpUrl && isSitemapUrl) { + // Check if crawling is enabled (defaults to false in config) + if (config.crawl === true) { + sitemapsToProcess.push(source); + } + } + } + + // Process sitemaps if there are any to crawl + if (sitemapsToProcess.length > 0) { + log(config, "info", `Processing ${sitemapsToProcess.length} sitemap(s)...`); + try { + const allDiscoveredUrls = []; + + // Process each sitemap + for (const sitemapUrl of sitemapsToProcess) { + const discoveredUrls = await crawlSitemap({ + config, + sitemapUrl, + log, + }); + allDiscoveredUrls.push(...discoveredUrls); + } + + // Add newly discovered URLs to the sequence + // Filter out URLs that were already in the initial sequence + const newUrls = allDiscoveredUrls.filter((url) => !sequence.includes(url)); + log(config, "info", `Discovered ${newUrls.length} additional URL(s) from sitemap(s)`); + + // Add new URLs after the input section but before cleanup + const cleanupStartIndex = cleanup ? sequence.indexOf(cleanup[0]) : sequence.length; + sequence.splice(cleanupStartIndex, 0, ...newUrls); + } catch (error) { + log(config, "error", `Sitemap processing failed: ${error.message}`); + } + } + for (let source of sequence) { log(config, "debug", `source: ${source}`); // Check if source is a URL