doc-detective · Copilot · Oct 30, 2025 · Oct 30, 2025 · Oct 30, 2025 · Oct 30, 2025
diff --git a/dev/index.js b/dev/index.js
@@ -12,7 +12,8 @@ main();
  */
 async function main() {
   const json = {
-    input: "dev/doc-content.dita",
+    input: "https://www.doc-detective.com/sitemap.xml",
+    crawl: true,
     logLevel: "debug",
     runOn: [
       {

diff --git a/src/config.test.js b/src/config.test.js
@@ -502,3 +502,43 @@ describe("resolveConcurrentRunners", function () {
     expect(result.concurrentRunners).to.equal(4);
   });
 });
+
+describe("crawl config field", function () {
+  it("should preserve crawl field through validation", async function () {
+    const inputConfig = { 
+      input: ["https://example.com"], 
+      crawl: true,
+      logLevel: "info",
+      fileTypes: ["markdown"]
+    };
+
+    const result = await setConfig({ config: inputConfig });
+
+    expect(result.crawl).to.equal(true);
+  });
+
+  it("should handle crawl field set to false", async function () {
+    const inputConfig = { 
+      input: ["https://example.com"], 
+      crawl: false,
+      logLevel: "info",
+      fileTypes: ["markdown"]
+    };
+
+    const result = await setConfig({ config: inputConfig });
+
+    expect(result.crawl).to.equal(false);
+  });
+
+  it("should default crawl field to false when not specified", async function () {
+    const inputConfig = { 
+      input: ["https://example.com"], 
+      logLevel: "info",
+      fileTypes: ["markdown"]
+    };
+
+    const result = await setConfig({ config: inputConfig });
+
+    expect(result.crawl).to.equal(false);
+  });
+});
diff --git a/src/crawler.integration.test.js b/src/crawler.integration.test.js
@@ -0,0 +1,190 @@
+const assert = require("assert");
+const sinon = require("sinon");
+const proxyquire = require("proxyquire");
+
+before(async function () {
+  const { expect } = await import("chai");
+  global.expect = expect;
+});
+
+describe("crawler integration", function () {
+  let qualifyFiles, axiosStub, fsStub, crawlSitemapStub, readFileStub;
+
+  beforeEach(function () {
+    axiosStub = {
+      get: sinon.stub(),
+    };
+
+    fsStub = {
+      statSync: sinon.stub(),
+      readdirSync: sinon.stub(),
+      existsSync: sinon.stub(),
+      mkdirSync: sinon.stub(),
+      writeFileSync: sinon.stub(),
+    };
+
+    crawlSitemapStub = sinon.stub();
+    readFileStub = sinon.stub().resolves({});
+
+    // Mock fetchFile behavior
+    axiosStub.get.callsFake(async (url) => {
+      if (url.endsWith("sitemap.xml")) {
+        return {
+          data: `<?xml version="1.0" encoding="UTF-8"?>
+            <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+              <url><loc>https://example.com/page1</loc></url>
+              <url><loc>https://example.com/page2</loc></url>
+            </urlset>`,
+        };
+      }
+      return { data: "" };
+    });
+
+    const utilsModule = proxyquire("./utils", {
+      axios: axiosStub,
+      fs: fsStub,
+      "./crawler": { crawlSitemap: crawlSitemapStub },
+      "doc-detective-common": {
+        validate: () => ({ valid: true }),
+        resolvePaths: (x) => x,
+        transformToSchemaKey: (x) => x,
+        readFile: readFileStub,
+      },
+    });
+
+    qualifyFiles = utilsModule.qualifyFiles;
+  });
+
+  afterEach(function () {
+    sinon.restore();
+  });
+
+  it("should process sitemap.xml URLs when crawl is true", async function () {
+    const config = {
+      input: ["https://example.com/sitemap.xml"],
+      crawl: true,
+      logLevel: "info",
+      fileTypes: [],
+    };
+
+    crawlSitemapStub.resolves([
+      "https://example.com/page1",
+      "https://example.com/page2",
+    ]);
+
+    // Mock file system calls for fetched files
+    fsStub.existsSync.returns(true);
+    fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+
+    await qualifyFiles({ config });
+
+    expect(crawlSitemapStub.calledOnce).to.be.true;
+    expect(crawlSitemapStub.firstCall.args[0].sitemapUrl).to.equal("https://example.com/sitemap.xml");
+  });
+
+  it("should not process non-sitemap URLs", async function () {
+    const config = {
+      input: ["https://example.com/page.html"],
+      logLevel: "info",
+      fileTypes: [],
+    };
+
+    // Mock file system calls for fetched files
+    fsStub.existsSync.returns(true);
+    fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+
+    await qualifyFiles({ config });
+
+    expect(crawlSitemapStub.called).to.be.false;
+  });
+
+  it("should disable processing when crawl is false", async function () {
+    const config = {
+      input: ["https://example.com/sitemap.xml"],
+      crawl: false,
+      logLevel: "info",
+      fileTypes: [],
+    };
+
+    // Mock file system calls for fetched files
+    fsStub.existsSync.returns(true);
+    fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+
+    await qualifyFiles({ config });
+
+    expect(crawlSitemapStub.called).to.be.false;
+  });
+
+  it("should enable processing when crawl is true", async function () {
+    const config = {
+      input: ["https://example.com/sitemap.xml"],
+      crawl: true,
+      logLevel: "info",
+      fileTypes: [],
+    };
+
+    crawlSitemapStub.resolves([
+      "https://example.com/page1",
+      "https://example.com/page2",
+    ]);
+
+    // Mock file system calls for fetched files
+    fsStub.existsSync.returns(true);
+    fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+
+    await qualifyFiles({ config });
+
+    expect(crawlSitemapStub.calledOnce).to.be.true;
+  });
+
+  it("should not process file:// URLs", async function () {
+    const config = {
+      input: [],
+      logLevel: "info",
+      fileTypes: [],
+    };
+
+    await qualifyFiles({ config });
+
+    expect(crawlSitemapStub.called).to.be.false;
+  });
+
+  it("should log sitemap processing activity", async function () {
+    const config = {
+      input: ["https://example.com/sitemap.xml"],
+      crawl: true,
+      logLevel: "info",
+      fileTypes: [],
+    };
+
+    crawlSitemapStub.resolves([
+      "https://example.com/page1",
+      "https://example.com/page2",
+    ]);
+
+    // Mock file system calls for fetched files
+    fsStub.existsSync.returns(true);
+    fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+
+    // Capture console output
+    const originalConsoleLog = console.log;
+    const logOutput = [];
+    console.log = (...args) => {
+      logOutput.push(args.join(" "));
+      originalConsoleLog(...args);
+    };
+
+    try {
+      await qualifyFiles({ config });
+
+      // Check that processing info was logged
+      const hasProcessingLog = logOutput.some((msg) => msg.includes("Processing") && msg.includes("sitemap"));
+      const hasDiscoveredLog = logOutput.some((msg) => msg.includes("Discovered"));
+
+      expect(hasProcessingLog).to.be.true;
+      expect(hasDiscoveredLog).to.be.true;
+    } finally {
+      console.log = originalConsoleLog;
+    }
+  });
+});
diff --git a/src/crawler.js b/src/crawler.js
@@ -0,0 +1,111 @@
+const axios = require("axios");
+
+exports.extractXmlSitemapUrls = extractXmlSitemapUrls;
+exports.isSameOrigin = isSameOrigin;
+exports.crawlSitemap = crawlSitemap;
+
+/**
+ * Extracts URLs from XML sitemap.
+ * 
+ * @param {string} xml - The XML sitemap content to parse
+ * @returns {string[]} - Array of extracted URLs
+ */
+function extractXmlSitemapUrls(xml) {
+  if (typeof xml !== "string") {
+    return [];
+  }
+
+  const urls = [];
+  // Match <loc> tags in XML sitemaps
+  const locRegex = /<loc>([^<]+)<\/loc>/gi;
+  let match;
+
+  while ((match = locRegex.exec(xml)) !== null) {
+    const url = match[1].trim();
+    if (url) {
+      urls.push(url);
+    }
+  }
+
+  return urls;
+}
+
+/**
+ * Compares two URLs for strict origin matching.
+ * 
+ * @param {string} url1 - First URL to compare
+ * @param {string} url2 - Second URL to compare
+ * @returns {boolean} - True if origins match strictly (protocol, hostname, and port)
+ */
+function isSameOrigin(url1, url2) {
+  try {
+    const parsed1 = new URL(url1);
+    const parsed2 = new URL(url2);
+
+    // Compare protocol, hostname, and port
+    return (
+      parsed1.protocol === parsed2.protocol &&
+      parsed1.hostname === parsed2.hostname &&
+      parsed1.port === parsed2.port
+    );
+  } catch (error) {
+    // If URL parsing fails, they can't be same origin
+    return false;
+  }
+}
+
+/**
+ * Processes an XML sitemap and extracts all URLs.
+ * 
+ * @param {Object} options - Crawling options
+ * @param {Object} options.config - Configuration object
+ * @param {string} options.sitemapUrl - URL of the sitemap to process
+ * @param {Function} options.log - Logging function (optional)
+ * @returns {Promise<string[]>} - Promise resolving to array of all discovered URLs
+ */
+async function crawlSitemap({ config, sitemapUrl, log }) {
+  // Default no-op logger if not provided
+  const logger = log || (() => {});
+
+  const discoveredUrls = [];
+
+  logger(config, "debug", `Processing sitemap: ${sitemapUrl}`);
+
+  // Fetch the sitemap content
+  let content;
+  let finalUrl = sitemapUrl;
+  try {
+    const response = await axios.get(sitemapUrl, {
+      timeout: 30000,
+      maxRedirects: 5,
+    });
+    content = response.data;
+
+    // Use the final URL after redirects for origin comparison
+    if (response.request && response.request.res && response.request.res.responseUrl) {
+      finalUrl = response.request.res.responseUrl;
+      logger(config, "debug", `Sitemap redirected to: ${finalUrl}`);
+    }
+  } catch (error) {
+    logger(config, "warn", `Failed to fetch sitemap ${sitemapUrl}: ${error.message}`);
+    return discoveredUrls;
+  }
+
+  // Extract URLs from sitemap
+  if (typeof content === "string") {
+    const extractedUrls = extractXmlSitemapUrls(content);
+
+    // Filter URLs to only include same-origin URLs (using final URL after redirects)
+    for (const url of extractedUrls) {
+      if (isSameOrigin(url, finalUrl)) {
+        discoveredUrls.push(url);
+      } else {
+        logger(config, "debug", `Skipping cross-origin URL: ${url}`);
+      }
+    }
+  }
+
+  logger(config, "info", `Discovered ${discoveredUrls.length} URL(s) from sitemap`);
+
+  return discoveredUrls;
+}